diff --git a/R_funcs.R b/R_funcs.R index c590423..b266059 100644 --- a/R_funcs.R +++ b/R_funcs.R @@ -1,61 +1,110 @@ corr_heatmap <- function(X) { ## Description: compute the pairwise correlation of matrix X columns ## and plot it as a heatmap. ## PARA: matrix X library(gplots) corr.pair <- cor(X) palette <- colorRampPalette(colors = c("blue", "white", "red")) ans <- heatmap.2(corr.pair, Rowv = NULL, Colv = NULL, dendrogram = "none", scale = "none", trace = "none", col = palette, key.title = "Pearson cor. coef.") ans } get_tpm <- function(x, org=c('hg19', 'mm9', 'rheMac8')) { ## Description: Compute TPM from a table of counts. ## Para: -x: table with raw counts, ensembl id as rownames ## -org: organism. countTable = x require(GenomicFeatures) org <- match.arg(org) ens <- makeTxDbFromUCSC(genome=org, tablename='ensGene', url = 'http://genome-euro.ucsc.edu/cgi-bin/', goldenPath_url = 'http://hgdownload.cse.ucsc.edu/goldenPath') exonic <- exonsBy(ens, by='gene') red.exonic <- reduce(exonic) exon.lengths <- sum(width(red.exonic)) names(exon.lengths) <- unlist(lapply(strsplit(names(exon.lengths), '\\.'), function(x) x[[1]])) exon.lengths.o <- exon.lengths[rownames(countTable)] length_norm <- countTable / exon.lengths.o ans <- t(t(length_norm)/colSums(length_norm)) * 1e6 # see https://rpubs.com/bbolker/sweep_divide ans } get_high_leverage_lm <- function(model, thresh){ ## Description: identify indices of high leverage observations in linear regression model ## Para: -model: linear regression model trained with lm() ## -thresh: high leverage threshold (mult. of expected lev. stat.) # expected leverage (p+1)/n, ref ISLR) n_coef = nrow(coef(summary(model))) n_obs = length(summary(model)$residuals) exp_lev = (n_coef + 1) / n_obs thresh_lev = thresh * exp_lev # identifying high leverage observations levs = hatvalues(model) high_levs = which(levs>thresh_lev) high_levs } get_aliased_preds_lm <- function(model) { ## Description: identify aliased predictors (causing singularities) in a linear ## regression model trained by lm() ## Para: -model: linear regression model trained with lm() aliases = alias(model) aliased_preds = row.names(aliases$Complete) aliased_preds } + +# this function is pure bullshit and is only there to keep a souvenir of my +# previous misunderstanding of 1) high leverage points and 2) collinear predictors +# in least square regression with or without regularization. In particular, +# high leverage points do NOT require calculating the model (of course, they only +# depend on the predictor matrix). Thus I do NOT need to care whether there are +# singularities in the model to get my leverage points. I fooled myself because of +# blind trust into the hatvalues() function (presented in ISLR), when I can instead +# use hat(). +purge_aliases_and_high_leverage_lm <- function(f, reg_mat, thresh){ + ## Description: identify aliased predictors (causing singularities), remove them, + ## identify high leverage observations, remove them, identify new aliased predictors, + ## remove them, identify high leverage observations, remove them until no + ## aliases or high leverage observations are left. Returns a list for + ## high leverage points and another one for aliases. + ## Para: -f: formula to train lm() + ## -reg_mat: dataframe containing observations and predictors as columns + ## -thresh: high leverage threshold (mult. of expected lev. stat.) + + #initialization + i = 1 + aliased_preds = c() + high_levs = c() + repeat{ + # identification and removal of aliased predictors + cat("Training model nr", i, "\n") + model = lm(f, data = reg_mat, singular.ok = T) + new_aliased_preds = get_aliased_preds_lm(model) + aliased_preds = append(aliased_preds, new_aliased_preds) + cat("New aliased preds: ", new_aliased_preds, "\n") + #updating reg_mat + reg_mat = reg_mat[, which(!names(reg_mat) %in% aliased_preds)] + + #identification and removal of high leverage observations + model = lm(f, data = reg_mat, singular.ok = F) + new_high_levs = get_high_leverage_lm(model, thresh) + cat("New high leverage observations: ", names(new_high_levs), "\n") + + reg_mat = reg_mat[-new_high_levs, ] + high_levs = append(high_levs, names(new_high_levs)) + + if(length(new_high_levs) == 0) { + break + } + i = i+1 + } + ans = list(aliased_preds, high_levs) + ans +}