diff --git a/R_funcs.R b/R_funcs.R index f438b6e..9d62134 100644 --- a/R_funcs.R +++ b/R_funcs.R @@ -1,163 +1,161 @@ corr_heatmap <- function(X) { ## Description: compute the pairwise correlation of matrix X columns ## and plot it as a heatmap. ## PARA: matrix X library(gplots) corr.pair <- cor(X) palette <- colorRampPalette(colors = c("blue", "white", "red")) ans <- heatmap.2(corr.pair, Rowv = NULL, Colv = NULL, dendrogram = "none", scale = "none", trace = "none", col = palette, key.title = "Pearson cor. coef.") ans } get_tpm <- function(x, org=c('hg19', 'mm9', 'rheMac8')) { ## Description: Compute TPM from a table of counts. ## Para: -x: table with raw counts, ensembl id as rownames ## -org: organism. countTable = x require(GenomicFeatures) org <- match.arg(org) ens <- makeTxDbFromUCSC(genome=org, tablename='ensGene', url = 'http://genome-euro.ucsc.edu/cgi-bin/', goldenPath_url = 'http://hgdownload.cse.ucsc.edu/goldenPath') exonic <- exonsBy(ens, by='gene') red.exonic <- reduce(exonic) exon.lengths <- sum(width(red.exonic)) names(exon.lengths) <- unlist(lapply(strsplit(names(exon.lengths), '\\.'), function(x) x[[1]])) exon.lengths.o <- exon.lengths[rownames(countTable)] length_norm <- countTable / exon.lengths.o ans <- t(t(length_norm)/colSums(length_norm)) * 1e6 # see https://rpubs.com/bbolker/sweep_divide ans } get_high_leverage_lm <- function(X, thresh){ ## Description: identify indices of high leverage observations in linear regression ## Para: -X: predictor matrix (predictors = columns) ## -thresh: high leverage threshold (mult. of expected lev. stat.) # expected leverage (p+1)/n, ref ISLR) p = ncol(X) n = nrow(X) exp_lev = (p + 1) / n thresh_lev = thresh * exp_lev # identifying high leverage observations levs = hat(X) high_levs = which(levs>thresh_lev) high_levs } get_aliased_preds_lm <- function(model) { ## Description: identify aliased predictors (causing singularities) in a linear ## regression model trained by lm() ## Para: -model: linear regression model trained with lm() aliases = alias(model) aliased_preds = row.names(aliases$Complete) aliased_preds } # this function is pure bullshit and is only there to keep a souvenir of my # previous misunderstanding of 1) high leverage points and 2) collinear predictors # in least square regression with or without regularization. In particular, # high leverage points do NOT require calculating the model (of course, they only # depend on the predictor matrix). Thus I do NOT need to care whether there are # singularities in the model to get my leverage points. I fooled myself because of # blind trust into the hatvalues() function (presented in ISLR), when I can instead # use hat(). purge_aliases_and_high_leverage_lm <- function(f, reg_mat, thresh){ ## Description: identify aliased predictors (causing singularities), remove them, ## identify high leverage observations, remove them, identify new aliased predictors, ## remove them, identify high leverage observations, remove them until no ## aliases or high leverage observations are left. Returns a list for ## high leverage points and another one for aliases. ## Para: -f: formula to train lm() ## -reg_mat: dataframe containing observations and predictors as columns ## -thresh: high leverage threshold (mult. of expected lev. stat.) #initialization i = 1 aliased_preds = c() high_levs = c() repeat{ # identification and removal of aliased predictors cat("Training model nr", i, "\n") model = lm(f, data = reg_mat, singular.ok = T) new_aliased_preds = get_aliased_preds_lm(model) aliased_preds = append(aliased_preds, new_aliased_preds) cat("New aliased preds: ", new_aliased_preds, "\n") #updating reg_mat reg_mat = reg_mat[, which(!names(reg_mat) %in% aliased_preds)] #identification and removal of high leverage observations model = lm(f, data = reg_mat, singular.ok = F) new_high_levs = get_high_leverage_lm(model, thresh) cat("New high leverage observations: ", names(new_high_levs), "\n") reg_mat = reg_mat[-new_high_levs, ] high_levs = append(high_levs, names(new_high_levs)) if(length(new_high_levs) == 0) { break } i = i+1 } ans = list(aliased_preds, high_levs) ans } -get_activities <- function(expr, X, alpha = 0.1, k = 5, ncores = 1) { +get_activities <- function(expr, X, alpha = 0.1, k = 5, + ncores = 1) { ## Description: estimate transcription factor activities by regressing each ## column of the expression matrix (1 col per sample) onto per promoter ## transcription factor binding sites. Multiple linear regression with the ## elastic net regularization is performed by the glmnet package. ## Para: -expr: row-centered TPM values in a gene (rows) x sample (columns) matrix. ## -X: per promoter (rows) TFBS (columns) counts. Column-centered. ## -k: number of folds for cross-validation ## -alpha: trades off between ridge (alpha = 0) and lasso (alpha = 1) penalties ## -ncores: number of cores for multiprocessing (1 -> no multiprocessing) require(glmnet) + require(Biobase) + require(doParallel) + print("Going parallel !") + cl <- makeCluster(ncores, type="FORK", outfile="./out.txt") - # enabling multiprocessing for glmnet - parallel = F - - if(ncores > 1) { - print("Going parallel !") - require(doMC) - registerDoMC(cores = ncores) - parallel = T - } - i = 1 n = ncol(expr) cat("Regressing activities of TFs for ", n, " samples\n") # regularized least squares on single expression vector f <- function(y) { - cat("sample ", i," / ", n, "\n") + print("started new sample") cv.out = cv.glmnet(x = X, y = y, alpha = alpha, - nfolds = k, parallel = parallel) + nfolds = k) coefs = predict(cv.out$glmnet.fit, type = "coefficients", s = cv.out$lambda.min) r_squared = cv.out$glmnet.fit$dev.ratio[which(cv.out$glmnet.fit$lambda == cv.out$lambda.min)] - i <<- i + 1 list("coefs" = coefs, "r_squared" = r_squared) } - res <- apply(expr, MARGIN = 2, FUN = f) + res = list() + res <- parApply(cl, expr, MARGIN = 2, FUN = f) + + # shutting down parallel clusters + stopCluster(cl) # renaming activities as they were entered at first (removing the "X" # added by default by glmnet names(res) = substr(names(res), 2, max(sapply(names(res), nchar))) # reshaping results into a two-list object: ans$r2 and ans$activities ans <- list() ans$r2 <- as.vector(subListExtract(res, "r_squared", simplify = T)) # Building the TF activities (rows) by samples (columns) table ans$activities <- t(do.call(rbind, lapply(subListExtract(res, "coefs"), as.vector))) rownames(ans$activities) <- rownames(subListExtract(res, "coefs")[[1]]) - + ans }