# Set of parallelizable R functions to compute motif activities # largely inspired from MARA (Nimwegen lab, Basel) get_lambda_max <- function(Y, X, alpha) { ## Description: computes the max value of lambda such that all coefficients ## remain 0 when running a penalized linear regression of Y onto X with the ## glmnet package. Based on the coordinate descent udpate formula. ## Para: -Y: observation/response vector ## -X: predictor matrix ## -alpha: mixing parameter for the penalty term (0 -> ridge, 1 -> lasso) # compute the standard deviation with the 1/N formula (not the 1/(N-1)) formula: mysd <- function(y) sqrt(sum((y-mean(y))^2)/length(y)) stopifnot(length(Y)==dim(X)[1]) return ((1/(length(Y)*alpha))*max(abs(apply(scale(X, center = T, scale = apply(X, 2, mysd))*(Y), MARGIN=2, sum)))) } getActivities <- function(expr, X, alpha = 0.1, k = 5, nlambda=100, ncores = 1, log_id = '') { ## Description: estimate motif activities (transcription factors, TE subfamilies) ## by regressing each ## column of the expression matrix (1 col per sample) onto per promoter ## motif site counts. Multiple linear regression with the ## elastic net regularization is performed by the glmnet package. ## Para: -expr: row-centered TPM values in a promoter/gene (rows) x sample (columns) matrix. ## -X: per promoter/gene (rows) motif (columns) counts. Column-centered. Also referred to as N. ## -k: number of folds for cross-validation ## -alpha: trades off between ridge (alpha = 0) and lasso (alpha = 1) penalties ## -ncores: number of cores for multiprocessing (1 -> no multiprocessing) ## -nlambda: number of lambda values in the grid from lambda max to lambda min ## Details: We aim to find the value of lambda which minimizes the error ## over all samples. Therefore, we must provide the same grid of lambda values ## to each regression problem (one per sample). The max value of lambda is ## defined as the maximum value of lambda_max across samples. ## lambda_max for a single sample is given by the formula: ## lambda_max = 1/(nobs*alpha)*max_s(abs(dot product(Z(x_s), y_s))) where Z ## is a function that centers AND standardizes its input. ## Author: Cyril Pulver (cyril.pulver@epfl.ch) require(glmnet) require(Biobase) require(doParallel) print("Going parallel !") cl <- makeCluster(ncores, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse='')) n = ncol(expr) cat("Computing lambda max over ", n, " samples\n") lambda_max_list = parApply(cl, expr, MARGIN = 2, FUN = get_lambda_max, X, alpha = 0.1) # stopping the cluster to communicate the grid of lambda to the workers after computing it stopCluster(cl) cat("Building the lambda grid") lambda_max = max(lambda_max_list) log_lambda_grid = seq(from=log(lambda_max), to = log(lambda_max*1e-4), length.out = nlambda) lambda_values = exp(log_lambda_grid) cat("Regressing activities of TFs for ", n, " samples\n") # regularized least squares on single expression vector f <- function(y) { print("started new sample") return(cv.glmnet(x = X, y = y, alpha = alpha, nfolds = k, lambda = lambda_values, type.measure='mse')) } cl <- makeCluster(ncores, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse='')) res = list() res <- parApply(cl, expr, MARGIN = 2, FUN = f) # shutting down parallel clusters stopCluster(cl) cat("Finding the optimal lambda value across samples...\n") cat("\t... 1) Computing the per lambda MSE across samples\n") # mse estimates: rows as lambda values, and columns as samples errors = do.call('cbind', subListExtract(res, 'cvm')) # average over mse estimates across samples, one value per lambda in the grid lambda_mse = rowSums(errors)/ncol(errors) # keeping sd(mse) to plot lambda vs mse plot later lambda_mse_sd = apply(errors, MARGIN=1, sd) #retrieving the value of lambda that yields the minimum across sample mse lambda_min_idx = which.min(lambda_mse) lambda_min = lambda_values[lambda_min_idx] cat("\t... 2) retrieving the coefficients, crossval MSE, r_squared for the optimal lambda value\n") coefs = lapply(X = subListExtract(res, 'glmnet.fit'), FUN = predict, type='coefficients', s=lambda_min) # r_squared at the overall best lambda temp = subListExtract(res, 'glmnet.fit') r_squared_temp = subListExtract(temp, 'dev.ratio') r_squared = lapply(r_squared_temp, FUN = function(x) x[lambda_min_idx]) rm(list = 'temp') # r_squared max for each sample (to see how much we lose): r_squared_max = lapply(r_squared_temp, FUN = max) # best lambda for each sample (for the record) best_lambda = subListExtract(res, 'lambda.min') # reshaping results into a five-list object: ans$bestlambda, ans$r2, # ans$test_err_est, ans$test_err_se and ans$activities ans <- list() ans$r2 <- unlist(r_squared) ans$r2_max <- unlist(r_squared_max) ans$lambdas <- lambda_values ans$lambda_min_overall <- lambda_min ans$lambdas_best <- unlist(best_lambda) ans$lambda_mse <- unlist(lambda_mse) ans$lambda_mse_sd <- unlist(lambda_mse_sd) # Building the TF activities (rows) by samples (columns) table ans$activities <- do.call(cbind, coefs) colnames(ans$activities) <- names(coefs) return(ans) } plotActivities <- function(act, TF, metadata, sort_by, decreasing=F, palette_type='sequential', palette_add_black = T, palette_black_idx=1, title=NULL, act_lim=NULL) { ## Description: scatter plot of TF activities per sample ## Para: -act: TF x samples matrix, with row and col names set ## -TF: string for TF of interest, e.g "MYC" ## -metadata: samples x metadata dataframe. Metadata should be set ## as factors ## -sort_by: string corresponding to a column of metadata ## -decreasing: bool to sort samples by metadata[sort_by] in decr. order. ## -palette_type: string, one of c("sequential", "diverging", "categorical") ## -palette_add_black: bool to enable setting a color to black (useful ## for baselines) ## -palette_black_idx: category to plot in black ## -title: title if it is to be different than the TF name ## -act_lim: limits of activities to plot, useful to get comparable y axes ## Author: Cyril Pulver (cyril.pulver@epfl.ch) require(RColorBrewer) # to reset after fun old_pal = palette() old_par = par(no.readonly=T) # matching TF name to idx tf_idx = which(rownames(act)==TF) # sorting RNA seq samples by selected metadata sample_order_idx = seq(1, dim(act)[2]) if(!is.null(sort_by)){ sample_order_idx = order(metadata[sort_by], decreasing=decreasing) } # generating a palette new_pal = palette("default") # sorting by categorical factor -> n colors in the palette = n factors if(is.factor(metadata[[sort_by]])) { if (palette_type=='sequential'){ new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "OrRd") } else if (palette_type=='diverging') { new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "Spectral") } else if (palette_type=='qualitative') { new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "Set1") } } # In case of ref level chosen as black if (palette_add_black){ new_pal[palette_black_idx] = "black" } palette(new_pal) # enlarging margins for legend out of plot, forcing square plot area par(mai=c(0.2, 0.8, 0.4, 1.2), pty='s') # title main_title=title if (is.null(title)) { main_title=TF } plot(x = seq(from = 1, to = ncol(act)), y = act[tf_idx, sample_order_idx], col = metadata[[sort_by]][sample_order_idx], main = main_title, xlab = paste("Samples (ordered by ", sort_by, ")", sep=""), ylab = "Activity (deviation from average sample activity)", ylim=act_lim, cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2) legend("right", legend = levels(metadata[[sort_by]]), col = new_pal, pch = 'o', inset=c(-0.26, 0), box.lwd = 0, bty = "n", xpd=T, cex=1.2) abline(a = 0, b = 0, col = "blue", lty=2, lwd = 2) # resetting palette and par #palette(old_pal) #par(old_par) }