diff --git a/estimateRMSE.R b/estimateRMSE.R old mode 100755 new mode 100644 index 26a98cc..8cd2524 --- a/estimateRMSE.R +++ b/estimateRMSE.R @@ -1,76 +1,76 @@ -estimateRMSE <- function(E_centered, N, treatment_group, control_group, idx_training, idx_test) { - # Estimates the differential activities of TE subfamilies by multiple regression - # Uses the activities estimated on the training set to estimate the root mean squared error (RMSE) on the test set - - # E centered is the matrix of gene expression values with genes as rows and samples as columns - # N is the predictor matrix with genes as rows and TE subfamilies as columns - # treatment_group is vector with the name of treated samples (correspond to columns in E_centered) - # control_group: same for control samples - # idx_training defines the training set - # idx_test defines the test set +estimateRMSE <- function(E_centered, N, treatment_group, control_group, idx_training, idx_val) { + ## Description: estimates the differential activities of TE subfamilies by multiple regression, + # and uses the activities estimated on the training observations to estimate the root mean squared error (RMSE) on the test observations + ## Para: -E centered: matrix of normalized gene expression values with genes as rows and samples as columns + #-N: TE cis-regulatory predictor matrix with genes as rows and TE subfamilies as columns + #-treatment_group: vector with the name of treated samples (must correspond to columns in E_centered) + #-control_group: vector with the name of control samples (must correspond to columns in E_centered) + #-idx_training: indexes the training observations + #-idx_val: indexes the validation observations + print('Starting new sample') # defensive programming stopifnot(dim(E_centered)[1]==dim(N)[1]) stopifnot(all(sapply(treatment_group, function(x) x %in% colnames(E_centered)))) stopifnot(all(sapply(control_group, function(x) x %in% colnames(E_centered)))) stopifnot(all(sapply(control_group, function(x) ! x %in% treatment_group))) - stopifnot(all(idx_training != idx_test)) + stopifnot(all(idx_training != idx_val)) # separating the training from the test set E_training = E_centered[idx_training, ] - E_test = E_centered[idx_test, ] + E_test = E_centered[idx_val, ] N_training = N[idx_training, ] - N_test = N[idx_test, ] + N_test = N[idx_val, ] # training set # generating the difference in expression vector control = c(sapply(control_group, (function(x) E_training[, x]))) treatment = c(sapply(treatment_group, (function(x) E_training[, x]))) delta = treatment-control # generating the augmented N matrix to match the dimensions of delta n_replicates = length(control_group) N_augmented = N_training i = 1 while (i < n_replicates) { N_augmented = rbind(N_augmented, N_training) i = i+1 } # assembling the design matrix, regression delta as a function of all columns in N_augmented design_matrix = as.data.frame(cbind(delta, N_augmented)) fit = lm(delta ~ ., data = design_matrix) # test set # generating the difference in expression vector control = c(sapply(control_group, (function(x) E_test[, x]))) treatment = c(sapply(treatment_group, (function(x) E_test[, x]))) delta = treatment-control # generating the augmented N matrix to match the dimensions of delta n_replicates = length(control_group) N_augmented = N_test i = 1 while (i < n_replicates) { N_augmented = rbind(N_augmented, N_test) i = i+1 } # assembling the design matrix, regression delta as a function of all columns in N_augmented design_matrix = as.data.frame(N_augmented) pred = predict(fit, newdata=design_matrix) rmse = sqrt(mean((delta-pred)^2)) return(rmse) } \ No newline at end of file diff --git a/estimate_rmse_crossval.R b/estimate_rmse_crossval.R new file mode 100644 index 0000000..002d9f2 --- /dev/null +++ b/estimate_rmse_crossval.R @@ -0,0 +1,25 @@ +estimate_rmse_crossval <- function(preprocessed, control_group, treatment_group, + N_list, N_metadata, + folds, f) { + ## Description: computes the RMSE in the context of a parallelizable cross-validation scheme + + #-Paras: -preprocessed: data produced by preprocess_E_N_for_activities() + #-control_group: vector containing the names of the control samples (column names in preprocessed$E_centered) + #-treatment_group: vector containing the sames of the treatment samples (column names in preprocessed$E_centered) + #-N_list: list of the different N matrices. Rows must already correspond to those of preprocessed$E_centered + #-N_metadata: dataframe with a number of rows equal to the number of elements in N_list + #-folds: vector of the same length as the number of observations in E_centered, with integers from 1:nFolds, each defining a fold + #-f: fold index for which to estimate the RMSE + + idx_training = folds==f + idx_test = !idx_training + # initializing for temporary results + rmse = N_metadata + rmse$fold = as.integer(0) + rmse$rmse = 0 + rmse['fold'] = f + for(i in 1:nrow(N_metadata)) { + rmse[[i, 'rmse']] = estimateRMSE(preprocessed$E_centered, N=N_list[[i]], treatment_group = treatment_group, control_group=control_group, idx_training = idx_training, idx_test = idx_test) + } + return(rmse) +} \ No newline at end of file diff --git a/getActivities.R b/getActivities.R index c5af10c..fdda869 100644 --- a/getActivities.R +++ b/getActivities.R @@ -1,231 +1,225 @@ # Set of parallelizable R functions to compute motif activities # largely inspired from MARA (Nimwegen lab, Basel) get_lambda_max <- function(Y, X, alpha) { ## Description: computes the max value of lambda such that all coefficients ## remain 0 when running a penalized linear regression of Y onto X with the ## glmnet package. Based on the coordinate descent udpate formula. ## Para: -Y: observation/response vector ## -X: predictor matrix ## -alpha: mixing parameter for the penalty term (0 -> ridge, 1 -> lasso) # compute the standard deviation with the 1/N formula (not the 1/(N-1)) formula: mysd <- function(y) sqrt(sum((y-mean(y))^2)/length(y)) stopifnot(length(Y)==dim(X)[1]) return ((1/(length(Y)*alpha))*max(abs(apply(scale(X, center = T, scale = apply(X, 2, mysd))*(Y), MARGIN=2, sum)))) } getActivities <- function(expr, X, alpha = 0.1, k = 5, nlambda=100, - ncores = 1, log_id = '', standardize=T, large_N=F) { + ncores = 1, log_id = '') { ## Description: estimate motif activities (transcription factors, TE subfamilies) ## by regressing each ## column of the expression matrix (1 col per sample) onto per promoter ## motif site counts. Multiple linear regression with the ## elastic net regularization is performed by the glmnet package. ## Para: -expr: row-centered TPM values in a promoter/gene (rows) x sample (columns) matrix. ## -X: per promoter/gene (rows) motif (columns) counts. Column-centered. Also referred to as N. ## -k: number of folds for cross-validation ## -alpha: trades off between ridge (alpha = 0) and lasso (alpha = 1) penalties ## -ncores: number of cores for multiprocessing (1 -> no multiprocessing) ## -nlambda: number of lambda values in the grid from lambda max to lambda min ## Details: We aim to find the value of lambda which minimizes the error ## over all samples. Therefore, we must provide the same grid of lambda values ## to each regression problem (one per sample). The max value of lambda is ## defined as the maximum value of lambda_max across samples. ## lambda_max for a single sample is given by the formula: ## lambda_max = 1/(nobs*alpha)*max_s(abs(dot product(Z(x_s), y_s))) where Z ## is a function that centers AND standardizes its input. ## Author: Cyril Pulver (cyril.pulver@epfl.ch) require(glmnet) require(Biobase) require(doParallel) print("Going parallel !") - ncores_lambda_max = ncores - - if(large_N) { - ncores_lambda_max=1 - } - - cl <- makeCluster(ncores_lambda_max, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse='')) + cl <- makeCluster(ncores, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse='')) n = ncol(expr) cat("Computing lambda max over ", n, " samples\n") lambda_max_list = parApply(cl, expr, MARGIN = 2, FUN = get_lambda_max, X, alpha = 0.1) # stopping the cluster to communicate the grid of lambda to the workers after computing it stopCluster(cl) cat("Building the lambda grid") lambda_max = max(lambda_max_list) log_lambda_grid = seq(from=log(lambda_max), to = log(lambda_max*1e-4), length.out = nlambda) lambda_values = exp(log_lambda_grid) cat("Regressing activities of TFs for ", n, " samples\n") # regularized least squares on single expression vector f <- function(y) { print("started new sample") return(cv.glmnet(x = X, y = y, alpha = alpha, - nfolds = k, lambda = lambda_values, type.measure='mse', standardize=standardize)) + nfolds = k, lambda = lambda_values, type.measure='mse')) } cl <- makeCluster(ncores, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse='')) res = list() res <- parApply(cl, expr, MARGIN = 2, FUN = f) # shutting down parallel clusters stopCluster(cl) cat("Finding the optimal lambda value across samples...\n") cat("\t... 1) Computing the per lambda MSE across samples\n") # mse estimates: rows as lambda values, and columns as samples errors = do.call('cbind', subListExtract(res, 'cvm')) # average over mse estimates across samples, one value per lambda in the grid lambda_mse = rowSums(errors)/ncol(errors) # keeping sd(mse) to plot lambda vs mse plot later lambda_mse_sd = apply(errors, MARGIN=1, sd) #retrieving the value of lambda that yields the minimum across sample mse lambda_min_idx = which.min(lambda_mse) lambda_min = lambda_values[lambda_min_idx] cat("\t... 2) retrieving the coefficients, crossval MSE, r_squared for the optimal lambda value\n") coefs = lapply(X = subListExtract(res, 'glmnet.fit'), FUN = predict, type='coefficients', s=lambda_min) # r_squared at the overall best lambda temp = subListExtract(res, 'glmnet.fit') r_squared_temp = subListExtract(temp, 'dev.ratio') r_squared = lapply(r_squared_temp, FUN = function(x) x[lambda_min_idx]) rm(list = 'temp') # r_squared max for each sample (to see how much we lose): r_squared_max = lapply(r_squared_temp, FUN = max) # best lambda for each sample (for the record) best_lambda = subListExtract(res, 'lambda.min') # reshaping results into a five-list object: ans$bestlambda, ans$r2, # ans$test_err_est, ans$test_err_se and ans$activities ans <- list() ans$r2 <- unlist(r_squared) ans$r2_max <- unlist(r_squared_max) ans$lambdas <- lambda_values ans$lambda_min_overall <- lambda_min ans$lambdas_best <- unlist(best_lambda) ans$lambda_mse <- unlist(lambda_mse) ans$lambda_mse_sd <- unlist(lambda_mse_sd) # Building the TF activities (rows) by samples (columns) table ans$activities <- do.call(cbind, coefs) colnames(ans$activities) <- names(coefs) return(ans) } plotActivities <- function(act, TF, metadata, sort_by, decreasing=F, palette_type='sequential', palette_add_black = T, palette_black_idx=1, title=NULL, act_lim=NULL) { ## Description: scatter plot of TF activities per sample ## Para: -act: TF x samples matrix, with row and col names set ## -TF: string for TF of interest, e.g "MYC" ## -metadata: samples x metadata dataframe. Metadata should be set ## as factors ## -sort_by: string corresponding to a column of metadata ## -decreasing: bool to sort samples by metadata[sort_by] in decr. order. ## -palette_type: string, one of c("sequential", "diverging", "categorical") ## -palette_add_black: bool to enable setting a color to black (useful ## for baselines) ## -palette_black_idx: category to plot in black ## -title: title if it is to be different than the TF name ## -act_lim: limits of activities to plot, useful to get comparable y axes ## Author: Cyril Pulver (cyril.pulver@epfl.ch) require(RColorBrewer) # to reset after fun old_pal = palette() old_par = par(no.readonly=T) # matching TF name to idx tf_idx = which(rownames(act)==TF) # sorting RNA seq samples by selected metadata sample_order_idx = seq(1, dim(act)[2]) if(!is.null(sort_by)){ sample_order_idx = order(metadata[sort_by], decreasing=decreasing) } # generating a palette new_pal = palette("default") # sorting by categorical factor -> n colors in the palette = n factors if(is.factor(metadata[[sort_by]])) { if (palette_type=='sequential'){ new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "OrRd") } else if (palette_type=='diverging') { new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "Spectral") } else if (palette_type=='qualitative') { new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "Set1") } } # In case of ref level chosen as black if (palette_add_black){ new_pal[palette_black_idx] = "black" } palette(new_pal) # enlarging margins for legend out of plot, forcing square plot area par(mai=c(0.2, 0.8, 0.4, 1.2), pty='s') # title main_title=title if (is.null(title)) { main_title=TF } plot(x = seq(from = 1, to = ncol(act)), y = act[tf_idx, sample_order_idx], col = metadata[[sort_by]][sample_order_idx], main = main_title, xlab = paste("Samples (ordered by ", sort_by, ")", sep=""), ylab = "Activity (deviation from average sample activity)", ylim=act_lim, cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2) legend("right", legend = levels(metadata[[sort_by]]), col = new_pal, pch = 'o', inset=c(-0.26, 0), box.lwd = 0, bty = "n", xpd=T, cex=1.2) abline(a = 0, b = 0, col = "blue", lty=2, lwd = 2) # resetting palette and par #palette(old_pal) #par(old_par) } diff --git a/get_exonic_lengths.R b/get_exonic_lengths.R new file mode 100644 index 0000000..4bdd409 --- /dev/null +++ b/get_exonic_lengths.R @@ -0,0 +1,16 @@ +get_exonic_lengths <- function(org='hg19') { + ## Description: returns the cumulated exonic length for each gene + ## Para: -org: organism + require(GenomicFeatures) + org <- match.arg(org) + ens <- makeTxDbFromUCSC(genome=org, + tablename='ensGene', + url = 'http://genome-euro.ucsc.edu/cgi-bin/' + ) + exonic <- exonsBy(ens, by='gene') + red.exonic <- reduce(exonic) + exon.lengths <- sum(width(red.exonic)) + # Removing ENSEMBL gene versions + names(exon.lengths) <- unlist(lapply(strsplit(names(exon.lengths), '\\.'), function(x) x[[1]])) + return(exon.lengths) +} \ No newline at end of file diff --git a/get_pseudocounts.R b/get_pseudocounts.R index 73d0af0..ed3b9bd 100644 --- a/get_pseudocounts.R +++ b/get_pseudocounts.R @@ -1,14 +1,15 @@ get_pseudocounts <- function(count_table, q=0.05) { - ## compute pseudocounts on a gene count matrix (genes are rows, samples are columns) + ## Description: compute pseudocounts on a gene count matrix (genes are rows, samples are columns) + # to replace zero values before logging - ## count_table: gene count table with rows as genes, columns as samples - ## q: quantile of non-zero values used as a pseudocount in each sample (default: quantile = 0.05) + ## Para: -count_table: gene count table with rows as genes, columns as samples + #-q: quantile of non-zero values used as a pseudocount in each sample (default: quantile = 0.05) stopifnot(typeof(count_table)=='integer' & is.matrix(count_table)) pseudocounts = apply(X = count_table, MARGIN = 2, FUN = function(x) quantile(x[x!=0], q)) # Broadcasting only works correctly by rows in R, so when broadcasting by columns # one has to transpose to broadcast by rows (instead of cols), do the operation # and then tranpose again to get back to the original conformation. return(t(t(count_table) + pseudocounts)) } diff --git a/get_tpm_local.R b/get_tpm_local.R new file mode 100644 index 0000000..60d314b --- /dev/null +++ b/get_tpm_local.R @@ -0,0 +1,44 @@ +get_tpm_local <- function(countTable, org='hg19', exonic_lengths_path=NULL, redownload_exons=FALSE) { + + ## Description: computes TPM from a count table + ## Para: -countTable: table with ensembl gene IDs as row names, sample IDs as columns + #- org: organism (defaults to 'hg19') + #- exonic_lengths_path: path to where computed exonic lengths are stored, to avoid recomputing them everytime + #- redownload_exons: forces the download and computation of exon lengths from scratch + + # Defensive programming + stopifnot(is.matrix(countTable)) + + if(is.null(exonic_lengths_path)) { + exonic_lengths_path = paste0('../data/temp/', org, '_exon_lengths.RDS') + } + + # checking for the existence of a previously generated exon_length vector + + if(!dir.exists(dirname(exonic_lengths_path))){ + dir.create(dirname(exonic_lengths_path), showWarnings = FALSE) + } + + exonic_lengths = NULL + if(!file.exists(exonic_lengths_path) | redownload_exons) { + file.remove(exonic_lengths_path, showWarnings = FALSE) + exonic_lengths = get_exonic_lengths(org) + saveRDS(exonic_lengths, exonic_lengths_path) + } else { + exonic_lengths = readRDS(exonic_lengths_path) + } + + # normalizing to TPM + + # filtering of genes present in the countTable is done at this stage, therefore the full table should be stored somewhere. + exonic_lengths_filtered <- exonic_lengths[rownames(countTable)] + + print(head(exonic_lengths_filtered)) + + # normalizing for gene length + length_norm <- countTable / exonic_lengths_filtered + + # normalizing for library size + ans <- t(t(length_norm)/colSums(length_norm)) * 1e6 # see https://rpubs.com/bbolker/sweep_divide + return(ans) +} \ No newline at end of file diff --git a/plot_signif_subfams.R b/plot_signif_subfams.R new file mode 100644 index 0000000..a8f0317 --- /dev/null +++ b/plot_signif_subfams.R @@ -0,0 +1,75 @@ +plot_signif_subfams <- function(res, alpha, n_label=NULL, main=NULL, n_subplots=1, idx=1) { + ## Description: volcano plot with TE subfamily activities on the x axis, activity coefficient significance on the y axis + ## para: -res: result file returned by getSignifActivities.R + #-alpha: BH-adjusted threshold for significance (black circles) + #-n_label: number of subfamilies to label on the plot, starting from the most significant subfamily + #-main: title of the plot + #-n_subplots: when plotting several plots on the same figure (e.g par(mar=c(2, 2))), we only reset par() after the last subplot is drawn + #-idx: belongs to the interval [1, n_subplots] and triggers the reset of par() when it equals n_subplots + + stopifnot(alpha <= 1 & alpha >= 0) + stopifnot(idx <= n_subplots) + stopifnot(n_subplots >= 0) + + # saving current plot margins + old_par = par(no.readonly = TRUE) + + if(idx == n_subplots) { + on.exit(par(old_par)) + } + # changing margin sizes + par(mar=c(4.1, 4.5, 2.5, 1)) + + # identifying significant coefficients (will be plotted in black, not grey) + signif = which(res$coefs$p_adj <= alpha) + coefs_signif = res$coefs[signif,] + coefs_non_signif = res$coefs[-signif,] + + coef_labels = NULL + largest_label_length = NULL + if (! is.null(n_label)) { + coef_labels = coefs_signif[head(order(coefs_signif$p_adj), n_label), ] + largest_label_length = max(sapply(rownames(coef_labels), nchar)) + + } + + + # correcting the plot margins for the error bars and the labels to fit in + std_offset = 1.96*coefs_signif$`Std. Error` + std_offset_margin = max(std_offset) + + # empty plot with limits + x_min = min(res$coefs$Estimate) + x_max = max(res$coefs$Estimate) + x_margin = 0 + if (! is.null(n_label)) { + x_margin = max(std_offset_margin, largest_label_length/300) + } else {x_margin = std_offset_margin} + + y_min = 0 + y_max = max(na.omit((-log10(res$coefs[, 'p_adj'])))) + y_margin = 0.04*(y_max-y_min) + + plot(NULL, type='n', xlim = c(x_min - x_margin, x_max + 2*x_margin), ylim = c(y_min, y_max + y_margin), + xlab = 'Activity', ylab = '-log10(adj. p-value)', cex.lab = 2, cex.axis = 2) + + # moving main title up + title(main, line=1, cex.main = 2) + + # plotting the 0.95 confidence interval for the estimated coefficient + arrows(x0=coefs_signif$Estimate - std_offset, y0=-log10(coefs_signif$p_adj), x1=coefs_signif$Estimate + std_offset, y1=-log10(coefs_signif$p_adj), + code=3, angle=90, length=0.05, + col="grey", lwd=1.5) + + # non significant activities + points(coefs_non_signif$Estimate, -log10(coefs_non_signif$p_adj), col = 'grey') + + # significant activities + points(coefs_signif$Estimate, -log10(coefs_signif$p_adj), col = 'black', cex=1.3) + + + if (! is.null(n_label)) { + # adding the name of the top n TE subfams to the plot + text(x = coef_labels$Estimate, y=-log10(coef_labels$p_adj), labels = rownames(coef_labels), adj = c(-0.1, -0.4), cex = 1.4) + } +} \ No newline at end of file diff --git a/preprocess_E_N_for_activities.R b/preprocess_E_N_for_activities.R index 1f998f4..2f484cc 100644 --- a/preprocess_E_N_for_activities.R +++ b/preprocess_E_N_for_activities.R @@ -1,87 +1,88 @@ preprocess_E_N_for_activities <- function(count_table, N, count_threshold=c(10, 1), N_threshold = 150, pseudocount_quantile=0.05, log_tpm_plot_path, - import_path + import_path, + preds_protected=NULL, + redownload_exons=FALSE ) { - ## Details: process the count_table to E along with N to regress activities onto occurences of DNA sequences - ## the set of genes in N is assumed to be the overall set of genes we - ## are interested in (e.g: if N only contains protein coding genes, we - ## only look at protein coding genes) - - ## E: gene count matrix (rows are genes, columns are samples). Contains integer values - ## N: per promoter occurences of DNA sequences. Promoters/genes are rows, DNA sequence prevalences are columns - ## count_threhold: [1] minimum number of counts in at least [2] genes to filter in gene in E - ## + ## Descrption: filter, normalize and process a table of raw counts with genes as rows and samples as columns to + ## the expression matrix E. Build a corresponding N matrix with remaining genes as rows and TEs as columns. + + ## Para: -count_table: gene count matrix (rows are genes, columns are samples). + #-N: per gene occurences of TE integrants. Genes are rows, TE subfamilies are columns + #-count_threhold: c(a,b) with a the minimum number of counts in at least b genes to keep a gene present in E + #-N_threshold: minimum sum over all genes that a TE subfamily has to reach in order to be kept as a predictor in matrix N + #-pseudocount_quantile: quantile of non-zero values used to compute per-sample pseudocounts prior to log-transformation + #-log_tpm_plot_path: path where histograms of expression values (in logged TPM) are drawn (useful to check quasi-normality of expression values) + #-import_path: path pointing to the other R functions required for the pre-processing + #-preds_protected: vector containing the names of predictors that cannot be filtered out based on N_threshold + #-redownload_exons: TRUE: exons are redownloaded to compute TPMs. FALSE: exons are not redownloaded to compute TPMs. + stopifnot(typeof(count_table)=='integer' & is.matrix(count_table) & all(count_table>=0)) stopifnot(is.matrix(N) & all(N)>=0) # importing useful functions source(file.path(import_path, 'get_pseudocounts.R')) - source(file.path(import_path, 'get_tpm.R')) + source(file.path(import_path, 'get_exonic_lengths.R')) + source(file.path(import_path, 'get_tpm_local.R')) source(file.path(import_path, 'row_center.R')) # filtering genes not expressed at enough counts in at least ... samples genes_kept = colSums(apply(count_table, MARGIN=1, (function(x) x>=count_threshold[1])))>=count_threshold[2] count_table = count_table[genes_kept, ] # finding genes in count_table that are in N in_N = rownames(count_table) %in% rownames(N) count_table = count_table[in_N, ] # ordering rows of N as rows of count_table N = N[rownames(count_table), ] # Selecting cols in N with at least ... hits # saving lost fams to return them - lost_cols = colSums(N)[which(colSums(N)=N_threshold)] - + N = N[, which(colSums(N)>=N_threshold | colnames(N) %in% preds_protected)] # removing genes that now have 0 hits in N: to_keep = rowSums(N)>0 N = N[to_keep, ] # in case we lost some genes in N, removing them in the count_table count_table = count_table[rownames(N), ] # adding pseudocounts to the count_table count_table = get_pseudocounts(count_table, pseudocount_quantile) # computing log2(tpm), keeping them for the graph - log_tpm = log2(get_tpm(count_table)) + log_tpm = log2(get_tpm_local(count_table, redownload_exons=redownload_exons)) gc() # plotting log tpm for quality control pdf(log_tpm_plot_path) hist(log_tpm) dev.off() # col centering E E = scale(log_tpm, center = T, scale = F) gc() # row centering E E = row_center(E) - # col centering N - N_centered = scale(N, center=T, scale=F) - stopifnot(all(rownames(E)==rownames(N))) return(list('E_centered' = E, - 'N_centered' = N_centered, 'N' = N, 'lost_cols' = lost_cols)) - } diff --git a/pval_from_fstat.R b/pval_from_fstat.R new file mode 100644 index 0000000..23cdeb1 --- /dev/null +++ b/pval_from_fstat.R @@ -0,0 +1,7 @@ +pval_from_fstat <- function(res) { + ## Description: returns the pvalue corresponding to the f-statistic of the complete versus the null model (mean only) + ## Para: -res: result returned by getSignifActivities + + pval = pf(q = res$fstatistic['value'], df1 = res$fstatistic['numdf'], df2 = res$fstatistic['dendf'], lower.tail = F) + return(pval) +} \ No newline at end of file diff --git a/row_center.R b/row_center.R index 5565590..d68d60f 100644 --- a/row_center.R +++ b/row_center.R @@ -1,6 +1,8 @@ row_center <- function(X) { - # center the matrix X by rows + + ## Description: centers the matrix X by rows + # Paras: X: matrix with rows to be centered. stopifnot(is.matrix(X)) return (t(scale(x = t(X), center = T, scale = F))) }