diff --git a/estimateRMSE.R b/estimateRMSE.R
old mode 100755
new mode 100644
index 26a98cc..8cd2524
--- a/estimateRMSE.R
+++ b/estimateRMSE.R
@@ -1,76 +1,76 @@
-estimateRMSE <- function(E_centered, N, treatment_group, control_group, idx_training, idx_test) {
-    # Estimates the differential activities of TE subfamilies by multiple regression
-    # Uses the activities estimated on the training set to estimate the root mean squared error (RMSE) on the test set
-
-    # E centered is the matrix of gene expression values with genes as rows and samples as columns
-    # N is the predictor matrix with genes as rows and TE subfamilies as columns
-    # treatment_group is vector with the name of treated samples (correspond to columns in E_centered)
-    # control_group: same for control samples
-    # idx_training defines the training set
-    # idx_test defines the test set
+estimateRMSE <- function(E_centered, N, treatment_group, control_group, idx_training, idx_val) {
+    ## Description: estimates the differential activities of TE subfamilies by multiple regression,
+    # and uses the activities estimated on the training observations to estimate the root mean squared error (RMSE) on the test observations
 
+        ## Para: -E centered: matrix of normalized gene expression values with genes as rows and samples as columns
+                #-N: TE cis-regulatory predictor matrix with genes as rows and TE subfamilies as columns
+                #-treatment_group: vector with the name of treated samples (must correspond to columns in E_centered)
+                #-control_group: vector with the name of control samples (must correspond to columns in E_centered)
+                #-idx_training: indexes the training observations
+                #-idx_val: indexes the validation observations
+    
     print('Starting new sample')
 
     # defensive programming
     stopifnot(dim(E_centered)[1]==dim(N)[1])
 
     stopifnot(all(sapply(treatment_group, function(x) x %in% colnames(E_centered))))
     stopifnot(all(sapply(control_group, function(x) x %in% colnames(E_centered))))
 
     stopifnot(all(sapply(control_group, function(x) ! x %in% treatment_group)))
-    stopifnot(all(idx_training != idx_test))
+    stopifnot(all(idx_training != idx_val))
 
     # separating the training from the test set
     E_training = E_centered[idx_training, ]
-    E_test = E_centered[idx_test, ]
+    E_test = E_centered[idx_val, ]
     
     N_training = N[idx_training, ]
-    N_test = N[idx_test, ]
+    N_test = N[idx_val, ]
     
                          
     # training set
     # generating the difference in expression vector
     control = c(sapply(control_group, (function(x) E_training[, x])))
     treatment = c(sapply(treatment_group, (function(x) E_training[, x])))
     delta = treatment-control    
                                            
     # generating the augmented N matrix to match the dimensions of delta
     n_replicates = length(control_group)
     N_augmented = N_training
 
     i = 1
     while (i < n_replicates) {
         N_augmented = rbind(N_augmented, N_training)  
         i = i+1
     }
 
                        
     # assembling the design matrix, regression delta as a function of all columns in N_augmented
     design_matrix = as.data.frame(cbind(delta, N_augmented))
     fit = lm(delta ~ ., data = design_matrix)
                                            
     
     # test set
     # generating the difference in expression vector
     control = c(sapply(control_group, (function(x) E_test[, x])))
     treatment = c(sapply(treatment_group, (function(x) E_test[, x])))
     delta = treatment-control    
                                            
     # generating the augmented N matrix to match the dimensions of delta
     n_replicates = length(control_group)
     N_augmented = N_test
 
     i = 1
     while (i < n_replicates) {
         N_augmented = rbind(N_augmented, N_test)  
         i = i+1
     }
 
                        
     # assembling the design matrix, regression delta as a function of all columns in N_augmented
     design_matrix = as.data.frame(N_augmented)
     pred = predict(fit, newdata=design_matrix)
 
     rmse = sqrt(mean((delta-pred)^2))
     return(rmse)
 }
\ No newline at end of file
diff --git a/estimate_rmse_crossval.R b/estimate_rmse_crossval.R
new file mode 100644
index 0000000..002d9f2
--- /dev/null
+++ b/estimate_rmse_crossval.R
@@ -0,0 +1,25 @@
+estimate_rmse_crossval <- function(preprocessed, control_group, treatment_group,
+                                  N_list, N_metadata,
+                                  folds, f) {
+        ## Description: computes the RMSE in the context of a parallelizable cross-validation scheme
+    
+            #-Paras: -preprocessed: data produced by preprocess_E_N_for_activities()
+                    #-control_group: vector containing the names of the control samples (column names in preprocessed$E_centered)
+                    #-treatment_group: vector containing the sames of the treatment samples (column names in preprocessed$E_centered)
+                    #-N_list: list of the different N matrices. Rows must already correspond to those of preprocessed$E_centered
+                    #-N_metadata: dataframe with a number of rows equal to the number of elements in N_list
+                    #-folds: vector of the same length as the number of observations in E_centered, with integers from 1:nFolds, each defining a fold
+                    #-f: fold index for which to estimate the RMSE
+
+    idx_training = folds==f
+    idx_test = !idx_training
+    # initializing for temporary results
+    rmse = N_metadata
+    rmse$fold = as.integer(0)
+    rmse$rmse = 0       
+    rmse['fold'] = f
+    for(i in 1:nrow(N_metadata)) {
+        rmse[[i, 'rmse']] = estimateRMSE(preprocessed$E_centered, N=N_list[[i]], treatment_group = treatment_group, control_group=control_group, idx_training = idx_training, idx_test = idx_test)
+    }
+    return(rmse)
+}
\ No newline at end of file
diff --git a/getActivities.R b/getActivities.R
index c5af10c..fdda869 100644
--- a/getActivities.R
+++ b/getActivities.R
@@ -1,231 +1,225 @@
 # Set of parallelizable R functions to compute motif activities
 # largely inspired from MARA (Nimwegen lab, Basel)
 
 
 
 
 get_lambda_max <- function(Y, X, alpha) {
     ## Description: computes the max value of lambda such that all coefficients
     ## remain 0 when running a penalized linear regression of Y onto X with the
     ## glmnet package. Based on the coordinate descent udpate formula. 
 
     ## Para: -Y: observation/response vector
     ##       -X: predictor matrix
     ##       -alpha: mixing parameter for the penalty term (0 -> ridge, 1 -> lasso)
 
     # compute the standard deviation with the 1/N formula (not the 1/(N-1)) formula:
     mysd <- function(y) sqrt(sum((y-mean(y))^2)/length(y))
 
     stopifnot(length(Y)==dim(X)[1])
 
     return ((1/(length(Y)*alpha))*max(abs(apply(scale(X, center = T, scale = apply(X, 2, mysd))*(Y), MARGIN=2, sum))))
 }
 
 getActivities <- function(expr, X, alpha = 0.1, k = 5, nlambda=100,
-                           ncores = 1, log_id = '', standardize=T, large_N=F) {
+                           ncores = 1, log_id = '') {
     ## Description: estimate motif activities (transcription factors, TE subfamilies)
     ## by regressing each
     ## column of the expression matrix (1 col per sample) onto per promoter
     ## motif site counts. Multiple linear regression with the
     ## elastic net regularization is performed by the glmnet package.
 
 
     ## Para: -expr: row-centered TPM values in a promoter/gene (rows) x sample (columns) matrix.
     ##       -X: per promoter/gene (rows) motif (columns) counts. Column-centered. Also referred to as N.
     ##       -k: number of folds for cross-validation
     ##       -alpha: trades off between ridge (alpha = 0) and lasso (alpha = 1) penalties
     ##       -ncores: number of cores for multiprocessing (1 -> no multiprocessing)
     ##       -nlambda: number of lambda values in the grid from lambda max to lambda min
 
 
     ## Details: We aim to find the value of lambda which minimizes the error
     ## over all samples. Therefore, we must provide the same grid of lambda values
     ## to each regression problem (one per sample). The max value of lambda is 
     ## defined as the maximum value of lambda_max across samples.
     ## lambda_max for a single sample is given by the formula:
     ## lambda_max = 1/(nobs*alpha)*max_s(abs(dot product(Z(x_s), y_s))) where Z
     ## is a function that centers AND standardizes its input.
 
 
     ## Author: Cyril Pulver (cyril.pulver@epfl.ch)
 
 
     require(glmnet)
     require(Biobase)
     require(doParallel)
     print("Going parallel !")
-    ncores_lambda_max = ncores
-
-    if(large_N) {
-        ncores_lambda_max=1
-    }
-
-    cl <- makeCluster(ncores_lambda_max, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse=''))
+    cl <- makeCluster(ncores, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse=''))
 
     n = ncol(expr)
 
 
     cat("Computing lambda max over ", n, " samples\n")
     lambda_max_list = parApply(cl, expr, MARGIN = 2, FUN = get_lambda_max, X, alpha = 0.1)
     # stopping the cluster to communicate the grid of lambda to the workers after computing it
     stopCluster(cl)
 
     cat("Building the lambda grid")
     lambda_max = max(lambda_max_list)
     log_lambda_grid = seq(from=log(lambda_max), to = log(lambda_max*1e-4), length.out = nlambda)
     lambda_values = exp(log_lambda_grid)
 
     cat("Regressing activities of TFs for ", n, " samples\n")
     # regularized least squares on single expression vector
     f <- function(y) {
         print("started new sample")
 
         return(cv.glmnet(x = X, y = y, alpha = alpha, 
-                           nfolds = k, lambda = lambda_values, type.measure='mse', standardize=standardize))
+                           nfolds = k, lambda = lambda_values, type.measure='mse'))
     }
 
     cl <- makeCluster(ncores, type="FORK", outfile=paste(c("./getActivities_log", log_id, ".txt"), collapse=''))
 
     res = list()
 
     res <- parApply(cl, expr, MARGIN = 2, FUN = f)
 
     # shutting down parallel clusters
     stopCluster(cl)
 
     cat("Finding the optimal lambda value across samples...\n")
 
     cat("\t... 1) Computing the per lambda MSE across samples\n")
     # mse estimates: rows as lambda values, and columns as samples
     errors = do.call('cbind', subListExtract(res, 'cvm'))
 
     # average over mse estimates across samples, one value per lambda in the grid 
     lambda_mse = rowSums(errors)/ncol(errors)
     # keeping sd(mse) to plot lambda vs mse plot later
     lambda_mse_sd = apply(errors, MARGIN=1, sd)
     
     #retrieving the value of lambda that yields the minimum across sample mse
     lambda_min_idx = which.min(lambda_mse)
     lambda_min = lambda_values[lambda_min_idx]
 
     cat("\t... 2) retrieving the coefficients, crossval MSE, r_squared for the optimal lambda value\n")
     coefs = lapply(X = subListExtract(res, 'glmnet.fit'), FUN = predict, type='coefficients', s=lambda_min)
 
     # r_squared at the overall best lambda
     temp = subListExtract(res, 'glmnet.fit')
     r_squared_temp = subListExtract(temp, 'dev.ratio')
     r_squared = lapply(r_squared_temp, FUN = function(x) x[lambda_min_idx])
     rm(list = 'temp')
 
     # r_squared max for each sample (to see how much we lose):
     r_squared_max = lapply(r_squared_temp, FUN = max)
 
     # best lambda for each sample (for the record)
     best_lambda = subListExtract(res, 'lambda.min')
 
     # reshaping results into a five-list object: ans$bestlambda, ans$r2,
     # ans$test_err_est, ans$test_err_se and ans$activities
 
     ans <- list()
     ans$r2 <- unlist(r_squared)
     ans$r2_max <- unlist(r_squared_max)
     ans$lambdas <- lambda_values 
     ans$lambda_min_overall <- lambda_min
     ans$lambdas_best <- unlist(best_lambda)
     ans$lambda_mse <- unlist(lambda_mse)
     ans$lambda_mse_sd <- unlist(lambda_mse_sd)
     
     # Building the TF activities (rows) by samples (columns) table
     ans$activities <- do.call(cbind, coefs)
     colnames(ans$activities) <- names(coefs)
     return(ans)
 }
 
 plotActivities <- function(act, TF, metadata, sort_by, decreasing=F, 
                            palette_type='sequential', 
                            palette_add_black = T, palette_black_idx=1,
                            title=NULL, act_lim=NULL) {
 
 
     ## Description: scatter plot of TF activities per sample 
     
     ## Para: -act: TF x samples matrix, with row and col names set
     ##       -TF: string for TF of interest, e.g "MYC"
     ##       -metadata: samples x metadata dataframe. Metadata should be set
     ## as factors
     ##       -sort_by: string corresponding to a column of metadata
     ##       -decreasing: bool to sort samples by metadata[sort_by] in decr. order.
     ##       -palette_type: string, one of c("sequential", "diverging", "categorical")
     ##       -palette_add_black: bool to enable setting a color to black (useful
     ## for baselines)
     ##       -palette_black_idx: category to plot in black
     ##       -title: title if it is to be different than the TF name
     ##       -act_lim: limits of activities to plot, useful to get comparable y axes
 
     ## Author: Cyril Pulver (cyril.pulver@epfl.ch)
 
 
 
     require(RColorBrewer)
 
     # to reset after fun
     old_pal = palette()
     old_par = par(no.readonly=T)
 
     # matching TF name to idx
     tf_idx = which(rownames(act)==TF)
 
     # sorting RNA seq samples by selected metadata
     sample_order_idx = seq(1, dim(act)[2])
     if(!is.null(sort_by)){
         sample_order_idx = order(metadata[sort_by], decreasing=decreasing)
     }
 
     # generating a palette
     new_pal = palette("default")
     # sorting by categorical factor -> n colors in the palette = n factors
     if(is.factor(metadata[[sort_by]])) {
         if (palette_type=='sequential'){
                 new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "OrRd")
             } else if (palette_type=='diverging') {
             new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "Spectral")
             } else if (palette_type=='qualitative') {
             new_pal = brewer.pal(n = length(levels(metadata[[sort_by]])), name = "Set1")
         }
     }
 
     # In case of ref level chosen as black
     if (palette_add_black){
         new_pal[palette_black_idx] = "black"
     }
     palette(new_pal)
 
     
     # enlarging margins for legend out of plot, forcing square plot area
     par(mai=c(0.2, 0.8, 0.4, 1.2), pty='s')
         
 
     # title
     main_title=title
     if (is.null(title)) {
         main_title=TF
     }
 
     plot(x = seq(from = 1, to = ncol(act)),
          y = act[tf_idx, sample_order_idx],
          col = metadata[[sort_by]][sample_order_idx],
          main = main_title,
          xlab = paste("Samples (ordered by ", sort_by, ")", sep=""),
          ylab = "Activity (deviation from average sample activity)",
          ylim=act_lim,
          cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2)
     legend("right", 
            legend = levels(metadata[[sort_by]]), 
           col = new_pal, pch = 'o', inset=c(-0.26, 0), 
             box.lwd = 0, bty = "n", xpd=T, cex=1.2)
     abline(a = 0, b = 0, col = "blue", lty=2, lwd = 2)
 
     # resetting palette and par
     #palette(old_pal)
     #par(old_par)
 }
diff --git a/get_exonic_lengths.R b/get_exonic_lengths.R
new file mode 100644
index 0000000..4bdd409
--- /dev/null
+++ b/get_exonic_lengths.R
@@ -0,0 +1,16 @@
+get_exonic_lengths <- function(org='hg19') {
+    ## Description: returns the cumulated exonic length for each gene
+        ## Para: -org: organism
+    require(GenomicFeatures)
+    org <- match.arg(org)
+    ens <- makeTxDbFromUCSC(genome=org,
+                            tablename='ensGene',
+                            url = 'http://genome-euro.ucsc.edu/cgi-bin/'
+                           )
+    exonic <- exonsBy(ens, by='gene')
+    red.exonic <- reduce(exonic)
+    exon.lengths <- sum(width(red.exonic))
+    # Removing ENSEMBL gene versions
+    names(exon.lengths) <- unlist(lapply(strsplit(names(exon.lengths), '\\.'), function(x) x[[1]]))
+    return(exon.lengths)
+}
\ No newline at end of file
diff --git a/get_pseudocounts.R b/get_pseudocounts.R
index 73d0af0..ed3b9bd 100644
--- a/get_pseudocounts.R
+++ b/get_pseudocounts.R
@@ -1,14 +1,15 @@
 get_pseudocounts <- function(count_table, q=0.05) {
-        ## compute pseudocounts on a gene count matrix (genes are rows, samples are columns)
+        ## Description: compute pseudocounts on a gene count matrix (genes are rows, samples are columns)
+        # to replace zero values before logging
 
-        ## count_table: gene count table with rows as genes, columns as samples
-        ## q: quantile of non-zero values used as a pseudocount in each sample (default: quantile = 0.05)
+            ## Para: -count_table: gene count table with rows as genes, columns as samples
+                    #-q: quantile of non-zero values used as a pseudocount in each sample (default: quantile = 0.05)
         stopifnot(typeof(count_table)=='integer' & is.matrix(count_table))
 
         pseudocounts = apply(X = count_table, MARGIN = 2, FUN = function(x) quantile(x[x!=0], q))
         
         # Broadcasting only works correctly by rows in R, so when broadcasting by columns
         # one has to transpose to broadcast by rows (instead of cols), do the operation
         # and then tranpose again to get back to the original conformation. 
         return(t(t(count_table) + pseudocounts))
 }
diff --git a/get_tpm_local.R b/get_tpm_local.R
new file mode 100644
index 0000000..60d314b
--- /dev/null
+++ b/get_tpm_local.R
@@ -0,0 +1,44 @@
+get_tpm_local <- function(countTable, org='hg19', exonic_lengths_path=NULL, redownload_exons=FALSE) {
+        
+    ## Description: computes TPM from a count table
+        ## Para: -countTable: table with ensembl gene IDs as row names, sample IDs as columns
+                #- org: organism (defaults to 'hg19')
+                #- exonic_lengths_path: path to where computed exonic lengths are stored, to avoid recomputing them everytime
+                #- redownload_exons: forces the download and computation of exon lengths from scratch
+    
+    # Defensive programming
+    stopifnot(is.matrix(countTable))
+
+    if(is.null(exonic_lengths_path)) {
+        exonic_lengths_path = paste0('../data/temp/', org, '_exon_lengths.RDS')
+    }
+    
+    # checking for the existence of a previously generated exon_length vector
+    
+    if(!dir.exists(dirname(exonic_lengths_path))){
+        dir.create(dirname(exonic_lengths_path), showWarnings = FALSE)
+    }
+    
+    exonic_lengths = NULL
+    if(!file.exists(exonic_lengths_path) | redownload_exons) {
+        file.remove(exonic_lengths_path, showWarnings = FALSE)
+        exonic_lengths = get_exonic_lengths(org)
+        saveRDS(exonic_lengths, exonic_lengths_path)
+    } else {
+        exonic_lengths = readRDS(exonic_lengths_path)
+    }
+
+    # normalizing to TPM
+    
+    # filtering of genes present in the countTable is done at this stage, therefore the full table should be stored somewhere.
+    exonic_lengths_filtered <- exonic_lengths[rownames(countTable)]
+    
+    print(head(exonic_lengths_filtered))
+    
+    # normalizing for gene length
+    length_norm <- countTable / exonic_lengths_filtered
+    
+    # normalizing for library size
+    ans <- t(t(length_norm)/colSums(length_norm)) * 1e6 # see https://rpubs.com/bbolker/sweep_divide
+    return(ans)
+}
\ No newline at end of file
diff --git a/plot_signif_subfams.R b/plot_signif_subfams.R
new file mode 100644
index 0000000..a8f0317
--- /dev/null
+++ b/plot_signif_subfams.R
@@ -0,0 +1,75 @@
+plot_signif_subfams <- function(res, alpha, n_label=NULL, main=NULL, n_subplots=1, idx=1) { 
+    ## Description: volcano plot with TE subfamily activities on the x axis, activity coefficient significance on the y axis
+        ## para: -res: result file returned by getSignifActivities.R
+                #-alpha: BH-adjusted threshold for significance (black circles)
+                #-n_label: number of subfamilies to label on the plot, starting from the most significant subfamily
+                #-main: title of the plot
+                #-n_subplots: when plotting several plots on the same figure (e.g par(mar=c(2, 2))), we only reset par() after the last subplot is drawn
+                #-idx: belongs to the interval [1, n_subplots] and triggers the reset of par() when it equals n_subplots
+    
+    stopifnot(alpha <= 1 & alpha >= 0)
+    stopifnot(idx <= n_subplots)
+    stopifnot(n_subplots >= 0)
+    
+    # saving current plot margins
+    old_par = par(no.readonly = TRUE)
+    
+    if(idx == n_subplots) {
+        on.exit(par(old_par))
+    }
+    # changing margin sizes
+    par(mar=c(4.1, 4.5, 2.5, 1))
+    
+    # identifying significant coefficients (will be plotted in black, not grey)
+    signif = which(res$coefs$p_adj <= alpha)
+    coefs_signif = res$coefs[signif,]
+    coefs_non_signif = res$coefs[-signif,]
+    
+    coef_labels = NULL
+    largest_label_length = NULL
+    if (! is.null(n_label)) {
+        coef_labels = coefs_signif[head(order(coefs_signif$p_adj), n_label), ]
+        largest_label_length = max(sapply(rownames(coef_labels), nchar))
+
+    }
+    
+    
+    # correcting the plot margins for the error bars and the labels to fit in
+    std_offset = 1.96*coefs_signif$`Std. Error`
+    std_offset_margin = max(std_offset)
+    
+    # empty plot with limits
+    x_min = min(res$coefs$Estimate)
+    x_max = max(res$coefs$Estimate)
+    x_margin = 0
+    if (! is.null(n_label)) {
+        x_margin = max(std_offset_margin, largest_label_length/300)
+    } else {x_margin = std_offset_margin}
+    
+    y_min = 0
+    y_max = max(na.omit((-log10(res$coefs[, 'p_adj']))))
+    y_margin = 0.04*(y_max-y_min)
+    
+    plot(NULL, type='n', xlim = c(x_min - x_margin, x_max + 2*x_margin), ylim = c(y_min, y_max + y_margin),
+        xlab = 'Activity', ylab = '-log10(adj. p-value)', cex.lab = 2, cex.axis = 2)
+
+    # moving main title up
+    title(main, line=1, cex.main = 2)
+    
+    # plotting the 0.95 confidence interval for the estimated coefficient
+    arrows(x0=coefs_signif$Estimate - std_offset, y0=-log10(coefs_signif$p_adj), x1=coefs_signif$Estimate + std_offset, y1=-log10(coefs_signif$p_adj), 
+           code=3, angle=90, length=0.05,
+           col="grey", lwd=1.5)
+    
+    # non significant activities
+    points(coefs_non_signif$Estimate, -log10(coefs_non_signif$p_adj), col = 'grey')
+    
+    # significant activities
+    points(coefs_signif$Estimate, -log10(coefs_signif$p_adj), col = 'black', cex=1.3)
+    
+
+    if (! is.null(n_label)) {
+        # adding the name of the top n TE subfams to the plot
+        text(x = coef_labels$Estimate, y=-log10(coef_labels$p_adj), labels = rownames(coef_labels), adj = c(-0.1, -0.4), cex = 1.4) 
+    }
+}
\ No newline at end of file
diff --git a/preprocess_E_N_for_activities.R b/preprocess_E_N_for_activities.R
index 1f998f4..2f484cc 100644
--- a/preprocess_E_N_for_activities.R
+++ b/preprocess_E_N_for_activities.R
@@ -1,87 +1,88 @@
 preprocess_E_N_for_activities <- function(count_table, N, 
                                           count_threshold=c(10, 1), 
                                           N_threshold = 150,
                                           pseudocount_quantile=0.05,
                                           log_tpm_plot_path,
-                                          import_path
+                                          import_path,
+                                          preds_protected=NULL,
+                                          redownload_exons=FALSE
                                           ) {
-    ## Details: process the count_table to E along with N to regress activities onto occurences of DNA sequences
-    ## the set of genes in N is assumed to be the overall set of genes we 
-    ## are interested in (e.g: if N only contains protein coding genes, we 
-    ## only look at protein coding genes)
-
-    ## E: gene count matrix (rows are genes, columns are samples). Contains integer values
-    ## N: per promoter occurences of DNA sequences. Promoters/genes are rows, DNA sequence prevalences are columns
-    ## count_threhold: [1] minimum number of counts in at least [2] genes to filter in gene in E
-    ## 
+    ## Descrption: filter, normalize and process a table of raw counts with genes as rows and samples as columns to 
+    ## the expression matrix E. Build a corresponding N matrix with remaining genes as rows and TEs as columns.
+
+        ## Para: -count_table: gene count matrix (rows are genes, columns are samples).
+                #-N: per gene occurences of TE integrants. Genes are rows, TE subfamilies are columns
+                #-count_threhold: c(a,b) with a the minimum number of counts in at least b genes to keep a gene present in E
+                #-N_threshold:  minimum sum over all genes that a TE subfamily has to reach in order to be kept as a predictor in matrix N
+                #-pseudocount_quantile: quantile of non-zero values used to compute per-sample pseudocounts prior to log-transformation
+                #-log_tpm_plot_path: path where histograms of expression values (in logged TPM) are drawn (useful to check quasi-normality of expression values)
+                #-import_path: path pointing to the other R functions required for the pre-processing
+                #-preds_protected: vector containing the names of predictors that cannot be filtered out based on N_threshold
+                #-redownload_exons: TRUE: exons are redownloaded to compute TPMs. FALSE: exons are not redownloaded to compute TPMs.
+        
 
     stopifnot(typeof(count_table)=='integer' & is.matrix(count_table) & all(count_table>=0))
     stopifnot(is.matrix(N) & all(N)>=0)
     
     # importing useful functions
 
     source(file.path(import_path, 'get_pseudocounts.R'))
-    source(file.path(import_path, 'get_tpm.R'))
+    source(file.path(import_path, 'get_exonic_lengths.R'))
+    source(file.path(import_path, 'get_tpm_local.R'))
     source(file.path(import_path, 'row_center.R'))
 
     # filtering genes not expressed at enough counts in at least ... samples
     genes_kept = colSums(apply(count_table,
                                MARGIN=1,
                                (function(x) x>=count_threshold[1])))>=count_threshold[2]
     
     count_table = count_table[genes_kept, ]
 
 
     # finding genes in count_table that are in N
     in_N = rownames(count_table) %in% rownames(N)
     count_table = count_table[in_N, ]
 
     # ordering rows of N as rows of count_table
     N = N[rownames(count_table), ]
 
     # Selecting cols in N with at least ... hits
 
     # saving lost fams to return them
-    lost_cols = colSums(N)[which(colSums(N)<N_threshold)]
-
+    lost_cols = colSums(N)[which(colSums(N)<N_threshold & ! colnames(N) %in% preds_protected)]
+    
     # removing cols of N with 
-    N = N[, which(colSums(N)>=N_threshold)]
-
+    N = N[, which(colSums(N)>=N_threshold | colnames(N) %in% preds_protected)]
 
     # removing genes that now have 0 hits in N:
     to_keep = rowSums(N)>0
     N = N[to_keep, ]
 
     # in case we lost some genes in N, removing them in the count_table
     count_table = count_table[rownames(N), ]
 
     # adding pseudocounts to the count_table
     count_table = get_pseudocounts(count_table, pseudocount_quantile)
 
     # computing log2(tpm), keeping them for the graph
-    log_tpm = log2(get_tpm(count_table))
+    log_tpm = log2(get_tpm_local(count_table, redownload_exons=redownload_exons))
     gc()
 
     # plotting log tpm for quality control
     pdf(log_tpm_plot_path)
     hist(log_tpm)
     dev.off()
 
     # col centering E
     E = scale(log_tpm, center = T, scale = F)
     gc()
 
     # row centering E
     E = row_center(E)
 
-    # col centering N
-    N_centered = scale(N, center=T, scale=F)
-
     stopifnot(all(rownames(E)==rownames(N)))
 
     return(list('E_centered' = E,
-                'N_centered' = N_centered,
                 'N' = N,
                 'lost_cols' = lost_cols))
-
 }
diff --git a/pval_from_fstat.R b/pval_from_fstat.R
new file mode 100644
index 0000000..23cdeb1
--- /dev/null
+++ b/pval_from_fstat.R
@@ -0,0 +1,7 @@
+pval_from_fstat <- function(res) {
+    ## Description: returns the pvalue corresponding to the f-statistic of the complete versus the null model (mean only)
+        ## Para: -res: result returned by getSignifActivities
+    
+    pval = pf(q = res$fstatistic['value'], df1 = res$fstatistic['numdf'], df2 = res$fstatistic['dendf'], lower.tail = F)
+    return(pval)
+}
\ No newline at end of file
diff --git a/row_center.R b/row_center.R
index 5565590..d68d60f 100644
--- a/row_center.R
+++ b/row_center.R
@@ -1,6 +1,8 @@
 row_center <- function(X) {
-    # center the matrix X by rows
+    
+    ## Description: centers the matrix X by rows
+        # Paras: X: matrix with rows to be centered.
     stopifnot(is.matrix(X))
 
     return (t(scale(x = t(X), center = T, scale = F)))
 }