diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_8/analysis_test.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.R similarity index 91% rename from scripts/10xgenomics_PBMC_5k_peaks_classification_8/analysis_test.R rename to scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.R index 042553b..f1d48fc 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_8/analysis_test.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.R @@ -1,121 +1,122 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") # the TF names class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1", "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF", "NR4A1", "EGR", "GATA", "NFAT", "RUNX") # the number of classes searched for each TF -# n.classes = 2:10 -n.classes = 10:10 +n.classes = 2:10 +# n.classes = 2 # the methods used for the classification for each TF -# em.methods = c("read", "consensussequence", "read_consensussequence") -em.methods = c("consensussequence_kmer") +em.methods = c("read", "consensussequence", "read_consensussequence") +# em.methods = c("read") # make a loop here for final analysis for(tf in class.tf) { # make a loop here for final analysis for(method in em.methods) { dir.tf = file.path("results", "10xgenomics_PBMC_5k_peaks_classification_8", tf, method) for(k in n.classes) { # sequence data = read.sequence.models(file.path(dir.tf, sprintf("data_class%s_%dclass_model_sequence.mat2d", tf, k))) model.seq = data$models model.prob = data$prob data = NULL # open chromatin model.open = read.read.models(file.path(dir.tf, sprintf("data_class%s_%dclass_model_open.mat2d", tf, k)))$models # nucleosomes model.nucl = read.read.models(file.path(dir.tf, sprintf("data_class%s_%dclass_model_nucl.mat2d", tf, k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=20, height=10) png(filename=file.path(dir.tf, sprintf("data_class%s_%dclass.png", tf, k)), units="in", res=720, width=20, height=10) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F][,] ref.nucl = model.nucl[ord,, drop=F][,] ref.seq = model.seq[,,ord, drop=F][,,] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, - main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + main=sprintf("class %d (p=%.2f)", class[i], prob[i]), + cex.main=2) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) - axis(1, at=x.at, labels=x.lab) + axis(1, at=x.at, labels=x.lab, cex.axis=2) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") - axis(2, at=y.at, labels=y.lab) + axis(2, at=y.at, labels=y.lab, cex.axis=2) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter row_h = 1/nrow(m) # height of row col_w = 1/ncol(m) # width of column row_cor = row_h / 3 col_cor = col_w / 3 for(i in 1:nrow(ref.open)) { # plot logo center left = (col_w*col_n) - col_w right = left + col_w left = right - col_cor bottom = 1 - (row_h*row_n) top = bottom + row_h bottom = top - row_cor p= par(fig=c(left, right, bottom, top), mar=c(0,0,0,0), new=T) idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis # x.at = seq(1, length(idx), length.out = 3) # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] x.at = ceiling(length(idx)/2) x.lab = 0 - axis(1, at=x.at, labels=x.lab) + axis(1, at=x.at, labels=x.lab, cex.axis=1) # yaxis - axis(2, at=y.at, labels=y.lab) + axis(2, at=y.at, labels=y.lab, cex.axis=1) row_n = row_n + 1 if(i %% nrow(m) == 0) { col_n = col_n + 1 row_n = 1 } par(p) } dev.off() } } } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_8/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/run_all.sh new file mode 100755 index 0000000..231ae7f --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/run_all.sh @@ -0,0 +1,3 @@ +scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.sh +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.R + diff --git a/src/Applications/ApplicationInterface.hpp b/src/Applications/ApplicationInterface.hpp index 7e000f8..b05d3ef 100644 --- a/src/Applications/ApplicationInterface.hpp +++ b/src/Applications/ApplicationInterface.hpp @@ -1,20 +1,22 @@ /*! * \brief The ApplicationInterface class provides an interface for - * program wrapper which only require to have one object constructed - * and its run() method called. + * classes that allow to create an application. + * Running the application then only requires to run the run() + * method that is declared in this class. */ class ApplicationInterface { public: /*! * \brief Destructor. */ virtual ~ApplicationInterface() ; /*! * \brief Runs the application, with all its * functionalities. - * \return the exit code. + * \return the exit code to return, for instance, + * to the OS. */ virtual int run() = 0 ; } ; diff --git a/src/Applications/CorrelationMatrixCreatorApplication.cpp b/src/Applications/CorrelationMatrixCreatorApplication.cpp index 4fe08f4..45ff2a4 100644 --- a/src/Applications/CorrelationMatrixCreatorApplication.cpp +++ b/src/Applications/CorrelationMatrixCreatorApplication.cpp @@ -1,190 +1,191 @@ #include #include #include #include #include #include // std::invalid_argument namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv) : file_bed(""), file_bam(""), file_bai(""), from(0), to(0), bin_size(0), method(CorrelationMatrixCreator::FRAGMENT), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int CorrelationMatrixCreatorApplication::run() { if(this->runnable) { CorrelationMatrixCreator mc(this->file_bed, this->file_bam, this->file_bai, this->from, this->to, this->bin_size, this->method) ; std::cout << mc.create_matrix() << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "CorrelationMatrixCreator is an application that creates a\n" "count matrix from a BED file and a BAM file and returnes it\n" "through stdout.\n" - "The matrix contains one row per region (reference region)\n" - "present in the BED file. For each region, its center is\n" - "computed and a set of equally sized, non-overlapping bins,\n" - "centered on the region center and covering the interval [from,to]\n" - "is build. Then, each bin is assigned the number of read/fragment\n" - "positions (targets) present in the BAM file that are mapped at\n" - "that position.\n" + "The center of each region is computed as the center of the\n" + "BED regions. Then a set of equally sized, non-overlapping\n" + "bins, centered on the region center and covering the interval\n" + "[from,to] is build for each region. Then, each bin is assigned\n" + "the number of read/fragment positions (targets) present in\n" + "the BAM file that are mapped at that position.\n" + "The read/fragment counts are then computed, for each bin,\n" + "from the BAM file.\n" "The matrix is a 2D matrix which dimensions are :\n" "1) number of regions\n" "2) length of region (to - from + 1) / bin_size\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_bed_msg = "The path to the BED file containing the references."; std::string opt_bam_msg = "The path to the BAM file containing the targets."; std::string opt_bai_msg = "The path to the BAI file containing the BAM file index."; std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build\n" "around each reference center." ; std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build\n" "around each reference center." ; std::string opt_binsize_msg = "The size of the bins." ; char tmp[4096] ; sprintf(tmp, "How the data in the BAM file should be handled when computing\n" "the number of counts in each bin.\n" "\t\"%s\" uses each position within the reads (by default)\n" "\t\"%s\" uses only the insertion site for ATAC-seq data\n" "\t\"%s\" uses each position within the fragments\n" "\t\"%s\" uses only the fragment central positions\n", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; std::string opt_method_msg = tmp ; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string method(method_read) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) ("method", po::value(&(method)), opt_method_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { CorrelationMatrixCreatorApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/CorrelationMatrixCreatorApplication.hpp b/src/Applications/CorrelationMatrixCreatorApplication.hpp index b108055..0cfa646 100644 --- a/src/Applications/CorrelationMatrixCreatorApplication.hpp +++ b/src/Applications/CorrelationMatrixCreatorApplication.hpp @@ -1,102 +1,111 @@ #ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP #define CORRELATIONMATRIXCREATORAPPLICATION_HPP #include #include // CorrelationMatrixCreator::methods #include /*! * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a * CorrelationMatrixCreator instance creating an autonomous application to - * compute a count matrix from a BAM file by directly passing all the options - * and parameters from the command line. + * compute a read/fragment count matrix from a BAM file by directly passing + * all the options and parameters from the command line. + * + * The center of each region is computed as the center of the BED regions. + * Then a set of equally sized, non-overlapping bins, centered on the region + * center and covering the interval [from,to] is build for each region. + * Then, each bin is assigned the number of read/fragment positions (targets) + * present in the BAM file that are mapped at that position. + * + * The read/fragment counts are then computed, for each bin, + * from the BAM file. */ class CorrelationMatrixCreatorApplication: public ApplicationInterface { public: CorrelationMatrixCreatorApplication() = delete ; CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ CorrelationMatrixCreatorApplication(int argn, char** argv) ; /*! * \brief Runs the application. A matrix containing the * read densities around the center of the bed regions, +/- * the given offsets is created by searching the BAM file * and printed on the stdout. * The matrix is a 2D matrix with dimensions : * 1) number of regions * 2) length of region (to - from + 1) / bin_size. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the bam file. */ std::string file_bam ; /*! * \brief the path to the bam index file. */ std::string file_bai ; /*! * \brief a relative coordinate indicating the * most downstream position to consider around * each region in the bed file. */ int from ; /*! * \brief a relative coordinate indicating the * most upstream position to consider around * each region in the bed file. */ int to ; /*! * \brief the size of the bin that will be used * to bin the signal in the regions [from,to] around * each region in the bed file. */ int bin_size ; /*! * \brief How to consider the sequenced fragments when computing * the bin values. */ CorrelationMatrixCreator::methods method ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP diff --git a/src/Applications/EMJointApplication.cpp b/src/Applications/EMJointApplication.cpp index 110d9b8..180ea56 100644 --- a/src/Applications/EMJointApplication.cpp +++ b/src/Applications/EMJointApplication.cpp @@ -1,248 +1,250 @@ #include #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include #include #include // filter() namespace po = boost::program_options ; EMJointApplication::EMJointApplication(int argn, char** argv) : files_read(""), file_sequence(""), file_conssequence(""), file_filter(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMJointApplication::run() { if(this->runnable) { // read data std::vector read_paths ; boost::split(read_paths, this->files_read, [](char c){return c == ',';}) ; // row filter std::vector filter ; if(this->file_filter != "") { // it is a column vector, easier to use the Matrix2D interface // to read it rather than coding a function for :) filter = Matrix2D(this->file_filter).get_data() ; std::sort(filter.begin(), filter.end()) ; } std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } Matrix2D data(path) ; // filter out some rows if needed if(filter.size()) { data = filter_rows(filter, data) ; } data_read.push_back(std::move(data)) ; } // read data only EMJoint* em = nullptr ; if(this->file_sequence == "" and this->file_conssequence == "") { em = new EMJoint(std::move(data_read), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } // read and sequence data else if(this->file_sequence != "" and this->file_conssequence == "") { Matrix2D data_seq(this->file_sequence) ; // filter out some rows if needed if(filter.size()) { data_seq = filter_rows(filter, data_seq) ; } em = new EMJoint(std::move(data_read), std::move(data_seq), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } // read and consensus sequence data else if(this->file_sequence == "" and this->file_conssequence != "") { Matrix3D data_consseq ; data_consseq.load(this->file_conssequence) ; // filter out some rows if needed if(filter.size()) { data_consseq = filter_rows(filter, data_consseq) ; } em = new EMJoint(std::move(data_read), std::move(data_consseq), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } em->classify() ; em->get_post_prob().save(this->file_out) ; delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMJointApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMJoint is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given 1) the shapes \n" - "of the read densities over the regions and 2) the region sequence \n" - "motif contents. \n " - "The assignment probabilitiesare returned through stdout.\n\n" ; + "of the read/fragment count densities over the regions and\n" + "2) the region DNA sequences.\n " + "The assignment probabilities are written in a binary format as\n" + "a 4D matrix of dimensions number of regions x number of\n" + "classes x number of shift states x number of flip states\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations, \n" "by default 0 (no parallelization)." ; std::string opt_read_msg = "A coma separated list of paths to the file containing the \n" "read density data. At least one path is needed." ; std::string opt_seq_msg = "The path to the file containing the sequence data. If no path is \n" "given, the classification is only cares about the read density shapes." ; std::string opt_consseq_msg = "The path to the file containing the consensus sequence data. If no path is \n" "given, the classification only cares about the read density shapes." ; std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" "indices of rows to filter out in the data." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; std::string opt_bckg_msg = "Adds a class to model the sequence and the read signal background. This\n" "class contains sequence background probabilies (for the sequence model)\n" "and the mean number of reads (for the read count models) at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) ("seq", po::value(&(this->file_sequence)), opt_seq_msg.c_str()) ("consseq", po::value(&(this->file_conssequence)), opt_seq_msg.c_str()) ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->files_read == "" and this->file_sequence == "" and this->file_conssequence == "" and (not help)) { std::string msg("Error! No data were given (--read --seq --consseq)!") ; throw std::invalid_argument(msg) ; } if(this->files_read == "" and (not help)) { std::string msg("Error! No read density data were given (--read)!") ; throw std::invalid_argument(msg) ; } if(this->file_sequence != "" and this->file_conssequence != "") { std::string msg("Error! --seq and --consseq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMJointApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMJointApplication.hpp b/src/Applications/EMJointApplication.hpp index 81a8e92..274a05d 100644 --- a/src/Applications/EMJointApplication.hpp +++ b/src/Applications/EMJointApplication.hpp @@ -1,126 +1,127 @@ #ifndef EMJOINTAPPLICATION_HPP #define EMJOINTAPPLICATION_HPP #include #include #include /*! * \brief The EMJointApplication class is a wrapper around an EMJoint - * instance creating an autonomous application to classify data by directly + * instance creating an autonomous application to classify one or more + * read/fragment cout data and/or DNA sequences by directly * passing all the options and parameters from the command line. */ class EMJointApplication: public ApplicationInterface { public: EMJointApplication() = delete ; EMJointApplication(const EMJointApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMJointApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief a coma separated list of paths to the files * containing the read density data */ std::string files_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_sequence ; /*! * \brief the path to the file containing the * consensus sequence data. */ std::string file_conssequence ; /*! * \brief the path to the file containing the * (0-based) index of the rows to filter out * from the data. */ std::string file_filter ; /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief whether a constant class to model the * sequence background should be added. This * class has the sequence background probabilities * (for the sequence model) and the mean number of * read counts (for read signal model) at each * position. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMJOINTAPPLICATION_HPP diff --git a/src/Applications/EMReadApplication.cpp b/src/Applications/EMReadApplication.cpp index d2e12b3..7d7ce26 100644 --- a/src/Applications/EMReadApplication.cpp +++ b/src/Applications/EMReadApplication.cpp @@ -1,175 +1,177 @@ #include #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include #include #include // filter() namespace po = boost::program_options ; EMReadApplication::EMReadApplication(int argn, char** argv) : file_read(""), file_filter(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMReadApplication::run() { if(this->runnable) { // read data Matrix2D data(this->file_read) ; // filter out some rows if needed std::vector filter ; if(this->file_filter != "") { // it is a column vector, easier to use the Matrix2D interface // to read it rather than coding a function for :) filter = Matrix2D(this->file_filter).get_data() ; std::sort(filter.begin(), filter.end()) ; data = filter_rows(filter, data) ; } EMRead em(std::move(data), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; em.classify() ; em.get_post_prob().save(this->file_out) ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMReadApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMRead is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given the shape \n" - "of the read density over the region. The assignment \n" - "probabilities are returned through stdout.\n\n" ; + "of the read/fragment count density over the region. The\n" + "assignment probabilities are written in a binary format as a" + "4D matrix of dimensions number of regions x number of\n" + "classes x number of shift states x number of flip states\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_read_msg = "The path to the file containing the read density data" ; std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" "indices of rows to filter out in the data." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom to realign the data. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_bckg_msg = "Adds a class to model the signal background. This class\n" "contains the mean number of read at each position and\n" "is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_read == "" and (not help)) { std::string msg("Error! No data were given (--read)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMReadApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMReadApplication.hpp b/src/Applications/EMReadApplication.hpp index 5f76c66..896433c 100644 --- a/src/Applications/EMReadApplication.hpp +++ b/src/Applications/EMReadApplication.hpp @@ -1,109 +1,110 @@ #ifndef EMREADAPPLICATION_HPP #define EMREADAPPLICATION_HPP #include #include /*! * \brief The EMReadApplication class is a wrapper around an EMRead - * instance creating an autonomous application to classify data by directly - * passing all the options and parameters from the command line. + * instance creating an autonomous application to classify read/fragment + * count data by directly passing all the options and parameters from + * the command line. */ class EMReadApplication: public ApplicationInterface { public: EMReadApplication() = delete ; EMReadApplication(const EMReadApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMReadApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the paths to the file containing the read * density data. */ std::string file_read ; /*! * \brief the path to the file containing the * (0-based) index of the rows to filter out * from the data. */ std::string file_filter ; /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief whether a constant class to model the * read count background should be added. This * class has mean number of read count at each * position. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMREADAPPLICATION_HPP diff --git a/src/Applications/EMSequenceApplication.cpp b/src/Applications/EMSequenceApplication.cpp index cfed4c6..18589f3 100644 --- a/src/Applications/EMSequenceApplication.cpp +++ b/src/Applications/EMSequenceApplication.cpp @@ -1,378 +1,379 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include #include #include // filter() #include // kmer::compute_kmer_pvalue() #include // order() namespace po = boost::program_options ; EMSequenceApplication::EMSequenceApplication(int argn, char** argv) : file_seq(""), files_motif(""), file_filter(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMSequenceApplication::run() { if(this->runnable) { EMSequence* em(nullptr) ; // data Matrix2D data(this->file_seq) ; // filter out some rows if needed std::vector filter ; if(this->file_filter != "") { // it is a column vector, easier to use the Matrix2D interface // to read it rather than coding a function for :) filter = Matrix2D(this->file_filter).get_data() ; std::sort(filter.begin(), filter.end()) ; data = filter_rows(filter, data) ; } // seeds motifs randomly if(this->files_motif == "" and this->seed != "") { em = new EMSequence(std::move(data), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } // seeds motifs with the given matrices else if(this->files_motif != "") { // model std::vector motif_paths ; boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ; // this->n_class = motif_paths.size() + this->bckg_class ; size_t model_ncol = data.get_ncol() - this->n_shift + 1 ; // add the given motif, random motifs (if needed) and // background class (if needed) Matrix3D model = this->init_model(model_ncol, data, motif_paths) ; em = new EMSequence(std::move(data), std::move(model), this->n_iter, this->bckg_class, this->n_threads) ; } // seeds from enriched kmers else { size_t model_ncol = data.get_ncol() - this->n_shift + 1 ; Matrix3D model = this->init_model_kmer(model_ncol, data) ; em = new EMSequence(std::move(data), std::move(model), this->n_iter, this->flip, this->bckg_class, this->n_threads) ; } // classify em->classify() ; em->get_post_prob().save(this->file_out) ; // clean delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMSequenceApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMSequence is a probabilistic partitioning algorithm that \n" "sofetly assigns sequences to classes given their motif content.\n" "The assignment probabilities are written in binary format as a 4D " - "matrix.\n\n" ; + "matrix of dimensions number of regions x number of\n" + "classes x number of shift states x number of flip states\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_seq_msg = "The path to the file containing the sequences" ; std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n" "values. The motifs should be probability matrices in horizontal format.\n" "If the motifs are too short after accounting for shifting, extra\n" "columns with uniform probabilities will be added on each side. The\n" "given number of classes (--class) should at least be the number of\n" "initial motifs. If the number of classes is bigger than the number of" "given motifs, the remaining classes are initialised randomly\n." ; std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" "indices of rows to filter out in the data." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n" "the data. By default, shifting is disabled (equivalent to\n" "--shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n" "contains the sequence background probabilities at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("seq", po::value(&(this->file_seq)), opt_seq_msg.c_str()) ("motifs", po::value(&(this->files_motif)), opt_motifs_msg.c_str()) ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_seq == "" and (not help)) { std::string msg("Error! No data were given (--seq)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } Matrix3D EMSequenceApplication::init_model(size_t l_model, const Matrix2D& data, const std::vector& motif_paths) const { int n_class_given = motif_paths.size() ; int n_class_bckg = this->bckg_class ; int n_class_rand = this->n_class - n_class_given - n_class_bckg ; // number of classes should at least be number of motifs if(n_class_given > (int)this->n_class) { char msg[4096] ; sprintf(msg, "Error! number of class given (--class %zu) should at " "least be equal to number of motifs (--motifs %d)", this->n_class, n_class_given) ; throw std::invalid_argument(msg) ; } // check if there is room for a background class if((int)this->n_class < n_class_given+this->bckg_class) { char msg[4096] ; sprintf(msg, "Error! no class left to add a background " "class (--bgclass) with the given motifs (--motifs) (--class %zu)", this->n_class) ; throw std::invalid_argument(msg) ; } // init empty model Matrix3D model(this->n_class, l_model, 4, 0.25) ; // add given motifs for(size_t i=0; i matrix(motif_paths[i]) ; // motif is too big for this shift if(matrix.get_ncol() > l_model) { char msg[4096] ; sprintf(msg, "Error! In %s, motif column number is bigger " "than data column number - shift + 1 " "(%zu > %zu - %zu + 1)", motif_paths[i].c_str(), matrix.get_ncol(), data.get_ncol(), this->n_shift) ; throw std::invalid_argument(msg) ; } // insert motif in middle of matrix else { // size_t j_model = this->n_shift / 2 ; size_t j_model = (l_model - matrix.get_ncol()) / 2 ; for(size_t j_mat=0, j_mod=j_model; j_mat 0) { // initialise randomly EMSequence em(data, n_class_rand, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; Matrix3D model_rand = em.get_sequence_models() ; // copy them into model for(int i_rand=0, i_mod=n_class_given; i_rand EMSequenceApplication::init_model_kmer(size_t l_model, const Matrix2D& data) const { // leave space for 2N's on each side size_t l_kmer = l_model ; size_t n_n = 0 ; // so far, 0 N's added if(l_model > 4) { n_n = 2 ; // 2 N's on each side l_kmer -= (2*n_n) ; } // compute the pvalue associated to each kmer auto kmers_pvalues = kmers::compute_kmer_pvalue(data, l_kmer) ; // sort kmers by ascending pvalue std::vector index = order(kmers_pvalues.second, true) ; // get most significant std::vector kmers(this->n_class) ; for(size_t i=0; in_class; i++) { size_t idx = index[i] ; kmers[i] = kmers_pvalues.first[idx] ; std::cerr << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ; } // turn to motifs double p_base = 0.7 ; // the prob of the base matching these of the kmer double p_nbase = 0.1 ; // the prob of the bases not matching these of the kmer double p_n = 0.25 ; // the prob of N // only N's for now Matrix3D model(this->n_class, l_model, 4, p_n) ; for(size_t i=0; i #include #include #include #include /*! * \brief The EMSequenceApplication class is a wrapper around an EMSequence - * instance creating an autonomous application to classify sequences by directly - * passing all the options and parameters from the command line. + * instance creating an autonomous application to classify DNA sequences by + * directly passing all the options and parameters from the command line. */ class EMSequenceApplication: public ApplicationInterface { public: EMSequenceApplication() = delete ; EMSequenceApplication(const EMSequenceApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMSequenceApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief Initialise the class models from the * most significantly enriched kmers in the data. * The kmers are turned into probability matrices * using the following rules : * 1) the probability of a base matching a base at * a given position of the kmer is set to 0.7, all * other to 0.1 * 2) if the model length is bigger than 4, two * N's are added on each side of the central kmer * core (prob are 0.25, 0,25, 0.25, 0.25). * \param l_model the length of the model. * \param data the sequences in integer format * (A:0, C:1, G:2, T:3). * Its dimensions are the following : * 1st the number of sequences * 2nd the length of the sequences. * \return The models initialised. * Its dimensions are the following : * 1st the number of classes * 2nd the length of the model * 3rd 4 for A, C, G, T */ Matrix3D init_model_kmer(size_t l_model, const Matrix2D& data) const ; /*! * \brief Initialise the class models if matrices * are given as initial class motifs. * If the given class motifs are shorter than the * model after accounting for shifting, extra columns * with uniform probabilities will be added on each * side. * If the number of classes is higher than the * number of given motifs, extra classes will be * initialised randomly.A background class is included * if needed. * \param l_model the number of positions (columns) * of the model to initialise. * \param data the sequence matrix, in integer format. * \param motif_paths the paths to the files containing * the probability matrices to use to initialise the * class motifs. * \return The models initialised. * Its dimensions are the following : * 1st the number of classes * 2nd the length of the model * 3rd 4 for A, C, G, T */ Matrix3D init_model(size_t l_model, const Matrix2D& data, const std::vector& motif_paths) const ; /*! * \brief the paths to the file containing the sequence * data. */ std::string file_seq ; /*! * \brief a coma separated list of files containing the * initial motif matrices. */ std::string files_motif ; /*! * \brief the path to the file containing the * (0-based) index of the rows to filter out * from the data. */ std::string file_filter ; /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief whether a constant class to model the * sequence background should be added. This * class has the sequence background probabilities * at each position. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMSEQUENCEAPPLICATION_HPP diff --git a/src/Applications/MatrixBinToTxtApplication.cpp b/src/Applications/MatrixBinToTxtApplication.cpp index e3aa898..161a0e9 100644 --- a/src/Applications/MatrixBinToTxtApplication.cpp +++ b/src/Applications/MatrixBinToTxtApplication.cpp @@ -1,156 +1,157 @@ #include #include #include #include #include namespace po = boost::program_options ; -// valid types +// valid types in the matrix handled std::string type_int = "int" ; std::string type_char = "char" ; std::string type_double = "double" ; MatrixBinToTxtApplication::MatrixBinToTxtApplication(int argn, char** argv) : file_matrix(""), type(""), ndim(0), runnable(false) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int MatrixBinToTxtApplication::run() { if(this->runnable) { if(this->type == type_int) { if(this->ndim == 2) { Matrix2D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } else if(this->ndim == 3) { Matrix3D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } else if(this->ndim == 4) { Matrix4D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } } else if(this->type == type_char) { if(this->ndim == 2) { Matrix2D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } else if(this->ndim == 3) { Matrix3D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } else if(this->ndim == 4) { Matrix4D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } } else if(this->type == type_double) { if(this->ndim == 2) { Matrix2D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } else if(this->ndim == 3) { Matrix3D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } else if(this->ndim == 4) { Matrix4D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } } return EXIT_SUCCESS ; } else { return EXIT_FAILURE; } } void MatrixBinToTxtApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" - "MatrixBinToTxt reads a matrix file in binary format and displays it\n" - "in text format.\n\n" ; + "MatrixBinToTxt reads a Matrix that has been dumped in file in\n" + "binary format (using Matrix.save(const std::string&) method)and " + "displays it in text format through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_file_mat_msg = "The path to the matrix file." ; std::string opt_ndim_msg = "The number of dimensions that the matrix has." ; char opt_type_msg[4096] ; sprintf(opt_type_msg, "The type of the data stored in the matrix. It should be %s, %s or\n" "%s.", type_int.c_str(), type_char.c_str(), type_double.c_str()); // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("file", po::value(&(this->file_matrix)), opt_file_mat_msg.c_str()) ("type", po::value(&(this->type)), opt_type_msg) ("ndim", po::value(&(this->ndim)), opt_ndim_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_matrix == "" and (not help)) { std::string msg("Error! No input file was given (--file)!") ; throw std::invalid_argument(msg) ; } else if(this->type == "" and (not help)) { std::string msg("Error! No data type was not given (--type)!") ; throw std::invalid_argument(msg) ; } else if(this->ndim == 0 and (not help)) { std::string msg("Error! The matrix dimensionality was not given (--ndim)!") ; throw std::invalid_argument(msg) ; } else if((this->type != type_int) and (this->type != type_char) and (this->type != type_double) and (not help)) { char msg[4096] ; sprintf(msg, "Error! Unsupported data type %s! It should be %s, %s or %s (--type)", this->type.c_str(), type_int.c_str(), type_char.c_str(), type_double.c_str()) ; throw std::invalid_argument(msg) ; } else if(this->ndim == 1) { std::string msg("Error! The matrix dimensionality cannot be 1!") ; throw std::invalid_argument(msg) ; } else if(this->ndim > 4) { char msg[4096] ; sprintf(msg, "Error! The matrix dimensionality cannot be bigger than 4 (%zu)!", this->ndim) ; throw std::invalid_argument(msg) ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { MatrixBinToTxtApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/MatrixBinToTxtApplication.hpp b/src/Applications/MatrixBinToTxtApplication.hpp index 73b4f9d..d203db0 100644 --- a/src/Applications/MatrixBinToTxtApplication.hpp +++ b/src/Applications/MatrixBinToTxtApplication.hpp @@ -1,73 +1,81 @@ #ifndef MATRIXBINTOTXTAPPLICATION_HPP #define MATRIXBINTOTXTAPPLICATION_HPP #include #include +/*! + * \brief The MatrixBinToTxtApplication class implementes an autonomous + * application that allows to convert a Matrix object dumped (using + * Matrix.save(const std::string&) method) in binary format into text + * format. + * It can handle the conversion of Maitrx2D, 3D and 4D containing , + * and types. + */ class MatrixBinToTxtApplication : public ApplicationInterface { public: MatrixBinToTxtApplication() = delete ; MatrixBinToTxtApplication(const MatrixBinToTxtApplication& other) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ MatrixBinToTxtApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the file containing the matrix. */ std::string file_matrix ; /*! * \brief the type of the data stored. */ std::string type ; /*! * \brief the number of dimensions of the matrix. */ size_t ndim ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // MATRIXBINTOTXTAPPLICATION_HPP diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp index 8a0613f..1b26fbf 100644 --- a/src/Applications/ProbToModelApplication.cpp +++ b/src/Applications/ProbToModelApplication.cpp @@ -1,320 +1,324 @@ #include #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include #include #include // filter_rows() namespace po = boost::program_options ; typedef std::vector vector_d ; ProbToModelApplication::ProbToModelApplication(int argn, char** argv) : file_read(""), file_seq(""), file_prob(""), file_filter(""), bckg_class(false), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ProbToModelApplication::~ProbToModelApplication() {} int ProbToModelApplication::run() { if(this->runnable) { // load data std::string file_data ; bool read_data = false ; bool seq_data = false ; bool consseq_data = false ; if(this->file_read != "") { file_data = this->file_read ; seq_data = consseq_data = false ; read_data = true ; } else if(this->file_seq != "") { file_data = this->file_seq ; read_data = consseq_data = false ; seq_data = true ; } else if(this->file_consseq != "") { file_data = this->file_consseq ; read_data = seq_data = false ; consseq_data = true ; } else { std::string msg("Error! Could not determine the type of the data!") ; throw std::runtime_error(msg) ; } // row filter std::vector filter ; if(this->file_filter != "") { // it is a column vector, easier to use the Matrix2D interface // to read it rather than coding a function for :) filter = Matrix2D(this->file_filter).get_data() ; std::sort(filter.begin(), filter.end()) ; } // prob Matrix4D prob ; prob.load(this->file_prob) ; // get the data model ModelComputer* ptr = nullptr ; if(read_data) { Matrix2D data(file_data) ; // filter out some rows if needed if(filter.size()) { data = filter_rows(filter, data) ; } this->check_dimensions(data, prob) ; ptr = new ReadModelComputer(std::move(data), prob, this->bckg_class, this->n_threads) ; } else if(seq_data) { Matrix2D data(file_data) ; // filter out some rows if needed if(filter.size()) { data = filter_rows(filter, data) ; } this->check_dimensions(data, prob) ; ptr = new SequenceModelComputer(std::move(data), prob, this->bckg_class, this->n_threads) ; } else if(consseq_data) { Matrix3D data ; data.load(file_data) ; // filter out some rows if needed if(filter.size()) { data = filter_rows(filter, data) ; } this->check_dimensions(data, prob) ; ptr = new ConsensusSequenceModelComputer(std::move(data), prob, this->bckg_class, this->n_threads) ; } Matrix2D model = ptr->get_model() ; delete ptr ; ptr = nullptr ; // compute the class prob size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob if(read_data) { for(size_t i=0; i(&(this->file_read)), opt_read_msg.c_str()) ("seq,", po::value(&(this->file_seq)), opt_seq_msg.c_str()) ("consseq,", po::value(&(this->file_consseq)), opt_seq_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if((this->file_read == "") and (this->file_seq == "") and (this->file_consseq == "") and (not help)) { std::string msg("Error! No data file was given (none of --read --seq --consseq)!") ; throw std::invalid_argument(msg) ; } else if((this->file_read != "") and (this->file_seq != "") and (not help)) { std::string msg("Error! --read and --seq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } else if((this->file_read != "") and (this->file_consseq != "") and (not help)) { std::string msg("Error! --read and --consseq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } else if((this->file_seq != "") and (this->file_consseq != "") and (not help)) { std::string msg("Error! --seq and --consseq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probabily file was given (--prob)!") ; throw std::invalid_argument(msg) ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } void ProbToModelApplication::check_dimensions(const Matrix2D& data, const Matrix4D& prob) const { if(data.get_nrow() != prob.get_dim()[0]) { char msg[4096] ; sprintf(msg, "Error! data and prob matrices have unequal " "row numbers (%zu / %zu)!", data.get_nrow(), prob.get_dim()[0]) ; throw std::runtime_error(msg) ; } else if(data.get_ncol() < prob.get_dim()[2]) { char msg[4096] ; sprintf(msg, "Error! too many shift states for the data width!" "(%zu shift states and %zu columns in data)!", prob.get_dim()[2], data.get_ncol()) ; throw std::runtime_error(msg) ; } } void ProbToModelApplication::check_dimensions(const Matrix3D& data, const Matrix4D& prob) const { if(data.get_dim()[0] != prob.get_dim()[0]) { char msg[4096] ; sprintf(msg, "Error! data and prob matrices have unequal " "row numbers (%zu / %zu)!", data.get_dim()[0], prob.get_dim()[0]) ; throw std::runtime_error(msg) ; } else if(data.get_dim()[1] < prob.get_dim()[2]) { char msg[4096] ; sprintf(msg, "Error! too many shift states for the data width!" "(%zu shift states and %zu data width)!", prob.get_dim()[2], data.get_dim()[1]) ; throw std::runtime_error(msg) ; } else if(data.get_dim()[2]!= 4) { char msg[4096] ; sprintf(msg, "Error! data 3rd dimension is not 4!" "(%zu)!", data.get_dim()[2]) ; throw std::runtime_error(msg) ; } } int main(int argn, char** argv) { ProbToModelApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ProbToModelApplication.hpp b/src/Applications/ProbToModelApplication.hpp index 27954ae..7417a16 100644 --- a/src/Applications/ProbToModelApplication.hpp +++ b/src/Applications/ProbToModelApplication.hpp @@ -1,139 +1,139 @@ #ifndef PROBTOMODELAPPLICATION_HPP #define PROBTOMODELAPPLICATION_HPP #include #include #include #include #include #include /*! * \brief The ProbToModelApplication class is a wrapper around an ModelReferenceComputer - * instance creating an autonomous application to compute data models given the + * instance creating an autonomous application that computes data models given the * data and the results of the classification procedure (the posterior probability * matrix). */ class ProbToModelApplication : public ApplicationInterface { public: ProbToModelApplication() = delete ; ProbToModelApplication(const ProbToModelApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ ProbToModelApplication(int argn, char** argv) ; /*! * \brief Destructor. */ virtual ~ProbToModelApplication() override ; /*! * \brief Runs the application. The data model * is computed and displayed through the * stdout. * \return the exit code. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief Checks that both matrices have * compatible dimensions. * For this, both matrices should have the * same number of rows (1st dimension) and * the data number of columns should not be * smaller than the probability matrix 3rd * dimension (shifting freedom incompatible * with data width). * \param data the data matrix. * \param prob the probability matrix. * \throw std::runtime_error with a descriptive * message if both matrices are not compatible. */ void check_dimensions(const Matrix2D& data, const Matrix4D& prob) const ; /*! * \brief Checks that both matrices have * compatible dimensions. * For this, both matrices should have the * 1st dimension size and the data 2nd * dimensions should not be smaller than * the probability matrix 3rd dimensions * (shifting freedom incompatible * with data width). * Additionaly, the data matrix should have * a 3rd dimension of size 4 (for A,C,G,T). * \param data the data matrix. * \param prob the probability matrix. * \throw std::runtime_error with a descriptive * message if both matrices are not compatible. */ void check_dimensions(const Matrix3D& data, const Matrix4D& prob) const ; /*! * \brief the path to the file containing the * read count data. */ std::string file_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_seq ; /*! * \brief the path to the file containing the * consensus sequence data. */ std::string file_consseq ; /*! * \brief the path to the file containing the * classification posterior probabilities. */ std::string file_prob ; /*! * \brief the path to the file containing the * (0-based) index of the rows to filter out * from the data. */ std::string file_filter ; /*! * \brief whether the last class of the * classification (posterior probabilities) is a * background class. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief whether run() can be called. */ bool runnable ; } ; #endif // PROBTOMODELAPPLICATION_HPP diff --git a/src/Applications/SequenceMatrixCreatorApplication.cpp b/src/Applications/SequenceMatrixCreatorApplication.cpp index 2fbfebc..02266ad 100644 --- a/src/Applications/SequenceMatrixCreatorApplication.cpp +++ b/src/Applications/SequenceMatrixCreatorApplication.cpp @@ -1,138 +1,140 @@ #include #include #include #include #include #include // std::invalid_argument namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv) : file_bed(""), file_fasta(""), from(0), to(0), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int CorrelationMatrixCreatorApplication::run() { if(this->runnable) { SequenceMatrixCreator mc(this->file_bed, this->file_fasta, this->from, this->to) ; std::cout << mc.create_matrix() << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "SequenceMatrixCreator is an application that creates a\n" "sequence matrix (1 nucleotide per matrix cell in int format)\n" "from a BED file and a fasta file and returnes it through stdout.\n" - "The matrix contains the sequences around the center of the BED\n" - "regions (references), +/- the given offsets. The sequences are\n" - "found by searching the sequences in the fasta file. Thus, the\n" - "sequence IDs in the BED file are expected to match exactly the\n" - "sequence headers in the fasta file.\n" + "The sequence centers are defined as the center position of each\n" + "region contained in the bed file. The corresponding single bp\n" + "regions are then extended using the from/to parameters on each side.\n" + "The corresponding sequences are then extracted from the fasta file.\n" + "The sequences are found by searching the sequences in the fasta file\n. " + "Thus, the sequence IDs in the BED file are expected to match EXACTLY\n" + "the sequence headers in the fasta file.\n" "The DNA character to integer conversion code is the following :\n" "A->0, C->1, G->2, T->3, N->4.\n" "The matrix is a 2D matrix which dimensions are :\n" "1) number of regions\n" - "2) length of region (to - from + 1)\n\n" ; + "2) length of region ( - + 1)\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_bed_msg = "The path to the BED file containing the references."; - std::string opt_fasta_msg = "The path to the fasta file containing the targets. The file " - "extension should be .fa or .fasta."; + std::string opt_fasta_msg = "The path to the fasta file containing the target sequences. The file " + "extension must be .fa or .fasta."; std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build " "around each reference center." ; std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build " "around each reference center." ; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string method(method_read) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) ("fasta", po::value(&(this->file_fasta)), opt_fasta_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_fasta == "" and (not help)) { std::string msg("Error! No fasta file was given (--fasta)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { CorrelationMatrixCreatorApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/SequenceMatrixCreatorApplication.hpp b/src/Applications/SequenceMatrixCreatorApplication.hpp index d68296a..1b44ca2 100644 --- a/src/Applications/SequenceMatrixCreatorApplication.hpp +++ b/src/Applications/SequenceMatrixCreatorApplication.hpp @@ -1,87 +1,96 @@ #ifndef SEQUENCEMATRIXCREATORAPPLICATION_HPP #define SEQUENCEMATRIXCREATORAPPLICATION_HPP #include #include /*! * \brief The SequenceMatrixCreatorApplication class is a wrapper around a * SequenceMatrixCreator instance creating an autonomous application to - * compute a sequence matrix from a bed file containing the regions of interest - * and a fasta file containing the genome sequences by directly passing all the - * options and parameters from the command line. + * compute a DNA sequence matrix from a bed file containing the regions of + * interest and a fasta file containing the genome sequences by directly + * passing all the options and parameters from the command line. + * + * The sequence centers are defined as the center position of each + * region contained in the bed file. The corresponding single bp + * regions are then extended using the from/to parameters on each side. + * The corresponding sequences are then extracted from the fasta file. + * + * The resulting matrix contains one sequence per row and one character per + * column. DNA bases are converted into integer codes and printed on stdout. + * */ class CorrelationMatrixCreatorApplication: public ApplicationInterface { public: CorrelationMatrixCreatorApplication() = delete ; CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ CorrelationMatrixCreatorApplication(int argn, char** argv) ; /*! * \brief Runs the application. A matrix containing the * sequences around the center of the bed regions, +/- * the given offsets is created by searching the fasta file * and printed on the stdout. * The matrix is a 2D matrix with dimensions : * 1) number of regions * 2) length of region (to - from + 1). * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the fasta file. */ std::string file_fasta ; /*! * \brief a relative coordinate indicating the * most downstream position to consider around * each region in the bed file. */ int from ; /*! * \brief a relative coordinate indicating the * most upstream position to consider around * each region in the bed file. */ int to ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // SEQUENCEMATRIXCREATORAPPLICATION_HPP