diff --git a/src/Applications/ClassReadDataCreatorApplication.cpp b/src/Applications/ClassReadDataCreatorApplication.cpp index ce04d46..89d0f26 100644 --- a/src/Applications/ClassReadDataCreatorApplication.cpp +++ b/src/Applications/ClassReadDataCreatorApplication.cpp @@ -1,215 +1,212 @@ #include #include #include #include #include #include #include // std::invalid_argument #include namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; ClassReadDataCreatorApplication::ClassReadDataCreatorApplication(int argn, char** argv) : file_bed(""), file_bam(""), file_bai(""), file_prob(""), from(0), to(0), bin_size(0), class_k(0), method(CorrelationMatrixCreator::FRAGMENT), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int ClassReadDataCreatorApplication::run() -{ std::cerr << "ClassReadDataCreatorApplication::run() START" << std::endl ; - if(this->runnable) - { std::cerr << "ClassReadDataCreatorApplication::run() run" << std::endl ; - ClassReadDataCreator crc(this->file_bed, +{ if(this->runnable) + { ClassReadDataCreator crc(this->file_bed, this->file_bam, this->file_bai, this->file_prob, this->from, this->to, this->bin_size, this->class_k, this->method) ; // display integer matrix for given class std::cout << crc.create_matrix() << std::endl ; - std::cerr << "ClassReadDataCreatorApplication::run() END" << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void ClassReadDataCreatorApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "ClassReadDataCreator is an application that retro-compute\n" "a fictive count matrix of a given class K, from a partition, assuming\n" "that the classification is true.\n" "To construct the final matrix M3, the original matrix M1 that was\n" "partitioned is computed and extended into a matrix M2. The original\n" "matrix M1 has L1 column and was partitioned allowing S shifting freedom.\n" "A number of extra columns E, overall, is added to the matrix M1 (~E/2 on\n" "each side). The final matrix M3, of L1 columns, is then computed. A row\n" "of the final matrix M3 is the weighted average of each of the S possibles\n" "slices of the corresponding row in M2. The weight used are th\n" "probabilities with which this row was assigned to class K, for each of\n" "the S slices.\n" "Thus, the final result is a 2D matrix of dimensions N,L where N is the\n" "number of regions and L the number of bins in which the regions have\n" "divided\n." "The resulting 2D matrix is returned through stdout in text format." ; std::string opt_help_msg = "Produces this help message." ; std::string opt_bed_msg = "The path to the BED file containing the references."; std::string opt_bam_msg = "The path to the BAM file containing the targets."; std::string opt_bai_msg = "The path to the BAI file containing the BAM file index."; std::string opt_prob_msg = "The path to the file containing the assignment probabilities\n" "(the partition)." ; std::string opt_from_msg = "The most upstream position - in relative coordinate - of the regions\n" "in the original matrix." ; std::string opt_to_msg = "The most downstream position - in relative coordinate - of the regions\n" "in the original matrix." ; std::string opt_classk_msg = "The index (1-based) of the class of interest." ; std::string opt_binsize_msg = "The size of the bins, in the original matrix." ; char tmp[4096] ; sprintf(tmp, "How the data in the BAM file should be handled when computing\n" "the number of counts in each bin.\n" "\t\"%s\" uses each position within the reads (by default)\n" "\t\"%s\" uses only the insertion site for ATAC-seq data\n" "\t\"%s\" uses each position within the fragments\n" "\t\"%s\" uses only the fragment central positions\n", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; std::string opt_method_msg = tmp ; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string method(method_read) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) ("prob", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) ("k", po::value(&(this->class_k)), opt_classk_msg.c_str()) ("method", po::value(&(method)), opt_method_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No probability (partition) file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->class_k == 0 and (not help)) { std::string msg("Error! no class of interest was given (--k)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ClassReadDataCreatorApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ClassSequenceDataCreatorApplication.cpp b/src/Applications/ClassSequenceDataCreatorApplication.cpp index 495284f..67da3f3 100644 --- a/src/Applications/ClassSequenceDataCreatorApplication.cpp +++ b/src/Applications/ClassSequenceDataCreatorApplication.cpp @@ -1,164 +1,163 @@ #include #include #include #include #include #include #include // std::invalid_argument #include namespace po = boost::program_options ; ClassSequenceDataCreatorApplication::ClassSequenceDataCreatorApplication(int argn, char** argv) : file_bed(""), file_fasta(""), file_prob(""), file_out(""), from(0), to(0), class_k(0), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int ClassSequenceDataCreatorApplication::run() { if(this->runnable) { ClassSequenceDataCreator csc(this->file_bed, this->file_fasta, this->file_prob, this->from, this->to, this->class_k) ; // display integer matrix for given class csc.create_matrix().save(this->file_out) ; - return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void ClassSequenceDataCreatorApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "ClassSequenceDataCreator is an application that retro-compute\n" "a fictive sequence matrix of a given class K, from a partition, assuming\n" "that the classification is true.\n" "To construct the final matrix M3, the original matrix M1 that was\n" "partitioned is computed and extended into a matrix M2. The original\n" "matrix M1 has L1 column and was partitioned allowing S shifting freedom.\n" "A number of extra columns E, overall, is added to the matrix M1 (~E/2 on\n" "each side). The final matrix M3, of L1 columns, is then computed. A row\n" "of the final matrix M3 is the weighted average of each of the S possibles\n" "slices of the corresponding row in M2. The weight used are the\n" "probabilities with which this row was assigned to class K, for each of\n" "the S slices. Because sequences cannot be averaged (for instance\n" "0.5*AAA + 0.5*CCC), the final result is a collection of N consensus\n" "sequences represented as a sequence probability matrix where N is\n" "the number of original sequences in M1 (also in M2).\n" "Thus the final result is a 3D matrix of dimensions N,L,4 where N\n" "is the number of consensus sequences, L the length of the consensus\n" "sequences and 4 for A,C,G,T.\n" "The resulting 3D matrix is written to a given file in binary format." ; std::string opt_help_msg = "Produces this help message." ; std::string opt_bed_msg = "The path to the BED file containing the references."; std::string opt_fasta_msg = "The path to the fasta file containing the sequences."; std::string opt_prob_msg = "The path to the file containing the assignment probabilities\n" "(the partition)." ; std::string opt_out_msg = "The path to the file in which the results will be written (3D matrix\n" "in binary format)." ; std::string opt_from_msg = "The most upstream position - in relative coordinate - of the regions\n" "in the original matrix." ; std::string opt_to_msg = "The most downstream position - in relative coordinate - of the regions\n" "in the original matrix." ; std::string opt_classk_msg = "The index (1-based) of the class of interest." ; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) ("fasta", po::value(&(this->file_fasta)), opt_fasta_msg.c_str()) ("prob", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("out", po::value(&(this->file_out)), opt_out_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("k", po::value(&(this->class_k)), opt_classk_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_fasta == "" and (not help)) { std::string msg("Error! No fasta file was given (--fasta)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No probability (partition) file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->file_out == "" and (not help)) { std::string msg("Error! No output file was given (--out)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->class_k == 0 and (not help)) { std::string msg("Error! no class of interest was given (--k)!") ; throw std::invalid_argument(msg) ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ClassSequenceDataCreatorApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMConsensusSequenceApplication.cpp b/src/Applications/EMConsensusSequenceApplication.cpp new file mode 100644 index 0000000..05ccddd --- /dev/null +++ b/src/Applications/EMConsensusSequenceApplication.cpp @@ -0,0 +1,187 @@ + +#include +#include + +#include +#include +#include +#include // std::move() +#include // std::invalid_argument +#include + +#include +#include +#include + +namespace po = boost::program_options ; + + +EMConsensusSequenceApplication::EMConsensusSequenceApplication(int argn, char** argv) + : file_consseq(""), file_filter(""), file_out(""), + n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), + n_threads(0), seed(""), runnable(true) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} + +int EMConsensusSequenceApplication::run() +{ if(this->runnable) + { EMConsensusSequence* em(nullptr) ; + + // row filter + std::vector filter ; + if(this->file_filter != "") + { // it is a column vector, easier to use the Matrix2D interface + // to read it rather than coding a function for :) + filter = Matrix2D(this->file_filter).get_data() ; + std::sort(filter.begin(), filter.end()) ; + } + + // data + Matrix3D data ; + data.load(this->file_consseq) ; + // filter out some rows if needed + if(filter.size()) + { data = filter_rows(filter, data) ; } + + // seeds motifs randomly + em = new EMConsensusSequence(std::move(data), + this->n_class, + this->n_iter, + this->n_shift, + this->flip, + this->bckg_class, + this->seed, + this->n_threads) ; + + // classify + em->classify() ; + em->get_post_prob().save(this->file_out) ; + + // clean + delete em ; + em = nullptr ; + + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE ; } +} + +void EMConsensusSequenceApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "EMConsensusSequence is a probabilistic partitioning algorithm that \n" + "sofetly assigns consensus sequences to classes given their motif\n" + "content.\n" + "The assignment probabilities are written in binary format as a 4D " + "matrix.\n\n" ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " + "by default 0 (no parallelization)." ; + std::string opt_consseq_msg = "The path to the file containing the consensus sequences" ; + std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" + "indices of rows to filter out in the data." ; + std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" + "in binary format." ; + std::string opt_iter_msg = "The number of iterations." ; + std::string opt_class_msg = "The number of classes to find." ; + std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n" + "the data. By default, shifting is disabled (equivalent to\n" + "--shift 1)." ; + std::string opt_flip_msg = "Enables flipping to realign the data."; + std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n" + "contains the sequence background probabilities at each position\n" + "and is never updated." ; + std::string opt_seed_msg = "A value to seed the random number generator."; + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + std::string seeding_tmp ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("consseq", po::value(&(this->file_consseq)), opt_consseq_msg.c_str()) + ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) + + ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) + + ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) + ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) + ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) + ("flip", opt_flip_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) + + ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_consseq == "" and + (not help)) + { std::string msg("Error! No data were given (--seq)!") ; + throw std::invalid_argument(msg) ; + } + if(this->file_out == "" and + (not help)) + { std::string msg("Error! No output file given (--out)!") ; + throw std::invalid_argument(msg) ; + } + + // no iter given -> 1 iter + if(this->n_iter == 0) + { this->n_iter = 1 ; } + // no shift class given -> 1 class + if(this->n_class == 0) + { this->n_class = 1 ; } + // no shift given, value of 1 -> no shift + if(this->n_shift == 0) + { this->n_shift = 1 ; } + // set flip + if(vm.count("flip")) + { this->flip = true ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + +int main(int argn, char** argv) +{ EMConsensusSequenceApplication app(argn, argv) ; + return app.run() ; +} + diff --git a/src/Applications/EMJointApplication.hpp b/src/Applications/EMConsensusSequenceApplication.hpp similarity index 72% copy from src/Applications/EMJointApplication.hpp copy to src/Applications/EMConsensusSequenceApplication.hpp index 3ef57ca..f356f4d 100644 --- a/src/Applications/EMJointApplication.hpp +++ b/src/Applications/EMConsensusSequenceApplication.hpp @@ -1,115 +1,117 @@ -#ifndef EMJOINTAPPLICATION_HPP -#define EMJOINTAPPLICATION_HPP +#ifndef EMCONSENSUSSEQUENCEAPPLICATION_HPP +#define EMCONSENSUSSEQUENCEAPPLICATION_HPP #include -#include #include +#include + +#include +#include /*! - * \brief The EMJointApplication class is a wrapper around an EMJoint - * instance creating an autonomous application to classify data by directly - * passing all the options and parameters from the command line. + * \brief The EMConsensusSequenceApplication class is a wrapper around an + * EMConsensusSequence instance creating an autonomous application to classify + * consensus sequences by directly passing all the options and parameters from + * the command line. */ -class EMJointApplication: public ApplicationInterface +class EMConsensusSequenceApplication: public ApplicationInterface { public: - EMJointApplication() = delete ; - EMJointApplication(const EMJointApplication& app) = delete ; + EMConsensusSequenceApplication() = delete ; + EMConsensusSequenceApplication(const EMConsensusSequenceApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - EMJointApplication(int argn, char** argv) ; + EMConsensusSequenceApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! - * \brief a coma separated list of paths to the files - * containing the read density data + * \brief the paths to the file containing the + * consensus sequence data. */ - std::string files_read ; + std::string file_consseq ; + /*! * \brief the path to the file containing the - * sequence data. + * (0-based) index of the rows to filter out + * from the data. */ - std::string file_sequence ; + std::string file_filter ; + /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; + /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief whether a constant class to model the * sequence background should be added. This * class has the sequence background probabilities - * (for the sequence model) and the mean number of - * read counts (for read signal model) at each - * position. + * at each position. */ bool bckg_class ; - /*! * \brief the number of threads. */ size_t n_threads ; - /*! * \brief a seed to initialise the random number generator. */ std::string seed ; - /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; - -#endif // EMJOINTAPPLICATION_HPP +#endif // EMCONSENSUSSEQUENCEAPPLICATION_HPP diff --git a/src/Applications/EMJointApplication.cpp b/src/Applications/EMJointApplication.cpp index e0ba459..110d9b8 100644 --- a/src/Applications/EMJointApplication.cpp +++ b/src/Applications/EMJointApplication.cpp @@ -1,202 +1,248 @@ #include #include #include #include +#include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include +#include +#include // filter() namespace po = boost::program_options ; -template -std::ostream& operator << (std::ostream& stream, - const std::vector& v) -{ for(const auto& x : v) - { stream << x << " " ; } - return stream ; -} EMJointApplication::EMJointApplication(int argn, char** argv) - : files_read(""), file_sequence(""), file_out(""), + : files_read(""), file_sequence(""), file_conssequence(""), + file_filter(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMJointApplication::run() { if(this->runnable) { // read data std::vector read_paths ; boost::split(read_paths, this->files_read, [](char c){return c == ',';}) ; + // row filter + std::vector filter ; + if(this->file_filter != "") + { // it is a column vector, easier to use the Matrix2D interface + // to read it rather than coding a function for :) + filter = Matrix2D(this->file_filter).get_data() ; + std::sort(filter.begin(), filter.end()) ; + } + std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } - data_read.push_back(Matrix2D(path)) ; + Matrix2D data(path) ; + // filter out some rows if needed + if(filter.size()) + { data = filter_rows(filter, data) ; } + data_read.push_back(std::move(data)) ; } - // sequence data + // read data only EMJoint* em = nullptr ; - if(this->file_sequence == "") + if(this->file_sequence == "" and + this->file_conssequence == "") { em = new EMJoint(std::move(data_read), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } - else + // read and sequence data + else if(this->file_sequence != "" and + this->file_conssequence == "") { Matrix2D data_seq(this->file_sequence) ; + // filter out some rows if needed + if(filter.size()) + { data_seq = filter_rows(filter, data_seq) ; } + em = new EMJoint(std::move(data_read), std::move(data_seq), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } + // read and consensus sequence data + else if(this->file_sequence == "" and + this->file_conssequence != "") + { Matrix3D data_consseq ; + data_consseq.load(this->file_conssequence) ; + // filter out some rows if needed + if(filter.size()) + { data_consseq = filter_rows(filter, data_consseq) ; } + em = new EMJoint(std::move(data_read), + std::move(data_consseq), + this->n_class, + this->n_iter, + this->n_shift, + this->flip, + this->bckg_class, + this->seed, + this->n_threads) ; + } em->classify() ; em->get_post_prob().save(this->file_out) ; delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMJointApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMJoint is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given 1) the shapes \n" "of the read densities over the regions and 2) the region sequence \n" "motif contents. \n " "The assignment probabilitiesare returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations, \n" "by default 0 (no parallelization)." ; std::string opt_read_msg = "A coma separated list of paths to the file containing the \n" "read density data. At least one path is needed." ; std::string opt_seq_msg = "The path to the file containing the sequence data. If no path is \n" - "given, the classification is only cares about the read density \n" - "shapes." ; + "given, the classification is only cares about the read density shapes." ; + std::string opt_consseq_msg = "The path to the file containing the consensus sequence data. If no path is \n" + "given, the classification only cares about the read density shapes." ; + std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" + "indices of rows to filter out in the data." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; std::string opt_bckg_msg = "Adds a class to model the sequence and the read signal background. This\n" "class contains sequence background probabilies (for the sequence model)\n" "and the mean number of reads (for the read count models) at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) - ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) - ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) - ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) + ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) + ("seq", po::value(&(this->file_sequence)), opt_seq_msg.c_str()) + ("consseq", po::value(&(this->file_conssequence)), opt_seq_msg.c_str()) + ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) + ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) - ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) - ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) - ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) + ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) + ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) + ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) - ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) - ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->files_read == "" and this->file_sequence == "" and + this->file_conssequence == "" and (not help)) - { std::string msg("Error! No data were given (--read and --seq)!") ; + { std::string msg("Error! No data were given (--read --seq --consseq)!") ; throw std::invalid_argument(msg) ; } if(this->files_read == "" and (not help)) { std::string msg("Error! No read density data were given (--read)!") ; throw std::invalid_argument(msg) ; } + if(this->file_sequence != "" and + this->file_conssequence != "") + { std::string msg("Error! --seq and --consseq are mutually exclusive!") ; + throw std::invalid_argument(msg) ; + } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMJointApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMJointApplication.hpp b/src/Applications/EMJointApplication.hpp index 3ef57ca..81a8e92 100644 --- a/src/Applications/EMJointApplication.hpp +++ b/src/Applications/EMJointApplication.hpp @@ -1,115 +1,126 @@ #ifndef EMJOINTAPPLICATION_HPP #define EMJOINTAPPLICATION_HPP #include #include #include /*! * \brief The EMJointApplication class is a wrapper around an EMJoint * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ class EMJointApplication: public ApplicationInterface { public: EMJointApplication() = delete ; EMJointApplication(const EMJointApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMJointApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief a coma separated list of paths to the files * containing the read density data */ std::string files_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_sequence ; + /*! + * \brief the path to the file containing the + * consensus sequence data. + */ + std::string file_conssequence ; + /*! + * \brief the path to the file containing the + * (0-based) index of the rows to filter out + * from the data. + */ + std::string file_filter ; /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; /*! * \brief whether a constant class to model the * sequence background should be added. This * class has the sequence background probabilities * (for the sequence model) and the mean number of * read counts (for read signal model) at each * position. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMJOINTAPPLICATION_HPP diff --git a/src/Applications/MatrixBinToTxtApplication.cpp b/src/Applications/MatrixBinToTxtApplication.cpp new file mode 100644 index 0000000..e3aa898 --- /dev/null +++ b/src/Applications/MatrixBinToTxtApplication.cpp @@ -0,0 +1,156 @@ +#include + +#include + +#include +#include +#include + + +namespace po = boost::program_options ; + + +// valid types +std::string type_int = "int" ; +std::string type_char = "char" ; +std::string type_double = "double" ; + + +MatrixBinToTxtApplication::MatrixBinToTxtApplication(int argn, char** argv) + : file_matrix(""), type(""), ndim(0), runnable(false) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} + +int MatrixBinToTxtApplication::run() +{ if(this->runnable) + { + if(this->type == type_int) + { if(this->ndim == 2) + { Matrix2D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + else if(this->ndim == 3) + { Matrix3D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + else if(this->ndim == 4) + { Matrix4D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + } + else if(this->type == type_char) + { if(this->ndim == 2) + { Matrix2D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + else if(this->ndim == 3) + { Matrix3D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + else if(this->ndim == 4) + { Matrix4D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + } + else if(this->type == type_double) + { if(this->ndim == 2) + { Matrix2D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + else if(this->ndim == 3) + { Matrix3D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + else if(this->ndim == 4) + { Matrix4D m ; m.load(this->file_matrix) ; std::cout << m << std::endl ; } + } + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE; } +} + +void MatrixBinToTxtApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "MatrixBinToTxt reads a matrix file in binary format and displays it\n" + "in text format.\n\n" ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_file_mat_msg = "The path to the matrix file." ; + std::string opt_ndim_msg = "The number of dimensions that the matrix has." ; + char opt_type_msg[4096] ; + sprintf(opt_type_msg, "The type of the data stored in the matrix. It should be %s, %s or\n" + "%s.", type_int.c_str(), type_char.c_str(), type_double.c_str()); + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("file", po::value(&(this->file_matrix)), opt_file_mat_msg.c_str()) + ("type", po::value(&(this->type)), opt_type_msg) + ("ndim", po::value(&(this->ndim)), opt_ndim_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_matrix == "" and + (not help)) + { std::string msg("Error! No input file was given (--file)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->type == "" and + (not help)) + { std::string msg("Error! No data type was not given (--type)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->ndim == 0 and + (not help)) + { std::string msg("Error! The matrix dimensionality was not given (--ndim)!") ; + throw std::invalid_argument(msg) ; + } + else if((this->type != type_int) and + (this->type != type_char) and + (this->type != type_double) and + (not help)) + { char msg[4096] ; + sprintf(msg, "Error! Unsupported data type %s! It should be %s, %s or %s (--type)", + this->type.c_str(), + type_int.c_str(), type_char.c_str(), type_double.c_str()) ; + throw std::invalid_argument(msg) ; + } + else if(this->ndim == 1) + { std::string msg("Error! The matrix dimensionality cannot be 1!") ; + throw std::invalid_argument(msg) ; + } + else if(this->ndim > 4) + { char msg[4096] ; + sprintf(msg, "Error! The matrix dimensionality cannot be bigger than 4 (%zu)!", + this->ndim) ; + throw std::invalid_argument(msg) ; + } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + +int main(int argn, char** argv) +{ MatrixBinToTxtApplication app(argn, argv) ; + return app.run() ; +} diff --git a/src/Applications/MatrixBinToTxtApplication.hpp b/src/Applications/MatrixBinToTxtApplication.hpp new file mode 100644 index 0000000..73b4f9d --- /dev/null +++ b/src/Applications/MatrixBinToTxtApplication.hpp @@ -0,0 +1,73 @@ +#ifndef MATRIXBINTOTXTAPPLICATION_HPP +#define MATRIXBINTOTXTAPPLICATION_HPP + +#include + +#include + + +class MatrixBinToTxtApplication : public ApplicationInterface +{ + + public: + MatrixBinToTxtApplication() = delete ; + MatrixBinToTxtApplication(const MatrixBinToTxtApplication& other) = delete ; + + /*! + * \brief Constructs an object from the command line + * options. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + */ + MatrixBinToTxtApplication(int argn, char** argv) ; + + /*! + * \brief Runs the application. The data are classified + * using the given settings and the posterior probability + * matrix is returned through the stdout. + * The matrix is a 4D matrix with dimensions : + * regions, class, shift flip. + * \return an exit code EXIT_SUCCESS or EXIT_FAILURE + * to return to the OS. + */ + virtual int run() override ; + + private: + /*! + * \brief Parses the program command line options and + * sets the object field accordingly. + * If the help option is detected, the "runnable" + * field is set to false and subsequent calls to + * run() will produce nothing. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + * \throw std::invalid_argument if an error is found + * in the program options. + */ + void parseOptions(int argn, char** argv) ; + + /*! + * \brief the path to the file containing the matrix. + */ + std::string file_matrix ; + /*! + * \brief the type of the data stored. + */ + std::string type ; + /*! + * \brief the number of dimensions of the matrix. + */ + size_t ndim ; + /*! + * \brief a flag indicating whether the core of run() can be + * run or not. + */ + bool runnable ; + +} ; + +#endif // MATRIXBINTOTXTAPPLICATION_HPP diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp index b8227a4..dce386b 100644 --- a/src/Applications/ProbToModelApplication.cpp +++ b/src/Applications/ProbToModelApplication.cpp @@ -1,223 +1,320 @@ #include #include #include +#include #include #include // std::move() #include // std::invalid_argument, std::runtime_error + #include #include #include +#include #include #include +#include // filter_rows() namespace po = boost::program_options ; typedef std::vector vector_d ; ProbToModelApplication::ProbToModelApplication(int argn, char** argv) - : file_read(""), file_seq(""), file_prob(""), bckg_class(false), + : file_read(""), file_seq(""), file_prob(""), file_filter(""), + bckg_class(false), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ProbToModelApplication::~ProbToModelApplication() {} int ProbToModelApplication::run() { if(this->runnable) { // load data std::string file_data ; - bool read_data = false ; - bool seq_data = false ; + bool read_data = false ; + bool seq_data = false ; + bool consseq_data = false ; if(this->file_read != "") { file_data = this->file_read ; - read_data = true ; seq_data = false ; + seq_data = consseq_data = false ; read_data = true ; } else if(this->file_seq != "") { file_data = this->file_seq ; - read_data = false ; seq_data = true ; + read_data = consseq_data = false ; seq_data = true ; + } + else if(this->file_consseq != "") + { file_data = this->file_consseq ; + read_data = seq_data = false ; consseq_data = true ; } else { std::string msg("Error! Could not determine the type of the data!") ; throw std::runtime_error(msg) ; } - Matrix2D data(file_data) ; - // Matrix4D prob(this->file_prob) ; - Matrix4D prob ; prob.load(this->file_prob) ; - if(data.get_nrow() != prob.get_dim()[0]) - { char msg[4096] ; - sprintf(msg, "Error! data and prob matrices have unequal " - "row numbers (%zu / %zu)!", - data.get_nrow(), prob.get_dim()[0]) ; - throw std::runtime_error(msg) ; - } - else if(data.get_ncol() < prob.get_dim()[2]) - { char msg[4096] ; - sprintf(msg, "Error! too many shift states for the data!" - "%zu shift states and %zu columns in data)!", - prob.get_dim()[2], data.get_ncol()) ; - throw std::runtime_error(msg) ; + + + // row filter + std::vector filter ; + if(this->file_filter != "") + { // it is a column vector, easier to use the Matrix2D interface + // to read it rather than coding a function for :) + filter = Matrix2D(this->file_filter).get_data() ; + std::sort(filter.begin(), filter.end()) ; } + // prob + Matrix4D prob ; prob.load(this->file_prob) ; + // get the data model ModelComputer* ptr = nullptr ; if(read_data) - { ptr = new ReadModelComputer(std::move(data), + { Matrix2D data(file_data) ; + // filter out some rows if needed + if(filter.size()) + { data = filter_rows(filter, data) ; } + this->check_dimensions(data, prob) ; + ptr = new ReadModelComputer(std::move(data), prob, this->bckg_class, this->n_threads) ; } else if(seq_data) - { ptr = new SequenceModelComputer(std::move(data), + { + Matrix2D data(file_data) ; + // filter out some rows if needed + if(filter.size()) + { data = filter_rows(filter, data) ; } + this->check_dimensions(data, prob) ; + ptr = new SequenceModelComputer(std::move(data), prob, this->bckg_class, this->n_threads) ; } + else if(consseq_data) + { Matrix3D data ; + data.load(file_data) ; + // filter out some rows if needed + if(filter.size()) + { data = filter_rows(filter, data) ; } + this->check_dimensions(data, prob) ; + ptr = new ConsensusSequenceModelComputer(std::move(data), + prob, + this->bckg_class, + this->n_threads) ; + } + Matrix2D model = ptr->get_model() ; delete ptr ; ptr = nullptr ; // compute the class prob size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob if(read_data) { for(size_t i=0; i(&(this->file_read)), opt_read_msg.c_str()) - ("seq,", po::value(&(this->file_seq)), opt_seq_msg.c_str()) - ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + ("read,", po::value(&(this->file_read)), opt_read_msg.c_str()) + ("seq,", po::value(&(this->file_seq)), opt_seq_msg.c_str()) + ("consseq,", po::value(&(this->file_consseq)), opt_seq_msg.c_str()) + ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) - ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings - if((this->file_read == "") and - (this->file_seq == "") and + if((this->file_read == "") and + (this->file_seq == "") and + (this->file_consseq == "") and (not help)) - { std::string msg("Error! No data file was given (--read or --seq)!") ; + { std::string msg("Error! No data file was given (none of --read --seq --consseq)!") ; throw std::invalid_argument(msg) ; } else if((this->file_read != "") and (this->file_seq != "") and (not help)) { std::string msg("Error! --read and --seq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } + else if((this->file_read != "") and + (this->file_consseq != "") and + (not help)) + { std::string msg("Error! --read and --consseq are mutually exclusive!") ; + throw std::invalid_argument(msg) ; + } + else if((this->file_seq != "") and + (this->file_consseq != "") and + (not help)) + { std::string msg("Error! --seq and --consseq are mutually exclusive!") ; + throw std::invalid_argument(msg) ; + } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probabily file was given (--prob)!") ; throw std::invalid_argument(msg) ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } +void ProbToModelApplication::check_dimensions(const Matrix2D& data, + const Matrix4D& prob) const +{ if(data.get_nrow() != prob.get_dim()[0]) + { char msg[4096] ; + sprintf(msg, "Error! data and prob matrices have unequal " + "row numbers (%zu / %zu)!", + data.get_nrow(), prob.get_dim()[0]) ; + throw std::runtime_error(msg) ; + } + else if(data.get_ncol() < prob.get_dim()[2]) + { char msg[4096] ; + sprintf(msg, "Error! too many shift states for the data width!" + "(%zu shift states and %zu columns in data)!", + prob.get_dim()[2], data.get_ncol()) ; + throw std::runtime_error(msg) ; + } +} + +void ProbToModelApplication::check_dimensions(const Matrix3D& data, + const Matrix4D& prob) const +{ if(data.get_dim()[0] != prob.get_dim()[0]) + { char msg[4096] ; + sprintf(msg, "Error! data and prob matrices have unequal " + "row numbers (%zu / %zu)!", + data.get_dim()[0], prob.get_dim()[0]) ; + throw std::runtime_error(msg) ; + } + else if(data.get_dim()[1] < prob.get_dim()[2]) + { char msg[4096] ; + sprintf(msg, "Error! too many shift states for the data width!" + "(%zu shift states and %zu data width)!", + prob.get_dim()[2], data.get_dim()[1]) ; + throw std::runtime_error(msg) ; + } + else if(data.get_dim()[2]!= 4) + { char msg[4096] ; + sprintf(msg, "Error! data 3rd dimension is not 4!" + "(%zu)!", + data.get_dim()[2]) ; + throw std::runtime_error(msg) ; + } +} + + int main(int argn, char** argv) { ProbToModelApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ProbToModelApplication.hpp b/src/Applications/ProbToModelApplication.hpp index 0ac126f..27954ae 100644 --- a/src/Applications/ProbToModelApplication.hpp +++ b/src/Applications/ProbToModelApplication.hpp @@ -1,88 +1,139 @@ #ifndef PROBTOMODELAPPLICATION_HPP #define PROBTOMODELAPPLICATION_HPP #include #include #include +#include +#include +#include + /*! * \brief The ProbToModelApplication class is a wrapper around an ModelReferenceComputer * instance creating an autonomous application to compute data models given the * data and the results of the classification procedure (the posterior probability * matrix). */ class ProbToModelApplication : public ApplicationInterface { public: ProbToModelApplication() = delete ; ProbToModelApplication(const ProbToModelApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ ProbToModelApplication(int argn, char** argv) ; /*! * \brief Destructor. */ virtual ~ProbToModelApplication() override ; /*! * \brief Runs the application. The data model * is computed and displayed through the * stdout. * \return the exit code. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; + /*! + * \brief Checks that both matrices have + * compatible dimensions. + * For this, both matrices should have the + * same number of rows (1st dimension) and + * the data number of columns should not be + * smaller than the probability matrix 3rd + * dimension (shifting freedom incompatible + * with data width). + * \param data the data matrix. + * \param prob the probability matrix. + * \throw std::runtime_error with a descriptive + * message if both matrices are not compatible. + */ + void check_dimensions(const Matrix2D& data, + const Matrix4D& prob) const ; + + /*! + * \brief Checks that both matrices have + * compatible dimensions. + * For this, both matrices should have the + * 1st dimension size and the data 2nd + * dimensions should not be smaller than + * the probability matrix 3rd dimensions + * (shifting freedom incompatible + * with data width). + * Additionaly, the data matrix should have + * a 3rd dimension of size 4 (for A,C,G,T). + * \param data the data matrix. + * \param prob the probability matrix. + * \throw std::runtime_error with a descriptive + * message if both matrices are not compatible. + */ + void check_dimensions(const Matrix3D& data, + const Matrix4D& prob) const ; + /*! * \brief the path to the file containing the * read count data. */ std::string file_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_seq ; + /*! + * \brief the path to the file containing the + * consensus sequence data. + */ + std::string file_consseq ; /*! * \brief the path to the file containing the * classification posterior probabilities. */ std::string file_prob ; + /*! + * \brief the path to the file containing the + * (0-based) index of the rows to filter out + * from the data. + */ + std::string file_filter ; /*! * \brief whether the last class of the * classification (posterior probabilities) is a * background class. */ bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief whether run() can be called. */ bool runnable ; } ; #endif // PROBTOMODELAPPLICATION_HPP diff --git a/src/Applications/WhichNullRowsApplication.cpp b/src/Applications/WhichNullRowsApplication.cpp new file mode 100644 index 0000000..58c833d --- /dev/null +++ b/src/Applications/WhichNullRowsApplication.cpp @@ -0,0 +1,116 @@ +#include + +#include + +#include + +#include + + +namespace po = boost::program_options ; + + + +WhichNullRowsApplication::WhichNullRowsApplication(int argn, char** argv) + :file_matrix(""), runnable("") +{ this->parseOptions(argn, argv) ; } + +int WhichNullRowsApplication::run() +{ + if(this->runnable) + { Matrix2D m(this->file_matrix) ; + + std::vector null_rows ; + + for(size_t i=0; i() ; + + // put the vector into a single column matrix + // to use the << operator + Matrix2D rows(null_rows.size(), 1) ; + for(size_t i=0; i< null_rows.size(); i++) + { rows(i,0) = null_rows[i] ; } + + std::cout << rows << std::endl ; + + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE; } +} + +void WhichNullRowsApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "WhichNullRow reads a 2D matrix file in text format and\n" + "returns the index (0-based) of the rows with no signal.\n\n" ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_file_mat_msg = "The path to the 2D matrix file." ; + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("mat", po::value(&(this->file_matrix)), opt_file_mat_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_matrix == "" and + (not help)) + { std::string msg("Error! No input file was given (--mat)!") ; + throw std::invalid_argument(msg) ; + } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + +int main(int argn, char** argv) +{ WhichNullRowsApplication app(argn, argv) ; + return app.run() ; +} diff --git a/src/Applications/WhichNullRowsApplication.hpp b/src/Applications/WhichNullRowsApplication.hpp new file mode 100644 index 0000000..4c2a80d --- /dev/null +++ b/src/Applications/WhichNullRowsApplication.hpp @@ -0,0 +1,65 @@ +#ifndef WHICHNULLROWSAPPLICATION_HPP +#define WHICHNULLROWSAPPLICATION_HPP + +#include + +#include + + +class WhichNullRowsApplication : public ApplicationInterface +{ + + public: + WhichNullRowsApplication() = delete ; + WhichNullRowsApplication(const WhichNullRowsApplication& other) = delete ; + + /*! + * \brief Constructs an object from the command line + * options. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + */ + WhichNullRowsApplication(int argn, char** argv) ; + + /*! + * \brief Runs the application. The data are classified + * using the given settings and the posterior probability + * matrix is returned through the stdout. + * The matrix is a 4D matrix with dimensions : + * regions, class, shift flip. + * \return an exit code EXIT_SUCCESS or EXIT_FAILURE + * to return to the OS. + */ + virtual int run() override ; + + private: + /*! + * \brief Parses the program command line options and + * sets the object field accordingly. + * If the help option is detected, the "runnable" + * field is set to false and subsequent calls to + * run() will produce nothing. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + * \throw std::invalid_argument if an error is found + * in the program options. + */ + void parseOptions(int argn, char** argv) ; + + /*! + * \brief the path to the file containing the matrix. + */ + std::string file_matrix ; + /*! + * \brief a flag indicating whether the core of run() can be + * run or not. + */ + bool runnable ; + +} ; + +#endif // WHICHNULLROWSAPPLICATION_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 294e25c..7fe6210 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,141 +1,157 @@ # compiler options add_compile_options(-std=c++14) add_compile_options(-O3) add_compile_options(-Wall) add_compile_options(-Wextra) add_compile_options(-Werror) add_compile_options(-Wfatal-errors) add_compile_options(-pedantic) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") add_definitions (${SEQAN_DEFINITIONS}) # include file location include_directories(${Boost_INCLUDE_DIRS}) include_directories(${SEQAN_INCLUDE_DIRS}) include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/Clustering") include_directories("${scATACseq_SOURCE_DIR}/src/Random") include_directories("${scATACseq_SOURCE_DIR}/src/Parallel") include_directories("${scATACseq_SOURCE_DIR}/src/Statistics") include_directories("${scATACseq_SOURCE_DIR}/src/GUI") include_directories("${scATACseq_SOURCE_DIR}/src/Applications") include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools") include_directories("${scATACseq_SOURCE_DIR}/src/Utility") # compile modules into static libraries ## set output directory set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib") ## build instructions add_library(Clustering "Clustering/DataLayer.cpp" "Clustering/Data2DLayer.cpp" "Clustering/Data3DLayer.cpp" "Clustering/ReadLayer.cpp" "Clustering/SequenceLayer.cpp" "Clustering/ConsensusSequenceLayer.cpp" "Clustering/ModelComputer.cpp" "Clustering/ReadModelComputer.cpp" "Clustering/SequenceModelComputer.cpp" + "Clustering/ConsensusSequenceModelComputer.cpp" "Clustering/EMBase.cpp" "Clustering/EMRead.cpp" "Clustering/EMSequence.cpp" "Clustering/EMConsensusSequence.cpp" "Clustering/EMJoint.cpp") add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp") add_library(Parallel "Parallel/ThreadPool.cpp") add_library(Statistics "Statistics/Statistics.cpp") add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp") add_library(GenomicTools "GenomicTools/MatrixCreator.cpp" "GenomicTools/ReadMatrixCreator.cpp" "GenomicTools/CorrelationMatrixCreator.cpp" "GenomicTools/ClassReadDataCreator.cpp" "GenomicTools/ClassSequenceDataCreator.cpp" "GenomicTools/SequenceMatrixCreator.cpp" "GenomicTools/GenomeRegion.cpp") -add_library(Utility "Utility/matrices.cpp" +add_library(Utility "Utility/matrix_utility.cpp" "Utility/dna_utility.cpp") ## resolve dependencies target_link_libraries(Utility ${SEQAN_LIBRARIES}) target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES}) target_link_libraries(Parallel Threads::Threads) target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES}) # executables ## a toy for seqan set(EXE_MAIN_TEST "main_test") add_executable(${EXE_MAIN_TEST} "main_test.cpp") target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering) set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a toy for correlation matrix set(EXE_MAIN_CORMAT "main_cormat") add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp") target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random) set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an application to read binary matrices and display them in txt +set(EXE_MAIN_MATBIN2TXT "MatrixBinToTxt") +add_executable(${EXE_MAIN_MATBIN2TXT} "Applications/MatrixBinToTxtApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_MAIN_MATBIN2TXT} Utility Boost::program_options) +set_target_properties(${EXE_MAIN_MATBIN2TXT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an application to know the indices of 0-signal rows in integer 2D matrices +set(EXE_MAIN_WHICHNULLROWS "WhichNullRows") +add_executable(${EXE_MAIN_WHICHNULLROWS} "Applications/WhichNullRowsApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_MAIN_WHICHNULLROWS} Boost::program_options) +set_target_properties(${EXE_MAIN_WHICHNULLROWS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a matrix from BED and a BAM file set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator") add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a sequence matrix from BED and a fasta file set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator") add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMRead standalone set(EXE_EMREAD "EMRead") add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp") -target_link_libraries(${EXE_EMREAD} Clustering Utility Boost::program_options) +target_link_libraries(${EXE_EMREAD} Clustering Boost::program_options) set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMSequence standalone set(EXE_EMSEQ "EMSequence") add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an EMConsensusSequence standalone +set(EXE_EMCONSSEQ "EMConsensusSequence") +add_executable(${EXE_EMCONSSEQ} "Applications/EMConsensusSequenceApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_EMCONSSEQ} Clustering Utility Boost::program_options) +set_target_properties(${EXE_EMCONSSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMJoint standalone set(EXE_EMJOINT "EMJoint") add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to compute data models from the data and the post prob of an EM classification set(EXE_PROB2REF "ProbToModel") add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options) set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_READMODELEXTENDER "ReadModelExtender") add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender") add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extract read data from an EM classification set(EXE_CLASSREADDATACREATOR "ClassReadDataCreator") add_executable(${EXE_CLASSREADDATACREATOR} "Applications/ClassReadDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_CLASSREADDATACREATOR} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_CLASSREADDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extract sequence data from an EM classification set(EXE_CLASSSEQDATACREATOR "ClassSequenceDataCreator") add_executable(${EXE_CLASSSEQDATACREATOR} "Applications/ClassSequenceDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_CLASSSEQDATACREATOR} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_CLASSSEQDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a test suite set(EXE_TESTS "unittests") add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp" "Unittests/unittests_genomictools.cpp") target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools) set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp index 26c8e18..b8a5a89 100644 --- a/src/Clustering/ConsensusSequenceLayer.cpp +++ b/src/Clustering/ConsensusSequenceLayer.cpp @@ -1,369 +1,371 @@ #include #include #include // std::promise, std::future #include // std::invalid_argument #include // log() #include // std::bind(), std::ref() #include #include #include #include #include // dna::base_composition() + double ConsensusSequenceLayer::score_consensus_subseq(const Matrix3D& cons_seq, size_t row, size_t start, const Matrix2D& model_log) { size_t ncol = cons_seq.get_dim()[1] ; size_t dim3 = cons_seq.get_dim()[2] ; if(start > ncol - model_log.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", start, ncol - model_log.get_nrow()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_nrow() > ncol) { char msg[4096] ; sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", model_log.get_nrow(), ncol) ; throw std::invalid_argument(msg) ; } else if(model_log.get_ncol() != 4) { char msg[4096] ; sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", model_log.get_ncol()) ; throw std::invalid_argument(msg) ; } else if(dim3 != 4) { char msg[4096] ; sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)", dim3) ; throw std::invalid_argument(msg) ; } size_t from = start ; size_t to = from + model_log.get_nrow() ; // will score [from, to) double ll = 0 ; for(size_t i=from, j=0; i& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) : Data3DLayer(data, n_class, n_shift, flip,bckg_class) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } } ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) : Data3DLayer(std::move(data), n_class, n_shift, flip, bckg_class) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } } ConsensusSequenceLayer::~ConsensusSequenceLayer() { ; } void ConsensusSequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // compute the log prob model and the log prob reverse-complement model std::vector> model_log(this->n_class, Matrix2D(this->l_model, this->n_category, 0.)) ; std::vector> model_log_rev = model_log ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { // forward model_log[i](j,k) = log(this->model(i,j,k)) ; // reverse model_log_rev[i](this->l_model-j-1,this->n_category-k-1) = log(this->model(i,j,k)) ; } } } // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_dim1, loglikelihood, loglikelihood_max, model_log, model_log_rev, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_dim1, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ConsensusSequenceLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(model_log), std::ref(model_log_rev), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ConsensusSequenceLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const { // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { // forward strand - { double ll_fw = score_consensus_subseq(this->data, i, s, model_log[j]) ; + { double ll_fw = std::max(score_consensus_subseq(this->data, i, s, model_log[j]), + ConsensusSequenceLayer::p_min_log); // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // apply lower bound here -> log(1e-100) - loglikelihood(i,j,s,flip_states::FORWARD) = std::max(ll_fw, - ConsensusSequenceLayer::p_min_log) ; + loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // keep track of max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } } // reverse if(this->flip) - { double ll_rev = score_consensus_subseq(this->data, i, s, model_log_rev[j]) ; + { double ll_rev = std::max(score_consensus_subseq(this->data, i, s, model_log_rev[j]), + ConsensusSequenceLayer::p_min_log) ; // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // apply lower bound here -> log(1e-100) - loglikelihood(i,j,s,flip_states::REVERSE) = std::max(ll_rev, - ConsensusSequenceLayer::p_min_log) ; + loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // keep track of max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void ConsensusSequenceLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_dim1, posterior_prob, promise) ; // this->model = future.get() ; auto model = future.get() ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_dim1, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ConsensusSequenceLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(promises[i])))) ; } // reinitialise the model size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the model for(auto& future : futures) { Matrix3D model_part = future.get() ; // for(size_t i=0; in_class; i++) for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // make sure to have no 0 values for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), ConsensusSequenceLayer::p_min) ; } } } // normalize to get probs for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { double sum = 0. ; for(size_t k=0; kn_category; k++) { sum += this->model(i,j,k) ; } for(size_t k=0; kn_category; k++) { double p = this->model(i,j,k) / sum ; this->model(i,j,k) = p ; } } } } void ConsensusSequenceLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t k=0; k < n_class_to_update; k++) { for(size_t s=0; sn_shift; s++) { for(size_t j=0; jl_model; j++) { // base prob on fw and rv strand vector_d base_prob_fw(this->n_category, 0.) ; vector_d base_prob_rv(this->n_category, 0.) ; for(size_t i=from; in_dim3; base++, base_rv--) { // --------------- forward --------------- { base_prob_fw[base] += this->data(i,s+j,base) * posterior_prob(i,k,s,ConsensusSequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[base_rv] += this->data(i,s+j,base) * posterior_prob(i,k,s,ConsensusSequenceLayer::REVERSE) ; } } } // update this position of the model for(size_t i=0,i_rev=base_prob_fw.size()-1; iflip) { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } } } } } promise.set_value(model) ; } void ConsensusSequenceLayer::create_bckg_class() { // sequence composition std::vector base_comp = dna::base_composition(this->data, this->flip) ; // set last class as background class for(size_t i=0; in_category; i++) { for(size_t j=0; jl_model; j++) { this->model(this->n_class-1,j,i) = base_comp[i] ; } } } diff --git a/src/Clustering/ConsensusSequenceModelComputer.cpp b/src/Clustering/ConsensusSequenceModelComputer.cpp new file mode 100644 index 0000000..f724022 --- /dev/null +++ b/src/Clustering/ConsensusSequenceModelComputer.cpp @@ -0,0 +1,49 @@ + +#include // std::move() + +#include +#include +#include +#include + +ConsensusSequenceModelComputer::ConsensusSequenceModelComputer(Matrix3D&& data, + const Matrix4D& post_prob, + bool bckg_class, + size_t n_threads) + : ModelComputer(), + threads(nullptr) +{ + // parameters + size_t n_class = post_prob.get_dim()[1] ; + size_t n_shift = post_prob.get_dim()[2] ; + size_t n_flip = post_prob.get_dim()[3] ; + bool flip = n_flip == 2 ; + + // the threads + if(n_threads) + { this->threads = new ThreadPool(n_threads) ; } + + // the data and the model + this->data_layer = new ConsensusSequenceLayer(std::move(data), + n_class, + n_shift, + flip, + bckg_class) ; + + this->data_layer->update_model(post_prob, + this->threads) ; +} + +ConsensusSequenceModelComputer::~ConsensusSequenceModelComputer() +{ // threads + if(this->threads != nullptr) + { this->threads->join() ; + delete this->threads ; + this->threads = nullptr ; + } + // data and model + if(this->data_layer != nullptr) + { delete this->data_layer ; + this->data_layer = nullptr ; + } +} diff --git a/src/Clustering/ConsensusSequenceModelComputer.hpp b/src/Clustering/ConsensusSequenceModelComputer.hpp new file mode 100644 index 0000000..c4cbdf6 --- /dev/null +++ b/src/Clustering/ConsensusSequenceModelComputer.hpp @@ -0,0 +1,46 @@ +#ifndef CONSENSUSSEQUENCEMODELCOMPUTER_HPP +#define CONSENSUSSEQUENCEMODELCOMPUTER_HPP + +#include +#include +#include +#include + + +class ConsensusSequenceModelComputer : public ModelComputer +{ + public: + + /*! + * \brief Constructs an object to retrieve + * the (consensus) sequence model given the data + * and their classification results. + * \param data the data. + * \param post_prob the data class assignment + * probabilities. + * \param bckg_class whether the last class of the + * classification (posterior probabilities) is a + * background class. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + ConsensusSequenceModelComputer(Matrix3D&& data, + const Matrix4D& post_prob, + bool bckg_class, + size_t n_threads) ; + + /*! + * \brief Destructor. + */ + virtual ~ConsensusSequenceModelComputer() override ; + + protected: + /*! + * \brief the threads. + */ + ThreadPool* threads ; + +} ; + +#endif // CONSENSUSSEQUENCEMODELCOMPUTER_HPP diff --git a/src/Clustering/EMConsensusSequence.cpp b/src/Clustering/EMConsensusSequence.cpp index 9cac9ee..5a52a0e 100644 --- a/src/Clustering/EMConsensusSequence.cpp +++ b/src/Clustering/EMConsensusSequence.cpp @@ -1,293 +1,294 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // SequenceLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool #include // dna::base_composition() EMConsensusSequence::EMConsensusSequence(const Matrix3D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_dim()[0], seq_matrix.get_dim()[1], n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), cseq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->cseq_layer = new ConsensusSequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->cseq_layer->update_model(this->post_prob, this->threads) ; } EMConsensusSequence::EMConsensusSequence(Matrix3D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_dim()[0], seq_matrix.get_dim()[1], n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), cseq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->cseq_layer->update_model(this->post_prob, this->threads) ; } EMConsensusSequence::~EMConsensusSequence() { if(this->cseq_layer != nullptr) { delete this->cseq_layer ; this->cseq_layer = nullptr ; } if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } Matrix3D EMConsensusSequence::get_sequence_models() const { return this->cseq_layer->get_model() ; } EMConsensusSequence::exit_codes EMConsensusSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; + } bar.update() ; std::cerr << std::endl ; return EMConsensusSequence::exit_codes::ITER_MAX ; } void EMConsensusSequence::compute_loglikelihood() { // compute the loglikelihood this->cseq_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMConsensusSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMConsensusSequence::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMConsensusSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMConsensusSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMConsensusSequence::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ConsensusSequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMConsensusSequence::update_models() { this->cseq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp index e8a9e1d..7ff0a16 100644 --- a/src/Clustering/EMJoint.cpp +++ b/src/Clustering/EMJoint.cpp @@ -1,626 +1,837 @@ #include #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include #include #include #include #include -#include #include #include // getRandomNumberGenerator() #include // ConsoleProgressBar +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ for(const auto& x : v) + { stream << x << " " ; } + return stream ; +} EMJoint::EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), - seq_layer(nullptr) - + seq_layer(nullptr), + consseq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } } EMJoint::EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), - seq_layer(nullptr) - + seq_layer(nullptr), + consseq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } } EMJoint::EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), - seq_layer(nullptr) + seq_layer(nullptr), + consseq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; - sprintf(msg, "Error! A sequence matrix row number is different than expected " + sprintf(msg, "Error! The sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; - sprintf(msg, "Error! A sequence matrix column number is different than expected " + sprintf(msg, "Error! The sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; this->seq_layer->update_model(this->post_prob, this->threads) ; } EMJoint::EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), - seq_layer(nullptr) + seq_layer(nullptr), + consseq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; - sprintf(msg, "Error! A sequence matrix row number is different than expected " + sprintf(msg, "Error! The sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; - sprintf(msg, "Error! A sequence matrix column number is different than expected " + sprintf(msg, "Error! The sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; } +EMJoint::EMJoint(const std::vector>& read_matrices, + const Matrix3D& consseq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed, + size_t n_threads) + : EMBase(read_matrices[0].get_nrow(), + read_matrices[0].get_ncol(), + n_class, + n_iter, + n_shift, + flip, + n_threads), + n_layer(read_matrices.size()+1), + loglikelihood_layer(this->n_layer, + Matrix4D(this->n_row, + this->n_class, + this->n_shift, + this->n_flip, + 0.)), + loglikelihood_max(this->n_layer, + vector_d(this->n_row, 0.)), + read_layers(), + seq_layer(nullptr), + consseq_layer(nullptr) +{ // check data matrices and their dimensions + for(const auto& matrix : read_matrices) + { if(matrix.get_nrow() != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! A read matrix row number is different than expected " + "(%zu instead of %zu)!", + matrix.get_nrow(), this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(matrix.get_ncol() != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! A read matrix column number is different than expected " + "(%zu instead of %zu)!", + matrix.get_ncol(), this->n_col) ; + throw std::invalid_argument(msg) ; + } + } + if(consseq_matrix.get_dim()[0] != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! The consensus sequence matrix row number is different than expected " + "(%zu instead of %zu)!", + consseq_matrix.get_dim()[0], this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(consseq_matrix.get_dim()[1] != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! The sequence matrix column number is different than expected " + "(%zu instead of %zu)!", + consseq_matrix.get_dim()[0], this->n_col) ; + throw std::invalid_argument(msg) ; + } + + // initialise post prob randomly + // getRandomGenerator(seed) ; + this->set_post_prob_random(seed) ; + + // data and models + // create read layer and initialise the models from the post prob + for(auto& matrix : read_matrices) + { // create the layer + this->read_layers.push_back(new ReadLayer(matrix, + this->n_class, + this->n_shift, + this->flip, + bckg_class, + this->threads)) ; + this->read_layers.back()->update_model(this->post_prob, + this->threads) ; + } + // create consensus sequence layer and initialise the models from the post prob + { + this->consseq_layer = new ConsensusSequenceLayer(consseq_matrix, + this->n_class, + this->n_shift, + this->flip, + bckg_class) ; + this->consseq_layer->update_model(this->post_prob, + this->threads) ; + } +} + +EMJoint::EMJoint(std::vector>&& read_matrices, + Matrix3D&& consseq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed, + size_t n_threads) + : EMBase(read_matrices[0].get_nrow(), + read_matrices[0].get_ncol(), + n_class, + n_iter, + n_shift, + flip, + n_threads), + n_layer(read_matrices.size()+1), + loglikelihood_layer(this->n_layer, + Matrix4D(this->n_row, + this->n_class, + this->n_shift, + this->n_flip, + 0.)), + loglikelihood_max(this->n_layer, + vector_d(this->n_row, 0.)), + read_layers(), + seq_layer(nullptr), + consseq_layer(nullptr) +{ // check data matrices and their dimensions + for(const auto& matrix : read_matrices) + { if(matrix.get_nrow() != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! A read matrix row number is different than expected " + "(%zu instead of %zu)!", + matrix.get_nrow(), this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(matrix.get_ncol() != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! A read matrix column number is different than expected " + "(%zu instead of %zu)!", + matrix.get_ncol(), this->n_col) ; + throw std::invalid_argument(msg) ; + } + } + if(consseq_matrix.get_dim()[0] != this->n_row) + { char msg[4096] ; + sprintf(msg, "Error! The consensus sequence matrix row number is different than expected " + "(%zu instead of %zu)!", + consseq_matrix.get_dim()[0], this->n_row) ; + throw std::invalid_argument(msg) ; + } + else if(consseq_matrix.get_dim()[1] != this->n_col) + { char msg[4096] ; + sprintf(msg, "Error! The sequence matrix column number is different than expected " + "(%zu instead of %zu)!", + consseq_matrix.get_dim()[0], this->n_col) ; + throw std::invalid_argument(msg) ; + } + + // initialise post prob randomly + // getRandomGenerator(seed) ; + this->set_post_prob_random(seed) ; + + // data and models + // create read layer and initialise the models from the post prob + for(auto& matrix : read_matrices) + { // create the layer + this->read_layers.push_back(new ReadLayer(std::move(matrix), + this->n_class, + this->n_shift, + this->flip, + bckg_class, + this->threads)) ; + this->read_layers.back()->update_model(this->post_prob, + this->threads) ; + } + // create consensus sequence layer and initialise the models from the post prob + { + this->consseq_layer = new ConsensusSequenceLayer(std::move(consseq_matrix), + this->n_class, + this->n_shift, + this->flip, + bckg_class) ; + this->consseq_layer->update_model(this->post_prob, + this->threads) ; + } +} + EMJoint::~EMJoint() { // join the threads in case // deleted by EMBase destructor if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // read data and models for(auto& ptr : this->read_layers) { if(ptr != nullptr) { delete ptr ; ptr = nullptr ; } } // sequence data and models - if(seq_layer != nullptr) + if(this->seq_layer != nullptr) { delete seq_layer ; this->seq_layer = nullptr ; } + // consensus sequence data and models + if(this->consseq_layer != nullptr) + { delete this->consseq_layer ; + this->consseq_layer = nullptr ; + } } std::vector> EMJoint::get_read_models() const { std::vector> models ; for(const auto& ptr : this->read_layers) { models.push_back(ptr->get_model()) ; } return models ; } Matrix3D EMJoint::get_sequence_models() const -{ return this->seq_layer->get_model() ; } +{ if(this->seq_layer != nullptr) + { return this->seq_layer->get_model() ; } + return Matrix3D() ; +} + +Matrix3D EMJoint::get_consensus_sequence_models() const +{ if(this->consseq_layer != nullptr) + { return this->consseq_layer->get_model() ; } + return Matrix3D() ; +} + EMJoint::exit_codes EMJoint::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; + // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMJoint::exit_codes::ITER_MAX ; } void EMJoint::compute_loglikelihood() { // compute the loglikelihood for each layer size_t i = 0 ; for(auto& ptr : this->read_layers) { ptr->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } if(this->seq_layer != nullptr) { this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } - + if(this->consseq_layer != nullptr) + { this->consseq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], + this->loglikelihood_max[i], + this->threads) ; + i++ ; + } // sum the likelihood for each state, over all layers // and rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t j=0; jthreads->addJob(std::move( std::bind(&EMJoint::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[j])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } /* void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // limite value range for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } done.set_value(true) ; } */ void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // the max likelihood found per row std::vector rowmax(to-from, std::numeric_limits::lowest()) ; // sum over layers for(size_t i=from, i_rowmax=0; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { // add rescaled layer value this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } // keep track of max if(this->loglikelihood(i,j,k,l) > rowmax[i_rowmax]) { rowmax[i_rowmax] = this->loglikelihood(i,j,k,l) ; } } } } } // rescale for(size_t i=from, i_rowmax=0; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ; } } } } done.set_value(true) ; } void EMJoint::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; // double p = std::max(exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * // this->post_state_prob(n_class,n_shift,n_flip), // ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; // double p = this->post_prob(i,n_class,n_shift,n_flip) / // this->post_prob_rowsum[i] ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMJoint::update_models() { // read data and models for(auto& ptr : this->read_layers) { ptr->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } // sequence data and models if(this->seq_layer != nullptr) { this->seq_layer->update_model(this->post_prob, this->threads) ; } + // consensus sequence data and models + if(this->consseq_layer != nullptr) + { this->consseq_layer->update_model(this->post_prob, + this->threads) ; + } } diff --git a/src/Clustering/EMJoint.hpp b/src/Clustering/EMJoint.hpp index 7ed8a51..aeea97b 100644 --- a/src/Clustering/EMJoint.hpp +++ b/src/Clustering/EMJoint.hpp @@ -1,313 +1,408 @@ #ifndef EMJOINT_HPP #define EMJOINT_HPP #include #include #include #include #include #include #include #include #include typedef std::vector vector_d ; class EMJoint : public EMBase { public: /*! * \brief Constructs an object to partition the * region according to all the given read densities * with the given shifting and flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * with the given shifting and flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * and sequences with the given shifting and * flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * and sequences with the given shifting and * flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. The matrix * dimensions are as follow: * 1st the number of regions, * 2nd the region length. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; + /*! + * \brief Constructs an object to partition the + * region according to all the given read densities + * consensus sequences with the given shifting and + * flipping freedom. + * \param read_matrices a vector containing all + * the different data densities (ChIP-seq or related + * signal) for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. + * \param consseq_matrix a matrix containing the DNA + * consensus sequences for the regions of interest as + * a probability matrix. Its dimensions are: + * 1st the number of regions, + * 2nd the region length. + * 3rd 4 for A,C,G,T + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model + * the background by setting all its parameters, at all + * positions, to the background base probabilties (for + * sequence models) and the mean number of counts (for + * read signal models). Since the background is constant, + * this class will never be updated. + * \param seed a seed to initialise the random number + * generator. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMJoint(const std::vector>& read_matrices, + const Matrix3D& consseq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed="", + size_t n_threads=0) ; + /*! + * \brief Constructs an object to partition the + * region according to all the given read densities + * consensus sequences with the given shifting and + * flipping freedom. + * \param read_matrices a vector containing all + * the different data densities (ChIP-seq or related + * signal) for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. + * \param consseq_matrix a matrix containing the DNA + * consensus sequences for the regions of interest as + * a probability matrix. Its dimensions are: + * 1st the number of regions, + * 2nd the region length. + * 3rd 4 for A,C,G,T + * \param n_class the number of region classes + * to search. + * \param n_iter the number of optimization iterations. + * \param n_shift the number of shift states allowed. + * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model + * the background by setting all its parameters, at all + * positions, to the background base probabilties (for + * sequence models) and the mean number of counts (for + * read signal models). Since the background is constant, + * this class will never be updated. + * \param seed a seed to initialise the random number + * generator. + * \param n_threads the number of parallel threads + * to run the computations. 0 means no parallel + * computing, everything is run on the main thread. + */ + EMJoint(std::vector>&& read_matrices, + Matrix3D&& consseq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed="", + size_t n_threads=0) ; EMJoint(const EMJoint& other) = delete ; /*! * \brief Destructor. */ virtual ~EMJoint() override ; /*! * \brief Returns all layer read models. * The models are in the same order * as the data were given to the * constructor. * \return a vector containing the * models. */ std::vector> get_read_models() const ; /*! * \brief Returns the sequence models. * \return a vector containing the * models. */ Matrix3D get_sequence_models() const ; + /*! + * \brief Returns the consensus sequence + * models. + * \return a vector containing the + * models. + */ + Matrix3D get_consensus_sequence_models() const ; + /*! * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMJoint::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood() that * computes the joint loglikelihood by summing the * individual loglikelihood obtained from each data layer. * At the same time, this method rescales the loglikelihood * values by substacting to each value the maximum * loglikelihood value found in the same data row, * for each layer. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the number of data layers. */ size_t n_layer ; /*! * \brief the log likelihood buffers for each individual * layer (one element per layer). */ std::vector> loglikelihood_layer ; /*! * \brief the max loglikelihood value for * each each data layer (1st dimension) * and each data row of the given layer * (2nd dimension). */ std::vector loglikelihood_max ; + /*! * \brief A vector containing the pointers * to the objects managing all the read * data and models. */ std::vector read_layers ; /*! * \brief A pointer to the object managing * the sequence data and their models. */ SequenceLayer* seq_layer ; /*! * \brief A pointer to the object managing * the consensus sequences data and their * models. */ - ConsensusSequenceLayer* conseq_layer ; + ConsensusSequenceLayer* consseq_layer ; } ; #endif // EMJOINT_HPP diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp index 88df13c..d3048b4 100644 --- a/src/Clustering/SequenceLayer.cpp +++ b/src/Clustering/SequenceLayer.cpp @@ -1,423 +1,423 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), pow() #include #include // std::max_element() #include // std::promise, std::future #include #include #include #include // dna::base_composition() double SequenceLayer::score_subseq(const Matrix2D& seq, size_t row, size_t start, const Matrix2D& model_log) { if(start > seq.get_ncol() - model_log.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", start, seq.get_ncol() - model_log.get_nrow()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_nrow() > seq.get_ncol()) { char msg[4096] ; sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", model_log.get_nrow(), seq.get_ncol()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_ncol() != 4) { char msg[4096] ; sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", model_log.get_ncol()) ; throw std::invalid_argument(msg) ; } size_t from = start ; size_t to = from + model_log.get_nrow() ; // will score [from, to) int n_code = dna::char_to_int('N') ; double ll = 0 ; for(size_t i=from, j=0; i get max score if(base == n_code) { std::vector row = model_log.get_row(j) ; ll += *(std::max_element(std::begin(row), std::end(row))) ; } // A,C,G,T -> get its score else { ll += model_log(j,base) ; } } return ll ; } SequenceLayer::SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) : Data2DLayer(data, n_class, n_shift, flip,bckg_class) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } } SequenceLayer::SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) : Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } } SequenceLayer::SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool bckg_class) : Data2DLayer(data, model,flip, bckg_class) {} SequenceLayer::SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool bckg_class) : Data2DLayer(std::move(data), std::move(model),flip, bckg_class) {} SequenceLayer::~SequenceLayer() { ; } void SequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // compute the log prob model and the log prob reverse-complement model std::vector> model_log(this->n_class, Matrix2D(this->l_model, this->n_category, 0.)) ; std::vector> model_log_rev = model_log ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { // forward model_log[i](j,k) = log(this->model(i,j,k)) ; // reverse model_log_rev[i](this->l_model-j-1,this->n_category-k-1) = log(this->model(i,j,k)) ; } } } // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, loglikelihood, loglikelihood_max, model_log, model_log_rev, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(model_log), std::ref(model_log_rev), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void SequenceLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const { // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { // forward strand - { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ; + { double ll_fw = std::max(score_subseq(this->data, i, s, model_log[j]), + SequenceLayer::p_min_log) ; // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // apply lower bound here -> log(1e-100) - loglikelihood(i,j,s,flip_states::FORWARD) = std::max(ll_fw, - SequenceLayer::p_min_log) ; + loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // keep track of max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } } // reverse if(this->flip) - { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ; + { double ll_rev = std::max(score_subseq(this->data, i, s, model_log_rev[j]), + SequenceLayer::p_min_log) ; // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // apply lower bound here -> log(1e-100) - loglikelihood(i,j,s,flip_states::REVERSE) = std::max(ll_rev, - SequenceLayer::p_min_log) ; + loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // keep track of max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void SequenceLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, promise) ; // this->model = future.get() ; auto model = future.get() ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(promises[i])))) ; } // reinitialise the model size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the model for(auto& future : futures) { Matrix3D model_part = future.get() ; // for(size_t i=0; in_class; i++) for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // make sure to have no 0 values for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), SequenceLayer::p_min) ; } } } // normalize to get probs for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { double sum = 0. ; for(size_t k=0; kn_category; k++) { sum += this->model(i,j,k) ; } for(size_t k=0; kn_category; k++) { double p = this->model(i,j,k) / sum ; this->model(i,j,k) = p ; } } } } void SequenceLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; // the int code of A, C, G, T, N static int a_code = dna::char_to_int('A') ; static int c_code = dna::char_to_int('C') ; static int g_code = dna::char_to_int('G') ; static int t_code = dna::char_to_int('T') ; static int n_code = dna::char_to_int('N') ; // the int code of the reverse complement of A, C, G, T static int a_code_r = dna::char_to_int('A', true) ; static int c_code_r = dna::char_to_int('C', true) ; static int g_code_r = dna::char_to_int('G', true) ; static int t_code_r = dna::char_to_int('T', true) ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t k=0; k < n_class_to_update; k++) { for(size_t s=0; sn_shift; s++) { for(size_t j=0; jl_model; j++) { // base prob on fw and rv strand vector_d base_prob_fw(this->n_category, 0.) ; vector_d base_prob_rv(this->n_category, 0.) ; for(size_t i=from; idata(i,s+j) ; int base_rev = this->n_category - base - 1 ; // N if(base == n_code) { // --------------- forward --------------- { base_prob_fw[a_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[c_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[g_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[t_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[a_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[c_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[g_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[t_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } // A, C, G, T else { // --------------- forward --------------- { base_prob_fw[base] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[base_rev] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } } // update this position of the model for(size_t i=0,i_rev=base_prob_fw.size()-1; iflip) { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } } } } } promise.set_value(model) ; } void SequenceLayer::create_bckg_class() { // sequence composition std::vector base_comp = dna::base_composition(this->data, this->flip) ; // set last class as background class for(size_t i=0; in_category; i++) { for(size_t j=0; jl_model; j++) { this->model(this->n_class-1,j,i) = base_comp[i] ; } } } diff --git a/src/GenomicTools/ClassSequenceDataCreator.cpp b/src/GenomicTools/ClassSequenceDataCreator.cpp index 8df2419..93e97f5 100644 --- a/src/GenomicTools/ClassSequenceDataCreator.cpp +++ b/src/GenomicTools/ClassSequenceDataCreator.cpp @@ -1,149 +1,172 @@ #include #include #include #include // std::invalid_argument #include #include #include #include // char_to_int() + + ClassSequenceDataCreator::ClassSequenceDataCreator(const std::string& bed_file_path, const std::string& fasta_file_path, const std::string& prob_file_path, int from, int to, size_t class_k) : bed_file_path(bed_file_path), fasta_file_path(fasta_file_path), prob_file_path(prob_file_path), from(from), to(to), class_k(class_k-1) { ; } ClassSequenceDataCreator::~ClassSequenceDataCreator() { ; } Matrix3D ClassSequenceDataCreator::create_matrix() { // load posterior prob Matrix4D prob ; prob.load(this->prob_file_path) ; - // prob.load(this->file_prob) ; + size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; bool flip = n_flip == 2 ; // compute class prob std::vector prob_colsum(n_class, 0.) ; double tot = 0. ; for(size_t i=0; i 0-based) if(this->class_k > n_class) { char msg[4096] ; sprintf(msg, "Error! invalid class of interest (%zu and %zu found)", this->class_k, n_class) ; throw std::invalid_argument(msg) ; } // realigning the data using the same shift as the partition // will "trim" the matrix -> create a wider matrix such that // the realigned matrix will be of the desired wide (original // from and to parameters).; int ext = n_shift - 1 ; int ext_r = ext / 2 ; int ext_l = ext - ext_r ; SequenceMatrixCreator mc(this->bed_file_path, this->fasta_file_path, this->from - ext_l, this->to + ext_r) ; Matrix2D data = mc.create_matrix() ; // check the compatibility with the prob matrix if(data.get_nrow() != prob.get_dim()[0]) { char msg[4096] ; sprintf(msg, "Error! the number of references in the BED file is not equal to the" "1st dimension of the probability matrix (%zu, %zu).", data.get_nrow(), prob.get_dim()[0]) ; throw(std::invalid_argument(msg)) ; } else if(data.get_nrow() < prob.get_dim()[1]) { char msg[4096] ; sprintf(msg, "Error! the number of references in the BED file is smaller than" "the number of classes in the probability matrix (%zu, %zu).", data.get_nrow(), prob.get_dim()[1]) ; throw(std::invalid_argument(msg)) ; } else if(data.get_ncol() < prob.get_dim()[2]) { char msg[4096] ; sprintf(msg, "Error! the data matrix has fewer columns (bins) than the" "the shifting freedom in the probability matrix (%zu, %zu).", data.get_ncol(), prob.get_dim()[2]) ; throw(std::invalid_argument(msg)) ; } // realign matrix size_t n_seq = data.get_nrow() ; size_t n_col2 = this->to - this->from + 1 ; - Matrix3D data2(n_seq, n_col2, 4, 0.) ; + Matrix3D data2(n_seq, n_col2, 4, 1e-100) ; // min value is 1e-100 to avoid 0 prob + // std::cerr << "ClassSequenceDataCreator::create_matrix()" << data2.get_dim() << std::endl ; // aggregate for(size_t i=0; i same as doing nothing - if(base == dna::char_to_int('N')) + int base_fw = data(i,s+j_fw) ; + // don't consider N's + if(base_fw == dna::char_to_int('N')) { continue ; } - int base_rv = 4 - base - 1 ; + int base_rv = 4 - base_fw - 1 ; // --------- forward --------- // std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ; - data2(i,j,base) += prob(i,class_k,s,0) ; + if(true) + { data2(i,j_fw,base_fw) += prob(i,class_k,s,0) ; } // --------- reverse --------- if(flip) { // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ; - data2(i,j,base_rv) += prob(i,class_k,s,1) ; + data2(i,j_rv,base_rv) += prob(i,class_k,s,1) ; } } } } // clean memory data = Matrix2D() ; prob = Matrix4D() ; // normalize for(size_t i=0; i data3(4, data2.get_dim()[1], 0.) ; + for(size_t i=0; i #include #include #include // std::invalid_argument #include #include #include // seqan::SeqFileIn int dna::map(char base, bool rev_compl) { static bool init = false ; static std::unordered_map hash_map ; static std::unordered_map hash_map_rev ; if(not init) { hash_map['A'] = 0 ; hash_map['a'] = 0 ; hash_map['C'] = 1 ; hash_map['c'] = 1 ; hash_map['G'] = 2 ; hash_map['g'] = 2 ; hash_map['T'] = 3 ; hash_map['t'] = 3 ; hash_map['N'] = 4 ; hash_map['n'] = 4 ; hash_map_rev['A'] = hash_map['T'] ; hash_map_rev['a'] = hash_map['t'] ; hash_map_rev['C'] = hash_map['G'] ; hash_map_rev['c'] = hash_map['g'] ; hash_map_rev['G'] = hash_map['C'] ; hash_map_rev['g'] = hash_map['c'] ; hash_map_rev['T'] = hash_map['A'] ; hash_map_rev['t'] = hash_map['a'] ; hash_map_rev['N'] = hash_map['N'] ; hash_map_rev['n'] = hash_map['n'] ; init = true ; } try { if(rev_compl) { return hash_map_rev.at(base) ; } else { return hash_map.at(base) ; } } // key could not be found catch(std::out_of_range& e) { char msg[256] ; sprintf(msg, "Error! Invalid DNA base : %c", base) ; throw std::invalid_argument(msg) ; } } char dna::map(int base, bool rev_compl) { static bool init = false ; static std::unordered_map hash_map ; static std::unordered_map hash_map_rev ; if(not init) { hash_map[0] = 'A' ; hash_map[1] = 'C' ; hash_map[2] = 'G' ; hash_map[3] = 'T' ; hash_map[4] = 'N' ; hash_map_rev[4] = hash_map[4] ; hash_map_rev[3] = hash_map[0] ; hash_map_rev[2] = hash_map[1] ; hash_map_rev[1] = hash_map[2] ; hash_map_rev[0] = hash_map[3] ; init = true ; } try { if(rev_compl) { return hash_map_rev.at(base) ; } else { return hash_map.at(base) ; } } // key could not be found catch(std::out_of_range& e) { char msg[256] ; sprintf(msg, "Error! Invalid DNA code : %i", base) ; throw std::invalid_argument(msg) ; } } int dna::char_to_int(char c, bool rev_compl) { return dna::map(c, rev_compl) ; } Matrix2D dna::char_to_int(const Matrix2D& matrix) { size_t n_row = matrix.get_nrow() ; size_t n_col = matrix.get_ncol() ; Matrix2D data_int(n_row, n_col) ; for(size_t i=0; i dna::base_composition(const Matrix2D& sequences, bool both_strands) { double total = 0. ; std::vector base_comp(4,0.) ; int base_N = dna::map('N') ; for(size_t i=0; i adding a count to each - // is equivalent to not changing anything + // do not account for N's if(base == base_N) { continue ; } else { base_comp[base] += 1; total += 1. ; } // reverse complement strand if(both_strands) { // size_t c_hash_rev = dna::hash(c, true) ; base_comp[4-base-1] += 1. ; total += 1. ; } } } // normalize for(auto& i : base_comp) { i /= total ; } return base_comp ; } std::vector dna::base_composition(const Matrix3D& consensus_sequences, bool both_strands) { if(consensus_sequences.get_dim()[2] != 4) { char msg[4096] ; sprintf(msg, "Error! consensus sequences 3rd dimension not equal to 4 (%zu)", consensus_sequences.get_dim()[2]) ; throw std::invalid_argument(msg) ; } double total = 0. ; std::vector base_comp(4,0.) ; for(size_t i=0; i // std::vector -#include - -// some typedef -typedef std::vector vector_i ; -typedef std::vector vector_d ; -typedef std::vector vector_c ; -typedef std::vector matrix2d_i ; -typedef std::vector matrix2d_d ; -typedef std::vector matrix2d_c ; -typedef std::vector matrix3d_d ; -typedef std::vector matrix4d_d ; - - -/*! -* \brief Constructs a matrix from a text file. A matrix contructed -* from an empty file (or a file containing only one EOL char) returns -* an empty matrix (null dimensions). -* \param file_address the address of the file containing the matrix. -* \throw std::runtime_error if anything happen while reading the -* file (format error, file not found, etc). -*/ -matrix2d_i read_matrix2d_i(const std::string &file_address) ; - -/*! -* \brief Constructs a matrix from a text file. A matrix contructed -* from an empty file (or a file containing only one EOL char) returns -* an empty matrix (null dimensions). -* \param file_address the address of the file containing the matrix. -* \throw std::runtime_error if anything happen while reading the -* file (format error, file not found, etc). -*/ -matrix2d_c read_matrix2d_c(const std::string &file_address) ; - -/*! -* \brief Constructs a 4D matrix from a text file. A matrix contructed -* from an empty file (or a file containing only one EOL char) returns -* an empty matrix (null dimensions). -* \param file_address the address of the file containing the matrix. -* \throw std::runtime_error if anything happen while reading the -* file (format error, file not found, etc). -*/ -matrix4d_d read_matrix4d_d(const std::string &file_address) ; - -/*! - * \brief A routine to read_matrix() for 4D matrices. - * Checks whether a header is a 3D header or not. - * \param str the header to check. - * \return whether the header is a 3D header. - */ -bool is_header_3d(const std::string &str) ; - -/*! - * \brief A routine to read_matrix() for 4D matrices. - * Checks whether a header is a 4D header or not. - * \param str the header to check. - * \return whether the header is a 4D header. - */ -bool is_header_4d(const std::string &str) ; - -/*! -* \brief Routine of read_matrix for 4D matrices. -* This method reads from a std::ifstream object, -* from the current pointer location until i) a 4D -* header line is found (such as ',,,1') or ii) until -* it cannot read anymore from the stream. All -* data are pushed back into the data vector and -* the dimensions of the data read are stored into -* the dim vector (these data are actually a 3D -* matrix). If the method returned because it -* found another 4D header, it returns true, false -* otherwise. -* To read an entire 4D matrix from a file, simply -* use this scheme : i) read the 1st 4D header -* ii) call this function while it returns true. -* \param file_name a reference to a string containing -* the address of the file currently read (for exception -* messages). -* \param file a reference to the std::ifstream to read -* from. Obviously, the stream state will be modified as -* the method reads from it. However, it will never be -* closed by the method. -* \param data a buffer to contain the slice. -* \param dim a reference to an empty vector where the -* dimensions of the read data will be stored. -* \return whether the last piece of data read from the -* stream was a 4D header. -*/ -bool get_3d_slice(const std::string& file_name, std::ifstream& file, - vector_d& data, std::vector &dim) ; - -/*! - * \brief Sends a nice vector text representation - * to the given stream - * \param stream the stream of interest. - * \param vector the vector of interest. - * \return a reference to the given stream. - */ -std::ostream& operator << (std::ostream& stream, const vector_d& vector) ; - -/*! - * \brief Sends a nice matrix text representation - * to the given stream - * \param stream the stream of interest. - * \param matrix the matrix of interest. - * \return a reference to the given stream. - */ -std::ostream& operator << (std::ostream& stream, const matrix2d_i& matrix) ; - -/*! - * \brief Sends a nice matrix text representation - * to the given stream - * \param stream the stream of interest. - * \param matrix the matrix of interest. - * \return a reference to the given stream. - */ -std::ostream& operator << (std::ostream& stream, const matrix2d_c& matrix) ; - -/*! - * \brief Sends a nice matrix text representation - * to the given stream - * \param stream the stream of interest. - * \param matrix the matrix of interest. - * \return a reference to the given stream. - */ -std::ostream& operator << (std::ostream& stream, const matrix2d_d& matrix) ; - -/*! - * \brief Sends a nice matrix text representation - * to the given stream - * \param stream the stream of interest. - * \param matrix the matrix of interest. - * \return a reference to the given stream. - */ -std::ostream& operator << (std::ostream& stream, const matrix3d_d& matrix) ; - -/*! - * \brief Sends a nice matrix text representation - * to the given stream - * \param stream the stream of interest. - * \param matrix the matrix of interest. - * \return a reference to the given stream. - */ -std::ostream& operator << (std::ostream& stream, const matrix4d_d& matrix) ; - -#endif // MATRICESHPP diff --git a/src/Utility/matrix_utility.cpp b/src/Utility/matrix_utility.cpp new file mode 100644 index 0000000..d82afa9 --- /dev/null +++ b/src/Utility/matrix_utility.cpp @@ -0,0 +1,126 @@ +#include + +#include +#include + +#include +#include +#include + + +Matrix2D filter_rows(const std::vector& indices, + const Matrix2D& m) +{ + size_t nrow = m.get_nrow() ; + size_t ncol = m.get_ncol() ; + size_t nrow_filtered = indices.size() ; + size_t nrow_final = nrow - nrow_filtered ; + + // ensure that all indices are usable + for(auto i : indices) + { // this index is too big + if(i >= nrow) + { char msg[4096] ; + sprintf(msg, "Error! Filter index bigger than matrix number of rows (%zu and %zu)!", + i, nrow) ; + throw std::invalid_argument(msg) ; + } + } + + Matrix2D m_final(nrow_final, ncol, 0) ; + + for(size_t i=0, i_idx=0, i_fin=0; i filter_rows(const std::vector& indices, + const Matrix3D& m) +{ + size_t ndim1 = m.get_dim()[0] ; + size_t ndim2 = m.get_dim()[1] ; + size_t ndim3 = m.get_dim()[2] ; + size_t ndim1_idx = indices.size() ; + size_t ndim1_final = ndim1 - ndim1_idx ; + + // ensure that all indices are usable + for(auto i : indices) + { // this index is too big + if(i >= ndim1) + { char msg[4096] ; + sprintf(msg, "Error! Filter index bigger than matrix number of rows (%zu and %zu)!", + i, ndim1) ; + throw std::invalid_argument(msg) ; + } + } + + Matrix3D m_final(ndim1_final,ndim2,ndim3,0) ; + + for(size_t i=0, i_idx=0, i_fin=0; i filter_rows(const std::vector& indices, + const Matrix4D& m) +{ + size_t ndim1 = m.get_dim()[0] ; + size_t ndim2 = m.get_dim()[1] ; + size_t ndim3 = m.get_dim()[2] ; + size_t ndim4 = m.get_dim()[3] ; + size_t ndim2_filtered = indices.size() ; + size_t ndim1_final = ndim1 - ndim2_filtered ; + + // ensure that all indices are usable + for(auto i : indices) + { // this index is too big + if(i >= ndim1) + { char msg[4096] ; + sprintf(msg, "Error! Filter index bigger than matrix number of rows (%zu and %zu)!", + i, ndim1) ; + throw std::invalid_argument(msg) ; + } + } + + Matrix4D m_final(ndim1_final, ndim2, ndim3,0) ; + + for(size_t i=0, i_idx=0, i_fin=0; i + +#include +#include +#include + + +/*! + * \brief Filters out given rows of a matrix. + * \param indices the 0-based indices of the + * rows to remove. The indices should be + * sorted in ascending order. + * \param m the matrix on which this operation + * should be performed. + * \return the matrix without the filtered + * rows. + * \throw std::invalid_argument if one index is + * invalid. + */ +Matrix2D filter_rows(const std::vector& indices, + const Matrix2D& m) ; + +/*! + * \brief Filters out given rows (1st dimension + * slices) of a matrix. + * \param indices the 0-based indices of the + * rows (1st dimension slices) to remove. The + * indices should be sorted in ascending order. + * \param m the matrix on which this operation + * should be performed. + * \return the matrix without the filtered + * rows. + * \throw std::invalid_argument if one index is + * invalid. + */ +Matrix3D filter_rows(const std::vector& indices, + const Matrix3D& m) ; + + +/*! + * \brief Filters out given rows (1st dimension + * slices) of a matrix. + * \param indices the 0-based indices of the + * rows (1st dimension slices) to remove. The + * indices should be sorted in ascending order. + * \param m the matrix on which this operation + * should be performed. + * \return the matrix without the filtered + * rows. + * \throw std::invalid_argument if one index is + * invalid. + */ +Matrix4D filter_rows(const std::vector& indices, + const Matrix4D& m) ; + + +#endif // MATRIXUTILITY_HPP diff --git a/src/main_test.cpp b/src/main_test.cpp index a6f2b20..de4471a 100644 --- a/src/main_test.cpp +++ b/src/main_test.cpp @@ -1,207 +1,148 @@ #include #include #include #include #include #include #include #include using namespace std ; template std::ostream& operator << (std::ostream& stream, const std::vector& v) { for(const auto& x : v) { stream << x << " " ; } return stream ; } Matrix3D compute_consensus_sequences(const Matrix2D& seq, const Matrix4D& prob, size_t class_k) { size_t n_seq = seq.get_nrow() ; size_t l_seq = seq.get_ncol() ; // size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; bool flip = n_flip == 2 ; size_t l_model = l_seq - n_shift + 1 ; Matrix3D cons_seq(n_seq, l_model, 4, 0.) ; // aggregate for(size_t i=0; i same as doing nothing if(base == dna::char_to_int('N')) { continue ; } int base_rv = 4 - base - 1 ; // --------- forward --------- // std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ; cons_seq(i,j,base) += prob(i,class_k,s,0) ; tot += prob(i,class_k,s,0) ; // --------- reverse --------- if(flip) { // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ; cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ; tot += prob(i,class_k,s,1) ; } } } } // normalize for(size_t i=0; i prob(prob_file) ; - // size_t n_row = prob.get_dim()[0] ; - // size_t n_class = prob.get_dim()[1] ; - size_t n_shift = prob.get_dim()[2] ; - // size_t n_flip = prob.get_dim()[3] ; - // bool flip = n_flip == 2 ; - std::cerr << "posterior prob " << prob.get_dim() << std::endl ; - - // these are from3 and to3 - int from1 = -500 ; - int to1 = 500 ; - - // class of interest - size_t class_k = 17 ; - class_k -= 1 ; - - // get a wider matrix to realign it - std::cerr << "creating data2 matrix" << std::endl ; - // extend limits - int ext = n_shift - 1 ; - int ext_r = ext / 2 ; - int ext_l = ext - ext_r ; - int from2 = from1 - ext_l ; - int to2 = to1 + ext_r ; - SequenceMatrixCreator mc(bed_file, - seq_file, - from2, - to2) ; - Matrix2D data2 = mc.create_matrix() ; - data2.save("test/data2.mat2d") ; - std::cerr << "data2 matrix " << data2.get_dim() << std::endl ; - - Matrix3D cons_seq= compute_consensus_sequences(data2, - prob, - class_k) ; - - data2.save("test/consensus_matrix.mat3d") ; - */ - - Matrix4D prob(prob_file) ; - std::cout << prob.get_dim() << std::endl ; - Matrix2D data2 ; - data2.load("test/data2.mat2d") ; - std::cout << data2.get_dim() << std::endl ; - - size_t class_k = 17 ; - class_k-- ; - Matrix3D cons_seq = compute_consensus_sequences(data2, - prob, - class_k) ; - cons_seq.save("test/consensus_sequences.mat3d") ; - - - /* // sequences Matrix2D seq(4, 7, 0) ; seq(0,0) = 0 ; seq(0,1) = 3 ; seq(0,2) = 3 ; seq(0,3) = 3 ; seq(0,4) = 1 ; seq(0,5) = 1 ; seq(0,6) = 1 ; seq(1,0) = 1 ; seq(1,1) = 0 ; seq(1,2) = 3 ; seq(1,3) = 3 ; seq(1,4) = 3 ; seq(1,5) = 1 ; seq(1,6) = 1 ; seq(2,0) = 1 ; seq(2,1) = 1 ; seq(2,2) = 3 ; seq(2,3) = 0 ; seq(2,4) = 0 ; seq(2,5) = 0 ; seq(2,6) = 1 ; seq(3,0) = 1 ; seq(3,1) = 1 ; seq(3,2) = 1 ; seq(3,3) = 3 ; seq(3,4) = 0 ; seq(3,5) = 0 ; seq(3,6) = 0 ; // probabilities Matrix4D prob(4, 1, 4, 2) ; // row 1, cluster 1 prob(0,0,0,0) = 999 ; prob(0,0,0,1) = .01 ; // shift 1, both flips prob(0,0,1,0) = .01 ; prob(0,0,1,1) = .01 ; // shift 2, both flips prob(0,0,2,0) = .01 ; prob(0,0,2,1) = .01 ; // shift 3, both flips prob(0,0,3,0) = .01 ; prob(0,0,3,1) = .01 ; // shift 4, both flips // row 2, cluster 1 prob(1,0,0,0) = .01 ; prob(1,0,0,1) = .01 ; // shift 1, both flips prob(1,0,1,0) = 999 ; prob(1,0,1,1) = .01 ; // shift 2, both flips prob(1,0,2,0) = .01 ; prob(1,0,2,1) = .01 ; // shift 3, both flips prob(1,0,3,0) = .01 ; prob(1,0,3,1) = .01 ; // shift 4, both flips // row 3, cluster 1 prob(2,0,0,0) = .01 ; prob(2,0,0,1) = .01 ; // shift 1, both flips prob(2,0,1,0) = .01 ; prob(2,0,1,1) = .01 ; // shift 2, both flips prob(2,0,2,0) = .01 ; prob(2,0,2,1) = 999 ; // shift 3, both flips prob(2,0,3,0) = .01 ; prob(2,0,3,1) = .01 ; // shift 4, both flips // row 4, cluster 1 prob(3,0,0,0) = .01 ; prob(3,0,0,1) = .01 ; // shift 1, both flips prob(3,0,1,0) = .01 ; prob(3,0,1,1) = .01 ; // shift 2, both flips prob(3,0,2,0) = .01 ; prob(3,0,2,1) = .01 ; // shift 3, both flips prob(3,0,3,0) = .01 ; prob(3,0,3,1) = 999 ; // shift 4, both flips //normalize for(size_t i=0; i cons_seq = compute_consensus_sequences(seq, prob, class_k) ; // print for(size_t i=0; i