diff --git a/src/Applications/EMJointApplication.cpp b/src/Applications/EMJointApplication.cpp index 22b9caa..5252ef2 100644 --- a/src/Applications/EMJointApplication.cpp +++ b/src/Applications/EMJointApplication.cpp @@ -1,184 +1,194 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include namespace po = boost::program_options ; EMJointApplication::EMJointApplication(int argn, char** argv) : files_read(""), file_sequence(""), file_out(""), - n_class(0), n_iter(0), n_shift(0), flip(false), + n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMJointApplication::run() { if(this->runnable) { // read data std::vector read_paths ; boost::split(read_paths, this->files_read, [](char c){return c == ',';}) ; std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } data_read.push_back(Matrix2D(path)) ; } // sequence data EMJoint* em = nullptr ; if(this->file_sequence == "") { em = new EMJoint(std::move(data_read), this->n_class, this->n_iter, this->n_shift, this->flip, + this->bckg_class, this->seed, this->n_threads) ; } else { Matrix2D data_seq(this->file_sequence) ; em = new EMJoint(std::move(data_read), std::move(data_seq), this->n_class, this->n_iter, this->n_shift, this->flip, + this->bckg_class, this->seed, this->n_threads) ; } em->classify() ; em->get_post_prob().save(this->file_out) ; delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMJointApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMJoint is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given 1) the shapes \n" "of the read densities over the regions and 2) the region sequence \n" "motif contents. \n " "The assignment probabilitiesare returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations, \n" "by default 0 (no parallelization)." ; std::string opt_read_msg = "A coma separated list of paths to the file containing the \n" "read density data. At least one path is needed." ; std::string opt_seq_msg = "The path to the file containing the sequence data. If no path is \n" "given, the classification is only cares about the read density \n" "shapes." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; + std::string opt_bckg_msg = "Adds a class to model the sequence and the read signal background. This\n" + "class contains sequence background probabilies (for the sequence model)\n" + "and the mean number of reads (for the read count models) at each position\n" + "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->files_read == "" and this->file_sequence == "" and (not help)) { std::string msg("Error! No data were given (--read and --seq)!") ; throw std::invalid_argument(msg) ; } if(this->files_read == "" and (not help)) { std::string msg("Error! No read density data were given (--read)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMJointApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMJointApplication.hpp b/src/Applications/EMJointApplication.hpp index abe5296..3ef57ca 100644 --- a/src/Applications/EMJointApplication.hpp +++ b/src/Applications/EMJointApplication.hpp @@ -1,106 +1,115 @@ #ifndef EMJOINTAPPLICATION_HPP #define EMJOINTAPPLICATION_HPP #include #include #include /*! * \brief The EMJointApplication class is a wrapper around an EMJoint * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ class EMJointApplication: public ApplicationInterface { public: EMJointApplication() = delete ; EMJointApplication(const EMJointApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMJointApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief a coma separated list of paths to the files * containing the read density data */ std::string files_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_sequence ; /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; + /*! + * \brief whether a constant class to model the + * sequence background should be added. This + * class has the sequence background probabilities + * (for the sequence model) and the mean number of + * read counts (for read signal model) at each + * position. + */ + bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMJOINTAPPLICATION_HPP diff --git a/src/Applications/EMReadApplication.cpp b/src/Applications/EMReadApplication.cpp index fb521ce..5ece4c9 100644 --- a/src/Applications/EMReadApplication.cpp +++ b/src/Applications/EMReadApplication.cpp @@ -1,147 +1,155 @@ #include #include #include #include #include // std::invalid_argument #include #include #include namespace po = boost::program_options ; EMReadApplication::EMReadApplication(int argn, char** argv) : file_read(""), file_out(""), - n_class(0), n_iter(0), n_shift(0), flip(false), + n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMReadApplication::run() { if(this->runnable) { EMRead em(Matrix2D(this->file_read), this->n_class, this->n_iter, this->n_shift, this->flip, + this->bckg_class, this->seed, this->n_threads) ; em.classify() ; em.get_post_prob().save(this->file_out) ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMReadApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMRead is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given the shape \n" "of the read density over the region. The assignment \n" "probabilities are returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_read_msg = "The path to the file containing the read density data" ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom to realign the data. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; + std::string opt_bckg_msg = "Adds a class to model the signal background. This class\n" + "contains the mean number of read at each position and\n" + "is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->file_read)), opt_read_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_read == "" and (not help)) { std::string msg("Error! No data were given (--read)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMReadApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/EMReadApplication.hpp b/src/Applications/EMReadApplication.hpp index fc98551..0847a54 100644 --- a/src/Applications/EMReadApplication.hpp +++ b/src/Applications/EMReadApplication.hpp @@ -1,96 +1,103 @@ #ifndef EMREADAPPLICATION_HPP #define EMREADAPPLICATION_HPP #include #include /*! * \brief The EMReadApplication class is a wrapper around an EMRead * instance creating an autonomous application to classify data by directly * passing all the options and parameters from the command line. */ class EMReadApplication: public ApplicationInterface { public: EMReadApplication() = delete ; EMReadApplication(const EMReadApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ EMReadApplication(int argn, char** argv) ; /*! * \brief Runs the application. The data are classified * using the given settings and the posterior probability * matrix is returned through the stdout. * The matrix is a 4D matrix with dimensions : * regions, class, shift flip. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the paths to the file containing the read * density data. */ std::string file_read ; /*! * \brief the path to the file in which the probability * matrix will be saved. */ std::string file_out ; /*! * \brief the number of classes to partition the data into. */ size_t n_class ; /*! * \brief the number of iterations allowed. */ size_t n_iter ; /*! * \brief the shifting freedom. */ size_t n_shift ; /*! * \brief whether flipping freedom is allowed. */ bool flip ; + /*! + * \brief whether a constant class to model the + * read count background should be added. This + * class has mean number of read count at each + * position. + */ + bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief a seed to initialise the random number generator. */ std::string seed ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // EMREADAPPLICATION_HPP diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/DataLayer.cpp index 6762291..e69ac60 100644 --- a/src/Clustering/DataLayer.cpp +++ b/src/Clustering/DataLayer.cpp @@ -1,184 +1,215 @@ #include #include // std::invalid_argument #include // log() #include #include #include #include DataLayer::DataLayer() {} DataLayer::DataLayer(const Matrix2D& data, size_t n_class, size_t n_shift, - bool flip) + bool flip, + bool last_class_cst) :data(data), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(n_class), l_model(n_col - n_shift + 1), n_shift(n_shift), - n_flip(flip + 1) + n_flip(flip + 1), + last_class_cst(last_class_cst) { // models cannot be initialise here // as the number of categories depend // on the exact class } DataLayer::DataLayer(Matrix2D&& data, size_t n_class, size_t n_shift, - bool flip) + bool flip, + bool last_class_cst) :data(data), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(n_class), l_model(n_col - n_shift + 1), n_shift(n_shift), - n_flip(flip + 1) + n_flip(flip + 1), + last_class_cst(last_class_cst) { // models cannot be initialise here // as the number of categories depend // on the exact class } DataLayer::DataLayer(const Matrix2D& data, const Matrix3D& model, - bool flip) + bool flip, + bool last_class_cst) : data(data), model(model), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(this->model.get_dim()[0]), l_model(this->model.get_dim()[1]), n_category(this->model.get_dim()[2]), n_shift(n_col - l_model + 1), - n_flip(flip + 1) + n_flip(flip + 1), + last_class_cst(last_class_cst) { // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } this->n_shift = this->n_col - this->l_model + 1 ; } DataLayer::DataLayer(Matrix2D&& data, Matrix3D&& model, - bool flip) + bool flip, + bool last_class_cst) : data(data), model(model), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(this->model.get_dim()[0]), l_model(this->model.get_dim()[1]), n_category(this->model.get_dim()[2]), n_shift(n_col - l_model + 1), - n_flip(flip + 1) + n_flip(flip + 1), + last_class_cst(last_class_cst) { // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } this->n_shift = this->n_col - this->l_model + 1 ; } DataLayer::~DataLayer() {} +void DataLayer::set_class(size_t i, const Matrix2D& class_model) +{ // check dimensions + if(class_model.get_nrow() != this->n_category) + { char msg[4096] ; + sprintf(msg, "Error! the given class model is incompatible " + "with the model (%zu rows instead of %zu)", + class_model.get_nrow(), this->n_category) ; + throw std::invalid_argument(msg) ; + } + else if(class_model.get_ncol() != this->l_model) + { char msg[4096] ; + sprintf(msg, "Error! the given class model is incompatible " + "with the model (%zu columns instead of %zu)", + class_model.get_ncol(), this->l_model) ; + throw std::invalid_argument(msg) ; + } + + for(size_t j=0; jmodel(i,j,k) = class_model(k,j) ; } + } +} + Matrix3D DataLayer::get_model() const { return this->model ; } void DataLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const { if(loglikelihood.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 1st dimension is not " "equal to data row number : %zu / %zu", loglikelihood.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 2nd dimension is not " "equal to model class number : %zu / %zu", loglikelihood.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", loglikelihood.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", loglikelihood.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } void DataLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const { if(loglikelihood_max.size() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood_max length is not " "equal to data row number : %zu / %zu", loglikelihood_max.size(), this->n_flip) ; throw std::invalid_argument(msg) ; } } void DataLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const { if(posterior_prob.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 1st dimension is not " "equal to data row number : %zu / %zu", posterior_prob.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 2nd dimension is not " "equal to model class number : %zu / %zu", posterior_prob.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", posterior_prob.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", posterior_prob.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } const double DataLayer::p_min = 1e-100 ; const double DataLayer::p_min_log = log(DataLayer::p_min) ; diff --git a/src/Clustering/DataLayer.hpp b/src/Clustering/DataLayer.hpp index 5b13153..3fbc159 100644 --- a/src/Clustering/DataLayer.hpp +++ b/src/Clustering/DataLayer.hpp @@ -1,256 +1,298 @@ #ifndef DATALAYER_HPP #define DATALAYER_HPP #include #include // std::promise, std::future #include #include #include #include typedef std::vector vector_d ; /*! * \brief The DataLayer class define the basic design * to handle probabilistic models together with * their data. * A DataLayer is made of two parts : * 1) a data matrix * 2) a model * The model contains the parameters of a probabilistic * model with one or more classes that fits the data. * The data likelihood given the model can be computed * and the model can be updated given a set of * posterior probabilities representing the data * assignments to the different classes. */ class DataLayer { public: /*! * \brief the smallest acceptable probability * for computations. */ static const double p_min ; /*! * \brief the log of the smallest probability. */ static const double p_min_log ; /*! * \brief The possible flip states. */ enum flip_states{FORWARD=0, REVERSE} ; /*! * \brief Default constructor. */ DataLayer() ; /*! * \brief Constructs an object with the * given data. * An empty model is not initialised yet * as the model number of categories * depends on the final class. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). */ DataLayer(const Matrix2D& data, size_t n_class, size_t n_shift, - bool flip) ; + bool flip, + bool last_class_cst) ; /*! * \brief Constructs an object with the * given data. * An empty model is not initialised yet * as the model number of categories * depends on the final class. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). */ DataLayer(Matrix2D&& data, size_t n_class, size_t n_shift, - bool flip) ; + bool flip, + bool last_class_cst) ; /*! * \brief Constructs an object with the * given data and model. * The model dimensions set the number of * classes and the shifting freedom. * \param data the data. * \param the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). */ DataLayer(const Matrix2D& data, const Matrix3D& model, - bool flip) ; + bool flip, + bool last_class_cst) ; /*! * \brief Constructs an object with the * given data and model. * The model dimensions set the number of * classes and the shifting freedom. * \param data the data. * \param the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). */ DataLayer(Matrix2D&& data, Matrix3D&& model, - bool flip) ; + bool flip, + bool last_class_cst) ; /*! * \brief Destructor. */ virtual ~DataLayer() ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const = 0 ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) = 0 ; + + /*! + * \brief Modify the values of th given class + * with the given parameters. + * \param i the index of the class to modify, 0-based. + * \param class_model the class model parameters. + * Its dimensions should be: + * 1st : , 4 for sequence models (A,C,G,T), + * 1 for signal models. + * 2nd : the model length. + * \throw std::invalid_argument if the dimensions are not + * compatible with the current model classes. + */ + virtual void set_class(size_t i, + const Matrix2D& class_model) ; + /*! * \brief Returns a copy of the current model. * \return the current model. * 1st dim : the number of classes * 2nd dim : the model length * 3rd dim : the number of value categories. */ virtual Matrix3D get_model() const ; protected: /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_loglikelihood_dim(const Matrix4D& loglikelihood) const ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * It should have a length equal to the number of * the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const ; /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_prob a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_posterior_prob_dim(const Matrix4D& posterior_prob) const ; /*! * \brief the data. */ Matrix2D data ; /*! * \brief the data model. */ Matrix3D model ; /*! * \brief whether flip is enabled. */ bool flip ; /*! * \brief the number of row in the data. */ size_t n_row ; /*! * \brief the number of columns in the data. */ size_t n_col ; /*! * \brief the number of classes in the model. */ size_t n_class ; /*! * \brief the model length, its 2nd dimension. */ size_t l_model ; /*! * \brief the number of variable categories in * the data. This is also the model 3rd * dimension. * Read counts are quantitative values and * have a number of categories equal to one * whereas as DNA sequences are made of * A,C,G,T (at least) and have 4 different * categories. */ size_t n_category ; /*! * \brief the number of shift states. */ size_t n_shift ; /*! * \brief the number of flip states. */ size_t n_flip ; + /*! + * \brief A flag indicating that the last class of the model + * is constant and should not be updated when calling + * update_model(). + */ + bool last_class_cst ; } ; #endif // DATALAYER_HPP diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp index 29de0b7..afef19b 100644 --- a/src/Clustering/EMJoint.cpp +++ b/src/Clustering/EMJoint.cpp @@ -1,576 +1,660 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include #include #include #include #include #include #include // getRandomNumberGenerator() #include // ConsoleProgressBar - +#include // EMRead::generate_bckg_motif() +#include // EMSequence::generate_bckg_motif() EMJoint::EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly - // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, + bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; + + // compute background and + // overwrite last class as background class + if(bckg_class) + { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; + Matrix2D bckg_motif = EMRead::generate_bckg_motif(matrix, + motif_len) ; + this->read_layers.back()->set_class(this->n_class-1, + bckg_motif) ; + } } } EMJoint::EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly - // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) - { // create the layer + { + // compute background before giving data to + // ReadLayer + Matrix2D bckg_motif ; + if(bckg_class) + { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; + bckg_motif = EMRead::generate_bckg_motif(matrix, + motif_len) ; + } + + // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, + bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; + + // overwrite last class as background class + if(bckg_class) + { this->read_layers.back()->set_class(this->n_class-1, + bckg_motif) ; + } } } EMJoint::EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, + bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; + + // compute background and + // overwrite last class as background class + if(bckg_class) + { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; + Matrix2D bckg_motif = EMRead::generate_bckg_motif(matrix, + motif_len) ; + this->read_layers.back()->set_class(this->n_class-1, + bckg_motif) ; + } } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, - false) ; + bckg_class) ; this->seq_layer->update_model(this->post_prob, this->threads) ; + // compute background and + // overwrite last class as background class + if(bckg_class) + { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; + Matrix2D bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, + motif_len, + this->flip) ; + this->seq_layer->set_class(this->n_class-1, + bckg_motif) ; + } } EMJoint::EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) - { // create the layer + { + // compute background before giving data to + // ReadLayer + Matrix2D bckg_motif ; + if(bckg_class) + { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; + bckg_motif = EMRead::generate_bckg_motif(matrix, + motif_len) ; + } + + // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, + bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; + // overwrite last class as background class + if(bckg_class) + { this->read_layers.back()->set_class(this->n_class-1, + bckg_motif) ; + } } + // create sequence layer and initialise the models from the post prob + // compute background before giving data to + // SequenceLayer + Matrix2D bckg_motif ; + if(bckg_class) + { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; + bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, + motif_len, + this->flip) ; + } this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, - false) ; + bckg_class) ; + // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; + // overwrite last class as background class + if(bckg_class) + { this->seq_layer->set_class(this->n_class-1, + bckg_motif) ; + } } EMJoint::~EMJoint() { // join the threads in case // deleted by EMBase destructor this->threads->join() ; // read data and models for(auto& ptr : this->read_layers) { if(ptr != nullptr) { delete ptr ; ptr = nullptr ; } } // sequence data and models if(seq_layer != nullptr) { delete seq_layer ; seq_layer = nullptr ; } } std::vector> EMJoint::get_read_models() const { std::vector> models ; for(const auto& ptr : this->read_layers) { models.push_back(ptr->get_model()) ; } return models ; } Matrix3D EMJoint::get_sequence_models() const { return this->seq_layer->get_model() ; } EMJoint::exit_codes EMJoint::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMJoint::exit_codes::ITER_MAX ; } void EMJoint::compute_loglikelihood() { // compute the loglikelihood for each layer size_t i = 0 ; for(auto& ptr : this->read_layers) { ptr->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; /* // sum the likelihood for each state, over all layers // this is the "joint likelihood" for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } */ // sum the likelihood for each state, over all layers // and rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // limite value range for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } done.set_value(true) ; } void EMJoint::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMJoint::update_models() { // read data and models for(auto& ptr : this->read_layers) { ptr->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } // sequence data and models this->seq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/EMJoint.hpp b/src/Clustering/EMJoint.hpp index ddbe7b9..41e4f9f 100644 --- a/src/Clustering/EMJoint.hpp +++ b/src/Clustering/EMJoint.hpp @@ -1,255 +1,283 @@ #ifndef EMJOINT_HPP #define EMJOINT_HPP #include #include #include #include #include #include #include #include typedef std::vector vector_d ; class EMJoint : public EMBase { public: /*! * \brief Constructs an object to partition the * region according to all the given read densities * with the given shifting and flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model + * the background by setting all its parameters, at all + * positions, to the background base probabilties (for + * sequence models) and the mean number of counts (for + * read signal models). Since the background is constant, + * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * with the given shifting and flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model + * the background by setting all its parameters, at all + * positions, to the background base probabilties (for + * sequence models) and the mean number of counts (for + * read signal models). Since the background is constant, + * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * and region sequences with the given shifting and * flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model + * the background by setting all its parameters, at all + * positions, to the background base probabilties (for + * sequence models) and the mean number of counts (for + * read signal models). Since the background is constant, + * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * and region sequences with the given shifting and * flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related * signal) for the regions of interest. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model + * the background by setting all its parameters, at all + * positions, to the background base probabilties (for + * sequence models) and the mean number of counts (for + * read signal models). Since the background is constant, + * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed="", size_t n_threads=0) ; EMJoint(const EMJoint& other) = delete ; /*! * \brief Destructor. */ virtual ~EMJoint() override ; /*! * \brief Returns all layer read models. * The models are in the same order * as the data were given to the * constructor. * \return a vector containing the * models. */ std::vector> get_read_models() const ; /*! * \brief Returns the sequence models. * \return a vector containing the * models. */ Matrix3D get_sequence_models() const ; /*! * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMJoint::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood() that * computes the joint loglikelihood by summing the * individual loglikelihood obtained from each data layer. * At the same time, this method rescales the loglikelihood * values by substacting to each value the maximum * loglikelihood value found in the same data row, * for each layer. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the number of data layers. */ size_t n_layer ; /*! * \brief the log likelihood buffers for each individual * layer (one element per layer). */ std::vector> loglikelihood_layer ; /*! * \brief the max loglikelihood value for * each each data layer (1st dimension) * and each data row of the given layer * (2nd dimension). */ std::vector loglikelihood_max ; /*! * \brief A vector containing the pointers * to the objects managing all the read * layer data and models. */ std::vector read_layers ; /*! * \brief A pointer to the object managing * the data and their model. */ SequenceLayer* seq_layer ; } ; #endif // EMJOINT_HPP diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp index 6535873..a5be742 100644 --- a/src/Clustering/EMRead.cpp +++ b/src/Clustering/EMRead.cpp @@ -1,309 +1,350 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include // ReadLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool -template -std::ostream& operator << (std::ostream& stream, const std::vector& v) -{ for(const auto x : v) - { stream << x << " " ; } - return stream ; -} +Matrix2D EMRead::generate_bckg_motif(const Matrix2D& read_matrix, + size_t motif_length) +{ + // sequence composition + double mean = 0. ; + double n = (double)read_matrix.get_nrow() * + (double)read_matrix.get_ncol() ; + for(size_t i=0; i bckg_motif(1,motif_length) ; + for(size_t j=0; j& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models this->read_layer = new ReadLayer(read_matrix, this->n_class, this->n_shift, flip, + bckg_class, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; + // compute background and + // overwrite last class as background class + if(bckg_class) + { size_t motif_len = read_matrix.get_ncol() - this->n_shift + 1 ; + Matrix2D bckg_motif = EMRead::generate_bckg_motif(read_matrix, + motif_len) ; + this->read_layer->set_class(this->n_class-1, + bckg_motif) ; + } } EMRead::EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; + + // compute background before giving data to + // ReadLayer + Matrix2D bckg_motif ; + if(bckg_class) + { size_t motif_len = read_matrix.get_ncol() - this->n_shift + 1 ; + bckg_motif = EMRead::generate_bckg_motif(read_matrix, + motif_len) ; + } + // data and models this->read_layer = new ReadLayer(std::move(read_matrix), this->n_class, this->n_shift, flip, + bckg_class, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; + + // overwrite last class as background class + if(bckg_class) + { this->read_layer->set_class(this->n_class-1, + bckg_motif) ; + } } EMRead::~EMRead() { if(this->read_layer == nullptr) { delete this->read_layer ; this->read_layer = nullptr ; } } Matrix3D EMRead::get_read_models() const { return read_layer->get_model() ; } EMRead::exit_codes EMRead::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; // std::cerr << this->post_prob_rowsum << std::endl ; // std::cerr << this->post_prob_colsum << std::endl ; this->compute_post_prob() ; // M-step // std::cerr << this->post_prob_rowsum << std::endl ; // std::cerr << this->post_prob_colsum << std::endl ; this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMRead::exit_codes::ITER_MAX ; } void EMRead::compute_loglikelihood() { // compute the loglikelihood this->read_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; /* // rescale the values for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } */ // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMRead::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMRead::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMRead::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { // avoid p=0. by rounding errors double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMRead::update_models() { this->read_layer->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMRead.hpp index 58f97af..fc2d51e 100644 --- a/src/Clustering/EMRead.hpp +++ b/src/Clustering/EMRead.hpp @@ -1,159 +1,183 @@ #ifndef EMREAD_HPP #define EMREAD_HPP #include #include #include #include // std::promise #include #include typedef std::vector vector_d ; class EMRead : public EMBase { public: + /*! + * \brief Generates a model with mean number of counts + * in the data,at each position. + * \param data the count data. + * \return the motif. Its dimensions are: + * 1st 1, a one row matrix + * 2nd the motif length. + */ + static Matrix2D generate_bckg_motif(const Matrix2D& read_matrix, + size_t motif_length) ; + + public: /*! * \brief Constructs an object to partition the * region (rows) according to the shape of the signal * with the given shifting and flipping freedom. * \param read_matrix a matrix containing the read * densitiy (ChIP-seq or related signal) for the * regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model the + * background by setting all its parameters, at all + * positions, to the mean number of counts. Since + * the background is constant, this class will never + * be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMRead(const Matrix2D& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region (rows) according to the shape of the signal * with the given shifting and flipping freedom. * \param read_matrix a matrix containing the read * densitiy (ChIP-seq or related signal) for the * regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. + * \param bckg_class the last class is used to model the + * background by setting all its parameters, at all + * positions, to the mean number of counts. Since + * the background is constant, this class will never + * be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, + bool bckg_class, const std::string& seed="", size_t n_threads=0) ; EMRead(const EMRead& other) = delete ; /*! * \brief Destructor. */ virtual ~EMRead() override ; /*! * \brief Returns the class read signal model. * \return the class read signal model. */ Matrix3D get_read_models() const ; /*! * \brief Runs the read signal model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMRead::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ ReadLayer* read_layer ; } ; #endif // EMREAD_HPP diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp index 89eba6f..fe69d06 100644 --- a/src/Clustering/EMSequence.cpp +++ b/src/Clustering/EMSequence.cpp @@ -1,398 +1,392 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include // SequenceLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool #include // dna::base_composition() +Matrix2D EMSequence::generate_bckg_motif(const Matrix2D& sequence_matrix, + size_t motif_length, + bool reverse_compl) +{ // sequence composition + std::vector base_comp = + dna::base_composition(sequence_matrix, + reverse_compl) ; + // create a motif + Matrix2D bckg_motif(4,motif_length) ; + for(size_t i=0; i& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; - // compute background before giving data to - // SequenceLayer - Matrix2D bckg_motif ; - if(bckg_class) - { // sequence composition - std::vector base_comp = - dna::base_composition(seq_matrix, - flip) ; - // create a motif - bckg_motif = Matrix2D(4, - seq_matrix.get_ncol()-this->n_shift+1) ; - for(size_t i=0; iseq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; + + // compute background and // overwrite last class as background class if(bckg_class) - { this->seq_layer->set_class(this->n_class-1, + { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; + Matrix2D bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, + motif_len, + this->flip) ; + this->seq_layer->set_class(this->n_class-1, bckg_motif) ; } } EMSequence::EMSequence(Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // compute background before giving data to // SequenceLayer Matrix2D bckg_motif ; if(bckg_class) - { // sequence composition - std::vector base_comp = - dna::base_composition(seq_matrix, - flip) ; - // create a motif - bckg_motif = Matrix2D(4, - seq_matrix.get_ncol()-this->n_shift+1) ; - for(size_t i=0; in_shift + 1 ; + bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, + motif_len, + this->flip) ; } // data and models this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; // overwrite last class as background class if(bckg_class) { this->seq_layer->set_class(this->n_class-1, bckg_motif) ; } } EMSequence::EMSequence(const Matrix2D& seq_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; - // initialise post prob randomly - // getRandomGenerator(seed) ; - // this->set_post_prob_random(seed) ; - // data and models + // background motif (if any) is the last of the given motifs this->seq_layer = new SequenceLayer(seq_matrix, motifs, this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::EMSequence(Matrix2D&& seq_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; - // initialise post prob randomly - // getRandomGenerator(seed) ; - // this->set_post_prob_random(seed) ; - // data and models + // background motif (if any) is the last of the given motifs this->seq_layer = new SequenceLayer(std::move(seq_matrix), std::move(motifs), this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::~EMSequence() { if(this->seq_layer == nullptr) { delete this->seq_layer ; this->seq_layer = nullptr ; } } Matrix3D EMSequence::get_sequence_models() const { return seq_layer->get_model() ; } EMSequence::exit_codes EMSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMSequence::exit_codes::ITER_MAX ; } void EMSequence::compute_loglikelihood() { // compute the loglikelihood this->seq_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], SequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMSequence::update_models() { this->seq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/EMSequence.hpp b/src/Clustering/EMSequence.hpp index f41f90b..8c177ac 100644 --- a/src/Clustering/EMSequence.hpp +++ b/src/Clustering/EMSequence.hpp @@ -1,238 +1,256 @@ #ifndef EMSEQUENCE_HPP #define EMSEQUENCE_HPP #include #include #include #include // std::promise #include #include typedef std::vector vector_d ; class EMSequence : public EMBase -{ public: +{ + public: + /*! + * \brief Generates a motif with the background + * base probabilites at each position. The background + * probabilities are computed from the data. + * \param sequence_matrix the sequence matrix in integer + * format. + * \param reverse_compl whether the reverse complement of + * the sequences should also be considered. + * \return the motif. Its dimensions are: + * 1st 4 for A,C,G,T + * 2nd the motif length. + */ + static Matrix2D generate_bckg_motif(const Matrix2D& sequence_matrix, + size_t motif_length, + bool reverse_compl) ; + + public: /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences models are initialised randomly. * \param sequence_matrix a matrix containing the sequences * of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the background * by setting all its parameters, at all positions, to the * background base probabilties. Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(const Matrix2D& sequence_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences models are initialised randomly. * \param sequence_matrix a matrix containing the sequences * of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the background * by setting all its parameters, at all positions, to the * background base probabilties. Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(Matrix2D&& sequence_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences class models are initialised using * the given motifs. The class probabilities are * initialised uniformlly. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param sequence_matrix a matrix containing the sequences * of interest. * \param motifs a matrix containing the different initial * class models with the following dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 for A,C,G,T * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param flip whether flipping is allowed. * \param bckg_class indicates that the last class in the * given motifs is used to model the background and it * should never be updated. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(const Matrix2D& sequence_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences class models are initialised using * the given motifs. The class probabilities are * initialised uniformlly. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param sequence_matrix a matrix containing the sequences * of interest. * \param motifs a matrix containing the different initial * class models with the following dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 for A,C,G,T * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param flip whether flipping is allowed. * \param bckg_class indicates that the last class in the * given motifs is used to model the background and it * should never be updated. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(Matrix2D&& sequence_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads=0) ; EMSequence(const EMSequence& other) = delete ; /*! * \brief Destructor. */ virtual ~EMSequence() override ; /*! * \brief Returns the class sequence model. * \return the class sequence model. */ Matrix3D get_sequence_models() const ; /*! * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMSequence::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ SequenceLayer* seq_layer ; } ; #endif // EMSEQUENCE_HPP diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp index 367e2b4..9555cf2 100644 --- a/src/Clustering/ReadLayer.cpp +++ b/src/Clustering/ReadLayer.cpp @@ -1,444 +1,468 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), exp(), pow() #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // beta_pmf(), poisson_pmf() #include // rand_real_uniform(), rand_int_uniform() #include #include #include #include #include typedef std::vector vector_d ; ReadLayer::ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, + bool last_class_cst, ThreadPool* threads) - : DataLayer(data, n_class, n_shift, flip), + : DataLayer(data, n_class, n_shift, flip, last_class_cst), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, + bool last_class_cst, ThreadPool* threads) - : DataLayer(std::move(data), n_class, n_shift, flip), + : DataLayer(std::move(data), n_class, n_shift, flip, last_class_cst), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, + bool last_class_cst, ThreadPool* threads) - : DataLayer(data, model, flip), + : DataLayer(data, model, flip, last_class_cst), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, + bool last_class_cst, ThreadPool* threads) - : DataLayer(std::move(data), std::move(model), flip), + : DataLayer(std::move(data), std::move(model), flip, last_class_cst), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::~ReadLayer() {} void ReadLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, std::ref(loglikelihood), std::ref(loglikelihood_max), promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const { // normalize the models Matrix3D model_norm = this->model ; for(size_t i=0; in_class; i++) { double mean = 0. ; for(size_t j=0; jl_model; j++) { mean += model_norm(i,j,0) ; } mean /= this->l_model ; for(size_t j=0; jl_model; j++) { model_norm(i,j,0) /= mean ; } } // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s_fw=0, s_rev=this->n_shift-1; s_fwn_shift; s_fw++, s_rev--) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood double ll_fw = 0. ; double ll_rev = 0. ; // --------------- forward --------------- size_t from_dat_fw = s_fw ; size_t to_dat_fw = from_dat_fw + this->l_model - 1 ; // --------------- reverse --------------- size_t from_dat_rev = this->n_col - 1 - s_fw ; // size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; j_dat_fwdata(i,j_dat_fw), model_norm(j,j_ref_fw,0)* this->window_means(i,s_fw))) ; // ll_fw += ll ; // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf ll_fw += std::max(ll, ReadLayer::p_min_log) ; // --------------- reverse --------------- if(this->flip) { ll = log(poisson_pmf(this->data(i,j_dat_rev), model_norm(j,j_ref_fw,0)* this->window_means(i,s_rev))) ; // ll_rev += ll ; // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf ll_rev += std::max(ll, ReadLayer::p_min_log) ; } } loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ; // keep track of the max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } if(this->flip) { loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ; // keep track of the max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // computing sum over the columns (classes) size_t n_row = posterior_prob.get_dim()[0] ; size_t n_class = posterior_prob.get_dim()[1] ; size_t n_shift = posterior_prob.get_dim()[2] ; size_t n_flip = posterior_prob.get_dim()[3] ; vector_d colsum(n_class, 0.) ; for(size_t i=0; iupdate_model(posterior_prob, colsum, threads) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, posterior_prob_colsum, promise) ; - this->model = future.get() ; + // this->model = future.get() ; + auto model = future.get() ; + size_t n_class_to_update = this->n_class - this->last_class_cst ; + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = model(i,j,k) ; } + } + } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(posterior_prob_colsum), std::ref(promises[i])))) ; } // reinitialise the model + /* this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; + */ + size_t n_class_to_update = this->n_class - this->last_class_cst ; + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = 0. ; } + } + } // wait until all threads are done working // and update the mode for(auto& future : futures) { Matrix3D model_part = future.get() ; - for(size_t i=0; in_class; i++) + // for(size_t i=0; in_class; i++) + for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // avoid 0's in the model to ensure that pmf_poisson() never // return 0 for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), ReadLayer::p_min) ; } } } } void ReadLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ; // partial model Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; - for(size_t n_class=0; n_class < this->n_class; n_class++) + size_t n_class_to_update = this->n_class - this->last_class_cst ; + + for(size_t n_class=0; n_class < n_class_to_update; n_class++) { for(size_t i=from; in_shift; n_shift++) { // --------------- forward --------------- int from_dat_fw = n_shift ; int to_dat_fw = from_dat_fw + this->l_model - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::FORWARD) * this->data(i,j_dat_fw)) / posterior_prob_colsum[n_class] ; } // --------------- reverse --------------- if(this->flip) { int from_dat_rev = this->n_col - 1 - n_shift ; int to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::REVERSE) * this->data(i,j_dat_rev)) / posterior_prob_colsum[n_class] ; } } } } } promise.set_value(model) ; } void ReadLayer::compute_window_means(ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_window_means_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_window_means_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_window_means_routine(size_t from, size_t to, std::promise& done) { double l_window = double(this->l_model) ; for(size_t i=from; in_shift; from++) { double sum = 0. ; // slice is [from,to) size_t to = from + this->l_model ; for(size_t j=from; jdata(i,j) ;} this->window_means(i,from) = sum / l_window ; } } done.set_value(true) ; } void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const { if(posterior_prob_colsum.size() != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_class_prob matrix size is not " "equal to model class number : %zu / %zu", posterior_prob_colsum.size(), this->n_class) ; throw std::invalid_argument(msg) ; } } diff --git a/src/Clustering/ReadLayer.hpp b/src/Clustering/ReadLayer.hpp index f5c12d7..d6deef6 100644 --- a/src/Clustering/ReadLayer.hpp +++ b/src/Clustering/ReadLayer.hpp @@ -1,244 +1,264 @@ #ifndef READLAYER_HPP #define READLAYER_HPP #include #include #include #include #include typedef std::vector vector_d ; class ReadLayer : public DataLayer { public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, + bool last_class_cst, ThreadPool* threads = nullptr) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, + bool last_class_cst, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, + bool last_class_cst, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. + * \param last_class_cst indicates that the + * last class of the model is constant + * and should never be updated by calls to + * update_model(). * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, + bool last_class_cst, ThreadPool* threads = nullptr) ; /*! * Destructor */ virtual ~ReadLayer() override ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * During this process, a normalized version of the * models, having a sum of signal of 1 count in average, * is used (a copy of the models is normalized, meaning * that the original models can still be retrieved the * dedicated getter). * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * This method does the same as the virtual method it * overloads. The only difference is that, for run time * gain, it is given the sum over the columns of the * posterior_prob matrix which is computed by the virtual * method. * \param posterior_prob the data assignment probabilities to * the different classes. * \param posterior_prob_colsum the sum over the columns * (classes) of the posterior_prob matrix. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads=nullptr) ; protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param promise a promise containing the partial model * computed from the given data slice. If several routines * work together to update the model, the promise matrices * need to be summed up to get the final model. */ void update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const ; /*! * \brief Computes the mean number of reads present in * each slice (of length l_model), in each row * of the data and store them in this->window_means. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void compute_window_means(ThreadPool* threads) ; /*! * \brief The routine that effectively computes the * window means. * \param from the index of the first row of the * data to considered. * \param to the index of the past last row of the * data to considered. * \param done a promise to fill when the routine * is done running. */ void compute_window_means_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_class_prob a vector containing the * class probabilities. * It should have a length equal to the number of * classes. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const ; /*! * \brief contains the data means, for * each window of size l_model. */ Matrix2D window_means ; } ; #endif // READLAYER_HPP diff --git a/src/Clustering/ReadModelComputer.cpp b/src/Clustering/ReadModelComputer.cpp index b429f0d..212af00 100644 --- a/src/Clustering/ReadModelComputer.cpp +++ b/src/Clustering/ReadModelComputer.cpp @@ -1,48 +1,49 @@ #include // std::move() #include #include #include #include #include #include ReadModelComputer::ReadModelComputer(Matrix2D&& data, const Matrix4D& post_prob, size_t n_threads) : ModelComputer(), threads(nullptr) { // parameters size_t n_class = post_prob.get_dim()[1] ; size_t n_shift = post_prob.get_dim()[2] ; size_t n_flip = post_prob.get_dim()[3] ; bool flip = n_flip == 2 ; // the threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } // the data and the model this->data_layer = new ReadLayer(std::move(data), n_class, n_shift, - flip) ; + flip, + false) ; this->data_layer->update_model(post_prob, this->threads) ; } ReadModelComputer::~ReadModelComputer() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // data and model if(this->data_layer != nullptr) { delete this->data_layer ; this->data_layer = nullptr ; } } diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp index fab5961..ae6aeaf 100644 --- a/src/Clustering/SequenceLayer.cpp +++ b/src/Clustering/SequenceLayer.cpp @@ -1,438 +1,411 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), pow() #include #include // std::max_element() #include // beta_pmf() #include // rand_real_uniform(), rand_int_uniform() #include #include #include #include double SequenceLayer::score_subseq(const Matrix2D& seq, size_t row, size_t start, const Matrix2D& model_log) { if(start > seq.get_ncol() - model_log.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", start, seq.get_ncol() - model_log.get_nrow()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_nrow() > seq.get_ncol()) { char msg[4096] ; sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", model_log.get_nrow(), seq.get_ncol()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_ncol() != 4) { char msg[4096] ; sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", model_log.get_ncol()) ; throw std::invalid_argument(msg) ; } size_t from = start ; size_t to = from + model_log.get_nrow() ; // will score [from, to) int n_code = dna::char_to_int('N') ; double ll = 0 ; for(size_t i=from, j=0; i get max score if(base == n_code) { std::vector row = model_log.get_row(j) ; ll += *(std::max_element(std::begin(row), std::end(row))) ; } // A,C,G,T -> get its score else { ll += model_log(j,base) ; } } return ll ; } SequenceLayer::SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool last_class_cst) - : DataLayer(data, n_class, n_shift, flip), - last_class_cst(last_class_cst) + : DataLayer(data, n_class, n_shift, flip,last_class_cst) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; } SequenceLayer::SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool last_class_cst) - : DataLayer(std::move(data), n_class, n_shift, flip), - last_class_cst(last_class_cst) + : DataLayer(std::move(data), n_class, n_shift, flip,last_class_cst) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; } SequenceLayer::SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool last_class_cst) - : DataLayer(data, model,flip), - last_class_cst(last_class_cst) + : DataLayer(data, model,flip,last_class_cst) {} SequenceLayer::SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool last_class_cst) - : DataLayer(std::move(data), std::move(model),flip), - last_class_cst(last_class_cst) + : DataLayer(std::move(data), std::move(model),flip,last_class_cst) {} SequenceLayer::~SequenceLayer() {} void SequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // compute the log prob model and the log prob reverse-complement model std::vector> model_log(this->n_class, Matrix2D(this->l_model, this->n_category, 0.)) ; std::vector> model_log_rev = model_log ; /* Matrix3D model_log(this->n_class, this->l_model, this->n_category, 0.) ; Matrix3D model_log_rev = model_log ; */ for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { // forward model_log[i](j,k) = log(this->model(i,j,k)) ; // reverse model_log_rev[i](this->l_model-j-1,this->n_category-k-1) = log(this->model(i,j,k)) ; } } } // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, loglikelihood, loglikelihood_max, model_log, model_log_rev, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(model_log), std::ref(model_log_rev), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void SequenceLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const { // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { // forward strand { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ; loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // keep track of max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } } // reverse if(this->flip) { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ; loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // keep track of max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void SequenceLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, promise) ; // this->model = future.get() ; auto model = future.get() ; size_t n_class_to_update = this->n_class - this->last_class_cst ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(promises[i])))) ; } // reinitialise the model /* this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; */ size_t n_class_to_update = this->n_class - this->last_class_cst ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the model for(auto& future : futures) { Matrix3D model_part = future.get() ; - for(size_t i=0; in_class; i++) + for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // make sure to have no 0 values for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), SequenceLayer::p_min) ; } } } // normalize to get probs for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { double sum = 0. ; for(size_t k=0; kn_category; k++) { sum += this->model(i,j,k) ; } for(size_t k=0; kn_category; k++) { double p = this->model(i,j,k) / sum ; this->model(i,j,k) = p ; /* this->model(i,j,k) = std::max(p, SequenceLayer::p_min) ; */ } } } } void SequenceLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; // the int code of A, C, G, T, N static int a_code = dna::char_to_int('A') ; static int c_code = dna::char_to_int('C') ; static int g_code = dna::char_to_int('G') ; static int t_code = dna::char_to_int('T') ; static int n_code = dna::char_to_int('N') ; // the int code of the reverse complement of A, C, G, T static int a_code_r = dna::char_to_int('A', true) ; static int c_code_r = dna::char_to_int('C', true) ; static int g_code_r = dna::char_to_int('G', true) ; static int t_code_r = dna::char_to_int('T', true) ; size_t n_class_to_update = this->n_class - this->last_class_cst ; for(size_t k=0; k < n_class_to_update; k++) // for(size_t k=0; k < this->n_class; k++) { for(size_t s=0; sn_shift; s++) { for(size_t j=0; jl_model; j++) { // base prob on fw and rv strand vector_d base_prob_fw(this->n_category, 0.) ; vector_d base_prob_rv(this->n_category, 0.) ; for(size_t i=from; idata(i,s+j) ; int base_rev = this->n_category - base - 1 ; // N if(base == n_code) { // --------------- forward --------------- { base_prob_fw[a_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[c_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[g_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[t_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[a_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[c_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[g_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[t_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } // A, C, G, T else { { base_prob_fw[base] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[base_rev] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } } // update this position of the model for(size_t i=0,i_rev=base_prob_fw.size()-1; iflip) { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } } } } } promise.set_value(model) ; } - -void SequenceLayer::set_class(size_t i, const Matrix2D& motif) -{ // check dimensions - if(motif.get_nrow() != this->n_category) - { char msg[4096] ; - sprintf(msg, "Error! the given class model is incompatible " - "with the SequenceLayer (%zu rows instead of %zu)", - motif.get_nrow(), this->n_category) ; - throw std::invalid_argument(msg) ; - } - else if(motif.get_ncol() != this->l_model) - { char msg[4096] ; - sprintf(msg, "Error! the given class model is incompatible " - "with the SequenceLayer (%zu columns instead of %zu)", - motif.get_ncol(), this->l_model) ; - throw std::invalid_argument(msg) ; - } - - for(size_t j=0; jmodel(i,j,k) = motif(k,j) ; } - } -} diff --git a/src/Clustering/SequenceLayer.hpp b/src/Clustering/SequenceLayer.hpp index fcf3628..1a1532b 100644 --- a/src/Clustering/SequenceLayer.hpp +++ b/src/Clustering/SequenceLayer.hpp @@ -1,250 +1,228 @@ #ifndef SEQUENCELAYER_HPP #define SEQUENCELAYER_HPP #include #include #include #include // std::promise, std::future #include #include #include #include typedef std::vector vector_d ; class SequenceLayer : public DataLayer { public: /*! * \brief Computes the log-likelihood of the sub- * sequence - stored in a given row - and starting * at the offset in the given sequence matrix. * The subsequence length is determined by the model * lenght. * \param seq the sequences in integer format. * \param row the row containing the sequence of * interest. * \param col the index at which the sub-sequence * is starting (1st index inside the subsequence * of interest). * \param model_log a model containing the log * probability model. * \return the log-likelihood of the sub-sequence * given the model. * \throw std::invalid_argument if 1) the offset is * invalid, 2) the sequence and the model have * incompatible dimensions or 3) the model 2n dimension * is not 4 (A,C,G,T). */ static double score_subseq(const Matrix2D& seq, size_t row, size_t col, const Matrix2D& model_log) ; public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param last_class_cst indicates that the * last class of the model is constant - * and will never be updated by calls to + * and should never be updated by calls to * update_model(). */ SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool last_class_cst) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param last_class_cst indicates that the * last class of the model is constant - * and will never be updated by calls to + * and should never be updated by calls to * update_model(). */ SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool last_class_cst) ; /*! * \brief Construct an object with the * given data and model. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. * \param model the model with the following * dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. * \param last_class_cst indicates that the * last class of the model is constant - * and will never be updated by calls to + * and should never be updated by calls to * update_model(). */ SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool last_class_cst) ; /*! * \brief Construct an object with the * given data and model. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. * \param model the model with the following * dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. * \param last_class_cst indicates that the * last class of the model is constant - * and will never be updated by calls to + * and should never be updated by calls to * update_model(). */ SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool last_class_cst) ; /*! * Destructor */ virtual ~SequenceLayer() override ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; - /*! - * \brief Modify the values of th given class - * with the given parameters. - * The given motif should have the same length - * as the current model classes. - * \param i the index of the class to modify, 0-based. - * \param motif the new parameters values. - * Its dimensions should be : - * 1st : 4 for A,C,G,T - * 2nd : the model length. - * \throw std::invalid_argument if the dimensions are not - * compatible with the current model classes. - */ - void set_class(size_t i, const Matrix2D& motif) ; - protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param model_log a vector containing the matrices with * the log values of the model for each class. * \param model_log_rev a vector containing the matrices with * the log values of the reverse strand model for each class * (the 1st position in the model becomes the last in the * reverse model and probabilities are swapped A<->T and C<->G). * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param done a promise containing the partial model * computed from the given data slice. If several routines * work together at updating the model, they need to be * summed and normalized (by the column sum) to get the * final model. */ void update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& done) const ; - /*! - * \brief A flag indicating that the last class of the model - * is constant and should not be updated when calling - * update_model(). - */ - bool last_class_cst ; - } ; #endif // SEQUENCELAYER_HPP