diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R index 9d94afc..d33a69e 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R @@ -1,83 +1,83 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(motifStack) library(TFBSTools) library(MotifDb) # functions source(file.path("scripts", "functions.R")) get_pfm_list = function(motifs, prefix_name) { pfm_list = list() for(i in 1:dim(motifs)[3]) { pfm_list[[i]] = new("pfm", mat=motifs[,,i], name=sprintf("%s %d", prefix_name, i)) } return(pfm_list) } # number of classes searched in the data -n_classes = c(10,30) +n_classes = c(10,20,30) for(n_class in n_classes) { # get MEME motifs files_meme = list.files(file.path("results", "10xgenomics_PBMC_5k_peaks_meme", sprintf("%d_motifs", n_class)), "mat", full.names=T) motifs_meme = lapply(files_meme, read.table) motifs_meme = lapply(motifs_meme, as.matrix) motifs_meme = lapply(motifs_meme, t) tmp = list() for(i in 1:length(motifs_meme)) { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") tmp[[i]] = new("pfm", mat=motifs_meme[[i]], name=sprintf("MEME %d", i)) } motifs_meme = tmp rm(tmp) # get EM discovered motifs motifs_found = get_pfm_list(read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_1", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, "EM") # colors red = brewer.pal(3, "Set1")[1] blue = brewer.pal(3, "Set1")[2] color = c(rep(blue, length(motifs_meme)), rep(red, length(motifs_found))) # draw tree # X11(height=12, width=12) png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_1", sprintf("%dclass_motif_tree.png", n_class)), units="in", res=720, width=14, height=14) motifStack(c(motifs_meme, motifs_found), layout="radialPhylog", circle=0.3, cleaves = 0.2, clabel.leaves = 0.5, col.bg=color, col.bg.alpha=0.3, col.leaves=color, col.inner.label.circle=color, inner.label.circle.width=0.05, col.outer.label.circle=color, outer.label.circle.width=0.02, circle.motif=.5, angle=350) dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R index f49336b..ba033cc 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R @@ -1,83 +1,83 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(motifStack) library(TFBSTools) library(MotifDb) # functions source(file.path("scripts", "functions.R")) get_pfm_list = function(motifs, prefix_name) { pfm_list = list() for(i in 1:dim(motifs)[3]) { pfm_list[[i]] = new("pfm", mat=motifs[,,i], name=sprintf("%s %d", prefix_name, i)) } return(pfm_list) } # number of classes searched in the data -n_classes = c(10,30) +n_classes = c(10,20,30) for(n_class in n_classes) { # get MEME motifs files_meme = list.files(file.path("results", "10xgenomics_PBMC_5k_peaks_meme", sprintf("%d_motifs", n_class)), "mat", full.names=T) motifs_meme = lapply(files_meme, read.table) motifs_meme = lapply(motifs_meme, as.matrix) motifs_meme = lapply(motifs_meme, t) tmp = list() for(i in 1:length(motifs_meme)) { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") tmp[[i]] = new("pfm", mat=motifs_meme[[i]], name=sprintf("MEME %d", i)) } motifs_meme = tmp rm(tmp) # get EM discovered motifs motifs_found = get_pfm_list(read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_2", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, "EM") # colors red = brewer.pal(3, "Set1")[1] blue = brewer.pal(3, "Set1")[2] color = c(rep(blue, length(motifs_meme)), rep(red, length(motifs_found))) # draw tree # X11(height=12, width=12) png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_2", sprintf("%dclass_motif_tree.png", n_class)), units="in", res=720, width=14, height=14) motifStack(c(motifs_meme, motifs_found), layout="radialPhylog", circle=0.3, cleaves = 0.2, clabel.leaves = 0.5, col.bg=color, col.bg.alpha=0.3, col.leaves=color, col.inner.label.circle=color, inner.label.circle.width=0.05, col.outer.label.circle=color, outer.label.circle.width=0.02, circle.motif=.5, angle=350) dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R index 27dcef6..1a559dd 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R @@ -1,83 +1,83 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(motifStack) library(TFBSTools) library(MotifDb) # functions source(file.path("scripts", "functions.R")) get_pfm_list = function(motifs, prefix_name) { pfm_list = list() for(i in 1:dim(motifs)[3]) { pfm_list[[i]] = new("pfm", mat=motifs[,,i], name=sprintf("%s %d", prefix_name, i)) } return(pfm_list) } # number of classes searched in the data -n_classes = c(10,30) +n_classes = c(10,20,30) for(n_class in n_classes) { # get MEME motifs files_meme = list.files(file.path("results", "10xgenomics_PBMC_5k_peaks_meme", sprintf("%d_motifs", n_class)), "mat", full.names=T) motifs_meme = lapply(files_meme, read.table) motifs_meme = lapply(motifs_meme, as.matrix) motifs_meme = lapply(motifs_meme, t) tmp = list() for(i in 1:length(motifs_meme)) { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") tmp[[i]] = new("pfm", mat=motifs_meme[[i]], name=sprintf("MEME %d", i)) } motifs_meme = tmp rm(tmp) # get EM discovered motifs motifs_found = get_pfm_list(read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_3", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, "EM") # colors red = brewer.pal(3, "Set1")[1] blue = brewer.pal(3, "Set1")[2] color = c(rep(blue, length(motifs_meme)), rep(red, length(motifs_found))) # draw tree # X11(height=12, width=12) png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_3", sprintf("%dclass_motif_tree.png", n_class)), units="in", res=720, width=14, height=14) motifStack(c(motifs_meme, motifs_found), layout="radialPhylog", circle=0.3, cleaves = 0.2, clabel.leaves = 0.5, col.bg=color, col.bg.alpha=0.3, col.leaves=color, col.inner.label.circle=color, inner.label.circle.width=0.05, col.outer.label.circle=color, outer.label.circle.width=0.02, circle.motif=.5, angle=350) dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R index 7a1904d..7aacc18 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R @@ -1,83 +1,83 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(motifStack) library(TFBSTools) library(MotifDb) # functions source(file.path("scripts", "functions.R")) get_pfm_list = function(motifs, prefix_name) { pfm_list = list() for(i in 1:dim(motifs)[3]) { pfm_list[[i]] = new("pfm", mat=motifs[,,i], name=sprintf("%s %d", prefix_name, i)) } return(pfm_list) } # number of classes searched in the data -n_classes = c(30) +n_classes = c(10,20,30) for(n_class in n_classes) { # get MEME motifs files_meme = list.files(file.path("results", "10xgenomics_PBMC_5k_peaks_meme", sprintf("%d_motifs", n_class)), "mat", full.names=T) motifs_meme = lapply(files_meme, read.table) motifs_meme = lapply(motifs_meme, as.matrix) motifs_meme = lapply(motifs_meme, t) tmp = list() for(i in 1:length(motifs_meme)) { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") tmp[[i]] = new("pfm", mat=motifs_meme[[i]], name=sprintf("MEME %d", i)) } motifs_meme = tmp rm(tmp) # get EM discovered motifs motifs_found = get_pfm_list(read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_5", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, "EM") # colors red = brewer.pal(3, "Set1")[1] blue = brewer.pal(3, "Set1")[2] color = c(rep(blue, length(motifs_meme)), rep(red, length(motifs_found))) # draw tree # X11(height=12, width=12) png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_5", sprintf("%dclass_motif_tree.png", n_class)), units="in", res=720, width=14, height=14) motifStack(c(motifs_meme, motifs_found), layout="radialPhylog", circle=0.3, cleaves = 0.2, clabel.leaves = 0.5, col.bg=color, col.bg.alpha=0.3, col.leaves=color, col.inner.label.circle=color, inner.label.circle.width=0.05, col.outer.label.circle=color, outer.label.circle.width=0.02, circle.motif=.5, angle=350) dev.off() } diff --git a/src/Applications/ClassCorrelationMatrixCreatorApplication.cpp b/src/Applications/ClassReadDataCreatorApplication.cpp similarity index 64% rename from src/Applications/ClassCorrelationMatrixCreatorApplication.cpp rename to src/Applications/ClassReadDataCreatorApplication.cpp index ad233ae..ce04d46 100644 --- a/src/Applications/ClassCorrelationMatrixCreatorApplication.cpp +++ b/src/Applications/ClassReadDataCreatorApplication.cpp @@ -1,296 +1,215 @@ -#include -#include +#include +#include #include #include #include #include #include // std::invalid_argument #include namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; -ClassCorrelationMatrixCreatorApplication::ClassCorrelationMatrixCreatorApplication(int argn, char** argv) +ClassReadDataCreatorApplication::ClassReadDataCreatorApplication(int argn, char** argv) : file_bed(""), file_bam(""), file_bai(""), file_prob(""), from(0), to(0), bin_size(0), class_k(0), method(CorrelationMatrixCreator::FRAGMENT), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } -int ClassCorrelationMatrixCreatorApplication::run() -{ if(this->runnable) - { - // load posterior prob - Matrix4D prob(this->file_prob) ; - // prob.load(this->file_prob) ; - size_t n_row = prob.get_dim()[0] ; - size_t n_class = prob.get_dim()[1] ; - size_t n_shift = prob.get_dim()[2] ; - size_t n_flip = prob.get_dim()[3] ; - bool flip = n_flip == 2 ; - - // compute class prob - std::vector prob_colsum(n_class, 0.) ; - double tot = 0. ; - for(size_t i=0; i 0-based) - this->class_k -= 1 ; - if(this->class_k > n_class) - { char msg[4096] ; - sprintf(msg, "Error! invalid class of interest (%zu and %zu found)", - this->class_k, n_class) ; - throw std::invalid_argument(msg) ; - } - - // extend limits - int ext = n_shift - 1 ; - int ext_r = ext / 2 ; - int ext_l = ext - ext_r ; - int from2 = this->from - ext_l ; - int to2 = this->to + ext_r ; - CorrelationMatrixCreator mc(this->file_bed, - this->file_bam, - this->file_bai, - from2, - to2, - this->bin_size, - this->method) ; - Matrix2D data2 = mc.create_matrix() ; - size_t n_col2 = data2.get_ncol() ; - - // realign matrix - size_t n_col3 = this->to - this->from + 1 ; - Matrix2D data3(n_row, - n_col3, - 0.) ; - for(size_t i=0; i= to_dat2_rev; - j_dat2_rev--, j_dat3_fw++) - { data3(i,j_dat3_fw) += - (prob(i,class_k,s,1) * - data2(i,j_dat2_rev)) / - prob_colsum[class_k] ; - } - } - } - } - // clean memory - prob = Matrix4D() ; - - // convert to integer - Matrix2D data4(data3.get_nrow(), - data3.get_ncol()) ; - for(size_t i=0; i< data4.get_nrow(); i++) - { for(size_t j=0; jrunnable) + { std::cerr << "ClassReadDataCreatorApplication::run() run" << std::endl ; + ClassReadDataCreator crc(this->file_bed, + this->file_bam, + this->file_bai, + this->file_prob, + this->from, + this->to, + this->bin_size, + this->class_k, + this->method) ; + + // display integer matrix for given class + std::cout << crc.create_matrix() << std::endl ; + std::cerr << "ClassReadDataCreatorApplication::run() END" << std::endl ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } -void ClassCorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv) +void ClassReadDataCreatorApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" - "ClassCorrelationMatrixCreator is an application that retro-compute\n" + "ClassReadDataCreator is an application that retro-compute\n" "a fictive count matrix of a given class K, from a partition, assuming\n" "that the classification is true.\n" "To construct the final matrix M3, the original matrix M1 that was\n" "partitioned is computed and extended into a matrix M2. The original\n" "matrix M1 has L1 column and was partitioned allowing S shifting freedom.\n" "A number of extra columns E, overall, is added to the matrix M1 (~E/2 on\n" "each side). The final matrix M3, of L1 columns, is then computed. A row\n" "of the final matrix M3 is the weighted average of each of the S possibles\n" "slices of the corresponding row in M2. The weight used are th\n" "probabilities with which this row was assigned to class K, for each of\n" - "the S slices." ; + "the S slices.\n" + "Thus, the final result is a 2D matrix of dimensions N,L where N is the\n" + "number of regions and L the number of bins in which the regions have\n" + "divided\n." + "The resulting 2D matrix is returned through stdout in text format." ; std::string opt_help_msg = "Produces this help message." ; std::string opt_bed_msg = "The path to the BED file containing the references."; std::string opt_bam_msg = "The path to the BAM file containing the targets."; std::string opt_bai_msg = "The path to the BAI file containing the BAM file index."; std::string opt_prob_msg = "The path to the file containing the assignment probabilities\n" "(the partition)." ; std::string opt_from_msg = "The most upstream position - in relative coordinate - of the regions\n" "in the original matrix." ; std::string opt_to_msg = "The most downstream position - in relative coordinate - of the regions\n" "in the original matrix." ; std::string opt_classk_msg = "The index (1-based) of the class of interest." ; std::string opt_binsize_msg = "The size of the bins, in the original matrix." ; char tmp[4096] ; sprintf(tmp, "How the data in the BAM file should be handled when computing\n" "the number of counts in each bin.\n" "\t\"%s\" uses each position within the reads (by default)\n" "\t\"%s\" uses only the insertion site for ATAC-seq data\n" "\t\"%s\" uses each position within the fragments\n" "\t\"%s\" uses only the fragment central positions\n", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; std::string opt_method_msg = tmp ; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string method(method_read) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) - ("prob", po::value(&(this->file_prob)), opt_bai_msg.c_str()) + ("prob", po::value(&(this->file_prob)), opt_prob_msg.c_str()) ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) ("k", po::value(&(this->class_k)), opt_classk_msg.c_str()) ("method", po::value(&(method)), opt_method_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No probability (partition) file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->class_k == 0 and (not help)) { std::string msg("Error! no class of interest was given (--k)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) -{ ClassCorrelationMatrixCreatorApplication app(argn, argv) ; +{ ClassReadDataCreatorApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ClassCorrelationMatrixCreatorApplication.hpp b/src/Applications/ClassReadDataCreatorApplication.hpp similarity index 80% copy from src/Applications/ClassCorrelationMatrixCreatorApplication.hpp copy to src/Applications/ClassReadDataCreatorApplication.hpp index edaea0b..e54652a 100644 --- a/src/Applications/ClassCorrelationMatrixCreatorApplication.hpp +++ b/src/Applications/ClassReadDataCreatorApplication.hpp @@ -1,99 +1,104 @@ -#ifndef CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP -#define CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP +#ifndef CLASSREADDATACREATORAPPLICATION_HPP +#define CLASSREADDATACREATORAPPLICATION_HPP #include #include #include // CorrelationMatrixCreator::methods -class ClassCorrelationMatrixCreatorApplication : public ApplicationInterface +/*! + * \brief The ClassReadDataCreatorApplication is a wrapper + * around a ClassReadDataCreator that creates a matrix representing + * the data from a given class of a given partition. + */ +class ClassReadDataCreatorApplication : public ApplicationInterface { public: - ClassCorrelationMatrixCreatorApplication() = delete ; - ClassCorrelationMatrixCreatorApplication( - const ClassCorrelationMatrixCreatorApplication& app) = delete ; + ClassReadDataCreatorApplication() = delete ; + ClassReadDataCreatorApplication( + const ClassReadDataCreatorApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - ClassCorrelationMatrixCreatorApplication(int argn, char** argv) ; + ClassReadDataCreatorApplication(int argn, char** argv) ; /*! * \brief TOFO * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the bam file. */ std::string file_bam ; /*! * \brief the path to the bam index file. */ std::string file_bai ; /*! * \brief the path to the posterior probability * file (the partition). */ std::string file_prob ; /*! * \brief the coordinate of the most upstream * position that was in the original matrix, in * relative coordinate. */ int from ; /*! * \brief the coordinate of the most downstream * position that was in the original matrix, in * relative coordinate. */ int to ; /*! * \brief the size of the bin that was used for * the original matrix. */ int bin_size ; /*! * \brief the class of interest (0-based). */ size_t class_k ; /*! * \brief How to consider the sequenced fragments when computing * the bin values. */ CorrelationMatrixCreator::methods method ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; -#endif // CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP +#endif // CLASSREADDATACREATORAPPLICATION_HPP diff --git a/src/Applications/ClassSequenceDataCreatorApplication.cpp b/src/Applications/ClassSequenceDataCreatorApplication.cpp new file mode 100644 index 0000000..495284f --- /dev/null +++ b/src/Applications/ClassSequenceDataCreatorApplication.cpp @@ -0,0 +1,164 @@ + +#include +#include + +#include +#include +#include +#include +#include // std::invalid_argument + +#include + +namespace po = boost::program_options ; + +ClassSequenceDataCreatorApplication::ClassSequenceDataCreatorApplication(int argn, char** argv) + : file_bed(""), file_fasta(""), file_prob(""), file_out(""), + from(0), to(0), class_k(0), + runnable(true) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} + +int ClassSequenceDataCreatorApplication::run() +{ if(this->runnable) + { + ClassSequenceDataCreator csc(this->file_bed, + this->file_fasta, + this->file_prob, + this->from, + this->to, + this->class_k) ; + + // display integer matrix for given class + csc.create_matrix().save(this->file_out) ; + + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE ; } +} + +void ClassSequenceDataCreatorApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "ClassSequenceDataCreator is an application that retro-compute\n" + "a fictive sequence matrix of a given class K, from a partition, assuming\n" + "that the classification is true.\n" + "To construct the final matrix M3, the original matrix M1 that was\n" + "partitioned is computed and extended into a matrix M2. The original\n" + "matrix M1 has L1 column and was partitioned allowing S shifting freedom.\n" + "A number of extra columns E, overall, is added to the matrix M1 (~E/2 on\n" + "each side). The final matrix M3, of L1 columns, is then computed. A row\n" + "of the final matrix M3 is the weighted average of each of the S possibles\n" + "slices of the corresponding row in M2. The weight used are the\n" + "probabilities with which this row was assigned to class K, for each of\n" + "the S slices. Because sequences cannot be averaged (for instance\n" + "0.5*AAA + 0.5*CCC), the final result is a collection of N consensus\n" + "sequences represented as a sequence probability matrix where N is\n" + "the number of original sequences in M1 (also in M2).\n" + "Thus the final result is a 3D matrix of dimensions N,L,4 where N\n" + "is the number of consensus sequences, L the length of the consensus\n" + "sequences and 4 for A,C,G,T.\n" + "The resulting 3D matrix is written to a given file in binary format." ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_bed_msg = "The path to the BED file containing the references."; + std::string opt_fasta_msg = "The path to the fasta file containing the sequences."; + std::string opt_prob_msg = "The path to the file containing the assignment probabilities\n" + "(the partition)." ; + std::string opt_out_msg = "The path to the file in which the results will be written (3D matrix\n" + "in binary format)." ; + std::string opt_from_msg = "The most upstream position - in relative coordinate - of the regions\n" + "in the original matrix." ; + std::string opt_to_msg = "The most downstream position - in relative coordinate - of the regions\n" + "in the original matrix." ; + std::string opt_classk_msg = "The index (1-based) of the class of interest." ; + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) + ("fasta", po::value(&(this->file_fasta)), opt_fasta_msg.c_str()) + ("prob", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + + ("out", po::value(&(this->file_out)), opt_out_msg.c_str()) + + ("from", po::value(&(this->from)), opt_from_msg.c_str()) + ("to", po::value(&(this->to)), opt_to_msg.c_str()) + ("k", po::value(&(this->class_k)), opt_classk_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_bed == "" and (not help)) + { std::string msg("Error! No BED file was given (--bed)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_fasta == "" and (not help)) + { std::string msg("Error! No fasta file was given (--fasta)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_prob == "" and (not help)) + { std::string msg("Error! No probability (partition) file was given (--prob)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_out == "" and (not help)) + { std::string msg("Error! No output file was given (--out)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from == 0 and this->to == 0 and (not help)) + { std::string msg("Error! No range given (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from >= this->to and (not help)) + { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->class_k == 0 and (not help)) + { std::string msg("Error! no class of interest was given (--k)!") ; + throw std::invalid_argument(msg) ; + } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + + +int main(int argn, char** argv) +{ ClassSequenceDataCreatorApplication app(argn, argv) ; + return app.run() ; +} + diff --git a/src/Applications/ClassCorrelationMatrixCreatorApplication.hpp b/src/Applications/ClassSequenceDataCreatorApplication.hpp similarity index 65% rename from src/Applications/ClassCorrelationMatrixCreatorApplication.hpp rename to src/Applications/ClassSequenceDataCreatorApplication.hpp index edaea0b..3277169 100644 --- a/src/Applications/ClassCorrelationMatrixCreatorApplication.hpp +++ b/src/Applications/ClassSequenceDataCreatorApplication.hpp @@ -1,99 +1,95 @@ -#ifndef CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP -#define CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP +#ifndef CLASSSEQUENCEDATACREATORAPPLICATION_HPP +#define CLASSSEQUENCEDATACREATORAPPLICATION_HPP #include #include -#include // CorrelationMatrixCreator::methods +#include // SequenceMatrixCreator::methods -class ClassCorrelationMatrixCreatorApplication : public ApplicationInterface +/*! + * \brief The ClassSequenceDataCreatorApplication is a wrapper + * around a ClassSequenceDataCreator that creates a matrix representing + * the data from a given class from a given partition. + */ +class ClassSequenceDataCreatorApplication : public ApplicationInterface { public: - ClassCorrelationMatrixCreatorApplication() = delete ; - ClassCorrelationMatrixCreatorApplication( - const ClassCorrelationMatrixCreatorApplication& app) = delete ; + ClassSequenceDataCreatorApplication() = delete ; + ClassSequenceDataCreatorApplication( + const ClassSequenceDataCreatorApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - ClassCorrelationMatrixCreatorApplication(int argn, char** argv) ; + ClassSequenceDataCreatorApplication(int argn, char** argv) ; /*! - * \brief TOFO + * \brief TODO * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! - * \brief the path to the bam file. + * \brief the path to the fasta file. */ - std::string file_bam ; - /*! - * \brief the path to the bam index file. - */ - std::string file_bai ; + std::string file_fasta ; /*! * \brief the path to the posterior probability * file (the partition). */ std::string file_prob ; + /*! + * \brief the path to the file in which the + * results will be written. + */ + std::string file_out ; /*! * \brief the coordinate of the most upstream * position that was in the original matrix, in * relative coordinate. */ int from ; /*! * \brief the coordinate of the most downstream * position that was in the original matrix, in * relative coordinate. */ int to ; - /*! - * \brief the size of the bin that was used for - * the original matrix. - */ - int bin_size ; /*! * \brief the class of interest (0-based). */ size_t class_k ; - /*! - * \brief How to consider the sequenced fragments when computing - * the bin values. - */ - CorrelationMatrixCreator::methods method ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; -#endif // CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP +#endif // CLASSSEQUENCEDATACREATORAPPLICATION_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4b0d893..294e25c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,128 +1,141 @@ # compiler options add_compile_options(-std=c++14) add_compile_options(-O3) add_compile_options(-Wall) add_compile_options(-Wextra) add_compile_options(-Werror) add_compile_options(-Wfatal-errors) add_compile_options(-pedantic) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") add_definitions (${SEQAN_DEFINITIONS}) # include file location include_directories(${Boost_INCLUDE_DIRS}) include_directories(${SEQAN_INCLUDE_DIRS}) include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/Clustering") include_directories("${scATACseq_SOURCE_DIR}/src/Random") include_directories("${scATACseq_SOURCE_DIR}/src/Parallel") include_directories("${scATACseq_SOURCE_DIR}/src/Statistics") include_directories("${scATACseq_SOURCE_DIR}/src/GUI") include_directories("${scATACseq_SOURCE_DIR}/src/Applications") include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools") include_directories("${scATACseq_SOURCE_DIR}/src/Utility") # compile modules into static libraries ## set output directory set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib") ## build instructions add_library(Clustering "Clustering/DataLayer.cpp" + "Clustering/Data2DLayer.cpp" + "Clustering/Data3DLayer.cpp" "Clustering/ReadLayer.cpp" "Clustering/SequenceLayer.cpp" + "Clustering/ConsensusSequenceLayer.cpp" "Clustering/ModelComputer.cpp" "Clustering/ReadModelComputer.cpp" "Clustering/SequenceModelComputer.cpp" "Clustering/EMBase.cpp" "Clustering/EMRead.cpp" "Clustering/EMSequence.cpp" + "Clustering/EMConsensusSequence.cpp" "Clustering/EMJoint.cpp") add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp") add_library(Parallel "Parallel/ThreadPool.cpp") add_library(Statistics "Statistics/Statistics.cpp") add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp") add_library(GenomicTools "GenomicTools/MatrixCreator.cpp" "GenomicTools/ReadMatrixCreator.cpp" "GenomicTools/CorrelationMatrixCreator.cpp" + "GenomicTools/ClassReadDataCreator.cpp" + "GenomicTools/ClassSequenceDataCreator.cpp" "GenomicTools/SequenceMatrixCreator.cpp" "GenomicTools/GenomeRegion.cpp") add_library(Utility "Utility/matrices.cpp" "Utility/dna_utility.cpp") ## resolve dependencies target_link_libraries(Utility ${SEQAN_LIBRARIES}) target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES}) target_link_libraries(Parallel Threads::Threads) target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES}) # executables ## a toy for seqan set(EXE_MAIN_TEST "main_test") add_executable(${EXE_MAIN_TEST} "main_test.cpp") target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering) set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a toy for correlation matrix set(EXE_MAIN_CORMAT "main_cormat") add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp") target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random) set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a matrix from BED and a BAM file set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator") add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a sequence matrix from BED and a fasta file set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator") add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMRead standalone set(EXE_EMREAD "EMRead") add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMREAD} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMSequence standalone set(EXE_EMSEQ "EMSequence") add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMJoint standalone set(EXE_EMJOINT "EMJoint") add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to compute data models from the data and the post prob of an EM classification set(EXE_PROB2REF "ProbToModel") add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options) set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_READMODELEXTENDER "ReadModelExtender") add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender") add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") -## an executable to extend read models from an EM classification -set(EXE_CLASSCORRELATIONMATRIXCREATOR "ClassCorrelationMatrixCreator") -add_executable(${EXE_CLASSCORRELATIONMATRIXCREATOR} "Applications/ClassCorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") -target_link_libraries(${EXE_CLASSCORRELATIONMATRIXCREATOR} Clustering GenomicTools Utility Boost::program_options) -set_target_properties(${EXE_CLASSCORRELATIONMATRIXCREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an executable to extract read data from an EM classification +set(EXE_CLASSREADDATACREATOR "ClassReadDataCreator") +add_executable(${EXE_CLASSREADDATACREATOR} "Applications/ClassReadDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_CLASSREADDATACREATOR} Clustering GenomicTools Utility Boost::program_options) +set_target_properties(${EXE_CLASSREADDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") + +## an executable to extract sequence data from an EM classification +set(EXE_CLASSSEQDATACREATOR "ClassSequenceDataCreator") +add_executable(${EXE_CLASSSEQDATACREATOR} "Applications/ClassSequenceDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_CLASSSEQDATACREATOR} Clustering GenomicTools Utility Boost::program_options) +set_target_properties(${EXE_CLASSSEQDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") + ## a test suite set(EXE_TESTS "unittests") add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp" "Unittests/unittests_genomictools.cpp") target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools) set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp index e69de29..26c8e18 100644 --- a/src/Clustering/ConsensusSequenceLayer.cpp +++ b/src/Clustering/ConsensusSequenceLayer.cpp @@ -0,0 +1,369 @@ +#include + +#include +#include // std::promise, std::future +#include // std::invalid_argument +#include // log() +#include // std::bind(), std::ref() + +#include +#include +#include +#include +#include // dna::base_composition() + +double ConsensusSequenceLayer::score_consensus_subseq(const Matrix3D& cons_seq, + size_t row, + size_t start, + const Matrix2D& model_log) +{ size_t ncol = cons_seq.get_dim()[1] ; + size_t dim3 = cons_seq.get_dim()[2] ; + + if(start > ncol - model_log.get_nrow()) + { char msg[4096] ; + sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", + start, ncol - model_log.get_nrow()) ; + throw std::invalid_argument(msg) ; + } + else if(model_log.get_nrow() > ncol) + { char msg[4096] ; + sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", + model_log.get_nrow(), ncol) ; + throw std::invalid_argument(msg) ; + } + else if(model_log.get_ncol() != 4) + { char msg[4096] ; + sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", + model_log.get_ncol()) ; + throw std::invalid_argument(msg) ; + } + else if(dim3 != 4) + { char msg[4096] ; + sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)", + dim3) ; + throw std::invalid_argument(msg) ; + } + + size_t from = start ; + size_t to = from + model_log.get_nrow() ; // will score [from, to) + + double ll = 0 ; + for(size_t i=from, j=0; i& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) + : Data3DLayer(data, n_class, n_shift, flip,bckg_class) +{ + this->n_category = 4 ; + + // initialise the empty model + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0.) ; + // background class + if(this->bckg_class) + { this->create_bckg_class() ; } +} + +ConsensusSequenceLayer::ConsensusSequenceLayer(Matrix3D&& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) + : Data3DLayer(std::move(data), n_class, n_shift, flip, bckg_class) +{ + this->n_category = 4 ; + + // initialise the empty model + this->model = Matrix3D(this->n_class, + this->l_model, + this->n_category, + 0.) ; + + // background class + if(this->bckg_class) + { this->create_bckg_class() ; } +} + +ConsensusSequenceLayer::~ConsensusSequenceLayer() +{ ; } + +void ConsensusSequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, + vector_d& loglikelihood_max, + ThreadPool* threads) const +{ + // dimension checks + this->check_loglikelihood_dim(loglikelihood) ; + this->check_loglikelihood_max_dim(loglikelihood_max) ; + + // compute the log prob model and the log prob reverse-complement model + std::vector> model_log(this->n_class, + Matrix2D(this->l_model, + this->n_category, + 0.)) ; + std::vector> model_log_rev = model_log ; + for(size_t i=0; in_class; i++) + { for(size_t j=0; jl_model; j++) + { for(size_t k=0; kn_category; k++) + { // forward + model_log[i](j,k) = log(this->model(i,j,k)) ; + // reverse + model_log_rev[i](this->l_model-j-1,this->n_category-k-1) + = log(this->model(i,j,k)) ; + } + } + } + + // don't parallelize + if(threads == nullptr) + { std::promise promise ; + std::future future = promise.get_future() ; + this->compute_loglikelihoods_routine(0, this->n_dim1, + loglikelihood, + loglikelihood_max, + model_log, + model_log_rev, + promise) ; + future.get() ; + } + // parallelize + else + { size_t n_threads = threads->getNThread() ; + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_dim1, n_threads) ; + + // get promises and futures + // the function run by the threads will simply fill the promise with + // "true" to indicate that they are done + std::vector> promises(n_threads) ; + std::vector> futures(n_threads) ; + for(size_t i=0; iaddJob(std::move( + std::bind(&ConsensusSequenceLayer::compute_loglikelihoods_routine, + this, + slice.first, + slice.second, + std::ref(loglikelihood), + std::ref(loglikelihood_max), + std::ref(model_log), + std::ref(model_log_rev), + std::ref(promises[i])))) ; + } + // wait until all threads are done working + for(auto& future : futures) + { future.get() ; } + // -------------------------- threads stop --------------------------- + } +} + +void ConsensusSequenceLayer::compute_loglikelihoods_routine(size_t from, + size_t to, + Matrix4D& loglikelihood, + vector_d& loglikelihood_max, + const std::vector>& model_log, + const std::vector>& model_log_rev, + std::promise& done) const +{ + // compute log likelihood + for(size_t i=from; i::lowest() ; + + for(size_t j=0; jn_class; j++) + { + for(size_t s=0; sn_shift; s++) + { // forward strand + { double ll_fw = score_consensus_subseq(this->data, i, s, model_log[j]) ; + // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; + // apply lower bound here -> log(1e-100) + loglikelihood(i,j,s,flip_states::FORWARD) = std::max(ll_fw, + ConsensusSequenceLayer::p_min_log) ; + // keep track of max per row + if(ll_fw > loglikelihood_max[i]) + { loglikelihood_max[i] = ll_fw ; } + + } + // reverse + if(this->flip) + { double ll_rev = score_consensus_subseq(this->data, i, s, model_log_rev[j]) ; + // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; + // apply lower bound here -> log(1e-100) + loglikelihood(i,j,s,flip_states::REVERSE) = std::max(ll_rev, + ConsensusSequenceLayer::p_min_log) ; + // keep track of max per row + if(ll_rev > loglikelihood_max[i]) + { loglikelihood_max[i] = ll_rev ; } + } + } + } + } + done.set_value(true) ; +} + +void ConsensusSequenceLayer::update_model(const Matrix4D& posterior_prob, + ThreadPool* threads) +{ // don't parallelize + if(threads == nullptr) + { std::promise> promise ; + std::future> future = promise.get_future() ; + this->update_model_routine(0, + this->n_dim1, + posterior_prob, + promise) ; + // this->model = future.get() ; + auto model = future.get() ; + size_t n_class_to_update = this->n_class - this->bckg_class ; + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = model(i,j,k) ; } + } + } + } + // parallelize + else + { size_t n_threads = threads->getNThread() ; + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, this->n_dim1, n_threads) ; + + // get promises and futures + // the function run by the threads will simply fill the promise with + // "true" to indicate that they are done + std::vector>> promises(n_threads) ; + std::vector>> futures(n_threads) ; + for(size_t i=0; iaddJob(std::move( + std::bind(&ConsensusSequenceLayer::update_model_routine, + this, + slice.first, + slice.second, + std::ref(posterior_prob), + std::ref(promises[i])))) ; + } + // reinitialise the model + size_t n_class_to_update = this->n_class - this->bckg_class ; + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = 0. ; } + } + } + // wait until all threads are done working + // and update the model + for(auto& future : futures) + { Matrix3D model_part = future.get() ; + // for(size_t i=0; in_class; i++) + for(size_t i=0; il_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) += model_part(i,j,k) ; } + } + } + } + // -------------------------- threads stop --------------------------- + } + // make sure to have no 0 values + for(size_t i=0; in_class; i++) + { for(size_t j=0; jl_model; j++) + { for(size_t k=0; kn_category; k++) + { this->model(i,j,k) = + std::max(this->model(i,j,k), ConsensusSequenceLayer::p_min) ; + } + } + } + // normalize to get probs + for(size_t i=0; in_class; i++) + { for(size_t j=0; jl_model; j++) + { double sum = 0. ; + for(size_t k=0; kn_category; k++) + { sum += this->model(i,j,k) ; } + for(size_t k=0; kn_category; k++) + { double p = this->model(i,j,k) / sum ; + this->model(i,j,k) = p ; + } + } + } +} + +void ConsensusSequenceLayer::update_model_routine(size_t from, + size_t to, + const Matrix4D& posterior_prob, + std::promise>& promise) const +{ // dimension checks + this->check_posterior_prob_dim(posterior_prob) ; + + Matrix3D model(this->n_class, + this->l_model, + this->n_category, + 0.) ; + + size_t n_class_to_update = this->n_class - this->bckg_class ; + + for(size_t k=0; k < n_class_to_update; k++) + { for(size_t s=0; sn_shift; s++) + { for(size_t j=0; jl_model; j++) + { // base prob on fw and rv strand + vector_d base_prob_fw(this->n_category, 0.) ; + vector_d base_prob_rv(this->n_category, 0.) ; + for(size_t i=from; in_dim3; base++, base_rv--) + { // --------------- forward --------------- + { base_prob_fw[base] += + this->data(i,s+j,base) * + posterior_prob(i,k,s,ConsensusSequenceLayer::FORWARD) ; + } + // --------------- reverse --------------- + if(this->flip) + { base_prob_rv[base_rv] += + this->data(i,s+j,base) * + posterior_prob(i,k,s,ConsensusSequenceLayer::REVERSE) ; + } + } + } + // update this position of the model + for(size_t i=0,i_rev=base_prob_fw.size()-1; iflip) + { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } + } + } + } + } + promise.set_value(model) ; +} + +void ConsensusSequenceLayer::create_bckg_class() +{ // sequence composition + std::vector base_comp = + dna::base_composition(this->data, + this->flip) ; + // set last class as background class + for(size_t i=0; in_category; i++) + { for(size_t j=0; jl_model; j++) + { this->model(this->n_class-1,j,i) = base_comp[i] ; } + } +} diff --git a/src/Clustering/ConsensusSequenceLayer.hpp b/src/Clustering/ConsensusSequenceLayer.hpp index f5edc48..5ebba92 100644 --- a/src/Clustering/ConsensusSequenceLayer.hpp +++ b/src/Clustering/ConsensusSequenceLayer.hpp @@ -1,93 +1,196 @@ #ifndef CONSENSUSSEQUENCELAYER_HPP #define CONSENSUSSEQUENCELAYER_HPP -#include +#include -class ConsensusSequenceLayer : public SequenceLayer +#include +#include // std::promise + +#include +#include +#include +#include + + +typedef std::vector vector_d ; + + +class ConsensusSequenceLayer : public Data3DLayer { + public: + /*! + * \brief Computes the log-likelihood of the sub- + * sequence - stored in a given row (1st dimension) - + * and starting at the offset (2nd dimension) in + * the given sequence matrix. + * The subsequence length is determined by the model + * lenght. + * \param log_cons_seq a probability matrix containing + * the consensus sequence. Its dimensions are : + * 1st the different sequences + * 2nd the sequences length + * 3rd 4 for A,C,G,T. + * \param row the row containing the sequence of + * interest. + * \param col the index at which the sub-sequence + * is starting (1st index inside the subsequence + * of interest). + * \param model_log a model containing the log + * probability model. + * \return the log-likelihood of the sub-sequence + * given the model. + * \throw std::invalid_argument if 1) the offset is + * invalid, 2) the sequence and the model have + * incompatible dimensions or 3) the model 2n dimension + * is not 4 (A,C,G,T). + */ + static double score_consensus_subseq(const Matrix3D& cons_seq, + size_t row, + size_t col, + const Matrix2D& model_log) ; public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ ConsensusSequenceLayer(const Matrix3D& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ - ConsensusSequenceLayer(Matrix2D&& data, + ConsensusSequenceLayer(Matrix3D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) ; /*! * \brief Destructor. */ virtual ~ConsensusSequenceLayer() ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; + protected: + /*! + * \brief The routine that effectively performs the + * loglikelihood computations. + * \param from the index of the first row of the data + * to considered. + * \param to the index of the past last row of the data + * to considered. + * \param loglikelihood a matrix to store the + * results. It should have the following dimensions : + * 1st : same as the data number of row + * 2nd : same as the model number of classes + * 3rd : same as the number of shifts + * 4th : same as the number of flip states + * \param loglikelihood_max a vector containing the + * max value for each row of log_likelihood. + * Its length should be equal to the data row number. + * \param model_log a vector containing the matrices with + * the log values of the model for each class. + * \param model_log_rev a vector containing the matrices with + * the log values of the reverse strand model for each class + * (the 1st position in the model becomes the last in the + * reverse model and probabilities are swapped A<->T and C<->G). + * \param done a promise to be filled when the routine + * is done running. + */ + void compute_loglikelihoods_routine(size_t from, + size_t to, + Matrix4D& loglikelihood, + vector_d& loglikelihood_max, + const std::vector>& model_log, + const std::vector>& model_log_rev, + std::promise& done) const ; + + /*! + * \brief The routine that effectively update the model. + * \param from the index of the first row of the + * posterior probabilities to considered. + * \param to the index of the past last row of the + * posterior probabilities to considered. + * \param posterior_prob the data assignment probabilities + * to the different classes. + * \param + * \param done a promise containing the partial model + * computed from the given data slice. If several routines + * work together at updating the model, they need to be + * summed and normalized (by the column sum) to get the + * final model. + */ + void update_model_routine(size_t from, + size_t to, + const Matrix4D& posterior_prob, + std::promise>& done) const ; + + /*! + * \brief Sets the last class as background class with the + * mean base probababilities of the data at each position. + */ + void create_bckg_class() ; + protected: } ; #endif // CONSENSUSSEQUENCELAYER_HPP diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/Data2DLayer.cpp similarity index 59% copy from src/Clustering/DataLayer.cpp copy to src/Clustering/Data2DLayer.cpp index 973a695..41aeb95 100644 --- a/src/Clustering/DataLayer.cpp +++ b/src/Clustering/Data2DLayer.cpp @@ -1,192 +1,174 @@ -#include +#include + +#include // std::invalid_argument +#include // std::move() -#include // std::invalid_argument -#include // log() #include #include #include -#include -DataLayer::DataLayer() + +Data2DLayer::Data2DLayer() + : DataLayer() {} -DataLayer::DataLayer(const Matrix2D& data, - size_t n_class, - size_t n_shift, - bool flip, - bool bckg_class) - :data(data), - flip(flip), - n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), - n_class(n_class), - l_model(n_col - n_shift + 1), - n_shift(n_shift), - n_flip(flip + 1), - bckg_class(bckg_class) -{ // models cannot be initialise here +Data2DLayer::Data2DLayer(const Matrix2D& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) + : DataLayer(n_class, n_shift, flip, bckg_class), + data(data), + n_row(this->data.get_nrow()), + n_col(this->data.get_ncol()) +{ + this->l_model = this->n_col - this->n_shift + 1 ; + + // models cannot be initialise here // as the number of categories depend // on the exact class } -DataLayer::DataLayer(Matrix2D&& data, - size_t n_class, - size_t n_shift, - bool flip, - bool bckg_class) - :data(data), - flip(flip), - n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), - n_class(n_class), - l_model(n_col - n_shift + 1), - n_shift(n_shift), - n_flip(flip + 1), - bckg_class(bckg_class) -{ // models cannot be initialise here +Data2DLayer::Data2DLayer(Matrix2D&& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) + : DataLayer(n_class, n_shift, flip, bckg_class), + data(data), + n_row(this->data.get_nrow()), + n_col(this->data.get_ncol()) +{ + this->l_model = this->n_col - this->n_shift + 1 ; + + // models cannot be initialise here // as the number of categories depend // on the exact class } -DataLayer::DataLayer(const Matrix2D& data, - const Matrix3D& model, - bool flip, - bool bckg_class) - : data(data), - model(model), - flip(flip), +Data2DLayer::Data2DLayer(const Matrix2D& data, + const Matrix3D& model, + bool flip, + bool bckg_class) + : DataLayer(model, flip, bckg_class), + data(data), n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), - n_class(this->model.get_dim()[0]), - l_model(this->model.get_dim()[1]), - n_category(this->model.get_dim()[2]), - n_shift(n_col - l_model + 1), - n_flip(flip + 1), - bckg_class(bckg_class) -{ // check if model is not too long + n_col(this->data.get_ncol()) +{ + this->n_shift = this->n_col - this->l_model + 1 ; + + // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } - this->n_shift = this->n_col - this->l_model + 1 ; } -DataLayer::DataLayer(Matrix2D&& data, - Matrix3D&& model, - bool flip, - bool bckg_class) - : data(data), - model(model), - flip(flip), +Data2DLayer::Data2DLayer(Matrix2D&& data, + Matrix3D&& model, + bool flip, + bool bckg_class) + : DataLayer(std::move(model), flip, bckg_class), + data(data), n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), - n_class(this->model.get_dim()[0]), - l_model(this->model.get_dim()[1]), - n_category(this->model.get_dim()[2]), - n_shift(n_col - l_model + 1), - n_flip(flip + 1), - bckg_class(bckg_class) -{ // check if model is not too long + n_col(this->data.get_ncol()) +{ + this->n_shift = this->n_col - this->l_model + 1 ; + + // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } - this->n_shift = this->n_col - this->l_model + 1 ; } -DataLayer::~DataLayer() +Data2DLayer::~Data2DLayer() {} -Matrix3D DataLayer::get_model() const -{ return this->model ; } - -void DataLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const +void Data2DLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const { if(loglikelihood.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 1st dimension is not " "equal to data row number : %zu / %zu", loglikelihood.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 2nd dimension is not " "equal to model class number : %zu / %zu", loglikelihood.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", loglikelihood.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", loglikelihood.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } -void DataLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const +void Data2DLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const { if(loglikelihood_max.size() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood_max length is not " "equal to data row number : %zu / %zu", - loglikelihood_max.size(), this->n_flip) ; + loglikelihood_max.size(), this->n_row) ; throw std::invalid_argument(msg) ; } } -void DataLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const +void Data2DLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const { if(posterior_prob.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 1st dimension is not " "equal to data row number : %zu / %zu", posterior_prob.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 2nd dimension is not " "equal to model class number : %zu / %zu", posterior_prob.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", posterior_prob.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", posterior_prob.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } - -const double DataLayer::p_min = 1e-100 ; -const double DataLayer::p_min_log = log(DataLayer::p_min) ; diff --git a/src/Clustering/Data2DLayer.hpp b/src/Clustering/Data2DLayer.hpp new file mode 100644 index 0000000..3de04ba --- /dev/null +++ b/src/Clustering/Data2DLayer.hpp @@ -0,0 +1,167 @@ +#ifndef DATA2DLAYER_HPP +#define DATA2DLAYER_HPP + +#include + +#include +#include +#include + +typedef std::vector vector_d ; + + +class Data2DLayer : public DataLayer +{ + public: + /*! + * \brief Constructs an empty object. + */ + Data2DLayer() ; + + /*! + * \brief Constructs an object with the + * given data. + * An empty model is not initialised yet + * as the model number of categories + * depends on the final class. + * \param data the data. + * \param n_class the number of classes + * of the model. + * \param n_shift the number of shift + * states of the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + Data2DLayer(const Matrix2D& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) ; + + /*! + * \brief Constructs an object with the + * given data. + * An empty model is not initialised yet + * as the model number of categories + * depends on the final class. + * \param data the data. + * \param n_class the number of classes + * of the model. + * \param n_shift the number of shift + * states of the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + Data2DLayer(Matrix2D&& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) ; + + /*! + * \brief Constructs an object with the + * given data and model. + * The model dimensions set the number of + * classes and the shifting freedom. + * \param data the data. + * \param the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + Data2DLayer(const Matrix2D& data, + const Matrix3D& model, + bool flip, + bool bckg_class) ; + + + /*! + * \brief Constructs an object with the + * given data and model. + * The model dimensions set the number of + * classes and the shifting freedom. + * \param data the data. + * \param the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + Data2DLayer(Matrix2D&& data, + Matrix3D&& model, + bool flip, + bool bckg_class) ; + + /*! + * \brief Destructor. + */ + virtual ~Data2DLayer() ; + + protected: + /*! + * \brief Checks the argument has compatible + * dimensions with the data and models. If this is + * not the case, throw a std::invalid_argument with + * a relevant message. + * \param logliklihood a matrix to store the + * results. It should have the following dimensions : + * 1st : same as the data row number + * 2nd : same as the model class number + * 3rd : same as the shift state number + * 4th : same as the flip state number + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void check_loglikelihood_dim(const Matrix4D& loglikelihood) const override; + + /*! + * \brief Checks that the argument has compatible + * dimensions with the data and models. If this is + * not the case, throw a std::invalid_argument with + * a relevant message. + * \param loglikelihood_max a vector containing the + * max value for each row of log_likelihood. + * It should have a length equal to the number of + * the data row number. + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const override ; + + /*! + * \brief Checks the argument has compatible + * dimensions with the data and models. If this is + * not the case, throw a std::invalid_argument with + * a relevant message. + * \param posterior_prob a matrix to store the + * results. It should have the following dimensions : + * 1st : same as the data row number + * 2nd : same as the model class number + * 3rd : same as the shift state number + * 4th : same as the flip state number + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void check_posterior_prob_dim(const Matrix4D& posterior_prob) const override ; + + /*! + * \brief the data. + */ + Matrix2D data ; + /*! + * \brief the number of row in the data. + */ + size_t n_row ; + /*! + * \brief the number of columns in the data. + */ + size_t n_col ; + +} ; + +#endif // DATA2DLAYER_HPP diff --git a/src/Clustering/Data3DLayer.cpp b/src/Clustering/Data3DLayer.cpp new file mode 100644 index 0000000..cbd6c1a --- /dev/null +++ b/src/Clustering/Data3DLayer.cpp @@ -0,0 +1,128 @@ +#include + +#include +#include + +Data3DLayer::Data3DLayer() + : DataLayer() +{} + +Data3DLayer::Data3DLayer(const Matrix3D& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) + : DataLayer(n_class, n_shift, flip, bckg_class), + data(data), + n_dim1(this->data.get_dim()[0]), + n_dim2(this->data.get_dim()[1]), + n_dim3(this->data.get_dim()[2]) +{ + this->l_model = this->n_dim2 - this->n_shift + 1 ; + + // models cannot be initialise here + // as the number of categories depend + // on the exact class +} + +Data3DLayer::Data3DLayer(Matrix3D&& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) + : DataLayer(n_class, n_shift, flip, bckg_class), + data(data), + n_dim1(this->data.get_dim()[0]), + n_dim2(this->data.get_dim()[1]), + n_dim3(this->data.get_dim()[2]) +{ + this->l_model = this->n_dim2 - this->n_shift + 1 ; + + // models cannot be initialise here + // as the number of categories depend + // on the exact class +} + +Data3DLayer::~Data3DLayer() +{} + +void Data3DLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const +{ if(loglikelihood.get_dim()[0] != this->n_dim1) + { char msg[4096] ; + sprintf(msg, + "Error! loglikelihood matrix 1st dimension is not " + "equal to data 1st dimension : %zu / %zu", + loglikelihood.get_dim()[0], this->n_dim1) ; + throw std::invalid_argument(msg) ; + } + else if(loglikelihood.get_dim()[1] != this->n_class) + { char msg[4096] ; + sprintf(msg, + "Error! loglikelihood matrix 2nd dimension is not " + "equal to model class number : %zu / %zu", + loglikelihood.get_dim()[1], this->n_class) ; + throw std::invalid_argument(msg) ; + } + else if(loglikelihood.get_dim()[2] != this->n_shift) + { char msg[4096] ; + sprintf(msg, + "Error! loglikelihood matrix 3rd dimension is not " + "equal to model shift state number : %zu / %zu", + loglikelihood.get_dim()[2], this->n_shift) ; + throw std::invalid_argument(msg) ; + } + else if(loglikelihood.get_dim()[3] != this->n_flip) + { char msg[4096] ; + sprintf(msg, + "Error! loglikelihood matrix 4th dimension is not " + "equal to model flip state number : %zu / %zu", + loglikelihood.get_dim()[3], this->n_flip) ; + throw std::invalid_argument(msg) ; + } +} + +void Data3DLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const +{ if(loglikelihood_max.size() != this->n_dim1) + { char msg[4096] ; + sprintf(msg, + "Error! loglikelihood_max length is not " + "equal to data 1st dimension : %zu / %zu", + loglikelihood_max.size(), this->n_dim1) ; + throw std::invalid_argument(msg) ; + } +} + +void Data3DLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const +{ if(posterior_prob.get_dim()[0] != this->n_dim1) + { char msg[4096] ; + sprintf(msg, + "Error! posterior_prob matrix 1st dimension is not " + "equal to data 1st dimension : %zu / %zu", + posterior_prob.get_dim()[0], this->n_dim1) ; + throw std::invalid_argument(msg) ; + } + else if(posterior_prob.get_dim()[1] != this->n_class) + { char msg[4096] ; + sprintf(msg, + "Error! posterior_prob matrix 2nd dimension is not " + "equal to model class number : %zu / %zu", + posterior_prob.get_dim()[1], this->n_class) ; + throw std::invalid_argument(msg) ; + } + else if(posterior_prob.get_dim()[2] != this->n_shift) + { char msg[4096] ; + sprintf(msg, + "Error! posterior_prob matrix 3rd dimension is not " + "equal to model shift state number : %zu / %zu", + posterior_prob.get_dim()[2], this->n_shift) ; + throw std::invalid_argument(msg) ; + } + else if(posterior_prob.get_dim()[3] != this->n_flip) + { char msg[4096] ; + sprintf(msg, + "Error! posterior_prob matrix 4th dimension is not " + "equal to model flip state number : %zu / %zu", + posterior_prob.get_dim()[3], this->n_flip) ; + throw std::invalid_argument(msg) ; + } +} diff --git a/src/Clustering/Data3DLayer.hpp b/src/Clustering/Data3DLayer.hpp new file mode 100644 index 0000000..19ffc09 --- /dev/null +++ b/src/Clustering/Data3DLayer.hpp @@ -0,0 +1,135 @@ +#ifndef DATA3DLAYER_HPP +#define DATA3DLAYER_HPP + +#include + +#include +#include + +typedef std::vector vector_d ; + +class Data3DLayer : public DataLayer +{ + public: + /*! + * \brief Constructs an empty object. + */ + Data3DLayer() ; + + /*! + * \brief Constructs an object with the + * given data. + * An empty model is not initialised yet + * as the model number of categories + * depends on the final class. + * \param data the data. + * \param n_class the number of classes + * of the model. + * \param n_shift the number of shift + * states of the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + Data3DLayer(const Matrix3D& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) ; + + /*! + * \brief Constructs an object with the + * given data. + * An empty model is not initialised yet + * as the model number of categories + * depends on the final class. + * \param data the data. + * \param n_class the number of classes + * of the model. + * \param n_shift the number of shift + * states of the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + Data3DLayer(Matrix3D&& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) ; + + /*! + * \brief Destructor. + */ + virtual ~Data3DLayer() ; + + + protected: + /*! + * \brief Checks the argument has compatible + * dimensions with the data and models. If this is + * not the case, throw a std::invalid_argument with + * a relevant message. + * \param logliklihood a matrix to store the + * results. It should have the following dimensions : + * 1st : same as the data row number + * 2nd : same as the model class number + * 3rd : same as the shift state number + * 4th : same as the flip state number + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void check_loglikelihood_dim(const Matrix4D& loglikelihood) const override; + + /*! + * \brief Checks that the argument has compatible + * dimensions with the data and models. If this is + * not the case, throw a std::invalid_argument with + * a relevant message. + * \param loglikelihood_max a vector containing the + * max value for each row of log_likelihood. + * It should have a length equal to the number of + * the data row number. + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const override ; + + /*! + * \brief Checks the argument has compatible + * dimensions with the data and models. If this is + * not the case, throw a std::invalid_argument with + * a relevant message. + * \param posterior_prob a matrix to store the + * results. It should have the following dimensions : + * 1st : same as the data row number + * 2nd : same as the model class number + * 3rd : same as the shift state number + * 4th : same as the flip state number + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void check_posterior_prob_dim(const Matrix4D& posterior_prob) const override ; + + /*! + * \brief the data. + */ + Matrix3D data ; + /*! + * \brief The size of data 1st dimension. + */ + size_t n_dim1 ; + /*! + * \brief The size of data 2nd dimension. + */ + size_t n_dim2 ; + /*! + * \brief The size of data 3rd dimension. + */ + size_t n_dim3 ; + +} ; + +#endif // DATA3DLAYER_HPP diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/DataLayer.cpp index 973a695..a1d8de4 100644 --- a/src/Clustering/DataLayer.cpp +++ b/src/Clustering/DataLayer.cpp @@ -1,192 +1,60 @@ #include #include // std::invalid_argument #include // log() + #include #include #include #include DataLayer::DataLayer() -{} - -DataLayer::DataLayer(const Matrix2D& data, - size_t n_class, - size_t n_shift, - bool flip, - bool bckg_class) - :data(data), - flip(flip), - n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), - n_class(n_class), - l_model(n_col - n_shift + 1), - n_shift(n_shift), - n_flip(flip + 1), - bckg_class(bckg_class) -{ // models cannot be initialise here - // as the number of categories depend - // on the exact class -} +{ ; } -DataLayer::DataLayer(Matrix2D&& data, - size_t n_class, +DataLayer::DataLayer(size_t n_class, size_t n_shift, bool flip, bool bckg_class) - :data(data), - flip(flip), - n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), - n_class(n_class), - l_model(n_col - n_shift + 1), - n_shift(n_shift), - n_flip(flip + 1), + : flip(flip), + n_class(n_class), + n_shift(n_shift), + n_flip(flip + 1), bckg_class(bckg_class) { // models cannot be initialise here // as the number of categories depend // on the exact class } -DataLayer::DataLayer(const Matrix2D& data, - const Matrix3D& model, +DataLayer::DataLayer(const Matrix3D& model, bool flip, bool bckg_class) - : data(data), - model(model), + : model(model), flip(flip), - n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), n_class(this->model.get_dim()[0]), l_model(this->model.get_dim()[1]), n_category(this->model.get_dim()[2]), - n_shift(n_col - l_model + 1), n_flip(flip + 1), bckg_class(bckg_class) -{ // check if model is not too long - if(this->n_col < this->l_model) - { char msg[4096] ; - sprintf(msg, - "Error! model is longer than data : %zu / %zu", - this->l_model, this->n_col) ; - throw std::invalid_argument(msg) ; - } - this->n_shift = this->n_col - this->l_model + 1 ; -} +{ ; } -DataLayer::DataLayer(Matrix2D&& data, - Matrix3D&& model, +DataLayer::DataLayer(Matrix3D&& model, bool flip, bool bckg_class) - : data(data), - model(model), + : model(model), flip(flip), - n_row(this->data.get_nrow()), - n_col(this->data.get_ncol()), n_class(this->model.get_dim()[0]), l_model(this->model.get_dim()[1]), n_category(this->model.get_dim()[2]), - n_shift(n_col - l_model + 1), n_flip(flip + 1), bckg_class(bckg_class) -{ // check if model is not too long - if(this->n_col < this->l_model) - { char msg[4096] ; - sprintf(msg, - "Error! model is longer than data : %zu / %zu", - this->l_model, this->n_col) ; - throw std::invalid_argument(msg) ; - } - this->n_shift = this->n_col - this->l_model + 1 ; -} +{ ; } DataLayer::~DataLayer() -{} +{ ; } Matrix3D DataLayer::get_model() const { return this->model ; } -void DataLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const -{ if(loglikelihood.get_dim()[0] != this->n_row) - { char msg[4096] ; - sprintf(msg, - "Error! loglikelihood matrix 1st dimension is not " - "equal to data row number : %zu / %zu", - loglikelihood.get_dim()[0], this->n_row) ; - throw std::invalid_argument(msg) ; - } - else if(loglikelihood.get_dim()[1] != this->n_class) - { char msg[4096] ; - sprintf(msg, - "Error! loglikelihood matrix 2nd dimension is not " - "equal to model class number : %zu / %zu", - loglikelihood.get_dim()[1], this->n_class) ; - throw std::invalid_argument(msg) ; - } - else if(loglikelihood.get_dim()[2] != this->n_shift) - { char msg[4096] ; - sprintf(msg, - "Error! loglikelihood matrix 3rd dimension is not " - "equal to model shift state number : %zu / %zu", - loglikelihood.get_dim()[2], this->n_shift) ; - throw std::invalid_argument(msg) ; - } - else if(loglikelihood.get_dim()[3] != this->n_flip) - { char msg[4096] ; - sprintf(msg, - "Error! loglikelihood matrix 4th dimension is not " - "equal to model flip state number : %zu / %zu", - loglikelihood.get_dim()[3], this->n_flip) ; - throw std::invalid_argument(msg) ; - } -} - -void DataLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const -{ if(loglikelihood_max.size() != this->n_row) - { char msg[4096] ; - sprintf(msg, - "Error! loglikelihood_max length is not " - "equal to data row number : %zu / %zu", - loglikelihood_max.size(), this->n_flip) ; - throw std::invalid_argument(msg) ; - } -} - -void DataLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const -{ if(posterior_prob.get_dim()[0] != this->n_row) - { char msg[4096] ; - sprintf(msg, - "Error! posterior_prob matrix 1st dimension is not " - "equal to data row number : %zu / %zu", - posterior_prob.get_dim()[0], this->n_row) ; - throw std::invalid_argument(msg) ; - } - else if(posterior_prob.get_dim()[1] != this->n_class) - { char msg[4096] ; - sprintf(msg, - "Error! posterior_prob matrix 2nd dimension is not " - "equal to model class number : %zu / %zu", - posterior_prob.get_dim()[1], this->n_class) ; - throw std::invalid_argument(msg) ; - } - else if(posterior_prob.get_dim()[2] != this->n_shift) - { char msg[4096] ; - sprintf(msg, - "Error! posterior_prob matrix 3rd dimension is not " - "equal to model shift state number : %zu / %zu", - posterior_prob.get_dim()[2], this->n_shift) ; - throw std::invalid_argument(msg) ; - } - else if(posterior_prob.get_dim()[3] != this->n_flip) - { char msg[4096] ; - sprintf(msg, - "Error! posterior_prob matrix 4th dimension is not " - "equal to model flip state number : %zu / %zu", - posterior_prob.get_dim()[3], this->n_flip) ; - throw std::invalid_argument(msg) ; - } -} - const double DataLayer::p_min = 1e-100 ; const double DataLayer::p_min_log = log(DataLayer::p_min) ; diff --git a/src/Clustering/DataLayer.hpp b/src/Clustering/DataLayer.hpp index aae52db..97bfdff 100644 --- a/src/Clustering/DataLayer.hpp +++ b/src/Clustering/DataLayer.hpp @@ -1,279 +1,237 @@ #ifndef DATALAYER_HPP #define DATALAYER_HPP -#include -#include // std::promise, std::future #include #include #include #include typedef std::vector vector_d ; /*! * \brief The DataLayer class define the basic design * to handle probabilistic models together with * their data. * A DataLayer is made of two parts : * 1) a data matrix * 2) a model * The model contains the parameters of a probabilistic * model with one or more classes that fits the data. * The data likelihood given the model can be computed * and the model can be updated given a set of * posterior probabilities representing the data * assignments to the different classes. */ class DataLayer { public: /*! * \brief the smallest acceptable probability * for computations. */ static const double p_min ; /*! * \brief the log of the smallest probability. */ static const double p_min_log ; /*! * \brief The possible flip states. */ enum flip_states{FORWARD=0, REVERSE} ; /*! - * \brief Default constructor. + * \brief Constructs an empty object. */ DataLayer() ; /*! * \brief Constructs an object with the - * given data. + * given model parameters. * An empty model is not initialised yet * as the model number of categories * depends on the final class. - * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ - DataLayer(const Matrix2D& data, - size_t n_class, + DataLayer(size_t n_class, size_t n_shift, bool flip, bool bckg_class) ; /*! * \brief Constructs an object with the - * given data. - * An empty model is not initialised yet - * as the model number of categories - * depends on the final class. - * \param data the data. - * \param n_class the number of classes - * of the model. - * \param n_shift the number of shift - * states of the model. - * \param flip whether flipping is allowed. - * \param bckg_class should the last class - * of the model is constant and models the - * background. - */ - DataLayer(Matrix2D&& data, - size_t n_class, - size_t n_shift, - bool flip, - bool bckg_class) ; - - /*! - * \brief Constructs an object with the - * given data and model. + * given model. * The model dimensions set the number of * classes and the shifting freedom. - * \param data the data. * \param the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ - DataLayer(const Matrix2D& data, - const Matrix3D& model, + DataLayer(const Matrix3D& model, bool flip, bool bckg_class) ; /*! * \brief Constructs an object with the - * given data and model. + * given model. * The model dimensions set the number of * classes and the shifting freedom. - * \param data the data. * \param the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ - DataLayer(Matrix2D&& data, - Matrix3D&& model, + DataLayer(Matrix3D&& model, bool flip, bool bckg_class) ; /*! * \brief Destructor. */ virtual ~DataLayer() ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const = 0 ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) = 0 ; /*! * \brief Returns a copy of the current model. * \return the current model. * 1st dim : the number of classes * 2nd dim : the model length * 3rd dim : the number of value categories. */ virtual Matrix3D get_model() const ; protected: /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ - void check_loglikelihood_dim(const Matrix4D& loglikelihood) const ; + virtual void check_loglikelihood_dim(const Matrix4D& loglikelihood) const = 0; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * It should have a length equal to the number of * the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ - void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const ; + virtual void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const = 0 ; /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_prob a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ - void check_posterior_prob_dim(const Matrix4D& posterior_prob) const ; + virtual void check_posterior_prob_dim(const Matrix4D& posterior_prob) const = 0 ; - /*! - * \brief the data. - */ - Matrix2D data ; /*! * \brief the data model. */ Matrix3D model ; /*! * \brief whether flip is enabled. */ bool flip ; - /*! - * \brief the number of row in the data. - */ - size_t n_row ; - /*! - * \brief the number of columns in the data. - */ - size_t n_col ; /*! * \brief the number of classes in the model. */ size_t n_class ; /*! * \brief the model length, its 2nd dimension. */ size_t l_model ; /*! * \brief the number of variable categories in * the data. This is also the model 3rd * dimension. * Read counts are quantitative values and * have a number of categories equal to one * whereas as DNA sequences are made of * A,C,G,T (at least) and have 4 different * categories. */ size_t n_category ; /*! * \brief the number of shift states. */ size_t n_shift ; /*! * \brief the number of flip states. */ size_t n_flip ; /*! * \brief A flag indicating that the last class of the model * is modelling the background. This class is considered constant * and should not be updated when calling update_model(). */ bool bckg_class ; } ; #endif // DATALAYER_HPP diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMConsensusSequence.cpp similarity index 65% copy from src/Clustering/EMRead.cpp copy to src/Clustering/EMConsensusSequence.cpp index edad5a3..9cac9ee 100644 --- a/src/Clustering/EMRead.cpp +++ b/src/Clustering/EMConsensusSequence.cpp @@ -1,308 +1,293 @@ -#include +#include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() -#include // exp() -#include // ReadLayer -#include // getRandomNumberGenerator() -#include // ConsoleProgressBar -#include // ThreadPool +#include // SequenceLayer +#include // getRandomNumberGenerator() +#include // ConsoleProgressBar +#include // ThreadPool +#include // dna::base_composition() -EMRead::EMRead(const Matrix2D& read_matrix, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - bool bckg_class, - const std::string& seed, - size_t n_threads) - : EMBase(read_matrix.get_nrow(), - read_matrix.get_ncol(), +EMConsensusSequence::EMConsensusSequence(const Matrix3D& seq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed, + size_t n_threads) + : EMBase(seq_matrix.get_dim()[0], + seq_matrix.get_dim()[1], n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), - read_layer(nullptr) -{ this->loglikelihood_max = vector_d(n_row, 0.) ; + cseq_layer(nullptr) +{ + this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly + // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; + // data and models - this->read_layer = new ReadLayer(read_matrix, - this->n_class, - this->n_shift, - flip, - bckg_class, - this->threads) ; + this->cseq_layer = new ConsensusSequenceLayer(seq_matrix, + this->n_class, + this->n_shift, + this->flip, + bckg_class) ; + // intialise the models with the post prob - this->read_layer->update_model(this->post_prob, - this->threads) ; + this->cseq_layer->update_model(this->post_prob, + this->threads) ; } -EMRead::EMRead(Matrix2D&& read_matrix, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - bool bckg_class, - const std::string& seed, - size_t n_threads) - : EMBase(read_matrix.get_nrow(), - read_matrix.get_ncol(), +EMConsensusSequence::EMConsensusSequence(Matrix3D&& seq_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed, + size_t n_threads) + : EMBase(seq_matrix.get_dim()[0], + seq_matrix.get_dim()[1], n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), - read_layer(nullptr) -{ this->loglikelihood_max = vector_d(n_row, 0.) ; + cseq_layer(nullptr) +{ + this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly + // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models - this->read_layer = new ReadLayer(std::move(read_matrix), - this->n_class, - this->n_shift, - flip, - bckg_class, - this->threads) ; + this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix), + this->n_class, + this->n_shift, + this->flip, + bckg_class) ; + // intialise the models with the post prob - this->read_layer->update_model(this->post_prob, - this->threads) ; + this->cseq_layer->update_model(this->post_prob, + this->threads) ; } -EMRead::~EMRead() -{ if(this->read_layer == nullptr) +EMConsensusSequence::~EMConsensusSequence() +{ if(this->cseq_layer != nullptr) + { delete this->cseq_layer ; + this->cseq_layer = nullptr ; + } + if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } -Matrix3D EMRead::get_read_models() const -{ return read_layer->get_model() ; } +Matrix3D EMConsensusSequence::get_sequence_models() const +{ return this->cseq_layer->get_model() ; } -EMRead::exit_codes EMRead::classify() +EMConsensusSequence::exit_codes EMConsensusSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) - { - // E-step + { // E-step this->compute_loglikelihood() ; - // std::cerr << this->post_prob_rowsum << std::endl ; - // std::cerr << this->post_prob_colsum << std::endl ; this->compute_post_prob() ; // M-step - // std::cerr << this->post_prob_rowsum << std::endl ; - // std::cerr << this->post_prob_colsum << std::endl ; this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; - bar.update() ; } bar.update() ; std::cerr << std::endl ; - return EMRead::exit_codes::ITER_MAX ; + return EMConsensusSequence::exit_codes::ITER_MAX ; } -void EMRead::compute_loglikelihood() +void EMConsensusSequence::compute_loglikelihood() { // compute the loglikelihood - this->read_layer->compute_loglikelihoods(this->loglikelihood, - this->loglikelihood_max, - this->threads) ; - - /* - // rescale the values - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t k=0; kn_shift; k++) - { for(size_t l=0; ln_flip; l++) - { this->loglikelihood(i,j,k,l) = - (this->loglikelihood(i,j,k,l) - - this->loglikelihood_max[i]) ; - } - } - } - } - */ - + this->cseq_layer->compute_loglikelihoods(this->loglikelihood, + this->loglikelihood_max, + this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( - std::bind(&EMRead::compute_loglikelihood_routine, + std::bind(&EMConsensusSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } -void EMRead::compute_loglikelihood_routine(size_t from, - size_t to, - std::promise& done) +void EMConsensusSequence::compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } -void EMRead::compute_post_prob() +void EMConsensusSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( - std::bind(&EMRead::compute_post_prob_routine, + std::bind(&EMConsensusSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } -void EMRead::compute_post_prob_routine(size_t from, - size_t to, - std::promise& post_prob_colsum) +void EMConsensusSequence::compute_post_prob_routine(size_t from, + size_t to, + std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } - // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { // avoid p=0. by rounding errors + { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], - ReadLayer::p_min) ; + ConsensusSequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } -void EMRead::update_models() -{ this->read_layer->update_model(this->post_prob, - this->post_prob_colsum, - this->threads) ; +void EMConsensusSequence::update_models() +{ this->cseq_layer->update_model(this->post_prob, + this->threads) ; } diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMConsensusSequence.hpp similarity index 57% copy from src/Clustering/EMRead.hpp copy to src/Clustering/EMConsensusSequence.hpp index ec3a896..8ba881c 100644 --- a/src/Clustering/EMRead.hpp +++ b/src/Clustering/EMConsensusSequence.hpp @@ -1,172 +1,184 @@ -#ifndef EMREAD_HPP -#define EMREAD_HPP +#ifndef EMCONSENSUSSEQUENCE_HPP +#define EMCONSENSUSSEQUENCE_HPP #include #include #include #include // std::promise #include -#include +#include typedef std::vector vector_d ; -class EMRead : public EMBase +class EMConsensusSequence : public EMBase { public: /*! * \brief Constructs an object to partition the - * region (rows) according to the shape of the signal - * with the given shifting and flipping freedom. - * \param read_matrix a matrix containing the read - * densitiy (ChIP-seq or related signal) for the - * regions of interest. + * given consensus sequences (rows) according to + * their motif content. + * The sequences models are initialised randomly. + * \param sequence_matrix a matrix containing the + * consensus sequences in a probability matrix. + * Its dimensions are : + * 1st the number of consensus sequences + * 2nd the length of the consensus sequences + * 3rd 4 for A,C,G,T + * The sums over the 1st and 2nd dimensions should + * be 1. The overall sum of the matrix values should + * be the st dimension. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. - * \param bckg_class the last class is used to model the - * background by setting all its parameters, at all - * positions, to the mean number of counts. Since - * the background is constant, this class will never - * be updated. + * \param bckg_class the last class is used to model the background + * by setting all its parameters, at all positions, to the + * background base probabilties. Since the background is constant, + * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ - EMRead(const Matrix2D& read_matrix, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - bool bckg_class, - const std::string& seed="", - size_t n_threads=0) ; - + EMConsensusSequence(const Matrix3D& sequence_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed="", + size_t n_threads=0) ; /*! * \brief Constructs an object to partition the - * region (rows) according to the shape of the signal - * with the given shifting and flipping freedom. - * \param read_matrix a matrix containing the read - * densitiy (ChIP-seq or related signal) for the - * regions of interest. + * given consensus sequences (rows) according to + * their motif + * content. + * The sequences models are initialised randomly. + * \param sequence_matrix a matrix containing the + * consensus sequences in a probability matrix. + * Its dimensions are : + * 1st the number of consensus sequences + * 2nd the length of the consensus sequences + * 3rd 4 for A,C,G,T + * The sums over the 1st and 2nd dimensions should + * be 1. The overall sum of the matrix values should + * be the st dimension. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. - * \param bckg_class the last class is used to model the - * background by setting all its parameters, at all - * positions, to the mean number of counts. Since - * the background is constant, this class will never - * be updated. + * \param bckg_class the last class is used to model the background + * by setting all its parameters, at all positions, to the + * background base probabilties. Since the background is constant, + * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ - EMRead(Matrix2D&& read_matrix, - size_t n_class, - size_t n_iter, - size_t n_shift, - bool flip, - bool bckg_class, - const std::string& seed="", - size_t n_threads=0) ; + EMConsensusSequence(Matrix3D&& sequence_matrix, + size_t n_class, + size_t n_iter, + size_t n_shift, + bool flip, + bool bckg_class, + const std::string& seed="", + size_t n_threads=0) ; - EMRead(const EMRead& other) = delete ; + EMConsensusSequence(const EMConsensusSequence& other) = delete ; /*! * \brief Destructor. */ - virtual ~EMRead() override ; + virtual ~EMConsensusSequence() override ; /*! - * \brief Returns the class read signal model. - * \return the class read signal model. + * \brief Returns the class sequence model. + * \return the class sequence model. */ - Matrix3D get_read_models() const ; + Matrix3D get_sequence_models() const ; /*! - * \brief Runs the read signal model optimization and + * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ - virtual EMRead::exit_codes classify() override ; + virtual EMConsensusSequence::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ - ReadLayer* read_layer ; + ConsensusSequenceLayer* cseq_layer ; } ; -#endif // EMREAD_HPP +#endif // EMCONSENSUSSEQUENCE_HPP diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp index d3efbbf..e8a9e1d 100644 --- a/src/Clustering/EMJoint.cpp +++ b/src/Clustering/EMJoint.cpp @@ -1,623 +1,626 @@ #include +#include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() +#include // exp() #include #include #include #include #include +#include #include #include // getRandomNumberGenerator() #include // ConsoleProgressBar EMJoint::EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } } EMJoint::EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } } EMJoint::EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; this->seq_layer->update_model(this->post_prob, this->threads) ; } EMJoint::EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; } EMJoint::~EMJoint() { // join the threads in case // deleted by EMBase destructor if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // read data and models for(auto& ptr : this->read_layers) { if(ptr != nullptr) { delete ptr ; ptr = nullptr ; } } // sequence data and models if(seq_layer != nullptr) { delete seq_layer ; - seq_layer = nullptr ; + this->seq_layer = nullptr ; } } std::vector> EMJoint::get_read_models() const { std::vector> models ; for(const auto& ptr : this->read_layers) { models.push_back(ptr->get_model()) ; } return models ; } Matrix3D EMJoint::get_sequence_models() const { return this->seq_layer->get_model() ; } EMJoint::exit_codes EMJoint::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMJoint::exit_codes::ITER_MAX ; } void EMJoint::compute_loglikelihood() { // compute the loglikelihood for each layer size_t i = 0 ; for(auto& ptr : this->read_layers) { ptr->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } if(this->seq_layer != nullptr) { this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } // sum the likelihood for each state, over all layers // and rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t j=0; jthreads->addJob(std::move( std::bind(&EMJoint::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[j])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } /* void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // limite value range for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } done.set_value(true) ; } */ void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // the max likelihood found per row std::vector rowmax(to-from, std::numeric_limits::lowest()) ; // sum over layers for(size_t i=from, i_rowmax=0; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { // add rescaled layer value this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } // keep track of max if(this->loglikelihood(i,j,k,l) > rowmax[i_rowmax]) { rowmax[i_rowmax] = this->loglikelihood(i,j,k,l) ; } } } } } // rescale for(size_t i=from, i_rowmax=0; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ; } } } } done.set_value(true) ; } void EMJoint::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; // double p = std::max(exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * // this->post_state_prob(n_class,n_shift,n_flip), // ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; // double p = this->post_prob(i,n_class,n_shift,n_flip) / // this->post_prob_rowsum[i] ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMJoint::update_models() { // read data and models for(auto& ptr : this->read_layers) { ptr->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } // sequence data and models if(this->seq_layer != nullptr) { this->seq_layer->update_model(this->post_prob, this->threads) ; } } diff --git a/src/Clustering/EMJoint.hpp b/src/Clustering/EMJoint.hpp index 41e4f9f..7ed8a51 100644 --- a/src/Clustering/EMJoint.hpp +++ b/src/Clustering/EMJoint.hpp @@ -1,283 +1,313 @@ #ifndef EMJOINT_HPP #define EMJOINT_HPP #include #include #include #include #include #include #include #include +#include typedef std::vector vector_d ; class EMJoint : public EMBase { public: /*! * \brief Constructs an object to partition the * region according to all the given read densities * with the given shifting and flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related - * signal) for the regions of interest. + * signal) for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param seq_matrix a matrix containing the DNA * sequences for the regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities * with the given shifting and flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related - * signal) for the regions of interest. + * signal) for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param seq_matrix a matrix containing the DNA - * sequences for the regions of interest. + * sequences for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities - * and region sequences with the given shifting and + * and sequences with the given shifting and * flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related - * signal) for the regions of interest. + * signal) for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param seq_matrix a matrix containing the DNA - * sequences for the regions of interest. + * sequences for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region according to all the given read densities - * and region sequences with the given shifting and + * and sequences with the given shifting and * flipping freedom. * \param read_matrices a vector containing all * the different data densities (ChIP-seq or related - * signal) for the regions of interest. + * signal) for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param seq_matrix a matrix containing the DNA - * sequences for the regions of interest. + * sequences for the regions of interest. The matrix + * dimensions are as follow: + * 1st the number of regions, + * 2nd the region length. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model * the background by setting all its parameters, at all * positions, to the background base probabilties (for * sequence models) and the mean number of counts (for * read signal models). Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; + + EMJoint(const EMJoint& other) = delete ; /*! * \brief Destructor. */ virtual ~EMJoint() override ; /*! * \brief Returns all layer read models. * The models are in the same order * as the data were given to the * constructor. * \return a vector containing the * models. */ std::vector> get_read_models() const ; /*! * \brief Returns the sequence models. * \return a vector containing the * models. */ Matrix3D get_sequence_models() const ; /*! * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMJoint::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood() that * computes the joint loglikelihood by summing the * individual loglikelihood obtained from each data layer. * At the same time, this method rescales the loglikelihood * values by substacting to each value the maximum * loglikelihood value found in the same data row, * for each layer. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the number of data layers. */ size_t n_layer ; /*! * \brief the log likelihood buffers for each individual * layer (one element per layer). */ std::vector> loglikelihood_layer ; /*! * \brief the max loglikelihood value for * each each data layer (1st dimension) * and each data row of the given layer * (2nd dimension). */ std::vector loglikelihood_max ; /*! * \brief A vector containing the pointers * to the objects managing all the read - * layer data and models. + * data and models. */ std::vector read_layers ; /*! * \brief A pointer to the object managing - * the data and their model. + * the sequence data and their models. */ SequenceLayer* seq_layer ; + /*! + * \brief A pointer to the object managing + * the consensus sequences data and their + * models. + */ + ConsensusSequenceLayer* conseq_layer ; } ; #endif // EMJOINT_HPP diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp index edad5a3..a8cb150 100644 --- a/src/Clustering/EMRead.cpp +++ b/src/Clustering/EMRead.cpp @@ -1,308 +1,314 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() +#include +#include #include // ReadLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool EMRead::EMRead(const Matrix2D& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models this->read_layer = new ReadLayer(read_matrix, this->n_class, this->n_shift, flip, bckg_class, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; } EMRead::EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models this->read_layer = new ReadLayer(std::move(read_matrix), this->n_class, this->n_shift, flip, bckg_class, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; } EMRead::~EMRead() -{ if(this->read_layer == nullptr) +{ if(this->read_layer!= nullptr) + { delete this->read_layer ; + this->read_layer = nullptr ; + } + if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } Matrix3D EMRead::get_read_models() const { return read_layer->get_model() ; } EMRead::exit_codes EMRead::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; // std::cerr << this->post_prob_rowsum << std::endl ; // std::cerr << this->post_prob_colsum << std::endl ; this->compute_post_prob() ; // M-step // std::cerr << this->post_prob_rowsum << std::endl ; // std::cerr << this->post_prob_colsum << std::endl ; this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMRead::exit_codes::ITER_MAX ; } void EMRead::compute_loglikelihood() { // compute the loglikelihood this->read_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; /* // rescale the values for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } */ // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMRead::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMRead::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMRead::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { // avoid p=0. by rounding errors double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMRead::update_models() { this->read_layer->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMRead.hpp index ec3a896..0c49b6a 100644 --- a/src/Clustering/EMRead.hpp +++ b/src/Clustering/EMRead.hpp @@ -1,172 +1,173 @@ #ifndef EMREAD_HPP #define EMREAD_HPP #include #include #include #include // std::promise +#include #include #include typedef std::vector vector_d ; class EMRead : public EMBase { public: /*! * \brief Constructs an object to partition the * region (rows) according to the shape of the signal * with the given shifting and flipping freedom. * \param read_matrix a matrix containing the read * densitiy (ChIP-seq or related signal) for the * regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the * background by setting all its parameters, at all * positions, to the mean number of counts. Since * the background is constant, this class will never * be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMRead(const Matrix2D& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region (rows) according to the shape of the signal * with the given shifting and flipping freedom. * \param read_matrix a matrix containing the read * densitiy (ChIP-seq or related signal) for the * regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the * background by setting all its parameters, at all * positions, to the mean number of counts. Since * the background is constant, this class will never * be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; EMRead(const EMRead& other) = delete ; /*! * \brief Destructor. */ virtual ~EMRead() override ; /*! * \brief Returns the class read signal model. * \return the class read signal model. */ Matrix3D get_read_models() const ; /*! * \brief Runs the read signal model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMRead::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ ReadLayer* read_layer ; } ; #endif // EMREAD_HPP diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp index 2a182e3..bf2bb0e 100644 --- a/src/Clustering/EMSequence.cpp +++ b/src/Clustering/EMSequence.cpp @@ -1,351 +1,356 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() +#include +#include #include // SequenceLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool -#include // dna::base_composition() EMSequence::EMSequence(const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; } EMSequence::EMSequence(Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; } EMSequence::EMSequence(const Matrix2D& seq_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // data and models // background motif (if any) is the last of the given motifs this->seq_layer = new SequenceLayer(seq_matrix, motifs, this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::EMSequence(Matrix2D&& seq_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // data and models // background motif (if any) is the last of the given motifs this->seq_layer = new SequenceLayer(std::move(seq_matrix), std::move(motifs), this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::~EMSequence() -{ if(this->seq_layer == nullptr) +{ if(this->seq_layer != nullptr) + { delete this->seq_layer ; + this->seq_layer = nullptr ; + } + if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } Matrix3D EMSequence::get_sequence_models() const { return seq_layer->get_model() ; } EMSequence::exit_codes EMSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMSequence::exit_codes::ITER_MAX ; } void EMSequence::compute_loglikelihood() { // compute the loglikelihood this->seq_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], SequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMSequence::update_models() { this->seq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/EMSequence.hpp b/src/Clustering/EMSequence.hpp index 1cec297..622532b 100644 --- a/src/Clustering/EMSequence.hpp +++ b/src/Clustering/EMSequence.hpp @@ -1,239 +1,240 @@ #ifndef EMSEQUENCE_HPP #define EMSEQUENCE_HPP #include #include #include #include // std::promise +#include #include #include typedef std::vector vector_d ; class EMSequence : public EMBase { public: /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences models are initialised randomly. * \param sequence_matrix a matrix containing the sequences * of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the background * by setting all its parameters, at all positions, to the * background base probabilties. Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(const Matrix2D& sequence_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences models are initialised randomly. * \param sequence_matrix a matrix containing the sequences * of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the background * by setting all its parameters, at all positions, to the * background base probabilties. Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(Matrix2D&& sequence_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences class models are initialised using * the given motifs. The class probabilities are * initialised uniformlly. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param sequence_matrix a matrix containing the sequences * of interest. * \param motifs a matrix containing the different initial * class models with the following dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 for A,C,G,T * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param flip whether flipping is allowed. * \param bckg_class indicates that the last class in the * given motifs is used to model the background and it * should never be updated. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(const Matrix2D& sequence_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences class models are initialised using * the given motifs. The class probabilities are * initialised uniformlly. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param sequence_matrix a matrix containing the sequences * of interest. * \param motifs a matrix containing the different initial * class models with the following dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 for A,C,G,T * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param flip whether flipping is allowed. * \param bckg_class indicates that the last class in the * given motifs is used to model the background and it * should never be updated. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(Matrix2D&& sequence_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads=0) ; EMSequence(const EMSequence& other) = delete ; /*! * \brief Destructor. */ virtual ~EMSequence() override ; /*! * \brief Returns the class sequence model. * \return the class sequence model. */ Matrix3D get_sequence_models() const ; /*! * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMSequence::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ SequenceLayer* seq_layer ; } ; #endif // EMSEQUENCE_HPP diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp index 79e61da..3c0e70d 100644 --- a/src/Clustering/ReadLayer.cpp +++ b/src/Clustering/ReadLayer.cpp @@ -1,492 +1,492 @@ #include + #include // std::invalid_argument #include // numeric_limits #include // log(), exp(), pow() #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() -#include // beta_pmf(), poisson_pmf() -#include // rand_real_uniform(), rand_int_uniform() +#include // poisson_pmf() #include #include #include #include #include typedef std::vector vector_d ; ReadLayer::ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class, ThreadPool* threads) - : DataLayer(data, n_class, n_shift, flip, bckg_class), + : Data2DLayer(data, n_class, n_shift, flip, bckg_class), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class, ThreadPool* threads) - : DataLayer(std::move(data), n_class, n_shift, flip, bckg_class), + : Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool bckg_class, ThreadPool* threads) - : DataLayer(data, model, flip, bckg_class), + : Data2DLayer(data, model, flip, bckg_class), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool bckg_class, ThreadPool* threads) - : DataLayer(std::move(data), std::move(model), flip, bckg_class), + : Data2DLayer(std::move(data), std::move(model), flip, bckg_class), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::~ReadLayer() {} void ReadLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, std::ref(loglikelihood), std::ref(loglikelihood_max), promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const { // normalize the models Matrix3D model_norm = this->model ; for(size_t i=0; in_class; i++) { double mean = 0. ; for(size_t j=0; jl_model; j++) { mean += model_norm(i,j,0) ; } mean /= this->l_model ; for(size_t j=0; jl_model; j++) { model_norm(i,j,0) /= mean ; } } // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s_fw=0, s_rev=this->n_shift-1; s_fwn_shift; s_fw++, s_rev--) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood double ll_fw = 0. ; double ll_rev = 0. ; // --------------- forward --------------- size_t from_dat_fw = s_fw ; size_t to_dat_fw = from_dat_fw + this->l_model - 1 ; // --------------- reverse --------------- size_t from_dat_rev = this->n_col - 1 - s_fw ; // size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; j_dat_fwdata(i,j_dat_fw), model_norm(j,j_ref_fw,0)* this->window_means(i,s_fw))) ; // ll_fw += ll ; // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf ll_fw += std::max(ll, ReadLayer::p_min_log) ; // --------------- reverse --------------- if(this->flip) { ll = log(poisson_pmf(this->data(i,j_dat_rev), model_norm(j,j_ref_fw,0)* this->window_means(i,s_rev))) ; // ll_rev += ll ; // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf ll_rev += std::max(ll, ReadLayer::p_min_log) ; } } loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ; // keep track of the max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } if(this->flip) { loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ; // keep track of the max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // computing sum over the columns (classes) size_t n_row = posterior_prob.get_dim()[0] ; size_t n_class = posterior_prob.get_dim()[1] ; size_t n_shift = posterior_prob.get_dim()[2] ; size_t n_flip = posterior_prob.get_dim()[3] ; vector_d colsum(n_class, 0.) ; for(size_t i=0; iupdate_model(posterior_prob, colsum, threads) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, posterior_prob_colsum, promise) ; // this->model = future.get() ; auto model = future.get() ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(posterior_prob_colsum), std::ref(promises[i])))) ; } // reinitialise the model /* this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; */ size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the mode for(auto& future : futures) { Matrix3D model_part = future.get() ; // for(size_t i=0; in_class; i++) for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // avoid 0's in the model to ensure that pmf_poisson() never // return 0 for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), ReadLayer::p_min) ; } } } } void ReadLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ; // partial model Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t n_class=0; n_class < n_class_to_update; n_class++) { for(size_t i=from; in_shift; n_shift++) { // --------------- forward --------------- int from_dat_fw = n_shift ; int to_dat_fw = from_dat_fw + this->l_model - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::FORWARD) * this->data(i,j_dat_fw)) / posterior_prob_colsum[n_class] ; } // --------------- reverse --------------- if(this->flip) { int from_dat_rev = this->n_col - 1 - n_shift ; int to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::REVERSE) * this->data(i,j_dat_rev)) / posterior_prob_colsum[n_class] ; } } } } } promise.set_value(model) ; } void ReadLayer::compute_window_means(ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_window_means_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_window_means_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_window_means_routine(size_t from, size_t to, std::promise& done) { double l_window = double(this->l_model) ; for(size_t i=from; in_shift; from++) { double sum = 0. ; // slice is [from,to) size_t to = from + this->l_model ; for(size_t j=from; jdata(i,j) ;} this->window_means(i,from) = sum / l_window ; } } done.set_value(true) ; } void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const { if(posterior_prob_colsum.size() != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_class_prob matrix size is not " "equal to model class number : %zu / %zu", posterior_prob_colsum.size(), this->n_class) ; throw std::invalid_argument(msg) ; } } void ReadLayer::create_bckg_class() { // mean count double mean = 0. ; double n = (double)this->data.get_nrow() * (double)this->data.get_ncol() ; for(size_t i=0; idata.get_nrow(); i++) { for(size_t j=0; jdata.get_ncol(); j++) { mean += ((double)this->data(i,j)) / n ; } } // set last class as background class for(size_t j=0; jl_model; j++) { this->model(this->n_class-1,j,0) = mean ; } } diff --git a/src/Clustering/ReadLayer.hpp b/src/Clustering/ReadLayer.hpp index aa1fedc..5d52c92 100644 --- a/src/Clustering/ReadLayer.hpp +++ b/src/Clustering/ReadLayer.hpp @@ -1,266 +1,269 @@ #ifndef READLAYER_HPP #define READLAYER_HPP -#include +#include + +#include // std::promise + #include #include #include #include typedef std::vector vector_d ; -class ReadLayer : public DataLayer +class ReadLayer : public Data2DLayer { public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class, ThreadPool* threads = nullptr) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool bckg_class, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool bckg_class, ThreadPool* threads = nullptr) ; /*! * Destructor */ virtual ~ReadLayer() override ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * During this process, a normalized version of the * models, having a sum of signal of 1 count in average, * is used (a copy of the models is normalized, meaning * that the original models can still be retrieved the * dedicated getter). * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * This method does the same as the virtual method it * overloads. The only difference is that, for run time * gain, it is given the sum over the columns of the * posterior_prob matrix which is computed by the virtual * method. * \param posterior_prob the data assignment probabilities to * the different classes. * \param posterior_prob_colsum the sum over the columns * (classes) of the posterior_prob matrix. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads=nullptr) ; protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param promise a promise containing the partial model * computed from the given data slice. If several routines * work together to update the model, the promise matrices * need to be summed up to get the final model. */ void update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const ; /*! * \brief Computes the mean number of reads present in * each slice (of length l_model), in each row * of the data and store them in this->window_means. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void compute_window_means(ThreadPool* threads) ; /*! * \brief The routine that effectively computes the * window means. * \param from the index of the first row of the * data to considered. * \param to the index of the past last row of the * data to considered. * \param done a promise to fill when the routine * is done running. */ void compute_window_means_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_class_prob a vector containing the * class probabilities. * It should have a length equal to the number of * classes. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const ; /*! * \brief Sets the last class as background class with the mean * number of counts in the data at each position. */ void create_bckg_class() ; /*! * \brief contains the data means, for * each window of size l_model. */ Matrix2D window_means ; } ; #endif // READLAYER_HPP diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp index 68834d0..88df13c 100644 --- a/src/Clustering/SequenceLayer.cpp +++ b/src/Clustering/SequenceLayer.cpp @@ -1,440 +1,423 @@ #include + #include // std::invalid_argument #include // numeric_limits #include // log(), pow() #include #include // std::max_element() +#include // std::promise, std::future -#include // beta_pmf() -#include // rand_real_uniform(), rand_int_uniform() #include #include #include -#include +#include // dna::base_composition() double SequenceLayer::score_subseq(const Matrix2D& seq, size_t row, size_t start, const Matrix2D& model_log) { if(start > seq.get_ncol() - model_log.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", start, seq.get_ncol() - model_log.get_nrow()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_nrow() > seq.get_ncol()) { char msg[4096] ; sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", model_log.get_nrow(), seq.get_ncol()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_ncol() != 4) { char msg[4096] ; sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", model_log.get_ncol()) ; throw std::invalid_argument(msg) ; } size_t from = start ; size_t to = from + model_log.get_nrow() ; // will score [from, to) int n_code = dna::char_to_int('N') ; double ll = 0 ; for(size_t i=from, j=0; i get max score if(base == n_code) { std::vector row = model_log.get_row(j) ; ll += *(std::max_element(std::begin(row), std::end(row))) ; } // A,C,G,T -> get its score else { ll += model_log(j,base) ; } } return ll ; } SequenceLayer::SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) - : DataLayer(data, n_class, n_shift, flip,bckg_class) + : Data2DLayer(data, n_class, n_shift, flip,bckg_class) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } } SequenceLayer::SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) - : DataLayer(std::move(data), n_class, n_shift, flip, bckg_class) + : Data2DLayer(std::move(data), n_class, n_shift, flip, bckg_class) { this->n_category = 4 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; // background class if(this->bckg_class) { this->create_bckg_class() ; } } SequenceLayer::SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool bckg_class) - : DataLayer(data, model,flip, bckg_class) + : Data2DLayer(data, model,flip, bckg_class) {} SequenceLayer::SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool bckg_class) - : DataLayer(std::move(data), std::move(model),flip, bckg_class) + : Data2DLayer(std::move(data), std::move(model),flip, bckg_class) {} SequenceLayer::~SequenceLayer() -{} +{ ; } void SequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // compute the log prob model and the log prob reverse-complement model std::vector> model_log(this->n_class, Matrix2D(this->l_model, this->n_category, 0.)) ; std::vector> model_log_rev = model_log ; - /* - Matrix3D model_log(this->n_class, - this->l_model, - this->n_category, - 0.) ; - Matrix3D model_log_rev = model_log ; - */ for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { // forward model_log[i](j,k) = log(this->model(i,j,k)) ; // reverse model_log_rev[i](this->l_model-j-1,this->n_category-k-1) = log(this->model(i,j,k)) ; } } } // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, loglikelihood, loglikelihood_max, model_log, model_log_rev, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(model_log), std::ref(model_log_rev), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void SequenceLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const { // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { // forward strand { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ; // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; // apply lower bound here -> log(1e-100) loglikelihood(i,j,s,flip_states::FORWARD) = std::max(ll_fw, SequenceLayer::p_min_log) ; // keep track of max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } } // reverse if(this->flip) { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ; // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; // apply lower bound here -> log(1e-100) loglikelihood(i,j,s,flip_states::REVERSE) = std::max(ll_rev, SequenceLayer::p_min_log) ; // keep track of max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void SequenceLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, promise) ; // this->model = future.get() ; auto model = future.get() ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(promises[i])))) ; } // reinitialise the model - /* - this->model = Matrix3D(this->n_class, - this->l_model, - this->n_category, - 0.) ; - */ size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the model for(auto& future : futures) { Matrix3D model_part = future.get() ; // for(size_t i=0; in_class; i++) for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // make sure to have no 0 values for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), SequenceLayer::p_min) ; } } } // normalize to get probs for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { double sum = 0. ; for(size_t k=0; kn_category; k++) { sum += this->model(i,j,k) ; } for(size_t k=0; kn_category; k++) { double p = this->model(i,j,k) / sum ; this->model(i,j,k) = p ; - /* - this->model(i,j,k) = - std::max(p, SequenceLayer::p_min) ; - */ } } } } void SequenceLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; // the int code of A, C, G, T, N static int a_code = dna::char_to_int('A') ; static int c_code = dna::char_to_int('C') ; static int g_code = dna::char_to_int('G') ; static int t_code = dna::char_to_int('T') ; static int n_code = dna::char_to_int('N') ; // the int code of the reverse complement of A, C, G, T static int a_code_r = dna::char_to_int('A', true) ; static int c_code_r = dna::char_to_int('C', true) ; static int g_code_r = dna::char_to_int('G', true) ; static int t_code_r = dna::char_to_int('T', true) ; size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t k=0; k < n_class_to_update; k++) - // for(size_t k=0; k < this->n_class; k++) { for(size_t s=0; sn_shift; s++) { for(size_t j=0; jl_model; j++) { // base prob on fw and rv strand vector_d base_prob_fw(this->n_category, 0.) ; vector_d base_prob_rv(this->n_category, 0.) ; for(size_t i=from; idata(i,s+j) ; int base_rev = this->n_category - base - 1 ; // N if(base == n_code) { // --------------- forward --------------- { base_prob_fw[a_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[c_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[g_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[t_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[a_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[c_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[g_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[t_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } // A, C, G, T else - { { base_prob_fw[base] += + { // --------------- forward --------------- + { base_prob_fw[base] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[base_rev] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } } // update this position of the model for(size_t i=0,i_rev=base_prob_fw.size()-1; iflip) { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } } } } } promise.set_value(model) ; } void SequenceLayer::create_bckg_class() { // sequence composition std::vector base_comp = dna::base_composition(this->data, this->flip) ; // set last class as background class for(size_t i=0; in_category; i++) { for(size_t j=0; jl_model; j++) { this->model(this->n_class-1,j,i) = base_comp[i] ; } } } diff --git a/src/Clustering/SequenceLayer.hpp b/src/Clustering/SequenceLayer.hpp index 567c113..2238ff4 100644 --- a/src/Clustering/SequenceLayer.hpp +++ b/src/Clustering/SequenceLayer.hpp @@ -1,230 +1,231 @@ #ifndef SEQUENCELAYER_HPP #define SEQUENCELAYER_HPP -#include +#include + #include #include -#include // std::promise, std::future +#include // std::promise #include #include #include #include typedef std::vector vector_d ; -class SequenceLayer : public DataLayer +class SequenceLayer : public Data2DLayer { public: /*! * \brief Computes the log-likelihood of the sub- * sequence - stored in a given row - and starting * at the offset in the given sequence matrix. * The subsequence length is determined by the model * lenght. * \param seq the sequences in integer format. * \param row the row containing the sequence of * interest. * \param col the index at which the sub-sequence * is starting (1st index inside the subsequence * of interest). * \param model_log a model containing the log * probability model. * \return the log-likelihood of the sub-sequence * given the model. * \throw std::invalid_argument if 1) the offset is * invalid, 2) the sequence and the model have * incompatible dimensions or 3) the model 2n dimension * is not 4 (A,C,G,T). */ static double score_subseq(const Matrix2D& seq, size_t row, size_t col, const Matrix2D& model_log) ; public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, bool bckg_class) ; /*! * \brief Construct an object with the * given data and model. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. * \param model the model with the following * dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, bool bckg_class) ; /*! * \brief Construct an object with the * given data and model. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. * \param model the model with the following * dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. * \param bckg_class should the last class * of the model is constant and models the * background. */ SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, bool bckg_class) ; /*! * Destructor */ virtual ~SequenceLayer() override ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param model_log a vector containing the matrices with * the log values of the model for each class. * \param model_log_rev a vector containing the matrices with * the log values of the reverse strand model for each class * (the 1st position in the model becomes the last in the * reverse model and probabilities are swapped A<->T and C<->G). * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param done a promise containing the partial model * computed from the given data slice. If several routines * work together at updating the model, they need to be * summed and normalized (by the column sum) to get the * final model. */ void update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& done) const ; /*! * \brief Sets the last class as background class with the * mean base probababilities of the data at each position. */ void create_bckg_class() ; } ; #endif // SEQUENCELAYER_HPP diff --git a/src/GenomicTools/ClassReadDataCreator.cpp b/src/GenomicTools/ClassReadDataCreator.cpp new file mode 100644 index 0000000..b9851b3 --- /dev/null +++ b/src/GenomicTools/ClassReadDataCreator.cpp @@ -0,0 +1,173 @@ +#include + +#include +#include +#include // std::invalid_argument + +#include +#include + +template +std::ostream& operator << (std::ostream& stream, + const std::vector& v) +{ for(const auto& x:v) + { stream << x << " " ; } + return stream ; +} + +ClassReadDataCreator::ClassReadDataCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + const std::string& prob_file_path, + int from, + int to, + int bin_size, + size_t class_k, + CorrelationMatrixCreator::methods method) + : bed_file_path(bed_file_path), + bam_file_path(bam_file_path), + bai_file_path(bai_file_path), + prob_file_path(prob_file_path), + from(from), + to(to), + bin_size(bin_size), + class_k(class_k-1), // (1-based -> 0-based) + method(method) +{ ; } + +ClassReadDataCreator::~ClassReadDataCreator() +{ ; } + + +Matrix2D ClassReadDataCreator::create_matrix() +{ + // load posterior prob + Matrix4D prob ; + prob.load(this->prob_file_path) ; + size_t n_row = prob.get_dim()[0] ; + size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; + bool flip = n_flip == 2 ; + + // compute class prob + std::vector prob_colsum(n_class, 0.) ; + double tot = 0. ; + for(size_t i=0; iclass_k > n_class) + { char msg[4096] ; + sprintf(msg, "Error! invalid class of interest (%zu and %zu found)", + this->class_k, n_class) ; + throw std::invalid_argument(msg) ; + } + + // realigning the data using the same shift as the partition + // will "trim" the matrix + // -> create a wider matrix such that the realigned matrix will + // be of the desired wide (original from and to parameters). + int ext = n_shift - 1 ; + int ext_r = ext / 2 ; + int ext_l = ext - ext_r ; + int from2 = this->from - ext_l ; + int to2 = this->to + ext_r ; + + CorrelationMatrixCreator mc(this->bed_file_path, + this->bam_file_path, + this->bai_file_path, + from2, + to2, + this->bin_size, + this->method) ; + Matrix2D data2 = mc.create_matrix() ; + size_t n_col2 = data2.get_ncol() ; + + // check the compatibility with the prob matrix + if(data2.get_nrow() != prob.get_dim()[0]) + { char msg[4096] ; + sprintf(msg, "Error! the number of references in the BED file is not equal to the" + "1st dimension of the probability matrix (%zu, %zu).", + data2.get_nrow(), prob.get_dim()[0]) ; + throw(std::invalid_argument(msg)) ; + } + else if(data2.get_nrow() < prob.get_dim()[1]) + { char msg[4096] ; + sprintf(msg, "Error! the number of references in the BED file is smaller than" + "the number of classes in the probability matrix (%zu, %zu).", + data2.get_nrow(), prob.get_dim()[1]) ; + throw(std::invalid_argument(msg)) ; + + } + else if(data2.get_ncol() < prob.get_dim()[2]) + { char msg[4096] ; + sprintf(msg, "Error! the data matrix has fewer columns (bins) than the" + "the shifting freedom in the probability matrix (%zu, %zu).", + data2.get_ncol(), prob.get_dim()[2]) ; + throw(std::invalid_argument(msg)) ; + } + + // realign matrix + // size_t n_col1 = data2.get_ncol() ; + size_t n_col3 = this->to - this->from + 1 ; + Matrix2D data3(n_row, + n_col3, + 0.) ; + for(size_t i=0; i= to_dat2_rev; + j_dat2_rev--, j_dat3_fw++) + { data3(i,j_dat3_fw) += + (prob(i,class_k,s,1) * + data2(i,j_dat2_rev)) / + prob_colsum[class_k] ; + } + } + } + } + + // clean memory + data2 = Matrix2D() ; + prob = Matrix4D() ; + + // convert to integer + Matrix2D data4(data3.get_nrow(), + data3.get_ncol()) ; + for(size_t i=0; i< data4.get_nrow(); i++) + { for(size_t j=0; j() ; + + return data4 ; +} diff --git a/src/GenomicTools/ClassReadDataCreator.hpp b/src/GenomicTools/ClassReadDataCreator.hpp new file mode 100644 index 0000000..23fd0f9 --- /dev/null +++ b/src/GenomicTools/ClassReadDataCreator.hpp @@ -0,0 +1,148 @@ +#ifndef CLASSREADDATACREATOR_HPP +#define CLASSREADDATACREATOR_HPP + +#include +#include +#include + +#include // BamFileIn +#include // BedFileIn + +#include +#include + +/*! + * \brief The ClassReadDataCreator class retro-computes + * a fictive count matrix of a given class K, from a partition, assuming + * that the classification is true. + * To construct the final matrix M3, the original matrix M1 that was + * partitioned is computed and extended into a matrix M2. The original + * matrix M1 has L1 column and was partitioned allowing S shifting freedom. + * A number of extra columns E, overall, is added to the matrix M1 (~E/2 on + * each side). The final matrix M3, of L1 columns, is then computed. A row + * of the final matrix M3 is the weighted average of each of the S possibles + * slices of the corresponding row in M2. The weight used are the + * probabilities with which this row was assigned to class K, for each of + * the S slices. + */ +class ClassReadDataCreator +{ + + public: + + + ClassReadDataCreator() = delete ; + + /*! + * \brief Constructs an object to build a + * class correlation matrix from a partition. + * \param bed_file_path the path to the file containing + * the references. + * \param bam_file_path the path to the file containing + * the targets. + * \param bai_file_path the path to index file of the bam + * file containing the targets. + * \param prob_file_path the path to the file containing + * the assignment probabilities of the partition. + * It should be 4D matrix with the following dimensions : + * 1st the number of regions, should be the number of + * references in the BED file. + * 2nd the number of classes. + * 3rd the shifting freedom. + * 4th the flipping freedom (1 for no flip, 2 otherwise). + * \param from the upstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param to the dowmstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param bin_size the bin size in base pair. + * \param class_k the index (1-based) of the class of + * interest for which a matrix should be computed, + * from the partition. + * \param method how the targets should be counted. + * READ all the positions inside the reads are + * counted. + * READ_ATAC only the +4bp position of +strand reads + * and the -5bp of -strand reads are counted. It + * correspond to the insertion position in ATAC-seq + * data. + * FRAGMENT all the positions within fragments (the + * genome segment between a pair of reads, reads + * included) are counted. + * FRAGMENT_CENTER only the central position of the + * fragements (the genome segment between a pair of + * reads, reads included) are counted. + */ + ClassReadDataCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + const std::string& prob_file_path, + int from, + int to, + int bin_size, + size_t class_k, + CorrelationMatrixCreator::methods method) ; + + /*! + * Destructor. + */ + ~ClassReadDataCreator() ; + + /*! + * \brief Computes the matrix and returns it. + * \return the class count matrix. + * The weighted averages are floating values + * however, they are rounded and returned as + * integer values. + */ + Matrix2D create_matrix() ; + + + protected: + /*! + * \brief Bed file path. + */ + std::string bed_file_path ; + /*! + * \brief Bam file path. + */ + std::string bam_file_path ; + /*! + * \brief Bam index file path. + */ + std::string bai_file_path ; + /*! + * \brief the path to the posterior probability + * file (the partition). + */ + std::string prob_file_path ; + /*! + * \brief The smallest relative coordinate from the region + * center to consider (included). + */ + int from ; + /*! + * \brief The biggest relative coordinate from the region + * center to consider (not included). + */ + int to ; + /*! + * \brief The bin size. + */ + int bin_size ; + /*! + * \brief the class of interest (0-based). + */ + size_t class_k ; + /*! + * \brief How to consider the sequenced fragments when computing + * the bin values. + */ + CorrelationMatrixCreator::methods method ; + +} ; + +#endif // CLASSREADDATACREATOR_HPP diff --git a/src/GenomicTools/ClassSequenceDataCreator.cpp b/src/GenomicTools/ClassSequenceDataCreator.cpp new file mode 100644 index 0000000..8df2419 --- /dev/null +++ b/src/GenomicTools/ClassSequenceDataCreator.cpp @@ -0,0 +1,149 @@ +#include + +#include +#include +#include // std::invalid_argument + +#include +#include +#include +#include // char_to_int() + +ClassSequenceDataCreator::ClassSequenceDataCreator(const std::string& bed_file_path, + const std::string& fasta_file_path, + const std::string& prob_file_path, + int from, + int to, + size_t class_k) + : bed_file_path(bed_file_path), + fasta_file_path(fasta_file_path), + prob_file_path(prob_file_path), + from(from), + to(to), + class_k(class_k-1) +{ ; } + +ClassSequenceDataCreator::~ClassSequenceDataCreator() +{ ; } + + +Matrix3D ClassSequenceDataCreator::create_matrix() +{ + // load posterior prob + Matrix4D prob ; + prob.load(this->prob_file_path) ; + // prob.load(this->file_prob) ; + size_t n_row = prob.get_dim()[0] ; + size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; + bool flip = n_flip == 2 ; + + // compute class prob + std::vector prob_colsum(n_class, 0.) ; + double tot = 0. ; + for(size_t i=0; i 0-based) + if(this->class_k > n_class) + { char msg[4096] ; + sprintf(msg, "Error! invalid class of interest (%zu and %zu found)", + this->class_k, n_class) ; + throw std::invalid_argument(msg) ; + } + + // realigning the data using the same shift as the partition + // will "trim" the matrix -> create a wider matrix such that + // the realigned matrix will be of the desired wide (original + // from and to parameters).; + int ext = n_shift - 1 ; + int ext_r = ext / 2 ; + int ext_l = ext - ext_r ; + SequenceMatrixCreator mc(this->bed_file_path, + this->fasta_file_path, + this->from - ext_l, + this->to + ext_r) ; + Matrix2D data = mc.create_matrix() ; + + // check the compatibility with the prob matrix + if(data.get_nrow() != prob.get_dim()[0]) + { char msg[4096] ; + sprintf(msg, "Error! the number of references in the BED file is not equal to the" + "1st dimension of the probability matrix (%zu, %zu).", + data.get_nrow(), prob.get_dim()[0]) ; + throw(std::invalid_argument(msg)) ; + } + else if(data.get_nrow() < prob.get_dim()[1]) + { char msg[4096] ; + sprintf(msg, "Error! the number of references in the BED file is smaller than" + "the number of classes in the probability matrix (%zu, %zu).", + data.get_nrow(), prob.get_dim()[1]) ; + throw(std::invalid_argument(msg)) ; + + } + else if(data.get_ncol() < prob.get_dim()[2]) + { char msg[4096] ; + sprintf(msg, "Error! the data matrix has fewer columns (bins) than the" + "the shifting freedom in the probability matrix (%zu, %zu).", + data.get_ncol(), prob.get_dim()[2]) ; + throw(std::invalid_argument(msg)) ; + } + + // realign matrix + size_t n_seq = data.get_nrow() ; + size_t n_col2 = this->to - this->from + 1 ; + Matrix3D data2(n_seq, n_col2, 4, 0.) ; + // aggregate + for(size_t i=0; i same as doing nothing + if(base == dna::char_to_int('N')) + { continue ; } + int base_rv = 4 - base - 1 ; + // --------- forward --------- + // std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ; + data2(i,j,base) += prob(i,class_k,s,0) ; + + // --------- reverse --------- + if(flip) + { // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ; + data2(i,j,base_rv) += prob(i,class_k,s,1) ; + } + } + } + } + // clean memory + data = Matrix2D() ; + prob = Matrix4D() ; + + // normalize + for(size_t i=0; i +#include +#include + +#include // BamFileIn +#include // BedFileIn + +#include +#include +#include + +/*! + * \brief The ClassSequenceDataCreator class retro-computes + * a fictive sequence matrix of a given class K, from a partition, assuming + * that the classification is true. + * To construct the final matrix M3, the original matrix M1 that was + * partitioned is computed and extended into a matrix M2. The original + * matrix M1 has L1 column and was partitioned allowing S shifting freedom. + * A number of extra columns E, overall, is added to the matrix M1 (~E/2 on + * each side). The final matrix M3, of L1 columns, is then computed. A row + * of the final matrix M3 is the weighted average of each of the S possibles + * slices of the corresponding row in M2. The weight used are the + * probabilities with which this row was assigned to class K, for each of + * the S slices. Because sequences cannot be averaged (for instance + * 0.5*AAA + 0.5*CCC), the final result is a collection of consensus + * sequences represented as a sequence probability matrix where is + * the number of original sequences in M1 (also in M2). + * Thus the final result is a 3D matrix of dimensions N,L,4 where + * is the number of consensus sequences, L the length of the consensus + * sequences and 4 for A,C,G,T. + */ +class ClassSequenceDataCreator +{ + + public: + + + ClassSequenceDataCreator() = delete ; + + /*! + * \brief Constructs an object to build a + * class sequence matrix from a partition. + * \param bed_file_path the path to the file containing + * the references. + * \param fasta_file_path the path to the file containing + * the sequences. + * \param prob_file_path the path to the file containing + * the assignment probabilities of the partition. + * It should be 4D matrix with the following dimensions : + * 1st the number of regions, should be the number of + * references in the BED file. + * 2nd the number of classes. + * 3rd the shifting freedom. + * 4th the flipping freedom (1 for no flip, 2 otherwise). + * \param from the upstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param to the dowmstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param class_k the index (1-based) of the class of + * interest for which a matrix should be computed, + * from the partition. + */ + ClassSequenceDataCreator(const std::string& bed_file_path, + const std::string& fasta_file_path, + const std::string& prob_file_path, + int from, + int to, + size_t class_k) ; + + /*! + * Destructor. + */ + ~ClassSequenceDataCreator() ; + + /*! + * \brief Computes the matrix and returns it. + * \return the class sequence matrix. + * For each region, a consensus sequence is + * returned as a probability matrix. + * The matrix dimensions are : + * 1st the number of regions. + * 2nd the consensus sequence length. + * 3rd 4 for A,C,G,T + */ + Matrix3D create_matrix() ; + + protected: + /*! + * \brief Bed file path. + */ + std::string bed_file_path ; + /*! + * \brief Fasta file path. + */ + std::string fasta_file_path ; + /*! + * \brief the path to the posterior probability + * file (the partition). + */ + std::string prob_file_path ; + /*! + * \brief The smallest relative coordinate from the region + * center to consider (included). + */ + int from ; + /*! + * \brief The biggest relative coordinate from the region + * center to consider (not included). + */ + int to ; + /*! + * \brief the class of interest (0-based). + */ + size_t class_k ; + +} ; + +#endif // CLASSSEQUENCEDATACREATOR_HPP diff --git a/src/GenomicTools/CorrelationMatrixCreator.cpp b/src/GenomicTools/CorrelationMatrixCreator.cpp index 0197664..f8f758b 100644 --- a/src/GenomicTools/CorrelationMatrixCreator.cpp +++ b/src/GenomicTools/CorrelationMatrixCreator.cpp @@ -1,374 +1,374 @@ #include #include #include // std::runtime_error #include // BamFileIn #include // BedFileIn #include #include /* template std::ostream& operator << (std::ostream& stream, const std::list& l) { for(const auto& p : l) { stream << p << " " ; } return stream ; } template std::ostream& operator << (std::ostream& stream, const std::vector& v) { for(const auto& p : v) { stream << p << " " ; } return stream ; } template std::ostream& operator << (std::ostream& stream, const std::pair& p) { stream << "[" << p.first << " " << p.second << "] " ; return stream ; } template std::ostream& operator << (std::ostream& stream, const std::unordered_map& m) { for(const auto& p : m) { stream << p << " " << std::endl; } return stream ; } */ /* A lambda to sort GenomeRegion by ascending starting coordinate */ auto sortByStartPos = [](const GenomeRegion& r1, const GenomeRegion& r2) -> bool { return r1 < r2 ; } ; CorrelationMatrixCreator::CorrelationMatrixCreator(const std::string& bed_file_path, const std::string& bam_file_path, const std::string& bai_file_path, int from, int to, int bin_size, CorrelationMatrixCreator::methods method) : ReadMatrixCreator(bed_file_path, bam_file_path, bai_file_path, from, to, bin_size, method), target_list_fw(), target_list_rv() { seqan::BedRecord bed_line ; // compute coordinates relative to each region this->compute_relative_bin_coord() ; size_t n_col = this->relative_bin_coord.size() ; // compute number of regions and get valid chromosomes names this->open_bed_file() ; this->open_bam_file() ; seqan::BamHeader header ; seqan::readHeader(header, bam_file) ; size_t n_row = 0 ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; std::string chrom_name = seqan::toCString(bed_line.ref) ; // new chromosome if(this->chrom_map_names.find(chrom_name) == this->chrom_map_names.end()) { int chrom_idx = -1 ; seqan::getIdByName(chrom_idx, seqan::contigNamesCache(seqan::context(this->bam_file)), chrom_name) ; this->chrom_map_names[chrom_name] = chrom_idx ; } n_row++ ; } this->close_bed_file() ; this->close_bam_file() ; // create the count matrix - this->matrix_counts = Matrix2D(n_row, n_col, 0.) ; + this->matrix = Matrix2D(n_row, n_col, 0.) ; // create the region matrix this->matrix_bins = std::vector> (n_row,std::vector(n_col)) ; this->open_bed_file() ; this->open_bam_file() ; size_t i = 0 ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; // find the region limits std::string region_chr = seqan::toCString(bed_line.ref) ; // int region_len = bed_line.endPos - bed_line.beginPos ; // int region_mid = bed_line.beginPos + (region_len / 2) ; int region_mid = CorrelationMatrixCreator::get_center_pos(bed_line) ; // compute the absolute bins coordinates for this region // and create the bins in this region for(size_t j=0; jrelative_bin_coord[j] ; this->matrix_bins[i][j] = GenomeRegion(region_chr, this->chrom_map_names[region_chr], region_mid + relative_coord.first, region_mid + relative_coord.second) ; } i++ ; } this->close_bed_file() ; this->close_bam_file() ; } CorrelationMatrixCreator::~CorrelationMatrixCreator() { this->close_bam_file() ; // bed file is closed in ~MatrixCreator() } Matrix2D CorrelationMatrixCreator::create_matrix() { this->open_bam_file() ; this->open_bai_file() ; // read BAM header seqan::BamHeader bam_header ; seqan::readHeader(bam_header, this->bam_file) ; - for(size_t i=0; imatrix_counts.get_nrow(); i++) + for(size_t i=0; imatrix.get_nrow(); i++) { const auto& row = this->matrix_bins[i] ; GenomeRegion region(row.front().chromosome, row.front().chromosome_idx, row.front().start, row.back().end) ; bool jump = this->jump_upstream(region, 600) ; if(not jump) { continue ; } // read all relevant targets this->to_downstream_target(region) ; // update count matrix row this->update_count_matrix(i) ; // clean buffers this->clear_target_lists() ; } this->close_bam_file() ; - return this->matrix_counts ; + return this->matrix ; } bool CorrelationMatrixCreator::jump_upstream(const GenomeRegion& region, int margin) { bool has_alignment = false ; int rID = -10 ; if(this->chrom_map_names.find(region.chromosome) != this->chrom_map_names.end()) { rID = this->chrom_map_names[region.chromosome] ; } else { char msg[4096] ; sprintf(msg, "Error! chromosome %s is not linked with a valid ID in BAM file", region.chromosome.c_str()) ; std::cerr << msg << std::endl ; return false ; } int start = std::max(0, region.start - margin) ; int end = start + 1 ; bool jump = seqan::jumpToRegion(this->bam_file, has_alignment, rID, start, end, this->bai_file) ; return jump ; } void CorrelationMatrixCreator::to_downstream_target(const GenomeRegion& region) { if(this->method == CorrelationMatrixCreator::methods::READ or this->method == CorrelationMatrixCreator::methods::READ_ATAC) { this->to_downstream_read(region) ; } else { this->to_downstream_fragment(region) ; } } void CorrelationMatrixCreator::to_downstream_read(const GenomeRegion& region) { bool done = false ; seqan::BamAlignmentRecord record ; while(not seqan::atEnd(this->bam_file) and not done) { // QC check and transform record seqan::readRecord(record, this->bam_file) ; if(not CorrelationMatrixCreator::is_good_read(record) or not this->is_valid_chromosome(record)) { continue ; } GenomeRegion target ; try { if(this->method == CorrelationMatrixCreator::methods::READ) { target = GenomeRegion::constructRead(record, this->bam_file) ; } else { target = GenomeRegion::constructReadATAC(record, this->bam_file) ; } } catch(std::invalid_argument& e) { // connect to cerr to write in SAM seqan::BamFileOut samFileOut(seqan::context(this->bam_file), std::cerr, seqan::Sam()) ; std::cerr << "std::invalid_argument caught! could not use " "this record as read: " << std::endl ; writeRecord(samFileOut, record) ; std::cerr << "message was : " << e.what() << std::endl << std::endl ; continue ; } // upstream -> continue if(target < region) { continue ; } // overlap -> store else if(target | region) { if(not seqan::hasFlagRC(record)) { this->target_list_fw.push_back(target) ; } else { this->target_list_rv.push_back(target) ; } } // downstream -> stop else { done = true ; } } } void CorrelationMatrixCreator::to_downstream_fragment(const GenomeRegion& region) { bool done = false ; seqan::BamAlignmentRecord record ; while(not seqan::atEnd(this->bam_file) and not done) { // QC check and transform record seqan::readRecord(record, this->bam_file) ; if(not CorrelationMatrixCreator::is_good_pair(record) or not this->is_valid_chromosome(record)) { continue ; } GenomeRegion target ; try { target = GenomeRegion::constructFragment(record, this->bam_file) ; } catch(std::invalid_argument& e) { // connect to cerr to write in SAM seqan::BamFileOut samFileOut(seqan::context(this->bam_file), std::cerr, seqan::Sam()) ; std::cerr << "std::invalid_argument caught! could not use " "this record as fragment: " << std::endl ; writeRecord(samFileOut, record) ; std::cerr << "message was : " << e.what() << std::endl << std::endl ; continue ; } // upstream -> continue if(target < region) { continue ; } // overlap -> store else if(target | region) { if(this->method == CorrelationMatrixCreator::methods::FRAGMENT_CENTER) { target = GenomeRegion::constructFragmentCenter(record, this->bam_file) ; if(target | region) { this->target_list_fw.push_back(target) ; } } else { this->target_list_fw.push_back(target) ; } } // downstream -> stop else if(target > region) { // std::cerr << std::endl ; done = true ; } } // std::cerr << "to_downstream_fragment END" << std::endl ; } void CorrelationMatrixCreator::clear_target_lists() { this->target_list_fw.clear() ; this->target_list_rv.clear() ; } /* void CorrelationMatrixCreator::remove_upstream_targets(const GenomeRegion& region) { // forward targets auto iter_fw = this->target_list_fw.cbegin() ; while(iter_fw != this->target_list_fw.end()) { // remove upstream reads if(*iter_fw < region) { iter_fw = this->target_list_fw.erase(iter_fw) ; } // keep overlapping reads, don't stop here else if(*iter_fw | region) { iter_fw++ ; } // stop at first read downstream else { break ; } } // reverse targets auto iter_rv = this->target_list_rv.cbegin() ; while(iter_rv != this->target_list_rv.end()) { // remove upstream reads if(*iter_rv < region) { iter_rv = this->target_list_rv.erase(iter_rv) ; } // keep overlapping reads else if(*iter_rv | region) { iter_rv++ ; } // stop at first read downstream else { break ; } } } */ void CorrelationMatrixCreator::update_count_matrix(size_t row_index) { // forward targets for(const auto& iter : this->target_list_fw) { auto bin_start_end = CorrelationMatrixCreator:: get_bin_indices(iter, this->matrix_bins[row_index]) ; for(int j=bin_start_end.first; jmatrix_counts(row_index,j) += - iter.overlap_len(this->matrix_bins[row_index][j]) ; + { this->matrix(row_index,j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; } } // reverse targets for(const auto& iter : this->target_list_rv) { auto bin_start_end = CorrelationMatrixCreator:: get_bin_indices(iter, this->matrix_bins[row_index]) ; for(int j=bin_start_end.first; jmatrix_counts(row_index,j) += - iter.overlap_len(this->matrix_bins[row_index][j]) ; + { this->matrix(row_index,j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; } } } /* void CorrelationMatrixCreator::update_count_matrix_naive(size_t row_index) { // forward targets for(const auto& iter : target_list_fw) - { for(size_t j=0; jmatrix_counts[0].size(); j++) - { this->matrix_counts[row_index][j] += + { for(size_t j=0; jmatrix[0].size(); j++) + { this->matrix[row_index][j] += iter.overlap_len(this->matrix_bins[row_index][j]) ; } } // reverse targets for(const auto& iter : target_list_rv) - { for(size_t j=0; jmatrix_counts[0].size(); j++) - { this->matrix_counts[row_index][j] += + { for(size_t j=0; jmatrix[0].size(); j++) + { this->matrix[row_index][j] += iter.overlap_len(this->matrix_bins[row_index][j]) ; } } } */ diff --git a/src/GenomicTools/MatrixCreator.cpp b/src/GenomicTools/MatrixCreator.cpp index b0090cc..5390847 100644 --- a/src/GenomicTools/MatrixCreator.cpp +++ b/src/GenomicTools/MatrixCreator.cpp @@ -1,41 +1,41 @@ #include #include #include // BedFileIn #include #include #include MatrixCreator::MatrixCreator(const std::string& bed_file_path, int from, int to) : bed_path(bed_file_path), bed_file(), from(from), to(to), - matrix_counts() + matrix() {} int MatrixCreator::get_center_pos(const seqan::BedRecord& bed_line) { int region_len = bed_line.endPos - bed_line.beginPos ; int region_mid = bed_line.beginPos + (region_len / 2) ; return region_mid ; } MatrixCreator::~MatrixCreator() { this->close_bed_file() ; } void MatrixCreator::open_bed_file() { if(not seqan::open(this->bed_file, this->bed_path.c_str())) { char msg[4096] ; sprintf(msg, "cannot open %s", this->bed_path.c_str()) ; throw std::runtime_error(msg) ; } } void MatrixCreator::close_bed_file() { seqan::close(this->bed_file) ; } diff --git a/src/GenomicTools/MatrixCreator.hpp b/src/GenomicTools/MatrixCreator.hpp index 746a586..fd02131 100644 --- a/src/GenomicTools/MatrixCreator.hpp +++ b/src/GenomicTools/MatrixCreator.hpp @@ -1,103 +1,103 @@ #ifndef MATRIXCREATOR_HPP #define MATRIXCREATOR_HPP #include #include // BedFileIn, BedRecord #include #include /*! * \brief The MatrixCreator class is a base class * to be derived by classes that are dedicated to * construct data matrices which rows contains * a signal at different positions (columns) in * this given region. */ class MatrixCreator { public: /*! * \brief Returns the central position of a bed region. * \param bed_line the region of interest. * \return the position of the center. */ static int get_center_pos(const seqan::BedRecord& bed_line) ; public: /*! * \brief Constructs an object. * \param bed_file_path the path to the bed file * containing the coordinates of the regions of * interest. * \param from the downstream most position * to consider, relative to a set of genomic * positions. * \param to the upstream most position to * consider, relative to a set of genomic * positions */ MatrixCreator(const std::string& bed_file_path, int from, int to) ; /*! * Destructor. */ virtual ~MatrixCreator() ; /*! * \brief Creates and return the count matrix. * \return the count matrix. */ virtual Matrix2D create_matrix() = 0 ; protected: /*! * \brief Opens the bed file. * \throw std::runtime_error if the file cannot * be open. */ void open_bed_file() ; /*! * \brief Closes the bed file. * Does nothing if already closed. */ void close_bed_file() ; /*! * \brief Bed file path. */ std::string bed_path ; /*! * \brief An input stream to the * bed file. * Use open_bed_file() to open the stream * and close_bed_file() to close it. */ seqan::BedFileIn bed_file ; /*! * \brief The smallest relative coordinate from the region * center to consider (included). */ int from ; /*! * \brief The biggest relative coordinate from the region * center to consider (not included). */ int to ; /*! - * \brief A matrix containing the number of targets + * \brief A matrix containing the data * found at each position around each reference. * This is the data structure to fill. */ - Matrix2D matrix_counts ; + Matrix2D matrix ; } ; #endif // MATRIXCREATOR_HPP diff --git a/src/GenomicTools/SequenceMatrixCreator.cpp b/src/GenomicTools/SequenceMatrixCreator.cpp index 837cf73..142b41a 100644 --- a/src/GenomicTools/SequenceMatrixCreator.cpp +++ b/src/GenomicTools/SequenceMatrixCreator.cpp @@ -1,113 +1,113 @@ #include #include #include // std::invalid_argument, std::runtime_error #include // std::make_pair(), std::move() #include #include // BedFileIn, BedRecord #include // seqan::SeqFileIn #include #include SequenceMatrixCreator::SequenceMatrixCreator(const std::string& bed_file_path, const std::string& fasta_file_path, int from, int to) : MatrixCreator(bed_file_path, from, to), fasta_path(fasta_file_path), fasta_file() { seqan::BedRecord bed_line ; // compute number of regions this->open_bed_file() ; size_t n_row = 0 ; size_t n_col = to - from + 1 ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; n_row++ ; } this->close_bed_file() ; // create the count matrix // init to 'N' because if a part of the matrix // cannot be filled, it wil contain stretches of // 'N' - this->matrix_counts = Matrix2D(n_row, n_col, dna::char_to_int('N')) ; + this->matrix = Matrix2D(n_row, n_col, dna::char_to_int('N')) ; } SequenceMatrixCreator::~SequenceMatrixCreator() { this->close_fasta_file() ; // bed file closed in ~MatrixCreator() } Matrix2D SequenceMatrixCreator::create_matrix() { std::unordered_map seq_map ; // read the fasta file and store all the sequences this->open_fasta_file() ; while(not seqan::atEnd(this->fasta_file)) { seqan::CharString record_id ; seqan::Dna5String record_seq ; seqan::readRecord(record_id, record_seq, this->fasta_file) ; std::string id = seqan::toCString(record_id) ; // store it if(seq_map.find(id) == seq_map.end()) { seq_map.insert(std::make_pair(std::move(id), std::move(record_seq))) ; } else { char msg[4096] ; sprintf(msg, "Error! header %s found several times in %s", id.c_str(), this->fasta_path.c_str()) ; throw std::runtime_error(msg) ; } } this->close_fasta_file() ; // fill the matrix this->open_bed_file() ; size_t i=0 ; seqan::BedRecord bed_line ; while(not seqan::atEnd(this->bed_file)) { seqan::readRecord(bed_line, this->bed_file) ; std::string region_chr = seqan::toCString(bed_line.ref) ; // get sequence [from, to) int region_mid = MatrixCreator::get_center_pos(bed_line) ; int region_start = std::max(0, region_mid + from) ; int region_end = region_mid + to + 1 ; auto iter = seq_map.find(region_chr) ; if(iter == seq_map.end()) { char msg[4096] ; sprintf(msg, "Error! %s sequence cannot be found in %s", region_chr.c_str(), this->fasta_path.c_str()) ; throw std::runtime_error(msg) ; } else { // auto& seq_name = iter->first ; auto& seq = iter->second ; for(int j_seq=region_start, j_mat=0; j_seqsecond); j_seq++, j_mat++) - { this->matrix_counts(i,j_mat) = dna::char_to_int(seq[j_seq]) ; } + { this->matrix(i,j_mat) = dna::char_to_int(seq[j_seq]) ; } } i++ ; } this->close_bed_file() ; - return this->matrix_counts ; + return this->matrix ; } void SequenceMatrixCreator::open_fasta_file() { if(not seqan::open(this->fasta_file, this->fasta_path.c_str())) { char msg[4096] ; sprintf(msg, "cannot open %s", this->fasta_path.c_str()) ; throw std::runtime_error(msg) ; } } void SequenceMatrixCreator::close_fasta_file() { seqan::close(this->fasta_file) ; } diff --git a/src/GenomicTools/SequenceMatrixCreator.hpp b/src/GenomicTools/SequenceMatrixCreator.hpp index e71dc88..8c22062 100644 --- a/src/GenomicTools/SequenceMatrixCreator.hpp +++ b/src/GenomicTools/SequenceMatrixCreator.hpp @@ -1,65 +1,80 @@ #ifndef SEQUENCEMATRIXCREATOR_HPP #define SEQUENCEMATRIXCREATOR_HPP #include #include // seqan::SeqFileIn #include #include class SequenceMatrixCreator : public MatrixCreator { public: - + /*! + * \brief Constructs an object to build a + * sequence matrix from a partition. + * \param bed_file_path the path to the file containing + * the references. + * \param fasta_file_path the path to the file containing + * the sequences. + * \param from the upstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param to the dowmstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + */ SequenceMatrixCreator(const std::string& bed_file_path, const std::string& fasta_file_path, int from, int to) ; /*! * \brief Destructor */ virtual ~SequenceMatrixCreator() ; /*! * \brief Computes the matrix and returns it. * \return the sequence matrix. * \throw std::runtime_error if two sequences * have the same header in the fasta file or * if a sequence/chromosome name present * in the bed cannot be found as sequence * header in the fasta file. */ virtual Matrix2D create_matrix() override ; protected: /*! * \brief Opens the fasta file. * \throw std::runtime_error if the file cannot * be open. */ void open_fasta_file() ; /*! * \brief Closes the fasta file. * \throw std::runtime_error if the file cannot * be open. */ void close_fasta_file() ; /*! * \brief Fasta file path. */ std::string fasta_path ; /*! * \brief An input stream to the * fasta file. * Use open_fasta_file() to open the stream * and close_fasta_file() to close it. */ seqan::SeqFileIn fasta_file ; } ; #endif // SEQUENCEMATRIXCREATOR_HPP diff --git a/src/Utility/dna_utility.cpp b/src/Utility/dna_utility.cpp index 11af0e0..a2a170f 100644 --- a/src/Utility/dna_utility.cpp +++ b/src/Utility/dna_utility.cpp @@ -1,142 +1,176 @@ #include #include #include +#include // std::invalid_argument #include +#include #include // seqan::SeqFileIn int dna::map(char base, bool rev_compl) { static bool init = false ; static std::unordered_map hash_map ; static std::unordered_map hash_map_rev ; if(not init) { hash_map['A'] = 0 ; hash_map['a'] = 0 ; hash_map['C'] = 1 ; hash_map['c'] = 1 ; hash_map['G'] = 2 ; hash_map['g'] = 2 ; hash_map['T'] = 3 ; hash_map['t'] = 3 ; hash_map['N'] = 4 ; hash_map['n'] = 4 ; hash_map_rev['A'] = hash_map['T'] ; hash_map_rev['a'] = hash_map['t'] ; hash_map_rev['C'] = hash_map['G'] ; hash_map_rev['c'] = hash_map['g'] ; hash_map_rev['G'] = hash_map['C'] ; hash_map_rev['g'] = hash_map['c'] ; hash_map_rev['T'] = hash_map['A'] ; hash_map_rev['t'] = hash_map['a'] ; hash_map_rev['N'] = hash_map['N'] ; hash_map_rev['n'] = hash_map['n'] ; init = true ; } try { if(rev_compl) { return hash_map_rev.at(base) ; } else { return hash_map.at(base) ; } } // key could not be found catch(std::out_of_range& e) { char msg[256] ; sprintf(msg, "Error! Invalid DNA base : %c", base) ; throw std::invalid_argument(msg) ; } } char dna::map(int base, bool rev_compl) { static bool init = false ; static std::unordered_map hash_map ; static std::unordered_map hash_map_rev ; if(not init) { hash_map[0] = 'A' ; hash_map[1] = 'C' ; hash_map[2] = 'G' ; hash_map[3] = 'T' ; hash_map[4] = 'N' ; hash_map_rev[4] = hash_map[4] ; hash_map_rev[3] = hash_map[0] ; hash_map_rev[2] = hash_map[1] ; hash_map_rev[1] = hash_map[2] ; hash_map_rev[0] = hash_map[3] ; init = true ; } try { if(rev_compl) { return hash_map_rev.at(base) ; } else { return hash_map.at(base) ; } } // key could not be found catch(std::out_of_range& e) { char msg[256] ; sprintf(msg, "Error! Invalid DNA code : %i", base) ; throw std::invalid_argument(msg) ; } } int dna::char_to_int(char c, bool rev_compl) { return dna::map(c, rev_compl) ; } Matrix2D dna::char_to_int(const Matrix2D& matrix) { size_t n_row = matrix.get_nrow() ; size_t n_col = matrix.get_ncol() ; Matrix2D data_int(n_row, n_col) ; for(size_t i=0; i dna::base_composition(const Matrix2D& sequences, bool both_strands) { double total = 0. ; std::vector base_comp(4,0.) ; int base_N = dna::map('N') ; for(size_t i=0; i adding a count to each // is equivalent to not changing anything if(base == base_N) { continue ; } else { base_comp[base] += 1; total += 1. ; } // reverse complement strand if(both_strands) { // size_t c_hash_rev = dna::hash(c, true) ; base_comp[4-base-1] += 1. ; total += 1. ; } } } // normalize for(auto& i : base_comp) { i /= total ; } return base_comp ; } + +std::vector dna::base_composition(const Matrix3D& consensus_sequences, bool both_strands) +{ + if(consensus_sequences.get_dim()[2] != 4) + { char msg[4096] ; + sprintf(msg, "Error! consensus sequences 3rd dimension not equal to 4 (%zu)", + consensus_sequences.get_dim()[2]) ; + throw std::invalid_argument(msg) ; + } + double total = 0. ; + std::vector base_comp(4,0.) ; + + for(size_t i=0; i +#include namespace dna { /*! * \brief Contains the mapping to convert * DNA characters to integer codes. * Lower and capital characters are accepted. * \param base the character of interest. * \param rev_compl whether the reverse * complement of the character is of interest. * \return the corresponding DNA code. */ int map(char base, bool rev_compl=false) ; /*! * \brief Contains the mapping to convert * DNA code to characters. * Only capital characters are returned. * \param base the code of interest. * \param rev_compl whether the reverse * complement of the code is of interest. * \return the corresponding DNA character. */ char map(int base, bool rev_compl=false) ; /*! * \brief Converts a DNA character (A, C, * G, T) to an integer. * \param c the DNA character of interest. * \return the character integer code. * \throw std::invalid_argument if the * given character is not a valid DNA * character. */ int char_to_int(char c, bool rev_compl= false) ; /*! * \brief Converts a DNA character matrix (A, C, * G, T) to an integer matrix. * The DNA characters are converted using * SequenceLayer::char_to_int(char). * param file_address the address of the file to load. * \return the corresponding int matrix. */ Matrix2D char_to_int(const Matrix2D& matrix) ; /*! * \brief Converts an int DNA code to * a regular DNA character (A, C, G, T). * This method is the reverse method of * char_to_int(char). * \param n the DNA code of interest. * \return the DNA character. * \throw std::invalid_argument if the * given int is not a valid DNA * code. */ char int_to_char(int n, bool rev_compl=false) ; /*! * \brief Computes the base composition of a set of * sequences, in integer format, contained in a matrix. * \param sequences a matrix containing the sequences * of interest. * \param both_strands also accounts for the reverse * complement of the sequences. * \throw std::invalid_argument if a non-supported * character is found in the matrix. * \return a vector of 4 values corresponding to the * frequencies of A,C,G and T * respectively. */ - std::vector base_composition(const Matrix2D& sequences, bool both_strands) ; + std::vector base_composition(const Matrix2D& sequences, + bool both_strands) ; + + + /*! + * \brief Computes the base composition of a set of + * consensus sequences, contained in a probability + * matrix with the following dimensions: + * 1st the number of sequences + * 2nd the length of the sequences + * 3rd 4 for A,C,G,T + * The sum over the 1st and 2nd dimension should be 1. + * The overall sum of the matrix values should be equal + * to the 1st dimension. + * \param consensus_sequences a matrix containing the + * consensus sequences of interest. + * \param both_strands also accounts for the reverse + * complement of the sequences. + * \throw std::invalid_argument if the 3rd dimension + * of the consensus sequence matrix is not equal to 4. + * \return a vector of 4 values corresponding to the + * frequencies of A,C,G and T + * respectively. + */ + std::vector base_composition(const Matrix3D& consensus_sequences, + bool both_strands) ; } #endif // DNA_UTILITY_HPP diff --git a/src/main_test.cpp b/src/main_test.cpp index 2efa423..a6f2b20 100644 --- a/src/main_test.cpp +++ b/src/main_test.cpp @@ -1,182 +1,207 @@ #include #include #include #include #include #include #include #include using namespace std ; template std::ostream& operator << (std::ostream& stream, const std::vector& v) { for(const auto& x : v) { stream << x << " " ; } return stream ; } +Matrix3D compute_consensus_sequences(const Matrix2D& seq, + const Matrix4D& prob, + size_t class_k) +{ + size_t n_seq = seq.get_nrow() ; + size_t l_seq = seq.get_ncol() ; + // size_t n_class = prob.get_dim()[1] ; + size_t n_shift = prob.get_dim()[2] ; + size_t n_flip = prob.get_dim()[3] ; + bool flip = n_flip == 2 ; + size_t l_model = l_seq - n_shift + 1 ; + + Matrix3D cons_seq(n_seq, l_model, 4, 0.) ; + + // aggregate + for(size_t i=0; i same as doing nothing + if(base == dna::char_to_int('N')) + { continue ; } + int base_rv = 4 - base - 1 ; + // --------- forward --------- + // std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ; + cons_seq(i,j,base) += prob(i,class_k,s,0) ; + tot += prob(i,class_k,s,0) ; + + // --------- reverse --------- + if(flip) + { // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ; + cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ; + tot += prob(i,class_k,s,1) ; + } + } + } + } + + // normalize + for(size_t i=0; i prob(prob_file) ; - size_t n_row = prob.get_dim()[0] ; + // size_t n_row = prob.get_dim()[0] ; // size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; - size_t n_flip = prob.get_dim()[3] ; - bool flip = n_flip == 2 ; + // size_t n_flip = prob.get_dim()[3] ; + // bool flip = n_flip == 2 ; std::cerr << "posterior prob " << prob.get_dim() << std::endl ; // these are from3 and to3 int from1 = -500 ; int to1 = 500 ; // class of interest size_t class_k = 17 ; class_k -= 1 ; // get a wider matrix to realign it std::cerr << "creating data2 matrix" << std::endl ; // extend limits int ext = n_shift - 1 ; int ext_r = ext / 2 ; int ext_l = ext - ext_r ; int from2 = from1 - ext_l ; int to2 = to1 + ext_r ; SequenceMatrixCreator mc(bed_file, seq_file, from2, to2) ; Matrix2D data2 = mc.create_matrix() ; - size_t n_col2 = data2.get_ncol() ; + data2.save("test/data2.mat2d") ; std::cerr << "data2 matrix " << data2.get_dim() << std::endl ; + Matrix3D cons_seq= compute_consensus_sequences(data2, + prob, + class_k) ; + data2.save("test/consensus_matrix.mat3d") ; + */ - // realign matrix - std::cerr << "computing data3 matrix" << std::endl ; - size_t n_col3 = to1 - from1 + 1 ; - Matrix3D data3(n_row, - n_col3, - 4, // A, C, G, T, N - 0.) ; - - // the int code of A, C, G, T, N - // static int a_code = dna::char_to_int('A') ; - // static int c_code = dna::char_to_int('C') ; - // static int g_code = dna::char_to_int('G') ; - // static int t_code = dna::char_to_int('T') ; - static int n_code = dna::char_to_int('N') ; - // the int code of the reverse complement of A, C, G, T - // static int a_code_r = dna::char_to_int('A', true) ; - // static int c_code_r = dna::char_to_int('C', true) ; - // static int g_code_r = dna::char_to_int('G', true) ; - // static int t_code_r = dna::char_to_int('T', true) ; - - std::cerr << "data3 matrix " << data3.get_dim() << std::endl ; - for(size_t i=0; i prob(prob_file) ; + std::cout << prob.get_dim() << std::endl ; + Matrix2D data2 ; + data2.load("test/data2.mat2d") ; + std::cout << data2.get_dim() << std::endl ; + + size_t class_k = 17 ; + class_k-- ; + Matrix3D cons_seq = compute_consensus_sequences(data2, + prob, + class_k) ; + cons_seq.save("test/consensus_sequences.mat3d") ; - } - // --------------- reverse --------------- - if(flip) - { int from_dat2_rev = n_col2 - 1 - s ; - int to_dat2_rev = from_dat2_rev - (n_col3 - 1) ; - for(int j_dat2_rev=from_dat2_rev, j_dat3_fw=0; - j_dat2_rev >= to_dat2_rev; - j_dat2_rev--, j_dat3_fw++) - { int base = data2(i,j_dat2_rev) ; - int base_rev = 4 - base - 1 ; - // exclude N's - if(base_rev == n_code) - { continue ; } - double p = prob(i,class_k,s,1) ; - data3(i,j_dat3_fw, base_rev) += p ; - tot += p ; - } - } - } - // normalize - for(size_t s=0; s() ; - - for(size_t j=0; j<10; j++) - // for(size_t j=0; j data4(data3.get_dim()[0], - data3.get_dim()[1]) ; - for(size_t i=0; i< data4.get_nrow(); i++) - { for(size_t j=0; j::lowest() ; - size_t b_max = 0 ; - for(size_t base=0; base p_max) - { p_max = data3(i,j,base) ; - b_max = base ; - } + // sequences + Matrix2D seq(4, 7, 0) ; + seq(0,0) = 0 ; seq(0,1) = 3 ; seq(0,2) = 3 ; seq(0,3) = 3 ; seq(0,4) = 1 ; seq(0,5) = 1 ; seq(0,6) = 1 ; + seq(1,0) = 1 ; seq(1,1) = 0 ; seq(1,2) = 3 ; seq(1,3) = 3 ; seq(1,4) = 3 ; seq(1,5) = 1 ; seq(1,6) = 1 ; + seq(2,0) = 1 ; seq(2,1) = 1 ; seq(2,2) = 3 ; seq(2,3) = 0 ; seq(2,4) = 0 ; seq(2,5) = 0 ; seq(2,6) = 1 ; + seq(3,0) = 1 ; seq(3,1) = 1 ; seq(3,2) = 1 ; seq(3,3) = 3 ; seq(3,4) = 0 ; seq(3,5) = 0 ; seq(3,6) = 0 ; + + // probabilities + Matrix4D prob(4, 1, 4, 2) ; + // row 1, cluster 1 + prob(0,0,0,0) = 999 ; prob(0,0,0,1) = .01 ; // shift 1, both flips + prob(0,0,1,0) = .01 ; prob(0,0,1,1) = .01 ; // shift 2, both flips + prob(0,0,2,0) = .01 ; prob(0,0,2,1) = .01 ; // shift 3, both flips + prob(0,0,3,0) = .01 ; prob(0,0,3,1) = .01 ; // shift 4, both flips + // row 2, cluster 1 + prob(1,0,0,0) = .01 ; prob(1,0,0,1) = .01 ; // shift 1, both flips + prob(1,0,1,0) = 999 ; prob(1,0,1,1) = .01 ; // shift 2, both flips + prob(1,0,2,0) = .01 ; prob(1,0,2,1) = .01 ; // shift 3, both flips + prob(1,0,3,0) = .01 ; prob(1,0,3,1) = .01 ; // shift 4, both flips + // row 3, cluster 1 + prob(2,0,0,0) = .01 ; prob(2,0,0,1) = .01 ; // shift 1, both flips + prob(2,0,1,0) = .01 ; prob(2,0,1,1) = .01 ; // shift 2, both flips + prob(2,0,2,0) = .01 ; prob(2,0,2,1) = 999 ; // shift 3, both flips + prob(2,0,3,0) = .01 ; prob(2,0,3,1) = .01 ; // shift 4, both flips + // row 4, cluster 1 + prob(3,0,0,0) = .01 ; prob(3,0,0,1) = .01 ; // shift 1, both flips + prob(3,0,1,0) = .01 ; prob(3,0,1,1) = .01 ; // shift 2, both flips + prob(3,0,2,0) = .01 ; prob(3,0,2,1) = .01 ; // shift 3, both flips + prob(3,0,3,0) = .01 ; prob(3,0,3,1) = 999 ; // shift 4, both flips + //normalize + for(size_t i=0; i data4(4, data3.get_dim()[1], 0.) ; - for(size_t j=0; j tot(4,0.) ; - - for(size_t i=0; i cons_seq = compute_consensus_sequences(seq, + prob, + class_k) ; + // print + for(size_t i=0; i