Page MenuHomec4science

EMJoint.hpp
No OneTemporary

File Metadata

Created
Sun, Apr 28, 13:39

EMJoint.hpp

#ifndef EMJOINT_HPP
#define EMJOINT_HPP
#include <EMBase.hpp>
#include <vector>
#include <string>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ReadLayer.hpp>
#include <SequenceLayer.hpp>
#include <ConsensusSequenceLayer.hpp>
typedef std::vector<double> vector_d ;
class EMJoint : public EMBase
{
public:
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* with the given shifting and flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* with the given shifting and flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* and sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(const std::vector<Matrix2D<int>>& read_matrices,
const Matrix2D<int>& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
/*!
* \brief Constructs an object to partition the
* region according to all the given read densities
* and sequences with the given shifting and
* flipping freedom.
* \param read_matrices a vector containing all
* the different data densities (ChIP-seq or related
* signal) for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param seq_matrix a matrix containing the DNA
* sequences for the regions of interest. The matrix
* dimensions are as follow:
* 1st the number of regions,
* 2nd the region length.
* \param n_class the number of region classes
* to search.
* \param n_iter the number of optimization iterations.
* \param n_shift the number of shift states allowed.
* \param flip whether flipping is allowed.
* \param bckg_class the last class is used to model
* the background by setting all its parameters, at all
* positions, to the background base probabilties (for
* sequence models) and the mean number of counts (for
* read signal models). Since the background is constant,
* this class will never be updated.
* \param seed a seed to initialise the random number
* generator.
* \param n_threads the number of parallel threads
* to run the computations. 0 means no parallel
* computing, everything is run on the main thread.
*/
EMJoint(std::vector<Matrix2D<int>>&& read_matrices,
Matrix2D<int>&& seq_matrix,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed="",
size_t n_threads=0) ;
EMJoint(const EMJoint& other) = delete ;
/*!
* \brief Destructor.
*/
virtual ~EMJoint() override ;
/*!
* \brief Returns all layer read models.
* The models are in the same order
* as the data were given to the
* constructor.
* \return a vector containing the
* models.
*/
std::vector<Matrix3D<double>> get_read_models() const ;
/*!
* \brief Returns the sequence models.
* \return a vector containing the
* models.
*/
Matrix3D<double> get_sequence_models() const ;
/*!
* \brief Runs the sequence model optimization and
* the data classification.
* \return a code indicating how the optimization
* ended.
*/
virtual EMJoint::exit_codes classify() override ;
private:
/*!
* \brief Computes the data log likelihood given the
* current models, for all layers and the joint
* likelihood for each state (the sum of the layer
* likelihoods for all layers, for a given state).
*/
virtual void compute_loglikelihood() override ;
/*!
* \brief This is a routine of compute_loglikelihood() that
* computes the joint loglikelihood by summing the
* individual loglikelihood obtained from each data layer.
* At the same time, this method rescales the loglikelihood
* values by substacting to each value the maximum
* loglikelihood value found in the same data row,
* for each layer.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done a promise to fill when the method
* is done.
*/
void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise<bool>& done) ;
/*!
* \brief Computes the data posterior probabilties.
*/
virtual void compute_post_prob() override ;
/*!
* \brief The routine that effectively computes
* the posterior probabilties.
* \param from the index of the first row
* in the data to consider.
* \param to the index of the past last row
* in the data to consider.
* \param done the partial column (over the classes)
* sum of posterior probabilities. If several routines
* are running together, the colsums are retrieved by
* summing up the vectors together.
*/
void compute_post_prob_routine(size_t from,
size_t to,
std::promise<vector_d>& post_prob_colsum) ;
/*!
* \brief Update the data models for all layers, given
* the current posterior and class probabilities.
*/
virtual void update_models() override ;
/*!
* \brief the number of data layers.
*/
size_t n_layer ;
/*!
* \brief the log likelihood buffers for each individual
* layer (one element per layer).
*/
std::vector<Matrix4D<double>> loglikelihood_layer ;
/*!
* \brief the max loglikelihood value for
* each each data layer (1st dimension)
* and each data row of the given layer
* (2nd dimension).
*/
std::vector<vector_d> loglikelihood_max ;
/*!
* \brief A vector containing the pointers
* to the objects managing all the read
* data and models.
*/
std::vector<ReadLayer*> read_layers ;
/*!
* \brief A pointer to the object managing
* the sequence data and their models.
*/
SequenceLayer* seq_layer ;
/*!
* \brief A pointer to the object managing
* the consensus sequences data and their
* models.
*/
ConsensusSequenceLayer* conseq_layer ;
} ;
#endif // EMJOINT_HPP

Event Timeline