EMSequence.hpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Aug 4, 08:49

EMSequence.hpp
View Options

	#ifndef EMSEQUENCE_HPP
	#define EMSEQUENCE_HPP

	#include <EMBase.hpp>

	#include <vector>
	#include <string>
	#include <future> // std::promise

	#include <Matrix3D.hpp>
	#include <SequenceLayer.hpp>


	typedef std::vector<double> vector_d ;


	class EMSequence : public EMBase
	{ public:
	/*!
	* \brief Constructs an object to partition the
	* given sequences (rows) according to their motif
	* content.
	* The sequences models are initialised randomly.
	* \param sequence_matrix a matrix containing the sequences
	* of interest.
	* \param n_class the number of region classes
	* to search.
	* \param n_iter the number of optimization iterations.
	* \param n_shift the number of shift states allowed.
	* \param flip whether flipping is allowed.
	* \param bckg_class the last class is used to model the background
	* by setting all its parameters, at all positions, to the
	* background base probabilties. Since the background is constant,
	* this class will never be updated.
	* \param seed a seed to initialise the random number
	* generator.
	* \param n_threads the number of parallel threads
	* to run the computations. 0 means no parallel
	* computing, everything is run on the main thread.
	*/
	EMSequence(const Matrix2D<int>& sequence_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed="",
	size_t n_threads=0) ;
	/*!
	* \brief Constructs an object to partition the
	* given sequences (rows) according to their motif
	* content.
	* The sequences models are initialised randomly.
	* \param sequence_matrix a matrix containing the sequences
	* of interest.
	* \param n_class the number of region classes
	* to search.
	* \param n_iter the number of optimization iterations.
	* \param n_shift the number of shift states allowed.
	* \param flip whether flipping is allowed.
	* \param bckg_class the last class is used to model the background
	* by setting all its parameters, at all positions, to the
	* background base probabilties. Since the background is constant,
	* this class will never be updated.
	* \param seed a seed to initialise the random number
	* generator.
	* \param n_threads the number of parallel threads
	* to run the computations. 0 means no parallel
	* computing, everything is run on the main thread.
	*/
	EMSequence(Matrix2D<int>&& sequence_matrix,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed="",
	size_t n_threads=0) ;

	/*!
	* \brief Constructs an object to partition the
	* given sequences (rows) according to their motif
	* content.
	* The sequences class models are initialised using
	* the given motifs. The class probabilities are
	* initialised uniformlly.
	* The shifting freedom is set to (data number
	* of columns) - (the model 2nd dimension)
	* + 1.
	* \param sequence_matrix a matrix containing the sequences
	* of interest.
	* \param motifs a matrix containing the different initial
	* class models with the following dimensions :
	* dim1 the number of classes
	* dim2 the model length
	* dim3 4 for A,C,G,T
	* \param n_class the number of region classes
	* to search.
	* \param n_iter the number of optimization iterations.
	* \param flip whether flipping is allowed.
	* \param bckg_class indicates that the last class in the
	* given motifs is used to model the background and it
	* should never be updated.
	* \param n_threads the number of parallel threads
	* to run the computations. 0 means no parallel
	* computing, everything is run on the main thread.
	*/
	EMSequence(const Matrix2D<int>& sequence_matrix,
	const Matrix3D<double>& motifs,
	size_t n_iter,
	bool flip,
	bool bckg_class,
	size_t n_threads=0) ;

	/*!
	* \brief Constructs an object to partition the
	* given sequences (rows) according to their motif
	* content.
	* The sequences class models are initialised using
	* the given motifs. The class probabilities are
	* initialised uniformlly.
	* The shifting freedom is set to (data number
	* of columns) - (the model 2nd dimension)
	* + 1.
	* \param sequence_matrix a matrix containing the sequences
	* of interest.
	* \param motifs a matrix containing the different initial
	* class models with the following dimensions :
	* dim1 the number of classes
	* dim2 the model length
	* dim3 4 for A,C,G,T
	* \param n_class the number of region classes
	* to search.
	* \param n_iter the number of optimization iterations.
	* \param flip whether flipping is allowed.
	* \param bckg_class indicates that the last class in the
	* given motifs is used to model the background and it
	* should never be updated.
	* \param n_threads the number of parallel threads
	* to run the computations. 0 means no parallel
	* computing, everything is run on the main thread.
	*/
	EMSequence(Matrix2D<int>&& sequence_matrix,
	Matrix3D<double>&& motifs,
	size_t n_iter,
	bool flip,
	bool bckg_class,
	size_t n_threads=0) ;

	EMSequence(const EMSequence& other) = delete ;

	/*!
	* \brief Destructor.
	*/
	virtual ~EMSequence() override ;

	/*!
	* \brief Returns the class sequence model.
	* \return the class sequence model.
	*/
	Matrix3D<double> get_sequence_models() const ;

	/*!
	* \brief Runs the sequence model optimization and
	* the data classification.
	* \return a code indicating how the optimization
	* ended.
	*/
	virtual EMSequence::exit_codes classify() override ;

	private:

	/*!
	* \brief Computes the data log likelihood given the
	* current models, for all layers and the joint
	* likelihood for each state (the sum of the layer
	* likelihoods for all layers, for a given state).
	*/
	virtual void compute_loglikelihood() override ;

	/*!
	* \brief This is a routine of compute_loglikelihood().
	* This method rescales the loglikelihood values by
	* substacting to each value the maximum loglikelihood
	* value found in the same data row.
	* This method
	* \param from the index of the first row
	* in the data to consider.
	* \param to the index of the past last row
	* in the data to consider.
	* \param done a promise to fill when the method
	* is done.
	*/
	void compute_loglikelihood_routine(size_t from,
	size_t to,
	std::promise<bool>& done) ;

	/*!
	* \brief Computes the data posterior probabilties.
	*/
	virtual void compute_post_prob() override ;

	/*!
	* \brief The routine that effectively computes
	* the posterior probabilties.
	* \param from the index of the first row
	* in the data to consider.
	* \param to the index of the past last row
	* in the data to consider.
	* \param done the partial column (over the classes)
	* sum of posterior probabilities. If several routines
	* are running together, the colsums are retrieved by
	* summing up the vectors together.
	*/
	void compute_post_prob_routine(size_t from,
	size_t to,
	std::promise<vector_d>& post_prob_colsum) ;

	/*!
	* \brief Update the data models for all layers, given
	* the current posterior and class probabilities.
	*/
	virtual void update_models() override ;

	/*!
	* \brief the max loglikelihood value for
	* each data row.
	*/
	std::vector<double> loglikelihood_max ;
	/*!
	* \brief A pointer to the object managing
	* the data and their model.
	*/
	SequenceLayer* seq_layer ;

	} ;

	#endif // EMSEQUENCE_HPP

EMSequence.hppNo OneTemporaryActions

File Metadata

EMSequence.hppView Options

Event Timeline

EMSequence.hpp
No OneTemporary
Actions

EMSequence.hpp
View Options