Page MenuHomec4science

MatrixCreatorParallel.hpp
No OneTemporary

File Metadata

Created
Sat, Oct 12, 19:03

MatrixCreatorParallel.hpp

#ifndef MATRIXCREATOR_HPP
#define MATRIXCREATOR_HPP
#include <vector>
#include <utility> // std::pair, std::make_pair()
#include <string>
#include <unordered_map>
#include <seqan/bed_io.h> // BedFileIn
#include <seqan/bam_io.h> // BamFileIn, BamAlignmentRecord
#include <ThreadPool.hpp>
#include <Matrix2D.hpp>
#include <GenomeRegion.hpp>
#include <typedef.hpp>
class MatrixCreator
{
public:
/*!
* \brief A list of values indicating how the data
* should be handled when counting the number of
* fragments mapped in a given bin.
*
* FRAGMENT : all positions within a fragment are
* accounted for and attributed to the
* corresponding bins :
* bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- pair of reads
* ||||| |||||| scoring positions
* bin1 gets a score of 5 and bin2 a
* score of 6.
*
* FRAGMENT_CENTER : only the central position
* within a fragment is accounted for and
* attributed to the corresponding bin :
* * bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- pair of reads
* | | scoring positions
* bin1 gets a score of 1 and bin2 also.
*
* READ : all positions within a read are
* accounted for and attributed to the
* corresponding bins :
* bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- reads
* | ||| ||| ||| scoring positions
* bin1 gets a score of 4 and bin2 a
* score of 6.
*
* READ_ATAC : only the shifted start
* of the reads are used. Additionally, the
* start position is shifted by +4bp(towards
* the right) for reads on the + strand and
* -5bp for reads on the - strand (towards the
* left). These positions indicate the insertion
* position in ATAC-seq data.
* bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- reads
* | | | scoring positions
* bin1 gets a score of 1 and bin2 a
* score of 2.
*/
enum methods {FRAGMENT=0,
FRAGMENT_CENTER,
READ,
READ_ATAC} ;
public:
/*!
* \brief Computes which bins (from a contiguous
* range of bins) are overlapped by a given target
* and returns two indices corresponding to :
* i) the index of the 1st bin overlapped by the
* target
* ii) the index of the past last bin overlapepd
* by the target.
* If the target does not overlapp any bin (it is
* located upstream the 1st bin, downstream the
* last bin or on a different chromosome), the
* index pair 0,0 is returned.
* Thus, in any case, a loop of the type
* for(i=first,i<second,i++) can always be used
* on the result.
* \param target t
* \param bins
* \return
*/
static std::pair<int, int> get_bin_indices(const GenomeRegion& target,
const std::vector<GenomeRegion>& bins) ;
/*!
* \brief Checks that the read is i) is mapped
* , ii) passes QC and iii) is not a duplicate,
* based on the flag value.
* \param read the read of interest.
* \return whether the read passes the above tests.
*/
bool is_good_read(const seqan::BamAlignmentRecord& read) ;
/*!
* \brief Checks that the read is i) a good read, ii)
* a paired read, iii) proplery aligned, iv) the 1st
* of the pair based on the flag values and that
* v) they forms a proper fragment with its mate mate
* (both read should point toward one other).
* \param read the read of interest.
* \return whether the read and its mate form a proper
* fragment.
*/
bool is_good_pair(const seqan::BamAlignmentRecord& read) ;
public:
MatrixCreator() = delete ;
/*!
* \brief Constructs an object to create
* a genomic count matrix.
* \param bed_file_path the path to the file containing
* the references.
* \param bam_file_path the path to the file containing
* the targets.
* \param bai_file_path the path to index file of the bam
* file containing the targets.
* \param from the downstream most position
* to consider, relative to a set of genomic
* positions.
* \param to the upstream most position to
* consider, relative to a set of genomic
* positions
* \param bin_size the size of the bins in
* which the regions encompassing the set
* of genomic positions will be broken
* into.
* \param method how the sequenced fragments
* should be consider when assigning counts
* to the bins.
* \param n_threads the number of working
* threads.
*/
MatrixCreator(const std::string& bed_file_path,
const std::string& bam_file_path,
const std::string& bai_file_path,
int from,
int to,
int bin_size,
MatrixCreator::methods method,
size_t n_threads) ;
~MatrixCreator() = default ;
/*!
* \brief Creates and return the count matrix.
* \return the count matrix.
*/
virtual Matrix2D<int> create_matrix() = 0 ;
protected:
/*!
* \brief Binarize the given range [from,to] into
* equal sized bins having the specified size.
* The bin coordinates are stored in bin_coord as
* pairs of [start,end) coordinates. One bin is
* centered on +/- 0.
*
*/
void compute_relative_bin_coord() ;
/*!
* \brief Checks whether a record has a valid chromosome,
* that is whether this chromosome has been found in the
* bed file has well.
* \param record a record from the bam file.
* \param thread_idx the thread index/number. This specifies
* which stream in the vector streams to use.
* \return whether the record chromosome is valid.
*/
bool is_valid_chromosome(const seqan::BamAlignmentRecord& record,
size_t thread_idx) ;
/*!
* \brief Opens the bam files.
* \throw std::runtime_error if the file cannot
* be open.
*/
void open_bam_files() ;
/*!
* \brief Opens the bam index files.
* \throw std::runtime_error if the file cannot
* be open.
*/
void open_bai_files() ;
/*!
* \brief Opens the bed files.
* \throw std::runtime_error if the file cannot
* be open.
*/
void open_bed_files() ;
/*!
* \brief Closes the bam files.
* Does nothing if already closed.
*/
void close_bam_files() ;
/*!
* \brief Closes the bed files.
* Does nothing if already closed.
*/
void close_bed_files() ;
/*!
* \brief The smallest relative coordinate for the bins (included).
*/
int from ;
/*!
* \brief The biggest relative coordinate for the bins (not included).
*/
int to ;
/*!
* \brief The bin size.
*/
int bin_size ;
/*!
* \brief How to consider the sequenced fragments when computing
* the bin values.
*/
MatrixCreator::methods method ;
/*!
* \brief The relative bin coordinates, compared to a given
* position. Each bin has a pair [from,to) where <from> is the
* 1st position within the bin and <to> is the 1st position
* after the bin. One bin is centered on +/- 0.
*/
v_pair relative_bin_coord ;
/*!
* \brief Bed file path.
*/
std::string bed_path ;
/*!
* \brief Bam file path.
*/
std::string bam_path ;
/*!
* \brief Bam index file path.
*/
std::string bai_path ;
/*!
* \brief A vector of input stream to the
* same bed file.
* The purpose is to allow multiple threads
* to read the same file at the same time.
* Use open_bed_files() to open the streams
* and close_bed_files() to close them.
*/
std::vector<seqan::BedFileIn> bed_files ;
/*!
* \brief A vector of input stream to the
* same bam file.
* The purpose is to allow multiple threads
* to read the same file at the same time.
* Use open_bam_files() to open the streams
* and close_bam_files() to close them.
*/
std::vector<seqan::BamFileIn> bam_files;
/*!
* \brief A vector of input stream to the
* same bam index file.
* The purpose is to allow multiple threads
* to read the same file at the same time.
* Use open_bai_files() to open the streams
* and close_bai_files() to close them.
*/
std::vector<seqan::BamIndex<seqan::Bai>> bai_files ;
/*!
* \brief A map containing the valid chromsome
* names as keys and their indices (as found
* in the BAM header) as values.
*/
std::unordered_map<std::string, int> chrom_map_names ;
/*!
* \brief A matrix containing the number of targets
* found at each position around each reference.
* This is the data structure to fill.
*/
Matrix2D<int> matrix_counts ;
/*!
* \brief A vector containing containing,
* for each reference, the coordinates of
* the genomic region covered by the bins.
*/
std::vector<std::vector<GenomeRegion>> matrix_bins ;
/*!
* \brief The number of working threads.
*/
size_t n_threads ;
ThreadPool threads ;
} ;
#endif // MATRIXCREATOR_HPP

Event Timeline