Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F87465951
MatrixCreator.hpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 12, 20:19
Size
9 KB
Mime Type
text/x-c
Expires
Mon, Oct 14, 20:19 (2 d)
Engine
blob
Format
Raw Data
Handle
21599795
Attached To
R8820 scATAC-seq
MatrixCreator.hpp
View Options
#ifndef MATRIXCREATOR_HPP
#define MATRIXCREATOR_HPP
#include <vector>
#include <utility> // std::pair, std::make_pair()
#include <string>
#include <unordered_map>
#include <seqan/bed_io.h> // BedFileIn
#include <seqan/bam_io.h> // BamFileIn, BamAlignmentRecord
#include <Matrix2D.hpp>
#include <GenomeRegion.hpp>
#include <typedef.hpp>
class MatrixCreator
{
public:
/*!
* \brief A list of values indicating how the data
* should be handled when counting the number of
* fragments mapped in a given bin.
*
* FRAGMENT : all positions within a fragment are
* accounted for and attributed to the
* corresponding bins :
* bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- pair of reads
* ||||| |||||| scoring positions
* bin1 gets a score of 5 and bin2 a
* score of 6.
*
* FRAGMENT_CENTER : only the central position
* within a fragment is accounted for and
* attributed to the corresponding bin :
* * bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- pair of reads
* | | scoring positions
* bin1 gets a score of 1 and bin2 also.
*
* READ : all positions within a read are
* accounted for and attributed to the
* corresponding bins :
* bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- reads
* | ||| ||| ||| scoring positions
* bin1 gets a score of 4 and bin2 a
* score of 6.
*
* READ_ATAC : only the shifted start
* of the reads are used. Additionally, the
* start position is shifted by +4bp(towards
* the right) for reads on the + strand and
* -5bp for reads on the - strand (towards the
* left). These positions indicate the insertion
* position in ATAC-seq data.
* bin1 bin2
* ----|-------|-------|------------> genome
* ------- ------- fragments
* --> <-- --> <-- reads
* | | | scoring positions
* bin1 gets a score of 1 and bin2 a
* score of 2.
*/
enum methods {FRAGMENT=0,
FRAGMENT_CENTER,
READ,
READ_ATAC} ;
public:
/*!
* \brief Computes which bins (from a contiguous
* range of bins) are overlapped by a given target
* and returns two indices corresponding to :
* i) the index of the 1st bin overlapped by the
* target
* ii) the index of the past last bin overlapepd
* by the target.
* If the target does not overlapp any bin (it is
* located upstream the 1st bin, downstream the
* last bin or on a different chromosome), the
* index pair 0,0 is returned.
* Thus, in any case, a loop of the type
* for(i=first,i<second,i++) can always be used
* on the result.
* \param target t
* \param bins
* \return
*/
static std::pair<int, int> get_bin_indices(const GenomeRegion& target,
const std::vector<GenomeRegion>& bins) ;
/*!
* \brief Checks that the read is i) is mapped
* , ii) passes QC and iii) is not a duplicate,
* based on the flag value.
* \param read the read of interest.
* \return whether the read passes the above tests.
*/
bool is_good_read(const seqan::BamAlignmentRecord& read) ;
/*!
* \brief Checks that the read is i) a good read, ii)
* a paired read, iii) proplery aligned, iv) the 1st
* of the pair based on the flag values and that
* v) they forms a proper fragment with its mate mate
* (both read should point toward one other).
* \param read the read of interest.
* \return whether the read and its mate form a proper
* fragment.
*/
bool is_good_pair(const seqan::BamAlignmentRecord& read) ;
public:
MatrixCreator() = delete ;
/*!
* \brief Constructs an object to create
* a genomic count matrix.
* \param bed_file_path the path to the file containing
* the references.
* \param bam_file_path the path to the file containing
* the targets.
* \param bai_file_path the path to index file of the bam
* file containing the targets.
* \param from the downstream most position
* to consider, relative to a set of genomic
* positions.
* \param to the upstream most position to
* consider, relative to a set of genomic
* positions
* \param bin_size the size of the bins in
* which the regions encompassing the set
* of genomic positions will be broken
* into.
* \param method how the sequenced fragments
* should be consider when assigning counts
* to the bins.
*/
MatrixCreator(const std::string& bed_file_path,
const std::string& bam_file_path,
const std::string& bai_file_path,
int from,
int to,
int bin_size,
MatrixCreator::methods method) ;
~MatrixCreator() = default ;
/*!
* \brief Creates and return the count matrix.
* \return the count matrix.
*/
virtual Matrix2D<int> create_matrix() = 0 ;
protected:
/*!
* \brief Binarize the given range [from,to] into
* equal sized bins having the specified size.
* The bin coordinates are stored in bin_coord as
* pairs of [start,end) coordinates. One bin is
* centered on +/- 0.
*
*/
void compute_relative_bin_coord() ;
/*!
* \brief Checks whether a record has a valid chromosome,
* that is whether this chromosome has been found in the
* bed file has well.
* \param record a record from the bam file.
* \return whether the record chromosome is valid.
*/
bool is_valid_chromosome(const seqan::BamAlignmentRecord& record) ;
/*!
* \brief Opens the bam file.
* \throw std::runtime_error if the file cannot
* be open.
*/
void open_bam_file() ;
/*!
* \brief Opens the bam index file.
* \throw std::runtime_error if the file cannot
* be open.
*/
void open_bai_file() ;
/*!
* \brief Opens the bed file.
* \throw std::runtime_error if the file cannot
* be open.
*/
void open_bed_file() ;
/*!
* \brief Closes the bam file.
* Does nothing if already closed.
*/
void close_bam_file() ;
/*!
* \brief Closes the bed file.
* Does nothing if already closed.
*/
void close_bed_file() ;
/*!
* \brief The smallest relative coordinate for the bins (included).
*/
int from ;
/*!
* \brief The biggest relative coordinate for the bins (not included).
*/
int to ;
/*!
* \brief The bin size.
*/
int bin_size ;
/*!
* \brief How to consider the sequenced fragments when computing
* the bin values.
*/
MatrixCreator::methods method ;
/*!
* \brief The relative bin coordinates, compared to a given
* position. Each bin has a pair [from,to) where <from> is the
* 1st position within the bin and <to> is the 1st position
* after the bin. One bin is centered on +/- 0.
*/
v_pair relative_bin_coord ;
/*!
* \brief Bed file path.
*/
std::string bed_path ;
/*!
* \brief Bam file path.
*/
std::string bam_path ;
/*!
* \brief Bam index file path.
*/
std::string bai_path ;
/*!
* \brief An input stream to the
* bed file.
* Use open_bed_file() to open the stream
* and close_bed_file() to close it.
*/
seqan::BedFileIn bed_file ;
/*!
* \brief An input stream to the
* bam file.
* Use open_bam_file() to open the stream
* and close_bam_file() to close it.
*/
seqan::BamFileIn bam_file;
/*!
* \brief An input stream to the
* bam index file.
* Use open_bai_file() to open the stream
* and close_bai_file() to close it.
*/
seqan::BamIndex<seqan::Bai> bai_file ;
/*!
* \brief A map containing the valid chromsome
* names as keys and their indices (as found
* in the BAM header) as values.
*/
std::unordered_map<std::string, int> chrom_map_names ;
/*!
* \brief A matrix containing the number of targets
* found at each position around each reference.
* This is the data structure to fill.
*/
Matrix2D<int> matrix_counts ;
/*!
* \brief A vector containing containing,
* for each reference, the coordinates of
* the genomic region covered by the bins.
*/
std::vector<std::vector<GenomeRegion>> matrix_bins ;
} ;
#endif // MATRIXCREATOR_HPP
Event Timeline
Log In to Comment