diff --git a/.gitignore b/.gitignore index 0e81523..6a4e9ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,45 +1,44 @@ # This file is used to ignore files which are generated # ---------------------------------------------------------------------------- - *~ *.autosave *.a *.core *.moc *.o *.obj *.orig *.rej *.so *.so.* *_pch.h.cpp *_resource.rc *.qm .#* *.*# core !core/ tags .DS_Store .directory *.debug Makefile* *.prl *.app moc_*.cpp ui_*.h qrc_*.cpp Thumbs.db *.res *.rc .Rhistory .RData /.qmake.cache /.qmake.stash bin/ CMakeFiles/ data/ results/ +lib/ CMakeCache.txt cmake_install.cmake - diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f1eca7..0827422 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,37 +1,34 @@ -# project -project(scATACseq) -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.0.0) -# static libraries +# project name and it is C++ only +project(scATACseq CXX) + + +# libraries ## boost library set(BOOST_INCLUDEDIR "/usr/local/include/boost/") set(BOOST_LIBRARYDIR "/usr/local/lib/boost") find_package(Boost 1.65 COMPONENTS program_options REQUIRED) ## UnitTest++ library ## TODO write a FindUnitTest++.cmake file to use find_package() find_library(UNITTEST_LIB NAMES "UnitTest++" PATHS "/usr/local/lib/UnitTest++") find_path(UNITTEST_INCLUDE NAMES "UnitTest++.h" PATHS "/usr/local/include/UnitTest++/") include_directories(${UNITTEST_INCLUDE}) # link_directories(${UNITTEST_LIB}) -## threads -find_package(Threads REQUIRED) +## zlib (for seqan Bam I/O) +find_package(ZLIB REQUIRED) +## SeqAn +find_package (SeqAn REQUIRED) -# compiler options -add_compile_options(-std=c++11) -add_compile_options(-O3) -add_compile_options(-Wall) -add_compile_options(-Wextra) -add_compile_options(-Werror) -add_compile_options(-Wfatal-errors) -add_compile_options(-pedantic) - +## threads +find_package(Threads REQUIRED) add_subdirectory(src) diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..13d7c28 --- /dev/null +++ b/build.sh @@ -0,0 +1 @@ +cmake3 -DCMAKE_MODULE_PATH="/local/groux/scATAC-seq/lib/seqan/util/cmake/" -DSEQAN_INCLUDE_PATH="/local/groux/scATAC-seq/lib/seqan/include/" . && make diff --git a/scripts/10xgenomics_PBMC_5k/.10xgenomics.sh.swp b/scripts/10xgenomics_PBMC_5k/.10xgenomics.sh.swp new file mode 100644 index 0000000..0688813 Binary files /dev/null and b/scripts/10xgenomics_PBMC_5k/.10xgenomics.sh.swp differ diff --git a/scripts/10xgenomics_PBMC_5k/.idea/encodings.xml b/scripts/10xgenomics_PBMC_5k/.idea/encodings.xml new file mode 100644 index 0000000..15a15b2 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k/.idea/libraries/R_User_Library.xml b/scripts/10xgenomics_PBMC_5k/.idea/libraries/R_User_Library.xml new file mode 100644 index 0000000..71f5ff7 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k/.idea/libraries/R_User_Library.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k/.idea/misc.xml b/scripts/10xgenomics_PBMC_5k/.idea/misc.xml new file mode 100644 index 0000000..65531ca --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k/.idea/modules.xml b/scripts/10xgenomics_PBMC_5k/.idea/modules.xml new file mode 100644 index 0000000..bb83e26 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k/.idea/scripts.iml b/scripts/10xgenomics_PBMC_5k/.idea/scripts.iml new file mode 100644 index 0000000..3a4807d --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k/.idea/scripts.iml @@ -0,0 +1,13 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k/.idea/workspace.xml b/scripts/10xgenomics_PBMC_5k/.idea/workspace.xml new file mode 100644 index 0000000..fc338d5 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k/.idea/workspace.xml @@ -0,0 +1,128 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1549034112423 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/scripts/bam_tools/filter_bam.py b/scripts/bam_tools/filter_bam.py new file mode 100644 index 0000000..b6a1dba --- /dev/null +++ b/scripts/bam_tools/filter_bam.py @@ -0,0 +1,83 @@ +import optparse +import sys +import os +import typing as tp +import pysam + +def construct_value_set(file: str) -> tp.Set[str]: + """ + Constructs a set containing the values listed in the given file. + The file should contain on value per line. + :param file: the file of interest. + :return: a set with the values listed in the file. + """ + s = set() + with open(file, "rt") as f: + for line in f: + s.add(line.rstrip()) + return s + +def filter_bam(file_in: str, file_out: str, tag: str, values: tp.Set[str]) -> None: + """ + Filters the reads in the bam file. + :param file_in: + :param file_out: + :param values: + :return: + """ + + # read bam file and dispatch the reads + bam_in = pysam.AlignmentFile(file_in) + bam_out = pysam.AlignmentFile(file_out, template=bam_in, mode="wb") + for read in bam_in: + if read.has_tag(tag): + value = read.get_tag(tag) + # check if read has the good tag + if value in values: + bam_out.write(read) + bam_in.close() + bam_out.close() + +if __name__ == "__main__": + # parse options + usage = "usage: %s [options]" % os.path.basename(__file__) + epilog = "This program reads a bam file and filters out any read that is not associated with one of the given " \ + "tag values." \ + "Written by Romain Groux, February 2019" + parser = optparse.OptionParser(usage=usage, epilog=epilog) + parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store", + help="The addresse of the bam file to filter.") + parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store", + help="The addresse of the output file.") + parser.add_option("--values", dest="file_values", default=None, type="string", action="store", + help="The value of the relevant tag for the filtering.") + parser.add_option("--tag", dest="tag", default=None, type="string", action="store", + help="The tag which values will be used for the filtering.") + (options, args) = parser.parse_args() + + file_in = options.file_in + file_out = options.file_out + file_values = options.file_values + tag = options.tag + + # check options + if file_in is None: + print("Error! No input file given (-i)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_in): + print("Error! %s does not exist!" % file_in) + exit(1) + elif file_out is None: + print("Error! no output file given (-o)!") + elif file_values is None: + print("Error! No value file given (--values)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_values): + print("Error! %s does not exist!" % file_values) + exit(1) + elif tag is None: + print("Error! no tag was given (--tag)!") + + value_set = construct_value_set(file_values) + filter_bam(file_in, file_out, tag, value_set) + diff --git a/scripts/bam_tools/head_bam.py b/scripts/bam_tools/head_bam.py new file mode 100644 index 0000000..7febfcb --- /dev/null +++ b/scripts/bam_tools/head_bam.py @@ -0,0 +1,54 @@ + +import optparse +import sys +import os +import typing as tp +import pysam + + +def get_file_subset(file_in: str, file_out:str, n: int) -> None: + f_in = pysam.AlignmentFile(file_in) + f_out = pysam.AlignmentFile(file_out, template=f_in, mode="wb") + + for i, line in enumerate(f_in): + if i >= n: + break + else: + f_out.write(line) + f_in.close() + f_out.close() + + +if __name__ == "__main__": + # parse options + usage = "usage: %s [options]" % os.path.basename(__file__) + epilog = "This program reads a bam file and writes the first reads to another file.\n" \ + "Written by Romain Groux, February 2019" + parser = optparse.OptionParser(usage=usage, epilog=epilog) + parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store", + help="the addresse of the bam file to split.") + parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store", + help="the addresse of the output file.") + parser.add_option("-n", "--nlines", dest="nlines", default=10, type="int", action="store", + help="the addresse of the output bam file.") + + (options, args) = parser.parse_args() + file_in = options.file_in + file_out = options.file_out + nlines = options.nlines + + + # check options + if file_in is None: + print("Error! No input file given (-i)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_in): + print("Error! %s does not exist!" % file_in) + exit(1) + elif file_out is None: + print("Error! No output file given (-o)!", sys.stderr) + exit(1) + elif nlines <= 0: + print("Error! number of lines <= 0 (-n)!", sys.stderr) + + get_file_subset(file_in, file_out, nlines) diff --git a/scripts/bam_tools/split_bam.py b/scripts/bam_tools/split_bam.py new file mode 100644 index 0000000..8719a90 --- /dev/null +++ b/scripts/bam_tools/split_bam.py @@ -0,0 +1,108 @@ +import optparse +import sys +import os +import typing as tp +import pysam + + +def construct_dict_files(f_values: str, f_prefix: str) -> tp.Dict[str, str]: + """ + Reads a file containing a list of tag values (one value per line) and constructs a dictionary + of values (key) and file addresses (values) to later dispatch the reads in. + :param f_values: the address of the file containing the tag values + :param f_prefix: a common prefix for the addresses of all the file addresses in the dictionary. + :return: a dictionary with tag values and their associated file addresses. + """ + d = dict() + with open(f_values, "rt") as f: + for line in f: + value = line.rstrip() + if d.get(value, None) is None: + f_reads = "%s%s.sam" % (f_prefix, value) + d[value] = f_reads + else: + pass + return d + + +def split_bam(f_bam: str, tag:str, d_files: tp.Dict[str, str]): + """ + Splits the bam file according to the given tag values. + The bam file is read and each read is check for the given tag value. If the value is listed in the given file + dictionary, then the read is written to the corresponding file. + :param f_bam: the address of the bam file to split. + :param tag: the tag which should be used for splitting. + :param d_files: a dictionary containing the accepted values for sorting (key) and the addresses of the + corresponding files in which the reads should be dispatched. + """ + + # Create all files and a 2nd dictionary telling whether header has already been written, the key is still the + # value. Don't write the sam file headers now. If a file is given no read, then it will be empty, not only + # containing a header + d_header = dict() + for key in d_files.keys(): + f = open(d_files[key], "wt") + f.close() + d_header[key] = False + + # read bam file and dispatch the reads + bam = pysam.AlignmentFile(f_bam) + for read in bam: + if read.has_tag(tag): + value = read.get_tag(tag) + # only treat value present in the list + if d_files.get(value, None) is not None: + # cannot keep all files open, raises an OS Error if too many are open at the same time + with open(d_files[value], "at") as f: + # write header if file has not been written before + if d_header[value] is False: + f.write(str(read.header)) + d_header[value] = True + f.write("%s\n" % read.to_string()) + bam.close() + + +if __name__ == "__main__": + # parse options + usage = "usage: %s [options]" % os.path.basename(__file__) + epilog = "This program reads a bam file and dispatches the reads into separated sam files according to the " \ + "values associated with a specified tag. The accepted values should be listed into a text file. The " \ + "output files will be located in the current directory.\n" \ + "Written by Romain Groux, January 2019" + parser = optparse.OptionParser(usage=usage, epilog=epilog) + parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store", + help="the addresse of the bam file to split.") + parser.add_option("-p", "--prefix", dest="prefix", default="", type="string", action="store", + help="a name prefix for the files in which the reads will be dispatched.") + parser.add_option("--values", dest="file_values", default=None, type="string", action="store", + help="the address of the file containing the associated tag values relevant for the splitting.") + parser.add_option("--tag", dest="tag", default=None, type="string", action="store", + help="The tag which values will be used for the splitting.") + (options, args) = parser.parse_args() + file_in = options.file_in + file_values = options.file_values + prefix = options.prefix + tag = options.tag + + # check options + if file_in is None: + print("Error! No input file given (-i)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_in): + print("Error! %s does not exist!" % file_in) + exit(1) + elif file_values is None: + print("Error! No value file given (--values)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_values): + print("Error! %s does not exist!" % file_values) + exit(1) + elif tag is None: + print("Error! no tag was given (--tag)!") + + if prefix != "": + prefix = "%s_" % prefix + + # split bam file + dict_files = construct_dict_files(file_values, prefix) + split_bam(file_in, tag, dict_files) diff --git a/scripts/bam_tools/split_by_length.py b/scripts/bam_tools/split_by_length.py new file mode 100644 index 0000000..216b3ab --- /dev/null +++ b/scripts/bam_tools/split_by_length.py @@ -0,0 +1,80 @@ +import optparse +import sys +import os +import typing as tp +import pysam + + +def parse_lengths(lengths_str: str) -> tp.Tuple[int, int]: + + tuple_lengths = () + + try: + if '-' not in lengths_str: + raise RuntimeError("invalid fragment lengths : %s" % lengths_str) + else: + duo = lengths_str.split('-') + # not two values + if len(duo) != 2: + raise RuntimeError("invalid list of fragment lengths : %s" % lengths_str) + duo = (int(duo[0]), int(duo[1])) + # to <= from + if duo[1] <= duo[0]: + raise RuntimeError("invalid list of fragment lengths : %s" % lengths_str) + + except Exception as e: + print(e, sys.stderr) + raise RuntimeError("invalid list of fragment lengths : %s" % lengths_str) + + return (duo[0], duo[1]) + + +def split_bam(file_bam, file_out, lengths): + + bam_in = pysam.AlignmentFile(file_in) + bam_out = pysam.AlignmentFile(file_out, template=bam_in, mode="wb") + + for read in bam_in: + # don't know how to get fragment length from bam so convert the fragment to + # sam and parse the sam representation + # frag. with 1st read on reverse have negative length + read_l = abs(int(read.to_string().split('\t')[8])) + if read_l >= lengths[0] and read_l <= lengths[1]: + bam_out.write(read) + + bam_in.close() + bam_out.close() + + +if __name__ == "__main__": + + # parse options + usage = "usage: %s [options]" % os.path.basename(__file__) + epilog = "This program reads a bam file and filters out any read that is not associated with one of the given " \ + "tag values." \ + "Written by Romain Groux, February 2019" + parser = optparse.OptionParser(usage=usage, epilog=epilog) + parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store", + help="the addresse of the bam file to filter.") + parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store", + help="The addresse of the output file.") + parser.add_option("--length", dest="lengths", default=None, type="string", action="store", + help="A pair of non-overlapping [from,to] values that will be used to " + "filter (including the boundaries) the fragments, for instance --length 1-200.") + (options, args) = parser.parse_args() + + file_in = options.file_in + file_out = options.file_out + from_to = parse_lengths(options.lengths) + + # check options + if file_in is None: + print("Error! No input file given (-i)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_in): + print("Error! %s does not exist!" % file_in) + exit(1) + elif file_out is None: + print("Error! no output file given (-o)!") + + split_bam(file_in, file_out, from_to) diff --git a/scripts/bam_tools/split_in_two.py b/scripts/bam_tools/split_in_two.py new file mode 100644 index 0000000..8ae2c37 --- /dev/null +++ b/scripts/bam_tools/split_in_two.py @@ -0,0 +1,275 @@ +import optparse +import sys +import os +import typing as tp +import pysam + +def split_bam(file_in, file_out): + + bam_in = pysam.AlignmentFile(file_in) + bam_out = pysam.AlignmentFile(file_out, template=bam_in, mode="wb") + + for read in bam_in: + # check whether there is a pair + read_start = read.reference_start + read_flags = list(str(bin(read.flag))[2:][::-1]) + read_is_paired = read_flags[0] == '1' + read_is_rev = read_flags[4] == '1' + read_is_first_in_pair = read_flags[6] == '1' + read_name = read.query_name + + mate_start = read.next_reference_start + mate_is_rev = read_flags[5] == '1' + + # check that read and fragment are OK + # qc + if read.is_qcfail: + continue + # check pair + elif not read_is_paired: + continue + # --> --> + elif(not read_is_rev and not mate_is_rev): + continue + # <-- <-- + elif (read_is_rev and read_is_rev): + continue + # <-- --> + elif (read_is_rev and not mate_is_rev) and (read_start < mate_read_start): + continue + # <-- --> + elif (not read_is_rev and mate_is_rev) and (read_start > mate_start): + continue + + # Split the fragment in two equally long fragments. + # Each read has a length of 1. + # Reads that do not create a proper fragment + # with their pair read are filtered (can only + # create 2 fragments from a fragment!). + if read_is_first_in_pair: + + # strand related parameters + # r1 is fw + # r1 r3 + # ---> ---> + # |---------|----------| + # <--- <--- + # r4 r2 + if not read_is_rev: + + frag_ref_id = read.reference_id + frag_start = read.reference_start + frag_len = read.template_length + frag_end = frag_start + frag_len - 1 + frag_mid = frag_start + (frag_len // 2) + frag_cell = read.get_tag("CB") + + # read 1 + r1_ref_id = frag_ref_id + r1_next_ref_id = r1_ref_id + r1_start = frag_start # read start inclusive + r1_flags = ['0' for _ in range(12)] + r1_flags[0] = '1' # paired + r1_flags[1] = '1' # proper pair + r1_flags[5] = '1' # mate is rev + r1_flags[6] = '1' # 1st in pair + r1_name = "%s_r1" % read_name + r1_tags = (("CB", frag_cell),) + r1_query_seq = 'N' + # read 2 + r2_ref_id = frag_ref_id + r2_next_ref_id = r2_ref_id + r2_start = frag_end -1 # read start inclusive + r2_flags = ['0' for _ in range(12)] + r2_flags[0] = '1' # paired + r2_flags[1] = '1' # proper pair + r2_flags[4] = '1' # read is rev + r2_flags[7] = '1' # mate 1st in pair + r2_name = "%s_r2" % read_name + r2_tags = (("CB", frag_cell),) + r2_query_seq = 'N' + # read 3 + r3_ref_id = frag_ref_id + r3_next_ref_id = r3_ref_id + r3_start = frag_mid # read start inclusive + r3_flags = ['0' for _ in range(12)] + r3_flags[0] = '1' # paired + r3_flags[1] = '1' # proper pair + r3_flags[5] = '1' # mate is rev + r3_flags[6] = '1' # 1st in pair + r3_name = "%s_r3" % read_name + r3_tags = (("CB", frag_cell),) + r3_query_seq = 'N' + # read 4 + r4_ref_id = frag_ref_id + r4_next_ref_id = r4_ref_id + r4_start = frag_mid - 1 # read start inclusive + r4_flags = ['0' for _ in range(12)] + r4_flags[0] = '1' # paired + r4_flags[1] = '1' # proper pair + r4_flags[4] = '1' # read is rev + r4_flags[7] = '1' # mate 1st in pair + r4_name = "%s_r4" % read_name + r4_tags = (("CB", frag_cell),) + r4_query_seq = 'N' + # fragment lengths + frag14_len = r4_start - r1_start + 1 + frag23_len = r2_start - r3_start + 1 + r1_tlen = frag14_len + r2_tlen = -frag23_len + r3_tlen = frag23_len + r4_tlen = -frag14_len + + # r1 is rv + # r2 r4 + # ---> ---> + # |---------|----------| + # <--- <--- + # r3 r1 + else: + frag_ref_id = read.reference_id + frag_start = read.next_reference_start + frag_len = abs(read.template_length) + frag_end = frag_start + frag_len - 1 + frag_mid = frag_start + (frag_len // 2) + frag_cell = read.get_tag("CB") + + # read 1 + r1_ref_id = frag_ref_id + r1_next_ref_id = r1_ref_id + r1_start = frag_end - 1 # read start inclusive + r1_flags = ['0' for _ in range(12)] + r1_flags[0] = '1' # paired + r1_flags[1] = '1' # proper pair + r1_flags[4] = '1' # read is rev + r1_flags[6] = '1' # 1st in pair + r1_name = "%s_r1" % read_name + r1_tags = (("CB", frag_cell),) + r1_query_seq = 'N' + # read 2 + r2_ref_id = frag_ref_id + r2_next_ref_id = r2_ref_id + r2_start = frag_start # read start inclusive + r2_flags = ['0' for _ in range(12)] + r2_flags[0] = '1' # paired + r2_flags[1] = '1' # proper pair + r2_flags[5] = '1' # mate is rev + r2_flags[7] = '1' # mate 1st in pair + r2_name = "%s_r2" % read_name + r2_tags = (("CB", frag_cell),) + r2_query_seq = 'N' + # read 3 + r3_ref_id = frag_ref_id + r3_next_ref_id = r3_ref_id + r3_start = frag_mid - 1 # read start inclusive + r3_flags = ['0' for _ in range(12)] + r3_flags[0] = '1' # paired + r3_flags[1] = '1' # proper pair + r3_flags[4] = '1' # read is rev + r3_flags[6] = '1' # 1st in pair + r3_name = "%s_r3" % read_name + r3_tags = (("CB", frag_cell),) + r3_query_seq = 'N' + # read 4 + r4_ref_id = frag_ref_id + r4_next_ref_id = r4_ref_id + r4_start = frag_mid # read start inclusive + r4_flags = ['0' for _ in range(12)] + r4_flags[0] = '1' # paired + r4_flags[1] = '1' # proper pair + r4_flags[5] = '1' # mate is rev + r4_flags[7] = '1' # mate 1st in pair + r4_name = "%s_r4" % read_name + r4_tags = (("CB", frag_cell),) + r4_query_seq = 'N' + # fragment lengths + frag14_len = r4_start - r1_start + 1 + frag23_len = r2_start - r3_start + 1 + r1_tlen = -frag14_len + r2_tlen = frag23_len + r3_tlen = -frag23_len + r4_tlen = frag14_len + + # create the reads + read1 = pysam.AlignedSegment() + read1.query_name = r1_name + read1.flag = int(''.join(r1_flags)[::-1], base=2) + read1.reference_id = frag_ref_id + read1.reference_start = r1_start + read1.next_reference_id = frag_ref_id + read1.next_reference_start = r4_start + read1.tags = r1_tags + read1.template_length = r1_tlen + read1.query_sequence = r1_query_seq + + read2 = pysam.AlignedSegment() + read2.query_name = r2_name + read2.flag = int(''.join(r2_flags)[::-1], base=2) + read2.reference_id = frag_ref_id + read2.reference_start = r2_start + read2.next_reference_id = frag_ref_id + read2.next_reference_start = r3_start + read2.tags = r2_tags + read2.template_length = r2_tlen + read2.query_sequence = r2_query_seq + + read3 = pysam.AlignedSegment() + read3.query_name = r3_name + read3.flag = int(''.join(r3_flags)[::-1], base=2) + read3.reference_id = frag_ref_id + read3.reference_start = r3_start + read3.next_reference_id = frag_ref_id + read3.next_reference_start = r2_start + read3.tags = r3_tags + read3.template_length = r3_tlen + read3.query_sequence = r3_query_seq + + read4 = pysam.AlignedSegment() + read4.query_name = r4_name + read4.flag = int(''.join(r4_flags)[::-1], base=2) + read4.reference_id = frag_ref_id + read4.reference_start = r4_start + read4.next_reference_id = frag_ref_id + read4.next_reference_start = r1_start + read4.tags = r4_tags + read4.template_length = r4_tlen + read4.query_sequence = r4_query_seq + + # write + bam_out.write(read1) + bam_out.write(read2) + bam_out.write(read3) + bam_out.write(read4) + + bam_in.close() + bam_out.close() + + +if __name__ == "__main__": + + # parse options + usage = "usage: %s [options]" % os.path.basename(__file__) + epilog = "This program reads a bam file and split the fragments within in two fragments " \ + "of equal size." \ + "Written by Romain Groux, June 2019" + parser = optparse.OptionParser(usage=usage, epilog=epilog) + parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store", + help="the addresse of the bam file to filter.") + parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store", + help="The addresse of the output file.") + (options, args) = parser.parse_args() + + file_in = options.file_in + file_out = options.file_out + + # check options + if file_in is None: + print("Error! No input file given (-i)!", sys.stderr) + exit(1) + elif not os.path.isfile(file_in): + print("Error! %s does not exist!" % file_in) + exit(1) + elif file_out is None: + print("Error! no output file given (-o)!") + + split_bam(file_in, file_out) diff --git a/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R b/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R new file mode 100755 index 0000000..7377bed --- /dev/null +++ b/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R @@ -0,0 +1,138 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# data +data.1 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_ref.mat")) +ref.1 = data.1$references +prob.1 = data.1$prob +aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_aic.txt"))) +data.1 = NULL + +data.2 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_ref.mat")) +ref.2 = data.2$references +prob.2 = data.2$prob +aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_aic.txt"))) +data.2 = NULL + +data.3 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_ref.mat")) +ref.3 = data.3$references +prob.3 = data.3$prob +aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_aic.txt"))) +data.3 = NULL + +data.4 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_ref.mat")) +ref.4 = data.4$references +prob.4 = data.4$prob +aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_aic.txt"))) +data.4 = NULL + +data.5 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_ref.mat")) +ref.5 = data.5$references +prob.5 = data.5$prob +aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_aic.txt"))) +data.5 = NULL + +data.6 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_ref.mat")) +ref.6 = data.6$references +prob.6 = data.6$prob +aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_aic.txt"))) +data.6 = NULL + +data.7 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_ref.mat")) +ref.7 = data.7$references +prob.7 = data.7$prob +aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_aic.txt"))) +data.7 = NULL + +data.8 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_ref.mat")) +ref.8 = data.8$references +prob.8 = data.8$prob +aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_aic.txt"))) +data.8 = NULL + +data.9 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_ref.mat")) +ref.9 = data.9$references +prob.9 = data.9$prob +aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_aic.txt"))) +data.9 = NULL + +data.10 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_ref.mat")) +ref.10 = data.10$references +prob.10 = data.10$prob +aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_aic.txt"))) +data.10 = NULL + +data.11 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_11class_ref.mat")) +ref.11 = data.11$references +prob.11 = data.11$prob +aic.11 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_11class_aic.txt"))) +data.11 = NULL + +data.12 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_12class_ref.mat")) +ref.12 = data.12$references +prob.12 = data.12$prob +aic.12 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_12class_aic.txt"))) +data.12 = NULL + +data.13 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_13class_ref.mat")) +ref.13 = data.13$references +prob.13 = data.13$prob +aic.13 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_13class_aic.txt"))) +data.13 = NULL + +data.14 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_14class_ref.mat")) +ref.14 = data.14$references +prob.14 = data.14$prob +aic.14 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_14class_aic.txt"))) +data.14 = NULL + +data.15 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_15class_ref.mat")) +ref.15 = data.15$references +prob.15 = data.15$prob +aic.15 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_15class_aic.txt"))) +data.15 = NULL + +ref = list(ref.15, ref.14, ref.13, ref.12, ref.11, ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1) +prob = list(prob.15, prob.14, prob.13, prob.12, prob.11, prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1) +aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1) + +# number of runs +n_run = length(ref) +# number of different classes overall +n_class_tot = sum(unlist(lapply(ref, nrow))) +# max value of K +n_class_max = max(unlist(lapply(ref, nrow))) + +# some colors +colors = rep(brewer.pal(9, "Set1")[1], n_class_max) + +# construct a matrix with all discovered references on the rows +references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]])) +run_value = vector(length=n_class_tot) +k_value = vector(length=n_class_tot) +probabilities = vector(length=n_class_tot) +k = 1 +for(i in 1:n_run) +{ + for(j in 1:nrow(ref[[i]])) + { references[k,] = ref[[i]][j,] + probabilities[k] = prob[[i]][j] + run_value[k] = i + k_value[k] = j + k = k + 1 + } +} + +# distance matrix between all references +distances = distance.ref(references) +rownames(distances) = 1:nrow(distances) +colnames(distances) = 1:ncol(distances) + +plot.references(file.path("results","bulk_sequencing", "ctcf_dnase.png"), + references, probabilities, colors, aic, distances, n_run, run_value, n_class_max) + diff --git a/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R b/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R index bb60a9b..20bc1dd 100755 --- a/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R +++ b/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R @@ -1,197 +1,138 @@ - - -# functions - -#' Compute the euclidean distance between two references. -#' It also check if a reference is in reverse orientation -#' and returns the smallest distance value. -#' \param ref1 a vector containing the first reference. -#' \param ref2 a vector containing the second reference. -#' \return the euclidean distance. -eucl.dist.ref = function(ref1, ref2) -{ - return(min(sqrt(sum(((ref1 - ref2 ) ^ 2))), - sqrt(sum(((ref1 - rev(ref2)) ^ 2))))) -} - - -#' Compute the correlation distance between two references. -#' It also check if a reference is in reverse orientation -#' and returns the smallest distance value. -#' \param ref1 a vector containing the first reference. -#' \param ref2 a vector containing the second reference. -#' \return the euclidean distance. -cor.dist.ref= function(ref1, ref2) -{ - return(1 - min(cor(ref1, ref2 ), - cor(ref1, rev(ref2)))) -} - - -#' Computes the distance matrix, using the euclidean distance, for all -#' the references aggregations given. As some references may be in reverse -#' orientation compared to others, the distance in both orientation is -#' computed, for each pair, and the best is returned. -distance.ref = function(references) -{ n = nrow(references) - d = matrix(nrow=n, ncol=n, data=0) - - for(i in 1:n) - { for(j in 1:i) - { x = eucl.dist.ref(references[i,], references[j,]) - d[i,j] = x - d[j,i] = x - } - } - return(d) -} - - -get_matches = function(distances, run_value) -{ - matches = matrix(nrow=0, ncol=4) - - # references of run i on the row -> y coord - # references of run j on the col -> x coord - - # run labels - run_i = 1 - # run_j = 2 - - for(run_j in setdiff(unique(run_value), run_i)) - { - # number of references in each run - n_i = length(which(run_value == run_i)) - n_j = length(which(run_value == run_j)) - - index_i = which(run_value == run_i) # rows of run i - index_j = which(run_value == run_j) # columns of run j - - i_taken = c() # classes of i already plotted -> rows to ignore - j_taken = c() # classes of j already plotted -> columns to ignore - - # while not all classes in j have been plotted - row_n = 1 - while(length(j_taken) < n_j) - { if(length(i_taken) == 0 && - length(j_taken) == 0) - { distances_tmp = distances[index_i, index_j] - coord = which(distances_tmp == min(distances_tmp), arr.ind=T) - coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) - coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) - coord = c(coord_i, coord_j) - } else { - rows = setdiff(index_i, i_taken) - cols = setdiff(index_j, j_taken) - distances_tmp = distances[rows, cols, drop=F] - coord = which(distances_tmp == min(distances_tmp), arr.ind=T) - coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) - coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) - coord = c(coord_i, coord_j) - } - coord = c(coord, row_n, run_j) - i_taken = c(i_taken, coord[1]) - j_taken = c(j_taken, coord[2]) - matches = rbind(matches, coord) - row_n = row_n + 1 - } - } - return(matches) -} - - -plot.references = function(references, distances, n_run, run_value, n_class_max) -{ - colors = brewer.pal(6, "Set1") - - # compute the best matches between all references to 1st run references - matches = get_matches(distances, run_value) - - # make a matrix for layout with good plot numbers - plots.lab = matrix(nrow=n_class_max, ncol=n_run) - plots.lab[,1] = 1:n_class_max # for run with max number of classes - z = n_class_max + 1 - for(i in 1:nrow(matches)) - { coord = matches[i,] - # plots.lab[coord[3], coord[4]] = z - plots.lab[coord[1], coord[4]] = z - z = z + 1 - } - # these will be the empty plots - for(i in 1:nrow(plots.lab)) - { for(j in 1:ncol(plots.lab)) - { if(is.na(plots.lab[i,j])) - { plots.lab[i,j] = z - z = z + 1 - } - } - } - - # plot - X11(height=12, width=10) - # a grid - m = layout(mat = plots.lab) - # layout.show(m) - x = 1:ncol(references) - - # plot run 1 references - for(i in 1:n_class_max) - { plot(x=x, y=references[i,], lwd=3, type='l', col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads") } - - # plot others - for(i in 1:nrow(matches)) - { ref_index = matches[i,2] - col_index = matches[i,3] - plot(x=x, y=references[ref_index,], lwd=3, type='l', col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads") - } -} - - - - - -library(RColorBrewer) - setwd(file.path("/", "local", "groux", "scATAC-seq")) -data.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class.mat"))) -data.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class.mat"))) -data.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class.mat"))) -data.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class.mat"))) -data.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class.mat"))) -data = list(data.6, data.5, data.4, data.3, data.2) +# libraries +library(RColorBrewer) -# some colors -colors = brewer.pal(6, "Set1") +# functions +source(file.path("scripts", "functions.R")) + +# data +data.1 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_ref.mat")) +ref.1 = data.1$references +prob.1 = data.1$prob +aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_aic.txt"))) +data.1 = NULL + +data.2 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_ref.mat")) +ref.2 = data.2$references +prob.2 = data.2$prob +aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_aic.txt"))) +data.2 = NULL + +data.3 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_ref.mat")) +ref.3 = data.3$references +prob.3 = data.3$prob +aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_aic.txt"))) +data.3 = NULL + +data.4 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_ref.mat")) +ref.4 = data.4$references +prob.4 = data.4$prob +aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_aic.txt"))) +data.4 = NULL + +data.5 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_ref.mat")) +ref.5 = data.5$references +prob.5 = data.5$prob +aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_aic.txt"))) +data.5 = NULL + +data.6 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_ref.mat")) +ref.6 = data.6$references +prob.6 = data.6$prob +aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_aic.txt"))) +data.6 = NULL + +data.7 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_ref.mat")) +ref.7 = data.7$references +prob.7 = data.7$prob +aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_aic.txt"))) +data.7 = NULL + +data.8 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_ref.mat")) +ref.8 = data.8$references +prob.8 = data.8$prob +aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_aic.txt"))) +data.8 = NULL + +data.9 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_ref.mat")) +ref.9 = data.9$references +prob.9 = data.9$prob +aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_aic.txt"))) +data.9 = NULL + +data.10 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_ref.mat")) +ref.10 = data.10$references +prob.10 = data.10$prob +aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_aic.txt"))) +data.10 = NULL + +data.11 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_11class_ref.mat")) +ref.11 = data.11$references +prob.11 = data.11$prob +aic.11 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_11class_aic.txt"))) +data.11 = NULL + +data.12 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_12class_ref.mat")) +ref.12 = data.12$references +prob.12 = data.12$prob +aic.12 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_12class_aic.txt"))) +data.12 = NULL + +data.13 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_13class_ref.mat")) +ref.13 = data.13$references +prob.13 = data.13$prob +aic.13 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_13class_aic.txt"))) +data.13 = NULL + +data.14 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_14class_ref.mat")) +ref.14 = data.14$references +prob.14 = data.14$prob +aic.14 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_14class_aic.txt"))) +data.14 = NULL + +data.15 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_15class_ref.mat")) +ref.15 = data.15$references +prob.15 = data.15$prob +aic.15 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_15class_aic.txt"))) +data.15 = NULL + +ref = list(ref.15, ref.14, ref.13, ref.12, ref.11, ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1) +prob = list(prob.15, prob.14, prob.13, prob.12, prob.11, prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1) +aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1) # number of runs -n_run = length(data) +n_run = length(ref) # number of different classes overall -n_class_tot = sum(unlist(lapply(data, nrow))) +n_class_tot = sum(unlist(lapply(ref, nrow))) # max value of K -n_class_max = max(unlist(lapply(data, nrow))) +n_class_max = max(unlist(lapply(ref, nrow))) + +# some colors +colors = rep(brewer.pal(9, "Set1")[2], n_class_max) # construct a matrix with all discovered references on the rows -references = matrix(nrow=n_class_tot, ncol=ncol(data[[1]])) -run_value = vector(length=n_class_tot) -k_value = vector(length=n_class_tot) +references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]])) +run_value = vector(length=n_class_tot) +k_value = vector(length=n_class_tot) +probabilities = vector(length=n_class_tot) k = 1 for(i in 1:n_run) -{ for(j in 1:nrow(data[[i]])) - { references[k,] = data[[i]][j,] +{ + for(j in 1:nrow(ref[[i]])) + { references[k,] = ref[[i]][j,] + probabilities[k] = prob[[i]][j] run_value[k] = i k_value[k] = j k = k + 1 } } # distance matrix between all references -distances = distance.ref(references) +distances = distance.ref(references) rownames(distances) = 1:nrow(distances) colnames(distances) = 1:ncol(distances) - -plot.references(references, distances, n_run, run_value, n_class_max) -savePlot("tmp_dnase.png") - +plot.references(file.path("results","bulk_sequencing", "ctcf_mnase.png"), + references, probabilities, colors, aic, distances, n_run, run_value, n_class_max) diff --git a/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh b/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh index 7c28f17..4414100 100755 --- a/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh +++ b/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh @@ -1,20 +1,23 @@ results_dir='results/bulk_sequencing' data_dir='data/bulk_sequencing/' mkdir -p $results_dir file_mnase=$data_dir'/ctcf_dnase_k562.mat' +file_seed=$results_dir'/ctcf_dnase_k562_seed.txt' n_iter='20' n_shift='21' -seed='12345678' seeding='random' +n_core=5 -for k in 2 3 4 5 6 +for k in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_dnase_k562_'$k'class_prob.mat4d' file_ref=$results_dir/'ctcf_dnase_k562_'$k'class_ref.mat' - file_aic=$results_dir/'ctcf_dnase_k562_'$k'class_aic.mat' - bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed -p 6 > $file_prob - bin/probToRef --data $file_mnase --prob $file_prob 1> $file_ref 2> $file_aic + file_aic=$results_dir/'ctcf_dnase_k562_'$k'class_aic.txt' + echo "$file_prob $seed" >> $file_seed + bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob + bin/probToRef --data $file_mnase --prob $file_prob --parallel $n_core 1> $file_ref 2> $file_aic done diff --git a/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh b/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh index 1f3a3fd..29779c0 100755 --- a/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh +++ b/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh @@ -1,20 +1,23 @@ results_dir='results/bulk_sequencing' data_dir='data/bulk_sequencing/' mkdir -p $results_dir file_mnase=$data_dir'/ctcf_mnase_k562.mat' +file_seed=$results_dir'/ctcf_mnase_k562_seed.txt' n_iter='20' n_shift='21' -seed='12345678' seeding='random' +n_core=5 -for k in 2 3 4 5 6 +for k in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_mnase_k562_'$k'class_prob.mat4d' file_ref=$results_dir/'ctcf_mnase_k562_'$k'class_ref.mat' - file_aic=$results_dir/'ctcf_mnase_k562_'$k'class_aic.mat' - bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed -p 6 > $file_prob - bin/probToRef --data $file_mnase --prob $file_prob 1> $file_ref 2> $file_aic + file_aic=$results_dir/'ctcf_mnase_k562_'$k'class_aic.txt' + echo "$file_prob $seed" >> $file_seed + bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob + bin/probToRef --data $file_mnase --prob $file_prob --parallel $n_core 1> $file_ref 2> $file_aic done diff --git a/scripts/bulk_sequencing/functions.R b/scripts/bulk_sequencing/functions.R new file mode 100644 index 0000000..453b404 --- /dev/null +++ b/scripts/bulk_sequencing/functions.R @@ -0,0 +1,221 @@ + +#' Reads a reference file and returns a list +#' with the class references and the associated +#' class probabilities. +#' \param file the path to the file of interest. +#' \return a list of two elements : "references" +#' a matrix with the references on each row and +#' "prob" the associated class probabilities. +#' +read.references = function(file) +{ ref = as.matrix(read.table(file), drop=F) + prob = ref[,1] + ref = ref[,-1, drop=F] + return(list(references=ref, prob=prob)) +} + +#' Compute the euclidean distance between two references. +#' It also check if a reference is in reverse orientation +#' and returns the smallest distance value. +#' \param ref1 a vector containing the first reference. +#' \param ref2 a vector containing the second reference. +#' \return the euclidean distance. +eucl.dist.ref = function(ref1, ref2) +{ + return(min(sqrt(sum(((ref1 - ref2 ) ^ 2))), + sqrt(sum(((ref1 - rev(ref2)) ^ 2))))) +} + + +#' Compute the correlation distance between two references. +#' It also check if a reference is in reverse orientation +#' and returns the smallest distance value. +#' \param ref1 a vector containing the first reference. +#' \param ref2 a vector containing the second reference. +#' \return the euclidean distance. +cor.dist.ref= function(ref1, ref2) +{ + return(1 - min(cor(ref1, ref2 ), + cor(ref1, rev(ref2)))) +} + + +#' Computes the (eucliden) distance matrix for all the given +#' the references As some references may be in reverse +#' orientation compared to others, the distance in both +#' orientation is computed, for each pair, and the best is +#' returned. +#' \param references a matrix with the references on each row. +#' \return a matrix containing the distances between each reference. +distance.ref = function(references) +{ n = nrow(references) + d = matrix(nrow=n, ncol=n, data=0) + + for(i in 1:n) + { for(j in 1:i) + { x = eucl.dist.ref(references[i,], references[j,]) + d[i,j] = x + d[j,i] = x + } + } + return(d) +} + + +get_matches = function(distances, run_value) +{ + matches = matrix(nrow=0, ncol=4) + + # references of run i on the row -> y coord + # references of run j on the col -> x coord + + # run labels + run_i = 1 + # run_j = 2 + + for(run_j in setdiff(unique(run_value), run_i)) + { + # number of references in each run + n_i = length(which(run_value == run_i)) + n_j = length(which(run_value == run_j)) + + index_i = which(run_value == run_i) # rows of run i + index_j = which(run_value == run_j) # columns of run j + + i_taken = c() # classes of i already matched -> rows to ignore + j_taken = c() # classes of j already matched -> columns to ignore + + # while not all classes in j have been assigned a best match + row_n = 1 + while(length(j_taken) < n_j) + { if(length(i_taken) == 0 && + length(j_taken) == 0) + { distances_tmp = distances[index_i, index_j] + coord = which(distances_tmp == min(distances_tmp), arr.ind=T) + coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) + coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) + coord = c(coord_i, coord_j) + } else { + rows = setdiff(index_i, i_taken) + cols = setdiff(index_j, j_taken) + distances_tmp = distances[rows, cols, drop=F] + coord = which(distances_tmp == min(distances_tmp), arr.ind=T) + coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) + coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) + coord = c(coord_i, coord_j) + } + coord = c(coord, row_n, run_j) + i_taken = c(i_taken, coord[1]) + j_taken = c(j_taken, coord[2]) + matches = rbind(matches, coord) + row_n = row_n + 1 + } + } + return(matches) +} + + + +#'Creates a composite figure in which several class references from +#'several partitions, with different numbers of classes, are plotted. +#'The figure is composed of a matrix of rows and +#'columns where is the highest number of classes in all +#'partitions and the number of different partition. T +#'The first column will contain the references of the +#'partition with classes. The next columns will contain the +#'references of the partition with the second biggest number of +#'classes (and so on). In a given column, except the 1st one, +#'the references are ordered (over the rows) such that the +#'overall similarity (euclidean distance) with the 1st column +#'references are maximized. +#'\file the file name where the image will be saved. +#'\param references a matrix with the different references to draw on +#'each row. +#'\param a vector containing the class probability (or weight) associated +#'to each corresponding reference (row) in matrix. +#'\param a vector of values that will be displayed atop of each +#'column of plots. +#'\param distances a distance matrix containing the distance between all +#'references. The row and column labels have to be the row and column +#'number (1, 2, 3, ...)! +#'\param n_run the total number of different partitions to which all +#'references belong. +#'\param run_value a vector indicating to which partition each reference +#'(row of references) belong to. It should be a simple vector of integers, +#'for instance 1,1,1,1,2,2,2,3,3 +#'\param n_class_max, the highest number of classes searches in all partitions () +plot.references = function(file, references, probabilities, col.titles, distances, n_run, run_value, n_class_max) +{ + colors = brewer.pal(6, "Set1") + + # compute the best matches between all references to 1st run references + matches = get_matches(distances, run_value) + + # make a matrix for layout with good plot numbers + plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) + plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) + plots.lab[-1,1] = 1:n_class_max # for run with max number of classes + z = n_class_max + 1 + for(i in 1:nrow(matches)) + { coord = matches[i,] + # plots.lab[coord[3], coord[4]] = z + plots.lab[coord[1]+1, coord[4]] = z + z = z + 1 + } + # these will be the empty plots + for(i in 1:nrow(plots.lab)) + { for(j in 1:ncol(plots.lab)) + { if(is.na(plots.lab[i,j])) + { plots.lab[i,j] = z + z = z + 1 + } + } + } + + # plot + # X11(height=24, width=20) + png(filename=file, width=20, height=24, unit="in", res=720) + # a grid + m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) + # layout.show(m) + x = 1:ncol(references) + + # plot references of partition with highest number of classes + for(i in 1:n_class_max) + { plot(x=x, y=references[i,], lwd=3, type='l', ylim=c(0, 1.2*max(references[i,])), + col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads") + # prob + x_ = 0.85*length(references[i,]) + y_ = max(references[i,]) + lab = round(probabilities[i],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # plot others + for(i in 1:nrow(matches)) + { ref_index = matches[i,2] + col_index = matches[i,3] + plot(x=x, y=references[ref_index,], lwd=3, type='l', ylim=c(0, 1.2*max(references[ref_index,])), + col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads") + # prob + x_ = 0.85*length(references[ref_index,]) + y_ = max(references[ref_index,]) + lab = round(probabilities[ref_index],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # empty plots + for(i in (length(run_value)+1):(n_run*n_class_max)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } + + # col titles + p = par(mar=c(0,0,0,0)) + for(i in 1:length(col.titles)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") + text(1,1, labels=col.titles[i], cex=2) + } + par(p) + dev.off() +} + + diff --git a/scripts/bulk_sequencing/prepare_data.R b/scripts/bulk_sequencing/prepare_data.R index ad4b79a..b7ee0ab 100755 --- a/scripts/bulk_sequencing/prepare_data.R +++ b/scripts/bulk_sequencing/prepare_data.R @@ -1,13 +1,21 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) -dnase1 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat"))) -dnase2 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat"))) -dnase3 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep3.mat"))) +# DNaseI around CTCF +ctcf.dnase1 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat"))) +ctcf.dnase2 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat"))) +ctcf.dnase3 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep3.mat"))) + +# DNaseI around TSS +tss.dnase1 = as.matrix(read.table(file.path("data", "bulk_sequencing", "tss_dnase_k562_rep1.mat"))) +tss.dnase2 = as.matrix(read.table(file.path("data", "bulk_sequencing", "tss_dnase_k562_rep1.mat"))) +tss.dnase3 = as.matrix(read.table(file.path("data", "bulk_sequencing", "tss_dnase_k562_rep3.mat"))) # sum everything to increase coverage -dnase = dnase1 + dnase2 + dnase3 +ctcf.dnase = ctcf.dnase1 + ctcf.dnase2 + ctcf.dnase3 +tss.dnase = tss.dnase1 + tss.dnase2 + tss.dnase3 # write the new tables -write.table(dnase, file=file.path("data", "bulk_sequencing", "ctcf_dnase_k562.mat"), col.names=F, row.names=F, quote=F, eol='\n', sep='\t') +write.table(ctcf.dnase, file=file.path("data", "bulk_sequencing", "ctcf_dnase_k562.mat"), col.names=F, row.names=F, quote=F, eol='\n', sep='\t') +write.table(tss.dnase, file=file.path("data", "bulk_sequencing", "tss_dnase_k562.mat"), col.names=F, row.names=F, quote=F, eol='\n', sep='\t') diff --git a/scripts/functions.R b/scripts/functions.R new file mode 100644 index 0000000..d981bc0 --- /dev/null +++ b/scripts/functions.R @@ -0,0 +1,511 @@ + +#' Reads a reference file and returns a list +#' with the class references and the associated +#' class probabilities. +#' \param file the path to the file of interest. +#' \return a list of two elements : "references" +#' a matrix with the references on each row and +#' "prob" the associated class probabilities. +#' +read.references = function(file) +{ ref = as.matrix(read.table(file), drop=F) + prob = ref[,1] + ref = ref[,-1, drop=F] + return(list(references=ref, prob=prob)) +} + +#' Compute the euclidean distance between two references. +#' It also check if a reference is in reverse orientation +#' and returns the smallest distance value. +#' \param ref1 a vector containing the first reference. +#' \param ref2 a vector containing the second reference. +#' \return the euclidean distance. +eucl.dist.ref = function(ref1, ref2) +{ + return(min(sqrt(sum(((ref1 - ref2 ) ^ 2))), + sqrt(sum(((ref1 - rev(ref2)) ^ 2))))) +} + + +#' Compute the correlation distance between two references. +#' It also check if a reference is in reverse orientation +#' and returns the smallest distance value. +#' \param ref1 a vector containing the first reference. +#' \param ref2 a vector containing the second reference. +#' \return the euclidean distance. +cor.dist.ref= function(ref1, ref2) +{ + return(1 - min(cor(ref1, ref2 ), + cor(ref1, rev(ref2)))) +} + + +#' Computes the (eucliden) distance matrix for all the given +#' the references As some references may be in reverse +#' orientation compared to others, the distance in both +#' orientation is computed, for each pair, and the best is +#' returned. +#' \param references a matrix with the references on each row. +#' \return a matrix containing the distances between each reference. +distance.ref = function(references) +{ n = nrow(references) + d = matrix(nrow=n, ncol=n, data=0) + + for(i in 1:n) + { for(j in 1:i) + { x = eucl.dist.ref(references[i,], references[j,]) + d[i,j] = x + d[j,i] = x + } + } + return(d) +} + + +get_matches = function(distances, run_value) +{ + matches = matrix(nrow=0, ncol=4) + + # references of run i on the row -> y coord + # references of run j on the col -> x coord + + # run labels + run_i = 1 + # run_j = 2 + + for(run_j in setdiff(unique(run_value), run_i)) + { + # number of references in each run + n_i = length(which(run_value == run_i)) + n_j = length(which(run_value == run_j)) + + index_i = which(run_value == run_i) # rows of run i + index_j = which(run_value == run_j) # columns of run j + + i_taken = c() # classes of i already matched -> rows to ignore + j_taken = c() # classes of j already matched -> columns to ignore + + # while not all classes in j have been assigned a best match + row_n = 1 + while(length(j_taken) < n_j) + { if(length(i_taken) == 0 && + length(j_taken) == 0) + { distances_tmp = distances[index_i, index_j, drop=F] + coord = which(distances_tmp == min(distances_tmp), arr.ind=T) + coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) + coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) + coord = c(coord_i, coord_j) + } else { + rows = setdiff(index_i, i_taken) + cols = setdiff(index_j, j_taken) + distances_tmp = distances[rows, cols, drop=F] + coord = which(distances_tmp == min(distances_tmp), arr.ind=T) + coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) + coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) + coord = c(coord_i, coord_j) + } + coord = c(coord, row_n, run_j) + i_taken = c(i_taken, coord[1]) + j_taken = c(j_taken, coord[2]) + matches = rbind(matches, coord) + row_n = row_n + 1 + } + } + return(matches) +} + + + +#'Creates a composite figure in which several class references from +#'several partitions, with different numbers of classes, are plotted. +#'The figure is composed of a matrix of rows and +#'columns where is the highest number of classes in all +#'partitions and the number of different partition. T +#'The first column will contain the references of the +#'partition with classes. The next columns will contain the +#'references of the partition with the second biggest number of +#'classes (and so on). In a given column, except the 1st one, +#'the references are ordered (over the rows) such that the +#'overall similarity (euclidean distance) with the 1st column +#'references are maximized. +#'\param file the file name where the image will be saved. +#'\param references a matrix with the different references to draw on +#'each row. +#'\param references a vector containing the class probability (or weight) associated +#'to each corresponding reference (row) in matrix. +#'\param probabilities a vector of values that will be displayed atop of each +#'column of plots. +#'\param colors a vector of colors to draw the class profiles. There should +#'be colors, they can be the same. +#'\param distances a distance matrix containing the distance between all +#'references. The row and column labels have to be the row and column +#'number (1, 2, 3, ...)! +#'\param n_run the total number of different partitions to which all +#'references belong. +#'\param run_value a vector indicating to which partition each reference +#'(row of references) belong to. It should be a simple vector of integers, +#'for instance 1,1,1,1,2,2,2,3,3 +#'\param n_class_max, the highest number of classes searches in all partitions () +plot.references = function(file, + references, + probabilities, + colors, + col.titles, + distances, + n_run, + run_value, + n_class_max, + width=15, + height=18) +{ + # compute the best matches between all references to 1st run references + matches = get_matches(distances, run_value) + + # make a matrix for layout with good plot numbers + plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) + plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) + plots.lab[-1,1] = 1:n_class_max # for run with max number of classes + z = n_class_max + 1 + for(i in 1:nrow(matches)) + { coord = matches[i,] + # plots.lab[coord[3], coord[4]] = z + plots.lab[coord[1]+1, coord[4]] = z + z = z + 1 + } + # these will be the empty plots + for(i in 1:nrow(plots.lab)) + { for(j in 1:ncol(plots.lab)) + { if(is.na(plots.lab[i,j])) + { plots.lab[i,j] = z + z = z + 1 + } + } + } + + + # plot + png(filename=file, width=width, height=height, unit="in", res=720) + # a grid + m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) + layout.show(m) + x = 1:ncol(references) + + # plot references of partition with highest number of classes + for(i in 1:n_class_max) + { plot(x=x, y=references[i,], lwd=2, type='l', ylim=c(0, 1.2*max(references[i,])), + col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads") + # prob + x_ = 0.85*length(references[i,]) + y_ = max(references[i,]) + lab = round(probabilities[i],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # plot others + for(i in 1:nrow(matches)) + { + ref_index = matches[i,2] + col_index = matches[i,3] + + + plot(x=x, y=references[ref_index,], lwd=2, type='l', ylim=c(0, 1.2*max(references[ref_index,])), + col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads") + # prob + x_ = 0.85*length(references[ref_index,]) + y_ = max(references[ref_index,]) + lab = round(probabilities[ref_index],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # empty plots + for(i in (length(run_value)+1):(n_run*n_class_max)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } + + # col titles + p = par(mar=c(0,0,0,0)) + for(i in 1:length(col.titles)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") + text(1,1, labels=col.titles[i], cex=2) + } + par(p) + dev.off() +} + + + + +plot.references.2 = function(file, + references, + probabilities, + colors, + col.titles, + distances, + n_run, + run_value, + n_class_max, + width=15, + height=18) +{ + # compute the best matches between all references to 1st run references + matches = get_matches(distances, run_value) + + # make a matrix for layout with good plot numbers + plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) + plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) + plots.lab[-1,1] = 1:n_class_max # for run with max number of classes + z = n_class_max + 1 + for(i in 1:nrow(matches)) + { coord = matches[i,] + # plots.lab[coord[3], coord[4]] = z + plots.lab[coord[1]+1, coord[4]] = z + z = z + 1 + } + # these will be the empty plots + for(i in 1:nrow(plots.lab)) + { for(j in 1:ncol(plots.lab)) + { if(is.na(plots.lab[i,j])) + { plots.lab[i,j] = z + z = z + 1 + } + } + } + + # plot + if(is.null(file)) + { X11(width=width, height=height) } + else + { png(filename=file, width=width, height=height, unit="in", res=720) } + # a grid + m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) + # layout.show(m) + x = 1:ncol(references[[1]]) + + # plot references of partition with highest number of classes + for(i in 1:n_class_max) + { for(j in 1:length(references)) + { + ylim = c(0, 1.2) + if(j == 1) + { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]), + lwd=2, type='l', ylim=ylim, + col=colors[j], main="", xlab="pos [bp]", ylab="Nb reads") + } + else + { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]), + lwd=2, type='l', col=colors[j]) + } + } + + # prob + x_ = 0.85*length(references[[1]][i,]) + # y_ = max(references[[1]][i,]) + y_ = 0.85 + lab = round(probabilities[i],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # plot others + for(i in 1:nrow(matches)) + { ref_index = matches[i,2] + col_index = matches[i,3] + for(j in 1:length(references)) + { ylim = c(0, 1.2) + if(j == 1) + { plot(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), + lwd=2, type='l', ylim=ylim, + col=colors[j], main="", xlab="pos [bp]", ylab="Nb reads") + } + else + { lines(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), + lwd=2, col=colors[j]) + } + } + # prob + x_ = 0.85*length(references[[1]][ref_index,]) + # y_ = max(references[[1]][ref_index,]) + y_ = 0.85 + lab = round(probabilities[ref_index],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # empty plots + for(i in (length(run_value)+1):(n_run*n_class_max)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } + + # col titles + p = par(mar=c(0,0,0,0)) + for(i in 1:length(col.titles)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") + text(1,1, labels=col.titles[i], cex=2) + } + par(p) + if(!is.null(file)) + { dev.off() } +} + + +plot.references.3 = function(file, + references, + probabilities, + colors, + col.titles, + distances, + n_run, + run_value, + n_class_max, + width=15, + height=18) +{ + # compute the best matches between all references to 1st run references + matches = get_matches(distances, run_value) + + # make a matrix for layout with good plot numbers + plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) + plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) + plots.lab[-1,1] = 1:n_class_max # for run with max number of classes + z = n_class_max + 1 + for(i in 1:nrow(matches)) + { coord = matches[i,] + # plots.lab[coord[3], coord[4]] = z + plots.lab[coord[1]+1, coord[4]] = z + z = z + 1 + } + # these will be the empty plots + for(i in 1:nrow(plots.lab)) + { for(j in 1:ncol(plots.lab)) + { if(is.na(plots.lab[i,j])) + { plots.lab[i,j] = z + z = z + 1 + } + } + } + + # plot + if(is.null(file)) + { X11(width=width, height=height) } + else + { png(filename=file, width=width, height=height, unit="in", res=720) } + + p = par(mar=c(0,0,0,0)) + + # a grid + m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) + # layout.show(m) + x = 1:ncol(references[[1]]) + + # plot references of partition with highest number of classes + for(i in 1:n_class_max) + { for(j in 1:length(references)) + { + ylim = c(0, 1.2) + if(j == 1) + { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]), + lwd=2, type='l', ylim=ylim, + col=colors[j], main='', xlab='', ylab='', + xaxt='n', yaxt='n') + } + else + { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]), + lwd=2, type='l', col=colors[j]) + } + } + + # prob + x_ = 0.85*length(references[[1]][i,]) + # y_ = max(references[[1]][i,]) + y_ = 0.85 + lab = round(probabilities[i],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # plot others + for(i in 1:nrow(matches)) + { ref_index = matches[i,2] + col_index = matches[i,3] + for(j in 1:length(references)) + { ylim = c(0, 1.2) + if(j == 1) + { plot(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), + lwd=2, type='l', ylim=ylim, + col=colors[j], main='', xlab='', ylab='', + xaxt='n', yaxt='n') + } + else + { lines(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]), + lwd=2, col=colors[j]) + } + } + # prob + x_ = 0.85*length(references[[1]][ref_index,]) + # y_ = max(references[[1]][ref_index,]) + y_ = 0.85 + lab = round(probabilities[ref_index],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + # empty plots + for(i in (length(run_value)+1):(n_run*n_class_max)) + { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } + + # col titles + for(i in 1:length(col.titles)) + { plot(1,1, xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") + text(1,1, labels=col.titles[i], cex=2) + } + par(p) + if(!is.null(file)) + { dev.off() } +} + + +plot.references.4 = function(file, + references, + probabilities, + colors, + width=15, + height=18) +{ + n_class = nrow(references[[1]]) + n_col = ncol(references[[1]]) + mat = matrix(nrow=n_class, ncol=1, data=1:n_class) + + # plot + if(is.null(file)) + { X11(width=width, height=height) } + else + { png(filename=file, width=width, height=height, unit="in", res=720) } + + p = par(mar=c(0,0,0,0)) + + # a grid + m = layout(mat = mat) + # layout.show(m) + x = 1:n_col + + for(i in 1:n_class) + { for(j in 1:length(references)) + { + ylim = c(0, 1.2) + if(j == 1) + { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]), + lwd=2, type='l', ylim=ylim, + col=colors[j], main='', xlab='', ylab='', + xaxt='n', yaxt='n') + } + else + { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]), + lwd=2, type='l', col=colors[j]) + } + } + # prob + x_ = 0.85*length(references[[1]][i,]) + # y_ = max(references[[1]][i,]) + y_ = 0.85 + lab = round(probabilities[i],3) + text(x=x_, y=y_, labels=lab, cex=1.2) + } + + if(!is.null(file)) + { dev.off() } +} + diff --git a/scripts/install_libraries/install_libSeqAn.sh b/scripts/install_libraries/install_libSeqAn.sh new file mode 100644 index 0000000..1bdd88f --- /dev/null +++ b/scripts/install_libraries/install_libSeqAn.sh @@ -0,0 +1,14 @@ +# install the header only SeqAn library + +library_dir='lib/seqan' + +# clone git +git clone https://github.com/seqan/seqan.git +cd seqan +#install +mkdir -p ../$library_dir +## header files +mv * ../$library_dir +cd .. +# clean +rm -rf seqan diff --git a/scripts/install_libraries/run_all.sh b/scripts/install_libraries/run_all.sh new file mode 100644 index 0000000..6251f3d --- /dev/null +++ b/scripts/install_libraries/run_all.sh @@ -0,0 +1,5 @@ +mkdir lib/ +mkdir lib/include + +scripts/install_libraries/install_libStatGen.sh + diff --git a/scripts/install_programs/install_deeptools.sh b/scripts/install_programs/install_deeptools.sh new file mode 100644 index 0000000..9df2c69 --- /dev/null +++ b/scripts/install_programs/install_deeptools.sh @@ -0,0 +1,8 @@ + +# make sure that pip is installed for python3.6 +# curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py +# sudo python3.6 get-pip.py --force-reinstall + +# install deeptools for python3.6 +sudo pip3.6 install deeptools + diff --git a/scripts/install_programs/run_all.sh b/scripts/install_programs/run_all.sh new file mode 100644 index 0000000..fd23b47 --- /dev/null +++ b/scripts/install_programs/run_all.sh @@ -0,0 +1 @@ +scripts/install_programs/install_deeptools.sh diff --git a/scripts/plot_references.R b/scripts/plot_references.R deleted file mode 100644 index 1c7c10a..0000000 --- a/scripts/plot_references.R +++ /dev/null @@ -1,27 +0,0 @@ -data_rand = as.matrix(read.table("mnase_random.txt", h=F)) -data_rand_new = as.matrix(read.table("mnase_random_new.txt", h=F)) -data_sampling = as.matrix(read.table("mnase_sampling.txt", h=F)) -data_rand_r = as.matrix(read.table("mnase_R.txt", h=F)) - -par(mfrow=c(2,2)) - -x = 1:ncol(data_rand_r) -plot(x, data_rand_r[1,], type='l', lwd=3, ylim=c(min(data_rand_r), max(data_rand_r)), main="R Random seeding") -for(i in 2:nrow(data_rand_r)) -{lines(x, data_rand_r[i,], lwd=3, col=i) } - -x = 1:ncol(data_rand) -plot(x, data_rand[1,], type='l', lwd=3, ylim=c(min(data_rand), max(data_rand)), main="C++ Random seeding") -for(i in 2:nrow(data_rand)) -{lines(x, data_rand[i,], lwd=3, col=i) } - -x = 1:ncol(data_rand_new) -plot(x, data_rand_new[1,], type='l', lwd=3, ylim=c(min(data_rand_new), max(data_rand_new)), main="C++ New random seeding") -for(i in 2:nrow(data_rand_new)) -{lines(x, data_rand_new[i,], lwd=3, col=i) } - -x = 1:ncol(data_sampling) -plot(x, data_sampling[1,], type='l', lwd=3, ylim=c(min(data_sampling), max(data_sampling)), main="C++ Sampling seeding") -for(i in 2:nrow(data_sampling)) -{lines(x, data_sampling[i,], lwd=3, col=i) } - diff --git a/scripts/run_all.sh b/scripts/run_all.sh index 55b002e..a698e65 100755 --- a/scripts/run_all.sh +++ b/scripts/run_all.sh @@ -1,2 +1,11 @@ -scripts/simulate_chipseq_data/run_all.sh +# install programs +scripts/install_programs/run_all.sh + +# install libraries +scripts/install_libraries/run_all.sh + +# simulate data for testing purposes +scripts/generate_toy_data/run_all.sh + + diff --git a/scripts/simulate_chipseq_data/run_all.sh b/scripts/simulate_chipseq_data/run_all.sh deleted file mode 100755 index 716b134..0000000 --- a/scripts/simulate_chipseq_data/run_all.sh +++ /dev/null @@ -1,2 +0,0 @@ -Rscript scripts/simulate_chipseq_data/simulate_data_chipseq.R - diff --git a/scripts/toy_data/generate_bam_file.py b/scripts/toy_data/generate_bam_file.py new file mode 100644 index 0000000..13c7419 --- /dev/null +++ b/scripts/toy_data/generate_bam_file.py @@ -0,0 +1,238 @@ + +# This toy dataset contains 2 chromosomes with exactly the same fragments +# on each one of them. Each fragment belong to 1 cell For each chromosome, the situation is +# the following. +# In the following picture, the fragments are depicted as [from,to) inverval. Thus the fragments +# contain the from position but do NOT contain the end position. Each fragment is present 2x, +# once the fw read is the 1st read of the pair, once the rv read is the 1st read of the pair. +# +# Each fragment is composed of two reads of 35bp each. The fw read is always the 1st read of the +# pair and the rv read the 2nd. +# +# <--------------------------> +# AAAAAAA +# ------> <------ +# TTTTTTT +# read fw (35bp) read rv (35bp) +# +# The genome is only made of A on the fw strand and T on the rv strand. +# +# 550 650 750 850 950 1050 1150 1250 1350 1450 +# | | | | | | | | | | +# --------------------------------------------------------------------------------------------------> chrom +# 400 480 | | | | | | | | | | +# cell 0 <-----> | | | | | | | | | | +# 480 550 | | | | | | | | | +# cell 1 <----->| | | | | | | | | | +# | 560 | | 800 | | | | | | | +# cell 2 | <------------------> | | | | | | | +# | 560 640| | | | | | | | | +# cell 3 | <----->| | | | | | | | | +# | 610| 690 | | | | | | | | +# cell 4 | <-----> | | | | | | | | +# | |670 750 | | | | | | | +# cell 5 | | <-----> | | | | | | | +# | | 730 810 | | | | | | | +# cell 6 | | <-----> | | | | | | | +# | | | 770 850 | | | | | | +# cell 7 | | | <-----> | | | | | | +# | | | | 950 | 1150 | | | +# cell 8 | | | | <-----------------> | | | +# | | | | |960 1040 | | | | +# cell 9 | | | | | <----->| | | | | +# | | | | | 1010 1090 | | | | +# cell 10 | | | | | <-----> | | | | +# | | | | | |1060 1140 | | | +# cell 11 | | | | | | <----->| | | | +# | | | | | | 1070 1150 | | | +# cell 12 | | | | | | <-----> | | | +# | | | | | | | | 1350 1430| +# cell 13 | | | | | | | | <-----> | +# | | | | | | | | |1360 1440 +# cell 14 | | | | | | | | | <----->| +# | | | | | | | | | 1410 | 1490 +# cell 15 | | | | | | | | | <-----> +# | | | | | | | | | | 1500 1600 +# cell 16 | | | | | | | | | | <------------> +# | | | | | | | | | | 1600 1700 +# cell 17 | | | | | | | | | | <------------> + +import pysam +import os + +def create_read_pair(frag_start, ref_id, frag_len): + + # Creates 2 pairs of read representing twice + # the same fragment. In one pair, the 1st + # read is fw and the 2n is rev, in the other + # pair, the 1st read is rev and the 2nd is fw. + # ---------------------------------> reference + # read_fw1 read_rv1 + # --------> <-------- + # |start end| + # |-----------frag len-----------| + # |end start| + # read_fw2 read_rv2 + # --------> <-------- + + n = create_read_pair.counter + + # the reads + read_fw1 = pysam.AlignedSegment() + read_rv1 = pysam.AlignedSegment() + read_fw2 = pysam.AlignedSegment() + read_rv2 = pysam.AlignedSegment() + + # the start of the reverse mate (rightmost pos) + # start_rv = frag_start + frag_len - 1 + + # the start of the reverse mate (leftmost pos) + start_rv = frag_start + frag_len - 35 + print("%d %d" % (frag_start, start_rv)) + + # flags + flag_fw1 = 99 # paired read, read mapped in proper pair + # mate rev strand, first in pair + flag_rv1 = 147 # paired read, read mapped in proper pair + # read in rev strand, second in pair + + flag_fw2 = 163 # paired read, read mapped in proper pair + # mate rev strand, second in pair + flag_rv2 = 83 # paired read, read mapped in proper pair + # read in rev strand, first in pair + + # optional field tags + extra_tags = (("NM", 1), # edit distance with ref + ("RG", "L1"), # read group + ("CB", "cell_%d" % n)) # cell barcode + + # pair 1 : 1st read fw, 2nd read rev + ## fw read + read_fw1.query_name = "read_fw1_%d" % n + read_fw1.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + read_fw1.flag = flag_fw1 + read_fw1.reference_id = ref_id + read_fw1.reference_start = frag_start + read_fw1.mapping_quality = 20 + # read_fw1.cigar = ((0,10), (2,1), (0,25)) + read_fw1.next_reference_id = ref_id + read_fw1.next_reference_start = start_rv + read_fw1.template_length = frag_len + read_fw1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") + read_fw1.tags = extra_tags + + ## rev read + read_rv1.query_name = "read_rv1_%d" % n + read_rv1.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" + read_rv1.flag = flag_rv1 + read_rv1.reference_id = ref_id + read_rv1.reference_start = start_rv + read_rv1.mapping_quality = 20 + # read_rv1.cigar = ((0,10), (2,1), (0,25)) + read_rv1.next_reference_id = ref_id + read_rv1.next_reference_start = frag_start + read_rv1.template_length = -frag_len + read_rv1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") + read_rv1.tags = extra_tags + create_read_pair.counter += 1 + + # pair 2 : 1st read rev, 2nd read fw + ## fw read + read_fw2.query_name = "read_fw2_%d" % n + read_fw2.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + read_fw2.flag = flag_fw2 + read_fw2.reference_id = ref_id + read_fw2.reference_start = frag_start + read_fw2.mapping_quality = 20 + # read_fw2.cigar = ((0,10), (2,1), (0,25)) + read_fw2.next_reference_id = ref_id + read_fw2.next_reference_start = start_rv + read_fw2.template_length = frag_len + read_fw2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") + read_fw2.tags = extra_tags + + ## rev read + read_rv2.query_name = "read_rv2_%d" % n + read_rv2.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" + read_rv2.flag = flag_rv2 + read_rv2.reference_id = ref_id + read_rv2.reference_start = start_rv + read_rv2.mapping_quality = 20 + # read_rv2.cigar = ((0,10), (2,1), (0,25)) + read_rv2.next_reference_id = ref_id + read_rv2.next_reference_start = frag_start + read_rv2.template_length = -frag_len + read_rv2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") + read_rv2.tags = extra_tags + create_read_pair.counter += 1 + + return ((read_fw1, read_rv1), (read_fw2, read_rv2)) + +create_read_pair.counter = 0 + +if __name__ == "__main__": + # file header, the genome will have 2 chromosomes + header = { 'HD': {'VN': '1.0', + 'SO': 'unsorted'}, + 'SQ': [{'LN': 2000, 'SN': 'chr1'}, # chrom index 0 + {'LN': 2000, 'SN': 'chr2'}] } # chrom index 1 + + file_out = os.path.join("data", "toy_data", "sc_reads.bam") + f_out = pysam.AlignmentFile(file_out, header=header, mode="wb") + + chromosomes = [0, 1] + + read_fw_starts = (400, 470, 560, 560, 610, 670, 730, 770, 950, 960, \ + 1010, 1060, 1070, 1350, 1360, 1410, 1500, 1600) + + frag_lengths = (80, 80, 240, 80, 80, 80, 80, 80, 200, 80, \ + 80, 80, 80, 80, 80, 80, 100, 100) + + for chrom in chromosomes: + for i in range(0, len(read_fw_starts), 1): + read_fw_start = read_fw_starts[i] + frag_len = frag_lengths[i] + reads = create_read_pair(read_fw_start, chrom, frag_len) + + f_out.write(reads[0][0]) + f_out.write(reads[0][1]) + f_out.write(reads[1][0]) + f_out.write(reads[1][1]) + + f_out.close() + +# read_fw, read_rev = create_read_pair(0, 0, 100) +# f_out.write(read_fw) +# f_out.write(read_rev) +# f_out.close() + + +# read_fw = pysam.AlignedSegment() +# read_fw.query_name = "read_1" +# read_fw.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" +# read_fw.flag = 99 +# read_fw.reference_id = 0 +# read_fw.reference_start = 32 +# read_fw.mapping_quality = 20 +# # read_fw.cigar = ((0,10), (2,1), (0,25)) +# read_fw.next_reference_id = 0 +# read_fw.next_reference_start=199 +# read_fw.template_length=167 +# read_fw.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") +# read_fw.tags = (("NM", 1),("RG", "L1")) +# f_out.write(read_fw) + +# read_rv = pysam.AlignedSegment() +# read_rv.query_name = "read_1" +# read_rv.query_sequence= "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" +# read_rv.flag = 147 +# read_rv.reference_id = 0 +# read_rv.reference_start = 199 +# read_rv.mapping_quality = 20 +# read_fw.cigar = ((0,10), (2,1), (0,25)) +# read_rv.next_reference_id = 0 +# read_rv.next_reference_start=32 +# read_rv.template_length=167 +# read_rv.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") +# read_rv.tags = (("NM", 1),("RG", "L1")) +# f_out.write(read_rv) diff --git a/scripts/toy_data/generate_bed_file.py b/scripts/toy_data/generate_bed_file.py new file mode 100755 index 0000000..135e1f8 --- /dev/null +++ b/scripts/toy_data/generate_bed_file.py @@ -0,0 +1,11 @@ +# simply generate a bed file with two region on two chromosomes + +import os + +file_bed = os.path.join("data", "toy_data", "peaks.bed") + +# write the cell barcodes +# add an extra barcode associated with no reads -> empty cell +with open(file_bed, "wt") as f_out: + f_out.write("chr1\t990\t1010\n") + f_out.write("chr2\t990\t1010\n") diff --git a/scripts/simulate_chipseq_data/simulate_data_chipseq.R b/scripts/toy_data/generate_matrix_data_chipseq.R similarity index 98% rename from scripts/simulate_chipseq_data/simulate_data_chipseq.R rename to scripts/toy_data/generate_matrix_data_chipseq.R index 082cf0a..c6447e1 100755 --- a/scripts/simulate_chipseq_data/simulate_data_chipseq.R +++ b/scripts/toy_data/generate_matrix_data_chipseq.R @@ -1,133 +1,133 @@ #' \brief This function #' generate_data_chipseq = function(n_col, shift_max, p_flip, p_noise, coverage, shape_list, nrows_list) { # number of datum to generate n_row = sum(unlist(nrows_list)) # data structure to store results data = matrix(0, nrow=n_row, ncol=n_col) shifts = vector(length=n_row, mode="numeric") flips = vector(length=n_row, mode="numeric") classes = vector(length=n_row, mode="numeric") shapes = matrix(0, nrow=length(shape_list), ncol=n_col) # the proportion of reads which are signal p_signal = 1 - p_noise # noise : a uniform distribution x = 1:n_col shape_noise = dunif(x, min=min(x), max=max(x)) i_tot = 1 for(k in 1:length(shape_list)) { l = length(shape_list[[k]]) from_s = floor((n_col-l)/2) to_s = from_s + l - 1 for(i in 1:nrows_list[[k]]) { # shift shifts[i_tot] = ceiling(runif(1,1,shift_max)) # flip flips[i_tot] = rbinom(1, 1, p_flip) # class classes[i_tot] = k from = shifts[i_tot] to = shifts[i_tot] + length(shape_list[[k]]) - 1 # construct shape given shift and flip # ensure min value equal to min in shape shape = rep(min(shape_list[[k]]), n_col) # flat, only lowest possible value if(flips[i_tot]) { shape[from:to] = rev(shape_list[[k]]) tmp = from from = to to = tmp } else { shape[from:to] = shape_list[[k]] } shape = shape*coverage*p_signal + shape_noise*coverage*p_noise # scale to right coverage and add noise # sample reads from shape data[i_tot,] = rpois(n_col, shape) shapes[k,from_s:to_s] = shapes[k,from_s:to_s] + ( data[i_tot, from:to] / nrows_list[[k]]) i_tot = i_tot + 1 } } return(list("data"=data, "shifts"=shifts, "flips"=flips, "classes"=classes, "shapes"=shapes)) } setwd(file.path("/", "local", "groux", "scATAC-seq")) seed = 20170426 -dir_data = file.path("data", "simulated_chipseq_data") +dir_data = file.path("data", "toy_data") dir.create(dir_data, showWarnings=F) if(!file.exists(dir_data)) { dir.create(dir_data) } # general parameter n_samples = 1000 n_col = 2001 # the length of a signal vector shift_max = 100 # the maximum possible shift p_flip = 0.3 # the prob of having a flipped signal # class 1 : a simple gaussian class1_n = 600 class1_m = ceiling(n_col/2) - ceiling(shift_max/2) # class 1 mean, mean will be in average in the middle of the data vector class1_s = 40 # class 1 sd # the signal shape shape1 = dnorm(1:(n_col-shift_max+1), class1_m, class1_s) # class 2 : half a gaussian class2_n = n_samples - class1_n class2_m = floor(n_col/2) - floor(shift_max/2) # class 2 mean, mean will be in average in the middle of the data vector class2_s = 40 # class 2 sd # the signal shape shape2 = dnorm(1:(n_col-shift_max+1), class2_m, class2_s) shape2[class2_m:length(shape2)] = min(shape2) # class 3 : a uniform class3_n = 333 class3_from = floor(n_col/2) - floor(shift_max/2) -120 # class 3 from, mean will be in average in the middle of the data vector class3_to = floor(n_col/2) - floor(shift_max/2) +120 # class 3 to, mean will be in average in the middle of the data vector # the signal shape shape3 = dunif(1:(n_col-shift_max+1), class3_from, class3_to) # normalize shape1 = shape1 / sum(shape1) shape2 = shape2 / sum(shape2) shape2 = shape2 / sum(shape2) # sequencing coverage # the mean number of read per sample coverages = c(1, 10, 100) # noise proportion # in the end, the noise is added EVERYWHERE (also on the signal core) # _ # /\ | # | - | proportion of signal # | - - | # |_____- -__________ _ # | | proportion of noise # ----------------------> _ noises = c(0) for(p_noise in noises) # the proportion of reads which are noise { for(coverage in coverages) { # ----------------------------------------------------- data with 3 classes ----------------------------------------------------- set.seed(seed) data = generate_data_chipseq(n_col, shift_max, p_flip, p_noise, coverage, list(shape1, shape2, shape3), list(class3_n, class3_n, class3_n+1)) # save write.table(data$data, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F) write.table(data$shifts, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_shifts_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F) write.table(data$flips, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_flips_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F) write.table(data$classes, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_classes_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F) write.table(data$shapes, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_shapes_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F) # clean data = shifts = flips = classes = shapes = NULL } } diff --git a/scripts/toy_data/run_all.sh b/scripts/toy_data/run_all.sh new file mode 100755 index 0000000..c8adfba --- /dev/null +++ b/scripts/toy_data/run_all.sh @@ -0,0 +1,10 @@ +mkdir -p data/toy_data + +Rscript scripts/toy_data/generate_matrix_data_chipseq.R +# generate peaks +python3.6 scripts/toy_data/generate_bed_file.py +# generate reads +python3.6 scripts/toy_data/generate_bam_file.py +samtools sort data/toy_data/sc_reads.bam > data/toy_data/sc_reads_sort.bam +mv data/toy_data/sc_reads_sort.bam data/toy_data/sc_reads.bam +samtools index data/toy_data/sc_reads.bam diff --git a/src/Applications/CorrelationMatrixCreatorApplication.cpp b/src/Applications/CorrelationMatrixCreatorApplication.cpp new file mode 100644 index 0000000..e9fab0a --- /dev/null +++ b/src/Applications/CorrelationMatrixCreatorApplication.cpp @@ -0,0 +1,186 @@ + +#include +#include // MatrixCreator::methods +#include + +#include +#include +#include +#include // std::invalid_argument + + +namespace po = boost::program_options ; + +// the valid values for --method option +std::string method_read = "read" ; +std::string method_read_atac = "read_atac" ; +std::string method_fragment = "fragment" ; +std::string method_fragment_center = "fragment_center" ; + + +CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv) + : file_bed(""), file_bam(""), from(0), to(0), bin_size(0), + method(MatrixCreator::FRAGMENT), runnable(true) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} + +int CorrelationMatrixCreatorApplication::run() +{ if(this->runnable) + { CorrelationMatrixCreator mc(this->file_bed, + this->file_bam, + this->file_bai, + this->from, + this->to, + this->bin_size, + this->method) ; + + std::cout << mc.create_matrix() << std::endl ; + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE ; } +} + +void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "CorrelationMatrixCreator is an application that creates a " + "count matrix from a BED file and a BAM file and returnes it " + "through stdout.\n" + "The matrix contains one row per reference region present in the " + "BED file. The region center is computed and then a region covering the " + "interval [from,to] is build around the middle and divided into " + "equally sized bins. Finally, each bin is assigned the number of " + "target present in the BAM file that are mapped at that position.\n\n" ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_bed_msg = "The path to the BED file containing the references"; + std::string opt_bam_msg = "The path to the BAM file containing the targets"; + std::string opt_bai_msg = "The path to the BAM index file of the BAM file containing the targets"; + std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build " + "around each reference center." ; + std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build " + "around each reference center." ; + std::string opt_binsize_msg = "The size of the bins." ; + char tmp[4096] ; + sprintf(tmp, + "How the data in the BAM file should be handled when computing " + "the number of counts in each bin.\n" + "\t\"%s\" uses each position within the reads (by default)\n" + "\t\"%s\" uses only the insertion site for ATAC-seq data\n" + "\t\"%s\" uses each position within the fragments\n" + "\t\"%s\" uses only the fragment central positions\n", + method_read.c_str(), + method_read_atac.c_str(), + method_fragment.c_str(), + method_fragment_center.c_str()) ; + + std::string opt_method_msg = tmp ; + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + std::string method(method_read) ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) + ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) + ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) + + ("from,f", po::value(&(this->from)), opt_from_msg.c_str()) + ("to,t", po::value(&(this->to)), opt_to_msg.c_str()) + ("binSize,b", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) + ("method,m", po::value(&(method)), opt_method_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_bed == "" and (not help)) + { std::string msg("Error! No BED file was given (--bed)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_bam == "" and (not help)) + { std::string msg("Error! No BAM file was given (--bam)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_bam == "" and (not help)) + { std::string msg("Error! No BAM index file was given (--bai)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from == 0 and this->to == 0 and (not help)) + { std::string msg("Error! No range given (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from >= this->to and (not help)) + { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->bin_size <= 0 and (not help)) + { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; + throw std::invalid_argument(msg) ; + } + else if(method != method_read and + method != method_read_atac and + method != method_fragment and + method != method_fragment_center) + { char msg[4096] ; + sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", + method_read.c_str(), + method_read_atac.c_str(), + method_fragment.c_str(), + method_fragment_center.c_str()) ; + throw std::invalid_argument(msg) ; + } + + // set method + if(method == method_read) + { this->method = MatrixCreator::READ ; } + else if(method == method_read_atac) + { this->method = MatrixCreator::READ_ATAC ; } + else if(method == method_fragment) + { this->method = MatrixCreator::FRAGMENT ; } + else if(method == method_fragment_center) + { this->method = MatrixCreator::FRAGMENT_CENTER ; } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + + +int main(int argn, char** argv) +{ CorrelationMatrixCreatorApplication app(argn, argv) ; + return app.run() ; +} + diff --git a/src/Applications/CorrelationMatrixCreatorApplication.hpp b/src/Applications/CorrelationMatrixCreatorApplication.hpp new file mode 100644 index 0000000..9a5cd1a --- /dev/null +++ b/src/Applications/CorrelationMatrixCreatorApplication.hpp @@ -0,0 +1,100 @@ +#ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP +#define CORRELATIONMATRIXCREATORAPPLICATION_HPP + +#include +#include // MatrixCreator::methods + +#include + +/*! + * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a + * RegionMatrixCreator instance creating an autonomous application to + * compute a count matrix from a BAM file by directly passing all the options + * and parameters from the command line. + */ +class CorrelationMatrixCreatorApplication: public ApplicationInterface +{ + public: + CorrelationMatrixCreatorApplication() = delete ; + CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ; + /*! + * \brief Constructs an object from the command line + * options. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + */ + CorrelationMatrixCreatorApplication(int argn, char** argv) ; + + /*! + * \brief Runs the application. The data are classified + * using the given settings and the posterior probability + * matrix is returned through the stdout. + * The matrix is a 4D matrix with dimensions : + * regions, class, shift flip. + * \return an exit code EXIT_SUCCESS or EXIT_FAILURE + * to return to the OS. + */ + virtual int run() override ; + + private: + /*! + * \brief Parses the program command line options and + * sets the object field accordingly. + * If the help option is detected, the "runnable" + * field is set to false and subsequent calls to + * run() will produce nothing. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + * \throw std::invalid_argument if an error is found + * in the program options. + */ + void parseOptions(int argn, char** argv) ; + + /*! + * \brief the path to the bed file. + */ + std::string file_bed ; + /*! + * \brief the path to the bam file. + */ + std::string file_bam ; + /*! + * \brief the path to the bam index file. + */ + std::string file_bai ; + /*! + * \brief a relative coordinate indicating the + * most downstream position to consider around + * each region in the bed file. + */ + int from ; + /*! + * \brief a relative coordinate indicating the + * most upstream position to consider around + * each region in the bed file. + */ + int to ; + /*! + * \brief the size of the bin that will be used + * to bin the signal in the regions [from,to] around + * each region in the bed file. + */ + int bin_size ; + /*! + * \brief How to consider the sequenced fragments when computing + * the bin values. + */ + MatrixCreator::methods method ; + /*! + * \brief a flag indicating whether the core of run() can be + * run or not. + */ + bool runnable ; +} ; + + +#endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP diff --git a/src/Applications/CorrelationMatrixCreatorApplicationParallel.cpp b/src/Applications/CorrelationMatrixCreatorApplicationParallel.cpp new file mode 100644 index 0000000..804d785 --- /dev/null +++ b/src/Applications/CorrelationMatrixCreatorApplicationParallel.cpp @@ -0,0 +1,193 @@ + +#include +#include // MatrixCreator::methods +#include + +#include +#include +#include +#include // std::invalid_argument + + +namespace po = boost::program_options ; + +// the valid values for --method option +std::string method_read = "read" ; +std::string method_read_atac = "read_atac" ; +std::string method_fragment = "fragment" ; +std::string method_fragment_center = "fragment_center" ; + + +CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv) + : file_bed(""), file_bam(""), from(0), to(0), bin_size(0), + method(MatrixCreator::FRAGMENT), runnable(true), n_threads(1) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} + +int CorrelationMatrixCreatorApplication::run() +{ if(this->runnable) + { CorrelationMatrixCreator mc(this->file_bed, + this->file_bam, + this->file_bai, + this->from, + this->to, + this->bin_size, + this->method, + this->n_threads) ; + + std::cout << mc.create_matrix() << std::endl ; + return EXIT_SUCCESS ; + } + else + { return EXIT_FAILURE ; } +} + +void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv) +{ + // no option to parse + if(argv == nullptr) + { std::string message = "no options to parse!" ; + throw std::invalid_argument(message) ; + } + + // help messages + std::string desc_msg = "\n" + "CorrelationMatrixCreator is an application that creates a " + "count matrix from a BED file and a BAM file and returnes it " + "through stdout.\n" + "The matrix contains one row per reference region present in the " + "BED file. The region center is computed and then a region covering the " + "interval [from,to] is build around the middle and divided into " + "equally sized bins. Finally, each bin is assigned the number of " + "target present in the BAM file that are mapped at that position.\n\n" ; + std::string opt_help_msg = "Produces this help message." ; + std::string opt_bed_msg = "The path to the BED file containing the references"; + std::string opt_bam_msg = "The path to the BAM file containing the targets"; + std::string opt_bai_msg = "The path to the BAM index file of the BAM file containing the targets"; + std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build " + "around each reference center." ; + std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build " + "around each reference center." ; + std::string opt_thread_msg = "The number of threads to use." ; + std::string opt_binsize_msg = "The size of the bins." ; + char tmp[4096] ; + sprintf(tmp, + "How the data in the BAM file should be handled when computing " + "the number of counts in each bin.\n" + "\t\"%s\" uses each position within the reads (by default)\n" + "\t\"%s\" uses only the insertion site for ATAC-seq data\n" + "\t\"%s\" uses each position within the fragments\n" + "\t\"%s\" uses only the fragment central positions\n", + method_read.c_str(), + method_read_atac.c_str(), + method_fragment.c_str(), + method_fragment_center.c_str()) ; + + std::string opt_method_msg = tmp ; + + // option parser + boost::program_options::variables_map vm ; + boost::program_options::options_description desc(desc_msg) ; + + std::string method(method_read) ; + + desc.add_options() + ("help,h", opt_help_msg.c_str()) + + ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) + ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) + ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) + + ("from,f", po::value(&(this->from)), opt_from_msg.c_str()) + ("to,t", po::value(&(this->to)), opt_to_msg.c_str()) + ("binSize,b", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) + ("method,m", po::value(&(method)), opt_method_msg.c_str()) + ("parallel,p", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + + // parse + try + { po::store(po::parse_command_line(argn, argv, desc), vm) ; + po::notify(vm) ; + } + catch(std::invalid_argument& e) + { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; + throw std::invalid_argument(msg) ; + } + catch(...) + { throw std::invalid_argument("An unknown error occured while parsing the options") ; } + + bool help = vm.count("help") ; + + // checks unproper option settings + if(this->file_bed == "" and (not help)) + { std::string msg("Error! No BED file was given (--bed)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_bam == "" and (not help)) + { std::string msg("Error! No BAM file was given (--bam)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->file_bam == "" and (not help)) + { std::string msg("Error! No BAM index file was given (--bai)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from == 0 and this->to == 0 and (not help)) + { std::string msg("Error! No range given (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->from >= this->to and (not help)) + { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; + throw std::invalid_argument(msg) ; + } + else if(this->bin_size <= 0 and (not help)) + { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; + throw std::invalid_argument(msg) ; + } + else if(method != method_read and + method != method_read_atac and + method != method_fragment and + method != method_fragment_center) + { char msg[4096] ; + sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", + method_read.c_str(), + method_read_atac.c_str(), + method_fragment.c_str(), + method_fragment_center.c_str()) ; + throw std::invalid_argument(msg) ; + } + else if(this->n_threads == 0) + { std::string msg("Error! at least one thread should be used (--parallel)!") ; + throw std::invalid_argument(msg) ; + } + + // set method + if(method == method_read) + { this->method = MatrixCreator::READ ; } + else if(method == method_read_atac) + { this->method = MatrixCreator::READ_ATAC ; } + else if(method == method_fragment) + { this->method = MatrixCreator::FRAGMENT ; } + else if(method == method_fragment_center) + { this->method = MatrixCreator::FRAGMENT_CENTER ; } + + // help invoked, run() cannot be invoked + if(help) + { std::cout << desc << std::endl ; + this->runnable = false ; + return ; + } + // everything fine, run() can be called + else + { this->runnable = true ; + return ; + } +} + + +int main(int argn, char** argv) +{ CorrelationMatrixCreatorApplication app(argn, argv) ; + return app.run() ; +} + diff --git a/src/Applications/CorrelationMatrixCreatorApplicationParallel.hpp b/src/Applications/CorrelationMatrixCreatorApplicationParallel.hpp new file mode 100644 index 0000000..a362db7 --- /dev/null +++ b/src/Applications/CorrelationMatrixCreatorApplicationParallel.hpp @@ -0,0 +1,104 @@ +#ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP +#define CORRELATIONMATRIXCREATORAPPLICATION_HPP + +#include +#include // MatrixCreator::methods + +#include + +/*! + * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a + * RegionMatrixCreator instance creating an autonomous application to + * compute a count matrix from a BAM file by directly passing all the options + * and parameters from the command line. + */ +class CorrelationMatrixCreatorApplication: public ApplicationInterface +{ + public: + CorrelationMatrixCreatorApplication() = delete ; + CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ; + /*! + * \brief Constructs an object from the command line + * options. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + */ + CorrelationMatrixCreatorApplication(int argn, char** argv) ; + + /*! + * \brief Runs the application. The data are classified + * using the given settings and the posterior probability + * matrix is returned through the stdout. + * The matrix is a 4D matrix with dimensions : + * regions, class, shift flip. + * \return an exit code EXIT_SUCCESS or EXIT_FAILURE + * to return to the OS. + */ + virtual int run() override ; + + private: + /*! + * \brief Parses the program command line options and + * sets the object field accordingly. + * If the help option is detected, the "runnable" + * field is set to false and subsequent calls to + * run() will produce nothing. + * \param argn the number of options passed to the + * main() function. + * \param argv the vector of options passed to the + * main() function. + * \throw std::invalid_argument if an error is found + * in the program options. + */ + void parseOptions(int argn, char** argv) ; + + /*! + * \brief the path to the bed file. + */ + std::string file_bed ; + /*! + * \brief the path to the bam file. + */ + std::string file_bam ; + /*! + * \brief the path to the bam index file. + */ + std::string file_bai ; + /*! + * \brief a relative coordinate indicating the + * most downstream position to consider around + * each region in the bed file. + */ + int from ; + /*! + * \brief a relative coordinate indicating the + * most upstream position to consider around + * each region in the bed file. + */ + int to ; + /*! + * \brief the size of the bin that will be used + * to bin the signal in the regions [from,to] around + * each region in the bed file. + */ + int bin_size ; + /*! + * \brief How to consider the sequenced fragments when computing + * the bin values. + */ + MatrixCreator::methods method ; + /*! + * \brief a flag indicating whether the core of run() can be + * run or not. + */ + bool runnable ; + /*! + * \brief the number of threads to use. + */ + size_t n_threads ; +} ; + + +#endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index daa0e79..78a92e2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,53 +1,86 @@ -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +# compiler options +add_compile_options(-std=c++14) +add_compile_options(-O3) +add_compile_options(-Wall) +add_compile_options(-Wextra) +add_compile_options(-Werror) +add_compile_options(-Wfatal-errors) +add_compile_options(-pedantic) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") +add_definitions (${SEQAN_DEFINITIONS}) # include file location +include_directories (${SEQAN_INCLUDE_DIRS}) include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/Clustering") include_directories("${scATACseq_SOURCE_DIR}/src/Random") include_directories("${scATACseq_SOURCE_DIR}/src/Parallel") include_directories("${scATACseq_SOURCE_DIR}/src/Statistics") include_directories("${scATACseq_SOURCE_DIR}/src/GUI") include_directories("${scATACseq_SOURCE_DIR}/src/Applications") include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") +include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools") # compile modules into static libraries -add_library(Clustering "Clustering/ClusteringEngine.cpp" "Clustering/EMEngine.cpp" "Clustering/ReferenceComputer.cpp") -add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp") -add_library(Parallel "Parallel/ThreadPool.cpp") -add_library(Statistics "Statistics/Statistics.cpp") -add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp") +## set output directory +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib") +## build instructions +add_library(Clustering "Clustering/ClusteringEngine.cpp" + "Clustering/EMEngine.cpp" + "Clustering/ReferenceComputer.cpp") +add_library(Random "Random/Random.cpp" + "Random/RandomNumberGenerator.cpp") +add_library(Parallel "Parallel/ThreadPool.cpp") +add_library(Statistics "Statistics/Statistics.cpp") +add_library(GUI "GUI/ConsoleProgressBar.cpp" + "GUI/Diplayable.cpp" + "GUI/Updatable.cpp") +add_library(GenomicTools "GenomicTools/MatrixCreator.cpp" + "GenomicTools/CorrelationMatrixCreator.cpp" + "GenomicTools/GenomeRegion.cpp") -link_directories("${scATACseq_SOURCE_DIR}/src/Clustering") -link_directories("${scATACseq_SOURCE_DIR}/src/Random") -link_directories("${scATACseq_SOURCE_DIR}/src/Statistics") -link_directories("${scATACseq_SOURCE_DIR}/src/GUI") -link_directories("${scATACseq_SOURCE_DIR}/src/Parallel") - - -# linking modules to resolve dependencies +## resolve dependencies target_link_libraries(Clustering Random Statistics GUI Parallel) target_link_libraries(Parallel Threads::Threads) +target_link_libraries(GenomicTools ${SEQAN_LIBRARIES}) # executables -## a toy -set(EXE_MAIN "main") -add_executable(${EXE_MAIN} "main.cpp") -target_link_libraries(${EXE_MAIN} Clustering) -set_target_properties(${EXE_MAIN} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## a toy for SeqAn usage +set(EXE_MAIN_SEQAN "main_seqan") +add_executable(${EXE_MAIN_SEQAN} "main_seqan.cpp") +target_link_libraries(${EXE_MAIN_SEQAN} ${SEQAN_LIBRARIES} GenomicTools Clustering) +set_target_properties(${EXE_MAIN_SEQAN} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## a toy for correlation matrix +set(EXE_MAIN_CORMAT "main_cormat") +add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp") +target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} GenomicTools) +set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## a toy for EM usage +set(EXE_MAIN_EM "main_em") +add_executable(${EXE_MAIN_EM} "main_em.cpp") +target_link_libraries(${EXE_MAIN_EM} Clustering) +set_target_properties(${EXE_MAIN_EM} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an application to create a matrix from BED and a BAM file +set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator") +add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Boost::program_options) +set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an ChIPPartitioning standalone set(EXE_CHIPPART "ChIPPartitioning") add_executable(${EXE_CHIPPART} "Applications/ChIPPartitioningApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_CHIPPART} Clustering Boost::program_options) set_target_properties(${EXE_CHIPPART} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to compute classes references from the data and the post prob of ChIPPartitioning set(EXE_PROB2REF "probToRef") add_executable(${EXE_PROB2REF} "Applications/ProbToRefApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_PROB2REF} Clustering Boost::program_options) set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a test suite set(EXE_TESTS "unittests") -add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp") -target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB}) +add_executable(${EXE_TESTS} "unittests.cpp" + "Unittests/unittests_matrix.cpp" + "Unittests/unittests_genomictools.cpp") +target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools) set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") diff --git a/src/Clustering/EMEngine.cpp b/src/Clustering/EMEngine.cpp index c10ce53..bacf3e6 100644 --- a/src/Clustering/EMEngine.cpp +++ b/src/Clustering/EMEngine.cpp @@ -1,769 +1,808 @@ #include #include #include #include #include #include // rand_int_uniform() #include // getRandomNumberGenerator() #include // poisson_pmf(), normal_pmf(), sd() #include // ConsoleProgressBar #include // ThreadPool #include // log(), exp(), pow() #include #include // numeric_limits #include // uniform_real, variate_generator #include // future, promise #include // move() #include // bind(), ref() EMEngine::EMEngine(const Matrix2D& data, size_t n_class, size_t n_iter, size_t n_shift, bool flip, EMEngine::seeding_codes seeding, const std::string& seed, size_t n_threads) : flip(flip), n_iter(n_iter), n_shift(n_shift), n_flip(flip+1), n_class(n_class), n_row(data.get_nrow()), n_col(data.get_ncol()), l_slice(n_col - n_shift + 1), seeding_method(seeding), n_threads(n_threads), threads(n_threads) { // initialise random number generator getRandomGenerator(seed) ; // copy the data this->data = matrix2d_i(this->n_row, v_i(this->n_col)) ; for(size_t i=0; in_row; i++) { for(size_t j=0; jn_col; j++) { this->data[i][j] = data(i,j) ; } } } EMEngine::~EMEngine() { this->threads.join() ; } Matrix2D EMEngine::get_references() const { Matrix2D references(this->n_class, this->l_slice, 0.) ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_slice; j++) { references(i,j) = this->references[i][j] ; } } return references ; } Matrix4D EMEngine::get_posterior_prob() const { Matrix4D post_prob(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ; for(size_t i=0; in_row; i++) { for(size_t k=0; kn_class; k++) { for(size_t s=0; sn_shift; s++) { for(size_t f=0; fn_flip; f++) { post_prob(i,k,s,f) = this->post_prob[i][k][s][f] ; } } } } return post_prob ; } - -double EMEngine::get_loglikelihood() const +/* +// this is the naive way, it is exact but results in Nan, -Nan, -Inf, +Inf +// sometimes... +double EMEngine::get_loglikelihood0() const { double ll = 0 ; - for(size_t i=0; in_row; i++) { double p_tmp = 0. ; for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood // --------------- forward --------------- double lp_fw = 0. ; int from_dat_fw = s ; int to_dat_fw = from_dat_fw + this->l_slice - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fwdata[i][j_dat_fw], + double lp = std::max(log(poisson_pmf(this->data[i][j_dat_fw], this->references[j][j_ref_fw]* - this->window_mean[i][s]) ; - + this->window_mean[i][s])), + EMEngine::p_min_log) ; lp_fw += lp ; - } + p_tmp += exp(lp_fw) * this->class_prob[j][s][flip_states::FORWARD] ; // --------------- reverse --------------- if(this->flip) { double lp_rev = 0. ; int from_dat_rev = this->n_col - 1 - s ; int to_dat_rev = from_dat_rev - (this->l_slice - 1) ; int shift_rev = this->n_shift - s - 1 ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) - { double lp = log(poisson_pmf(this->data[i][j_dat_rev], + { double lp = std::max(log(poisson_pmf(this->data[i][j_dat_rev], this->references[j][j_ref_fw]* - this->window_mean[i][shift_rev])) ; + this->window_mean[i][shift_rev])), + EMEngine::p_min_log) ; lp_rev += lp ; } + p_tmp += exp(lp_rev) * this->class_prob[j][s][flip_states::REVERSE] ; } } } ll += log(p_tmp) ; } return ll ; } +*/ -double EMEngine::get_aic() const -{ double ll = this->get_loglikelihood() ; - double n_param = ((double) this->n_class * - (double)this->l_slice) + - ((double)this->n_shift * - (double)this->flip+1. * - (double)this->n_class) - 1. ; - return (2.*n_param) - (2.*ll) ; -} - -/* double EMEngine::get_loglikelihood() const { double ll = 0. ; + // compute all terms needed for(size_t i=0; in_row; i++) - { double l = 0. ; + { double prob_tmp = 0 ; for(size_t j=0; jn_class; j++) - { for(size_t s_fw=0, s_rev=this->n_shift-1; - s_fwn_shift; s_fw++, s_rev--) - { // slice is [from_fw,to) + { std::vector> v3 ; + for(size_t s=0; sn_shift; s++) + { + // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev + // log likelihood // --------------- forward --------------- - size_t from_dat_fw = s_fw ; - size_t to_dat_fw = from_dat_fw + this->l_slice - 1 ; - // --------------- reverse --------------- - size_t from_dat_rev = this->n_col - 1 - s_fw ; - // size_t to_dat_rev = from_dat_rev - (this->l_slice - 1) ; - - double ll_tmp = 0. ; + double lp_fw = 0. ; + int from_dat_fw = s ; + int to_dat_fw = from_dat_fw + this->l_slice - 1 ; + for(int j_dat_fw=from_dat_fw, j_ref_fw=0; + j_dat_fwdata[i][j_dat_fw], + this->references[j][j_ref_fw]* + this->window_mean[i][s]), + EMEngine::p_min)) ; + lp_fw += lp ; + } + double p_fw = this->class_prob[j][s][flip_states::FORWARD] ; + v3.push_back(std::make_pair(lp_fw, p_fw)) ; - for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; - j_dat_fwdata[i][j_dat_fw], - this->references[j][j_ref_fw]* - this->window_mean[i][s_fw]); - ll_tmp += log(std::max(p, EMEngine::p_min) * - this->class_prob[j][s_fw][flip_states::FORWARD]) ; - // --------------- reverse --------------- - if(this->flip) - { double p = poisson_pmf(this->data[i][j_dat_rev], - this->references[j][j_ref_fw]* - this->window_mean[i][s_rev]) ; - ll_tmp += log(std::max(p, EMEngine::p_min) * - this->class_prob[j][s_fw][flip_states::REVERSE]) ; + // --------------- reverse --------------- + if(this->flip) + { double lp_rev = 0. ; + int from_dat_rev = this->n_col - 1 - s ; + int to_dat_rev = from_dat_rev - (this->l_slice - 1) ; + int shift_rev = this->n_shift - s - 1 ; + for(int j_dat_rev=from_dat_rev, j_ref_fw=0; + j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) + { double lp = log(std::max(poisson_pmf(this->data[i][j_dat_rev], + this->references[j][j_ref_fw]* + this->window_mean[i][shift_rev]), + EMEngine::p_min)) ; + lp_rev += lp ; } + double p_rev = this->class_prob[j][s][flip_states::REVERSE] ; + v3.push_back(std::make_pair(lp_rev, p_rev)) ; } - l += ll_tmp ; } + prob_tmp += sum_exp(v3) ; } - ll += l ; + ll += log(prob_tmp) ; } return ll ; } -*/ + +double EMEngine::get_aic() const +{ double ll = this->get_loglikelihood() ; + double n_param = ((double) this->n_class * + (double)this->l_slice) + + ((double)this->n_shift * + (double)this->flip+1. * + (double)this->n_class) - 1. ; + // std::cerr << "AIC = " << (2.*n_param) << " - " << ll << std::endl ; + return (2.*n_param) - (2.*ll) ; +} ClusteringEngine::exit_codes EMEngine::cluster() { size_t bar_update_n = this->n_iter + 1 ; ConsoleProgressBar bar(std::cerr, bar_update_n, 70, "clustering") ; // construct all other required data structures // mean number of reads per window this->window_mean = matrix2d_d(this->n_row, v_d(this->n_shift, 0.)) ; this->compute_window_means() ; // the references this->references = matrix2d_d(this->n_class, v_d(this->l_slice, 0.)) ; // log loglikelihood this->loglikelihood = matrix4d_d(this->n_row, matrix3d_d(this->n_class, matrix2d_d(this->n_shift, - v_d(this->n_flip, 0.)))) ; + v_d(this->n_flip, 9.)))) ; this->loglikelihood_max = v_d(this->n_row, 0.) ; // posterior prob this->post_prob = matrix4d_d(this->n_row, matrix3d_d(this->n_class, matrix2d_d(this->n_shift, v_d(this->n_flip, 0.)))) ; this->class_prob = matrix3d_d(this->n_class, matrix2d_d(this->n_shift, v_d(this->n_flip, 0.))) ; this->class_prob_tot = v_d(this->n_class, 0.) ; this->post_prob_row = v_d(this->n_row, 0.) ; this->post_prob_class = v_d(this->n_class, 0.) ; this->post_prob_tot = 0. ; // seeding this->seeding(this->seeding_method) ; bar.update() ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // normalize the references such thjat the mean value, on each // row, is 1 this->normalize_references() ; // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->compute_references() ; this->center_shifts() ; + bar.update() ; } bar.update() ; std::cerr << std::endl ; return ClusteringEngine::exit_codes::SUCCESS ; } void EMEngine::normalize_references() { for(size_t i=0; in_class; i++) { double mean = 0. ; for(size_t j=0; jl_slice; j++) { mean += this->references[i][j] ; } mean /= this->l_slice ; for(size_t j=0; jl_slice; j++) { this->references[i][j] /= mean ; } } } void EMEngine::seeding(EMEngine::seeding_codes seeding) { if(seeding == EMEngine::seeding_codes::RANDOM) { this->seeding_random() ; } else if(seeding == EMEngine::seeding_codes::SAMPLING) { this->seeding_sampling() ; } else if(seeding == EMEngine::seeding_codes::TOY) { this->seeding_toy() ; } } void EMEngine::seeding_random() { // get random values from a beta distribution cannot be done using boost so // i) generate random number [0,1] x // ii) compute f(x) where f is beta distribution matrix2d_d prob(this->n_row, v_d(this->n_class, 0.)) ; v_d prob_class(this->n_class, 0.) ; double tot_sum = 0. ; // sample the prob // beta distribution parameters double alpha = pow(this->n_row, -0.5) ; double beta = 1. ; for(size_t i=0; in_row; i++) { double row_sum = 0. ; for(size_t j=0; jn_class; j++) { double x = rand_real_uniform(0., 1.0) ; double p = std::max(EMEngine::p_min, beta_pmf(x, alpha, beta)) ; prob[i][j] = p ; prob_class[j] += p ; tot_sum += p ; row_sum += p ; } // normalize for(size_t j=0; jn_class; j++) { prob[i][j] /= row_sum ; } } // class prob for(auto& p : prob_class) { p /= tot_sum ; } // compute the refererences for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++) { this->references[j][j_ref] += (this->data[i][j_dat] * prob[i][j]) ; } } } // normalize for(size_t i=0; in_class; i++) { for(size_t j=0; jl_slice; j++) { this->references[i][j] ; } } // set the class probabilities to a uniform distribution double sum = this->n_class * this->n_shift * this->n_flip ; for(size_t i=0; in_class; i++) { for(size_t j=0; jn_shift; j++) { for(size_t k=0; kn_flip; k++) { this->class_prob[i][j][k] = 1./sum ; } } } } void EMEngine::seeding_sampling() { // sample data to initialise the references std::vector choosen(this->n_row, false) ; for(size_t i=0; in_class; ) { size_t index = rand_int_uniform(size_t(0), size_t(this->n_row-1)) ; // already choose if(choosen[index]) { ; } // not yet choosen as reference else { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++) { this->references[i][j_ref] = this->data[index][j_dat] ; } choosen[index] = true ; i++ ; } } // set the class probabilities to a uniform distribution double sum = this->n_class * this->n_shift * this->n_flip ; for(size_t i=0; in_class; i++) { for(size_t j=0; jn_shift; j++) { for(size_t k=0; kn_flip; k++) { this->class_prob[i][j][k] = 1. / sum ; } } } } void EMEngine::seeding_toy() { // sample data to initialise the references std::vector choosen(this->n_row, false) ; for(size_t i=0; in_class; ) { size_t index = i ; // already choose if(choosen[index]) { ; } // not yet choosen as reference else { for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++) { this->references[i][j_ref] = this->data[index][j_dat] ; } choosen[index] = true ; i++ ; } } // set the class probabilities to a uniform distribution double sum = this->n_class * this->n_shift * this->n_flip ; for(size_t i=0; in_class; i++) { for(size_t j=0; jn_shift; j++) { for(size_t k=0; kn_flip; k++) { this->class_prob[i][j][k] = 1./sum ; } } } } void EMEngine::compute_window_means() { // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, this->n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(this->n_threads) ; std::vector> futures(this->n_threads) ; for(size_t i=0; in_threads; i++) { futures[i] = promises[i].get_future() ; } // distribute work to threads // -------------------------- threads start -------------------------- for(size_t i=0; in_threads; i++) { auto slice = slices[i] ; this->threads.addJob(std::move( std::bind(&EMEngine::compute_window_means_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } void EMEngine::compute_window_means_routine(size_t from, size_t to, std::promise& done) { double l_slice = double(this->l_slice) ; for(size_t i=from; in_shift; from++) { double sum = 0. ; // slice is [from,to) size_t to = from + this->l_slice ; for(size_t j=from; jdata[i][j] ;} this->window_mean[i][from] = sum / l_slice ; } } done.set_value(true) ; } void EMEngine::compute_loglikelihood() { // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, this->n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(this->n_threads) ; std::vector> futures(this->n_threads) ; for(size_t i=0; in_threads; i++) { futures[i] = promises[i].get_future() ; } // distribute work to threads // -------------------------- threads start -------------------------- for(size_t i=0; in_threads; i++) { auto slice = slices[i] ; this->threads.addJob(std::move( std::bind(&EMEngine::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } void EMEngine::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // access in writing // this->loglikelihood -> only access the i-th which belong [from,to) // this->loglikelihood_max -> only access the i-th which belong [from,to) for(size_t i=from; iloglikelihood_max[i] = std::numeric_limits::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s_fw=0, s_rev=this->n_shift-1; s_fwn_shift; s_fw++, s_rev--) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood double ll_fw = 0. ; double ll_rev = 0. ; // --------------- forward --------------- size_t from_dat_fw = s_fw ; size_t to_dat_fw = from_dat_fw + this->l_slice - 1 ; // --------------- reverse --------------- size_t from_dat_rev = this->n_col - 1 - s_fw ; // size_t to_dat_rev = from_dat_rev - (this->l_slice - 1) ; for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; j_dat_fwdata[i][j_dat_fw], this->references[j][j_ref_fw]* this->window_mean[i][s_fw])) ; ll_fw += std::max(ll, EMEngine::p_min_log) ; // --------------- reverse --------------- if(this->flip) { ll = log(poisson_pmf(this->data[i][j_dat_rev], this->references[j][j_ref_fw]* this->window_mean[i][s_rev])) ; ll_rev += std::max(ll, EMEngine::p_min_log) ; } } this->loglikelihood[i][j][from_dat_fw][flip_states::FORWARD] = ll_fw ; // keep track of the max per row if(ll_fw > this->loglikelihood_max[i]) { this->loglikelihood_max[i] = ll_fw ; } if(this->flip) { this->loglikelihood[i][j][from_dat_fw][flip_states::REVERSE] = ll_rev ; // keep track of the max per row if(ll_rev > this->loglikelihood_max[i]) { this->loglikelihood_max[i] = ll_rev ; } } } } } // fill the promise to indicate that the function exited done.set_value(true) ; } void EMEngine::compute_post_prob() { // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, this->n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(this->n_threads) ; std::vector> futures(this->n_threads) ; for(size_t i=0; in_threads; i++) { futures[i] = promises[i].get_future() ; } // distribute work to threads // -------------------------- threads start -------------------------- for(size_t i=0; in_threads; i++) { auto slice = slices[i] ; this->threads.addJob(std::move( std::bind(&EMEngine::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_class = v_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_class[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } void EMEngine::compute_post_prob_routine(size_t from, size_t to, std::promise& done) { // this->post_prob_row -> only access the i-th which belong [from,to) // this->post_prob -> only access the i-th which belong [from,to) // some values that needs to be returned // the total of the posterior prob for this slice of the data // the total per class of posterior prob for this slice of the data v_d post_prob_class(this->n_class, 0.) ; for(size_t i=from; ipost_prob_row[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) - { double p = exp(this->loglikelihood[i][n_class][n_shift][n_flip] - + { /* + double p = exp(this->loglikelihood[i][n_class][n_shift][n_flip] - this->loglikelihood_max[i]) * this->class_prob[n_class][n_shift][n_flip] ; + */ + double p = std::max(exp(this->loglikelihood[i][n_class][n_shift][n_flip] - + this->loglikelihood_max[i]) * + this->class_prob[n_class][n_shift][n_flip], + EMEngine::p_min) ; this->post_prob[i][n_class][n_shift][n_flip] = p ; this->post_prob_row[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { this->post_prob[i][n_class][n_shift][n_flip] /= this->post_prob_row[i] ; double p = this->post_prob[i][n_class][n_shift][n_flip] ; post_prob_class[n_class] += p ; } } } } done.set_value(post_prob_class) ; } void EMEngine::compute_class_prob() { for(size_t n_class=0; n_classn_class; n_class++) { // reset total this->class_prob_tot[n_class] = 0. ; for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t flip=0; flipn_flip; flip++) { // sum this->class_prob[n_class][n_shift][flip] = 0. ; for(size_t i=0; in_row; i++) { this->class_prob[n_class][n_shift][flip] += this->post_prob[i][n_class][n_shift][flip] ; } // normalize this->class_prob[n_class][n_shift][flip] /= this->post_prob_tot ; this->class_prob_tot[n_class] += this->class_prob[n_class][n_shift][flip] ; } } } } void EMEngine::compute_references() { // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, this->n_threads) ; // get promises and futures // the function run by the threads will compute // the reference from the given slice std::vector> promises(this->n_threads) ; std::vector> futures(this->n_threads) ; for(size_t i=0; in_threads; i++) { futures[i] = promises[i].get_future() ; } // distribute work to threads // -------------------------- threads start -------------------------- for(size_t i=0; in_threads; i++) { auto& slice = slices[i] ; this->threads.addJob(std::move( std::bind(&EMEngine::compute_references_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // while threads are working, reset the references for(size_t i=0; in_class; i++) { for(size_t j=0; jl_slice; j++) { this->references[i][j] = 0. ; } } // wait until all threads are done working // sum the partial class references to get the complete ones for(size_t n=0; nn_threads; n++) { matrix2d_d reference = futures[n].get() ; for(size_t i=0; in_class; i++) { for(size_t j=0; jl_slice; j++) { this->references[i][j] += reference[i][j] ; } } } // -------------------------- threads stop --------------------------- } void EMEngine::compute_references_routine(size_t from, size_t to, std::promise& references) { // the empty references matrix2d_d ref(this->n_class, v_d(this->l_slice, 0.)) ; for(size_t n_class=0; n_class < this->n_class; n_class++) { for(size_t i=from; in_shift; n_shift++) { // --------------- forward --------------- int from_dat_fw = n_shift ; int to_dat_fw = from_dat_fw + this->l_slice - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) { ref[n_class][j_ref_fw] += (this->post_prob[i][n_class][n_shift][flip_states::FORWARD] * this->data[i][j_dat_fw]) / this->post_prob_class[n_class] ; } // --------------- reverse --------------- if(this->flip) { int from_dat_rev = this->n_col - 1 - n_shift ; int to_dat_rev = from_dat_rev - (this->l_slice - 1) ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) { ref[n_class][j_ref_fw] += (this->post_prob[i][n_class][n_shift][flip_states::REVERSE] * this->data[i][j_dat_rev]) / this->post_prob_class[n_class] ; } } } } } references.set_value(ref) ; } void EMEngine::center_shifts() { if(this->n_shift == 1) { return ; } // the possible shift states std::vector shifts(this->n_shift) ; std::iota(shifts.begin(), shifts.end(), 1.) ; // the shift probabilities and the class probabilies (no need to norm., class_prob sums to 1) double shifts_prob_measured_tot = 0. ; std::vector shifts_prob_measured(this->n_shift) ; for(size_t s=0; sn_shift; s++) { for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { shifts_prob_measured[s] += this->class_prob[k][s][f] ; shifts_prob_measured_tot += this->class_prob[k][s][f] ; } } } // the shift mean and (biased) standard deviation double shifts_sd = sd(shifts, shifts_prob_measured, false) ; // the shift probabilities under the assumption that is distributed as a gaussian centered on // the central shift state with sd and mean as in the data // sd as the data std::vector shifts_prob_centered(shifts.size(), 0.) ; double shifts_prob_centered_tot = 0. ; for(size_t i=0; in_shift/2)+1, shifts_sd) ; shifts_prob_centered_tot += shifts_prob_centered[i] ; } for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { for(size_t s=0; sn_shift; s++) { this->class_prob[k][s][f] = this->class_prob_tot[k] * shifts_prob_centered[s] / (this->n_flip * shifts_prob_centered_tot) ; } } } // shifts_prob_measured_tot = 0. ; shifts_prob_measured.clear() ; shifts_prob_measured.resize(this->n_shift) ; for(size_t s=0; sn_shift; s++) { for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { shifts_prob_measured[s] += this->class_prob[k][s][f] ; } } } } const double EMEngine::p_min = 1e-100 ; const double EMEngine::p_min_log = log(EMEngine::p_min) ; + +#include + +double sum_exp(const std::vector>& v) +{ + double result = 0. ; + // double max = *std::max_element(lp.begin(), lp.end()) ; + + double max = std::numeric_limits::lowest() ; + for(const auto& i : v) + { if(i.first > max) + { max = i.first ; } + } + + // sum + for(const auto& i : v) + { result += (exp(i.first - max))*i.second ; } + result *= exp(max) ; + + return result ; +} diff --git a/src/Clustering/EMEngine.hpp b/src/Clustering/EMEngine.hpp index fa586d1..d4087cf 100644 --- a/src/Clustering/EMEngine.hpp +++ b/src/Clustering/EMEngine.hpp @@ -1,362 +1,363 @@ #ifndef EMENGINE_HPP #define EMENGINE_HPP #include #include #include #include #include #include #include // promise, future // some typdef #include /*! * \brief This class implements the iterative expectation * maximization classification procedure described in Nair * et al. 2014, Bioinformatics. * The classification procedure performs a probabilistic * partitioning of genomic regions, based on the distribution * of the reads over the regions. * To mitigate a miss-alignment of the signal in the different * regions - that is a same signal strech is present in two * regions but at different offsets - the classification * procedure can search protypic signals shorter than a whole * region, at each possible offset over the region (named * shift). * To mitigate an inversion of the signal in the different regions * - that is a same signal strech is present in two regions but in * reverse orientation - the classification procedure can search * protypic signals in both orientation. */ class EMEngine : public ClusteringEngine { static const double p_min ; static const double p_min_log ; public: /*! * \brief The possible seeding strategies. */ enum seeding_codes {RANDOM=0, SAMPLING, TOY} ; /*! * \brief The possible flip states. */ enum flip_states{FORWARD=0, REVERSE} ; public: /*! * \brief Constructs an object. * \param data the data to classify. * \param n_class the number of signal classes to search. * \param n_iter the number of iterations. * \param n_shift the shifting freedom. 1 means no shift. * \param flip whether flipping is allowed. * \param n_threads the number of threads dedicated to the * computations. */ EMEngine(const Matrix2D& data, size_t n_class, size_t n_iter, size_t n_shift, bool flip, seeding_codes seeding, const std::string& seed=std::string(""), size_t n_threads=1) ; /*! * \brief Destructor. */ virtual ~EMEngine() override ; /*! * \brief Returns a matrix with the class class references * (protypic signal), on each row. * \return a matrix containing the class references, on * each row. */ virtual Matrix2D get_references() const ; /*! * \brief Returns a matrix with the posterior probabilies * with the dimensions representing the data, classes, shifts * and flips respectively. * \return a matrix containing the posterior probabilities. */ virtual Matrix4D get_posterior_prob() const ; /*! * \brief Returns the likelihood of the partition. * \return the likelihood of the partition. */ virtual double get_loglikelihood() const ; /*! * \brief Returns the Akaike Information Criterion (AIC) * for the given partition. * The AIC is 2n - 2LL where is the number of * free parameters in the model and LL the log * likelihood of the partition. * \return the partition AIC. */ virtual double get_aic() const ; /*! * \brief Runs the data clustering. * \return */ virtual ClusteringEngine::exit_codes cluster() override ; - protected: /*! * \brief Default constructor. */ EMEngine() = default ; /*! * \brief Sets each class protypic signal to 1 count, * in average. */ virtual void normalize_references() ; /*! * \brief Initialises the references using the corresponding * method. * \param seeding the method to use. */ virtual void seeding(seeding_codes seeding) ; /*! * \brief Initialises the references randomly. * Generates the initial references by randomly assigning * the data to the classes using a beta distribution and * all classes are set equally likely. */ virtual void seeding_random() ; /*! * \brief Initialises the K references by randomly * sampling K rows in the data. The class are set * equally probable. */ virtual void seeding_sampling() ; /*! * \brief Initialises the K references using the first K * rows in data. The class are set equally probable. */ virtual void seeding_toy() ; /*! * \brief Computes the mean number of reads present in * each slice (of length ncol - shift + 1), in each row * of the data and store them in this->window_mean. */ virtual void compute_window_means() ; /*! * \brief The routine that effectively computes the mean * number of reads present in each slice, for the range * [from,to) of rows in the data. * This function is thread safe only as long as different * [from,to) slices are given to the different threads. * \param from the index of the first row to treat. * \param to the index of the past last row to treat. * \param done a promise filled when the function is done * working. This allows to synchronize threads. */ virtual void compute_window_means_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data log likelihood given the * current class protypic signals. */ virtual void compute_loglikelihood() ; /*! * \brief The routine that effectively computes the * log likelihoods for the range [from,to) of rows * in the data. This function is used to distribute * the log likelihood computations over several threads. * This function is thread safe only as long as * different [from,to) slices are given to the different * threads. * \param from the index of the first row to treat. * \param to the index of the past last row to treat. * \param done a promise filled when the function is * done working. This allows to synchronize threads. */ virtual void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() ; /*! * \brief The routine that effectively computes the * posterior probabilities for the range [from,to) of * rows in the data. This function is used to distribute * the posterior probability computations over several * threads. This function is thread safe only as long * as different [from,to) slices are given to the * differentthreads. * \param from the index of the first row to treat. * \param to the index of the past last row to treat. * \param probs a promise containing a vector with the * sum of the posterior probability, for each class, * computed for the given slice. */ virtual void compute_post_prob_routine(size_t from, size_t to, std::promise& probs) ; /*! * \brief Computes the class probabilities from the * posterior probabilities. */ virtual void compute_class_prob() ; /*! * \brief Computes the class aggregations given the * posterior probabilities. */ virtual void compute_references() ; /*! * \brief A routine that computes the partial class * references for the range [from,to) of rows in the * data. To obtain the full class references, it is * required to 1) run this routine on the whole data * at once or 2) run it on different slices and * sum up the partial references obtained. This function * is used to distribute the posterior probability * computations over several threads. This function is * thread safe only as long as different [from,to) slices * are given to the different threads. * \param from the index of the first row to treat. * \param to the index of the past last row to treat. * \param class_ref a promise containing a matrix with the * partial class references on each row. */ virtual void compute_references_routine(size_t from, size_t to, std::promise& class_ref) ; /*! * \brief Modifies the class probabilities in such a * way that the shift probabilities are then normaly * distributed, centered on the middle shift state. * However, the overall class probabilities remain * unchanged. */ virtual void center_shifts() ; protected: /*! * \brief whether flip is enabled. */ bool flip ; /*! * \brief the number of iterations. */ size_t n_iter ; /*! * \brief the number of shift states. */ size_t n_shift ; /*! * \brief the number of flip states. */ size_t n_flip ; /*! * \brief the number of classes. */ size_t n_class ; /*! * \brief the data. */ matrix2d_i data ; /*! * \brief the mean number of reads per window in the * data. */ matrix2d_d window_mean ; /*! * \brief the class aggregation signal. */ matrix2d_d references ; /*! * \brief the log likelihoods. */ matrix4d_d loglikelihood ; /*! * \brief the max log likelihood value for each row. */ v_d loglikelihood_max ; /*! * \brief the posterior probabilities. */ matrix4d_d post_prob ; /*! * \brief the class probabilities. */ matrix3d_d class_prob ; /*! * \brief the total prob per class. */ v_d class_prob_tot ; /*! * \brief the sum per row of post_prob. */ v_d post_prob_row ; /*! * \brief the sum per class of post_prob. */ v_d post_prob_class ; /*! * \brief the total of post_prob. */ double post_prob_tot ; /*! * \brief the number of rows in data. */ size_t n_row ; /*! * \brief the number of columns in data. */ size_t n_col ; /*! * \brief the size of the pattern search and of * the scanning window in the data. */ size_t l_slice ; /*! * \brief the seeding method to use. */ EMEngine::seeding_codes seeding_method ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief the threads. */ ThreadPool threads ; } ; +double sum_exp(const std::vector>& v) ; + #endif // EMENGINE_HPP diff --git a/src/Clustering/ReferenceComputer.cpp b/src/Clustering/ReferenceComputer.cpp index bde0ad0..352da39 100644 --- a/src/Clustering/ReferenceComputer.cpp +++ b/src/Clustering/ReferenceComputer.cpp @@ -1,84 +1,79 @@ #include #include #include // some typdef #include -template -std::ostream& operator << (std::ostream& stream, const std::vector& v) -{ for(const auto& x : v) - { stream << x << " " ; } - stream << std::endl ; - return stream ; -} ReferenceComputer::ReferenceComputer(const Matrix2D& data, const Matrix4D& posterior_prob, size_t n_threads) : EMEngine(data, posterior_prob.get_dim()[1], 1, posterior_prob.get_dim()[2], posterior_prob.get_dim()[3] == 2, EMEngine::seeding_codes::RANDOM, "", n_threads) { - // copy the data this->data = matrix2d_i(this->n_row, v_i(this->n_col)) ; for(size_t i=0; in_row; i++) { for(size_t j=0; jn_col; j++) { this->data[i][j] = data(i,j) ; } } + // compute window means + this->window_mean = matrix2d_d(this->n_row, v_d(this->n_shift, 0.)) ; + this->compute_window_means() ; + // initialise, copy and compute probs this->post_prob = matrix4d_d(this->n_row, matrix3d_d(this->n_class, matrix2d_d(this->n_shift, v_d(this->n_flip, 0.)))) ; this->class_prob = matrix3d_d(this->n_class, matrix2d_d(this->n_shift, v_d(this->n_flip, 0.))) ; this->class_prob_tot = v_d(this->n_class, 0.) ; this->post_prob_class = v_d(this->n_class, 0.) ; for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { for(size_t f=0; fn_flip; f++) { double p = posterior_prob(i,j,s,f) ; this->post_prob[i][j][s][f] = p ; this->post_prob_class[j] += p ; this->post_prob_tot += p ; } } } } this->compute_class_prob() ; // compute the references this->references = matrix2d_d(this->n_class, v_d(this->l_slice, 0.)) ; this->compute_references() ; - } ReferenceComputer::~ReferenceComputer() { ; } Matrix2D ReferenceComputer::get_references() const { // add a 1st column with the class probabilities Matrix2D references(this->n_class, this->l_slice+1, 0.) ; for(size_t i=0; in_class; i++) { // class prob references(i,0) = this->class_prob_tot[i] ; // signal for(size_t j=0; jl_slice; j++) { references(i,j+1) = this->references[i][j] ; } } return references ; } diff --git a/src/Clustering/typedef.hpp b/src/Clustering/typedef.hpp index 231fd50..4d3e91a 100644 --- a/src/Clustering/typedef.hpp +++ b/src/Clustering/typedef.hpp @@ -1,11 +1,16 @@ #ifndef TYPEDEFCLUSTERING_HPP #define TYPEDEFCLUSTERING_HPP +#include // std::vector +#include // std::pair + typedef std::vector v_i ; typedef std::vector v_d ; typedef std::vector matrix2d_i ; typedef std::vector matrix2d_d ; typedef std::vector matrix3d_d ; typedef std::vector matrix4d_d ; +typedef std::vector> v_pair ; + #endif // TYPEDEFCLUSTERING_HPP diff --git a/src/GenomicTools/CellMatrixCreator.cpp b/src/GenomicTools/CellMatrixCreator.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/GenomicTools/CellMatrixCreator.hpp b/src/GenomicTools/CellMatrixCreator.hpp new file mode 100644 index 0000000..e69de29 diff --git a/src/GenomicTools/CorrelationMatrixCreator.cpp b/src/GenomicTools/CorrelationMatrixCreator.cpp new file mode 100644 index 0000000..fbf6dfa --- /dev/null +++ b/src/GenomicTools/CorrelationMatrixCreator.cpp @@ -0,0 +1,373 @@ +#include +#include +#include // std::runtime_error + +#include // BamFileIn +#include // BedFileIn + +#include +#include + + +template +std::ostream& operator << (std::ostream& stream, const std::list& l) +{ + for(const auto& p : l) + { stream << p << " " ; } + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ + for(const auto& p : v) + { stream << p << " " ; } + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::pair& p) +{ + stream << "[" << p.first << " " << p.second << "] " ; + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::unordered_map& m) +{ + for(const auto& p : m) + { stream << p << " " << std::endl; } + return stream ; +} + + +/* A lambda to sort GenomeRegion by ascending starting coordinate + */ +auto sortByStartPos = [](const GenomeRegion& r1, const GenomeRegion& r2) -> bool +{ return r1 < r2 ; +} ; + +CorrelationMatrixCreator::CorrelationMatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method) + : MatrixCreator(bed_file_path, + bam_file_path, + bai_file_path, + from, + to, + bin_size, + method), + target_list_fw(), + target_list_rv() +{ + seqan::BedRecord bed_line ; + + // compute coordinates relative to each region + this->compute_relative_bin_coord() ; + size_t n_col = this->relative_bin_coord.size() ; + + // compute number of regions and get valid chromosomes names + this->open_bed_file() ; + this->open_bam_file() ; + seqan::BamHeader header ; + seqan::readHeader(header, bam_file) ; + size_t n_row = 0 ; + while(not seqan::atEnd(this->bed_file)) + { seqan::readRecord(bed_line, this->bed_file) ; + std::string chrom_name = seqan::toCString(bed_line.ref) ; + // new chromosome + if(this->chrom_map_names.find(chrom_name) == + this->chrom_map_names.end()) + { int chrom_idx = -1 ; + seqan::getIdByName(chrom_idx, + seqan::contigNamesCache(seqan::context(this->bam_file)), + chrom_name) ; + this->chrom_map_names[chrom_name] = chrom_idx ; + } + n_row++ ; + } + this->close_bed_file() ; + this->close_bam_file() ; + + // create the count matrix + this->matrix_counts = Matrix2D(n_row, n_col, 0) ; + // create the region matrix + this->matrix_bins = + std::vector> + (n_row,std::vector(n_col)) ; + this->open_bed_file() ; + this->open_bam_file() ; + size_t i = 0 ; + while(not seqan::atEnd(this->bed_file)) + { seqan::readRecord(bed_line, this->bed_file) ; + // find the region limits + std::string region_chr = seqan::toCString(bed_line.ref) ; + int region_len = bed_line.endPos - bed_line.beginPos ; + int region_mid = bed_line.beginPos + (region_len / 2) ; + + // compute the absolute bins coordinates for this region + // and create the bins in this region + for(size_t j=0; jrelative_bin_coord[j] ; + this->matrix_bins[i][j] = + GenomeRegion(region_chr, + this->chrom_map_names[region_chr], + region_mid + relative_coord.first, + region_mid + relative_coord.second) ; + } + i++ ; + } + this->close_bed_file() ; + this->close_bam_file() ; +} + +CorrelationMatrixCreator::~CorrelationMatrixCreator() +{ this->close_bam_file() ; + this->close_bed_file() ; +} + +Matrix2D CorrelationMatrixCreator::create_matrix() +{ + this->open_bam_file() ; + this->open_bai_file() ; + + // read BAM header + seqan::BamHeader bam_header ; + seqan::readHeader(bam_header, this->bam_file) ; + + for(size_t i=0; imatrix_counts.get_nrow(); i++) + { + const auto& row = this->matrix_bins[i] ; + GenomeRegion region(row.front().chromosome, + row.front().chromosome_idx, + row.front().start, + row.back().end) ; + + bool jump = this->jump_upstream(region, 600) ; + if(not jump) + { continue ; } + // read all relevant targets + this->to_downstream_target(region) ; + // update count matrix row + this->update_count_matrix(i) ; + // clean buffers + this->clear_target_lists() ; + } + this->close_bam_file() ; + return this->matrix_counts ; +} + +bool CorrelationMatrixCreator::jump_upstream(const GenomeRegion& region, + int margin) +{ bool has_alignment = false ; + int rID = -10 ; + if(this->chrom_map_names.find(region.chromosome) != + this->chrom_map_names.end()) + { rID = this->chrom_map_names[region.chromosome] ; } + else + { char msg[4096] ; + sprintf(msg, "Error! chromosome %s is not linked with a valid ID in BAM file", + region.chromosome.c_str()) ; + std::cerr << msg << std::endl ; + return false ; + } + + int start = std::max(0, region.start - margin) ; + int end = start + 1 ; + bool jump = seqan::jumpToRegion(this->bam_file, + has_alignment, + rID, + start, + end, + this->bai_file) ; + return jump ; +} + +void CorrelationMatrixCreator::to_downstream_target(const GenomeRegion& region) +{ if(this->method == CorrelationMatrixCreator::methods::READ or + this->method == CorrelationMatrixCreator::methods::READ_ATAC) + { this->to_downstream_read(region) ; } + else + { this->to_downstream_fragment(region) ; } +} + +void CorrelationMatrixCreator::to_downstream_read(const GenomeRegion& region) +{ bool done = false ; + + seqan::BamAlignmentRecord record ; + + while(not seqan::atEnd(this->bam_file) and + not done) + { // QC check and transform record + seqan::readRecord(record, this->bam_file) ; + if(not CorrelationMatrixCreator::is_good_read(record) or + not this->is_valid_chromosome(record)) + { continue ; } + + GenomeRegion target ; + try + { if(this->method == CorrelationMatrixCreator::methods::READ) + { target = GenomeRegion::constructRead(record, this->bam_file) ; } + else + { target = GenomeRegion::constructReadATAC(record, this->bam_file) ; } + } + catch(std::invalid_argument& e) + { // connect to cerr to write in SAM + seqan::BamFileOut samFileOut(seqan::context(this->bam_file), + std::cerr, + seqan::Sam()) ; + std::cerr << "std::invalid_argument caught! could not use " + "this record as read: " << std::endl ; + writeRecord(samFileOut, record) ; + std::cerr << "message was : " << e.what() << std::endl << std::endl ; + continue ; + } + + // upstream -> continue + if(target < region) + { continue ; } + // overlap -> store + else if(target | region) + { if(not seqan::hasFlagRC(record)) + { this->target_list_fw.push_back(target) ; } + else + { this->target_list_rv.push_back(target) ; } + } + // downstream -> stop + else + { done = true ; } + } +} + +void CorrelationMatrixCreator::to_downstream_fragment(const GenomeRegion& region) +{ + bool done = false ; + + seqan::BamAlignmentRecord record ; + + while(not seqan::atEnd(this->bam_file) and + not done) + { // QC check and transform record + seqan::readRecord(record, this->bam_file) ; + if(not CorrelationMatrixCreator::is_good_pair(record) or + not this->is_valid_chromosome(record)) + { continue ; } + + GenomeRegion target ; + try + { target = GenomeRegion::constructFragment(record, this->bam_file) ; } + catch(std::invalid_argument& e) + { // connect to cerr to write in SAM + seqan::BamFileOut samFileOut(seqan::context(this->bam_file), + std::cerr, + seqan::Sam()) ; + std::cerr << "std::invalid_argument caught! could not use " + "this record as fragment: " << std::endl ; + writeRecord(samFileOut, record) ; + std::cerr << "message was : " << e.what() << std::endl << std::endl ; + continue ; + } + + // upstream -> continue + if(target < region) + { continue ; } + // overlap -> store + else if(target | region) + { if(this->method == CorrelationMatrixCreator::methods::FRAGMENT_CENTER) + { target = GenomeRegion::constructFragmentCenter(record, + this->bam_file) ; + if(target | region) + { this->target_list_fw.push_back(target) ; } + } + else + { this->target_list_fw.push_back(target) ; } + } + // downstream -> stop + else if(target > region) + { // std::cerr << std::endl ; + done = true ; + } + } + // std::cerr << "to_downstream_fragment END" << std::endl ; +} + +void CorrelationMatrixCreator::clear_target_lists() +{ this->target_list_fw.clear() ; + this->target_list_rv.clear() ; +} + +/* +void CorrelationMatrixCreator::remove_upstream_targets(const GenomeRegion& region) +{ // forward targets + auto iter_fw = this->target_list_fw.cbegin() ; + while(iter_fw != this->target_list_fw.end()) + { // remove upstream reads + if(*iter_fw < region) + { iter_fw = this->target_list_fw.erase(iter_fw) ; } + // keep overlapping reads, don't stop here + else if(*iter_fw | region) + { iter_fw++ ; } + // stop at first read downstream + else + { break ; } + } + // reverse targets + auto iter_rv = this->target_list_rv.cbegin() ; + while(iter_rv != this->target_list_rv.end()) + { // remove upstream reads + if(*iter_rv < region) + { iter_rv = this->target_list_rv.erase(iter_rv) ; } + // keep overlapping reads + else if(*iter_rv | region) + { iter_rv++ ; } + // stop at first read downstream + else + { break ; } + } +} +*/ + +void CorrelationMatrixCreator::update_count_matrix(size_t row_index) +{ + // forward targets + for(const auto& iter : this->target_list_fw) + { auto bin_start_end = CorrelationMatrixCreator:: + get_bin_indices(iter, this->matrix_bins[row_index]) ; + for(int j=bin_start_end.first; jmatrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } + // reverse targets + for(const auto& iter : this->target_list_rv) + { auto bin_start_end = CorrelationMatrixCreator:: + get_bin_indices(iter, this->matrix_bins[row_index]) ; + for(int j=bin_start_end.first; jmatrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } +} + +/* +void CorrelationMatrixCreator::update_count_matrix_naive(size_t row_index) +{ // forward targets + for(const auto& iter : target_list_fw) + { for(size_t j=0; jmatrix_counts.get_ncol(); j++) + { this->matrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } + // reverse targets + for(const auto& iter : target_list_rv) + { for(size_t j=0; jmatrix_counts.get_ncol(); j++) + { this->matrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } +} +*/ diff --git a/src/GenomicTools/CorrelationMatrixCreator.hpp b/src/GenomicTools/CorrelationMatrixCreator.hpp new file mode 100644 index 0000000..e6043bc --- /dev/null +++ b/src/GenomicTools/CorrelationMatrixCreator.hpp @@ -0,0 +1,187 @@ +#ifndef CORRELATIONMATRIXCREATOR_HPP +#define CORRELATIONMATRIXCREATOR_HPP + +#include +#include +#include + +#include // BamFileIn +#include // BedFileIn + +#include +#include + +/*! + * \brief The CorrelationMatrixCreator class allows + * to create correlation matrices. + * A correlation matrix contains the number of target + * mapped at different positions around a set of + * reference positions. + * This class will read the reference positions from + * a BED file and the targets from a BAM file. For each + * reference, the region center is computed and then a + * region covering the interval [from,to] is build + * around the middle and divided into equally sized + * bins. Finally, each bin is assigned the number of + * target present in the BAM file that are mapped at + * that position. + * The final matrix contains one row per reference, + * with the number of targets counted at each possible + * position (bin). relative to this reference. + */ +class CorrelationMatrixCreator: public MatrixCreator +{ + public: + + CorrelationMatrixCreator() = delete ; + + /*! + * \brief Constructs an object to build a + * correlation matrix. + * \param bed_file_path the path to the file containing + * the references. + * \param bam_file_path the path to the file containing + * the targets. + * \param bai_file_path the path to index file of the bam + * file containing the targets. + * \param from the upstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param to the dowmstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param bin_size the bin size in base pair. + * \param method how the targets should be counted. + * READ all the positions inside the reads are + * counted. + * READ_ATAC only the +4bp position of +strand reads + * and the -5bp of -strand reads are counted. It + * correspond to the insertion position in ATAC-seq + * data. + * FRAGMENT all the positions within fragments (the + * genome segment between a pair of reads, reads + * included) are counted. + * FRAGMENT_CENTER only the central position of the + * fragements (the genome segment between a pair of + * reads, reads included) are counted. + */ + CorrelationMatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method) ; + /*! + * Destructor. + */ + ~CorrelationMatrixCreator() ; + + /*! + * \brief Computes the matrix and returns it. + * \return the count matrix. + */ + virtual Matrix2D create_matrix() override ; + + protected: + /*! + * \brief Seek in the BAM file right before the last + * record upstream the given region. The margin + * parameters allows to modify the region start + * value. + * To read a record within the region, a read + * operation is required to get ride of the + * record right + * \param region the region in front of which the + * pointer is desired. + * \param margin + * which streams in the stream vectors to use. + * \return whether the reading pointer could be moved + * to the desired position. + */ + bool jump_upstream(const GenomeRegion& region, + int margin) ; + + /*! + * \brief A generic routine that reads the following records + * until finding the first one located downstream the region + * of interest (the definition of the first target downstream + * the region of interest depends if READ/READ_ATAC/FRAGMENT + * or FRAGMENT_CENTER is set as method). + * All record overlapping the region of interest are stored + * in the target lists. + * The reading pointer is supposed to be located + * upstream the region of interest. If this is note the case, + * the method will read records until reaching the end of + * the file. + * \param region the region of interest. + */ + void to_downstream_target(const GenomeRegion& region) ; + + /*! + * \brief The routine that reads the following records + * until finding the first one located downstream the region + * of interest if READ or READ_ATAC is set as method. + * All record overlapping the region of interest are stored + * in the target lists. + * The reading pointer is supposed to be located + * upstream the region of interest. If this is note the case, + * the method will read records until reaching the end of + * the file. + * \param region the region of interest. + */ + void to_downstream_read(const GenomeRegion& region) ; + + /*! + * \brief The routine that reads the following records + * until finding the first one located downstream the region + * of interest if FRAGMENT or FRAGMENT_CENTER is set as + * method. + * All record overlapping the region of interest are stored + * in the target lists. + * The reading pointer is supposed to be located + * upstream the region of interest. If this is note the case, + * the method will read records until reaching the end of + * the file. + * \param region the region of interest. + */ + void to_downstream_fragment(const GenomeRegion& region) ; + + /*! + * \brief Clear the content of the target lists. + */ + void clear_target_lists() ; + + /*! + * \brief Parses the target lists and remove any target + * located upstream of the given region. + * \param region the region of interest. + */ + // void remove_upstream_targets(const GenomeRegion& region) ; + + /*! + * \brief Update the given row of the count matrix with + * the content of the target lists. + * \param matrix_row_index the index of the row, in the + * count matrix. + */ + void update_count_matrix(size_t row_index) ; + + /*! + * \brief A buffers containing the + * target mapped on the forward strand. + * Target without strand (fragments) + * are also stored in this list. + */ + std::list target_list_fw ; + /*! + * \brief A buffers containing the + * target mapped on the reverse strand. + */ + std::list target_list_rv ; + +} ; + +#endif // CORRELATIONMATRIXCREATOR_HPP diff --git a/src/GenomicTools/CorrelationMatrixCreatorParallel.cpp b/src/GenomicTools/CorrelationMatrixCreatorParallel.cpp new file mode 100644 index 0000000..9e775b0 --- /dev/null +++ b/src/GenomicTools/CorrelationMatrixCreatorParallel.cpp @@ -0,0 +1,483 @@ +#include +#include +#include +#include // std::runtime_error +#include // std::pair, std::make_pair(), std::move() +#include // std::ref(), std::bind() + +#include // BamFileIn, BamAlignmentRecord + +#include +#include +#include + + +template +std::ostream& operator << (std::ostream& stream, const std::list& l) +{ + for(const auto& p : l) + { stream << p << " " ; } + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ + for(const auto& p : v) + { stream << p << " " ; } + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::pair& p) +{ + stream << "[" << p.first << " " << p.second << "] " ; + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::unordered_map& m) +{ + for(const auto& p : m) + { stream << p << " " << std::endl; } + return stream ; +} + + +/* A lambda to sort GenomeRegion by ascending starting coordinate + */ +auto sortByStartPos = [](const GenomeRegion& r1, const GenomeRegion& r2) -> bool +{ return r1 < r2 ; +} ; + +CorrelationMatrixCreator::CorrelationMatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method, + size_t n_threads) + : MatrixCreator(bed_file_path, + bam_file_path, + bai_file_path, + from, + to, + bin_size, + method, + n_threads), + target_lists_fw(n_threads), + target_lists_rv(n_threads) +{ + seqan::BedRecord bed_line ; + + // compute coordinates relative to each region + this->compute_relative_bin_coord() ; + size_t n_col = this->relative_bin_coord.size() ; + + // compute number of regions and get valid chromosomes names + this->open_bed_files() ; + this->open_bam_files() ; + seqan::BamHeader header ; + seqan::readHeader(header, this->bam_files[0]) ; + size_t n_row = 0 ; + while(not seqan::atEnd(this->bed_files[0])) + { seqan::readRecord(bed_line, this->bed_files[0]) ; + std::string chrom_name = seqan::toCString(bed_line.ref) ; + // new chromosome + if(this->chrom_map_names.find(chrom_name) == + this->chrom_map_names.end()) + { int chrom_idx = -1 ; + seqan::getIdByName(chrom_idx, + seqan::contigNamesCache(seqan::context(this->bam_files[0])), + chrom_name) ; + this->chrom_map_names[chrom_name] = chrom_idx ; + } + n_row++ ; + } + this->close_bed_files() ; + + // create the count matrix + this->matrix_counts = Matrix2D(n_row, n_col, 0) ; + // create the region matrix + this->matrix_bins = + std::vector> + (n_row,std::vector(n_col)) ; + this->open_bed_files() ; + size_t i = 0 ; + while(not seqan::atEnd(this->bed_files[0])) + { seqan::readRecord(bed_line, this->bed_files[0]) ; + // find the region limits + std::string region_chr = seqan::toCString(bed_line.ref) ; + int region_len = bed_line.endPos - bed_line.beginPos ; + int region_mid = bed_line.beginPos + (region_len / 2) ; + + // compute the absolute bins coordinates for this region + // and create the bins in this region + for(size_t j=0; jrelative_bin_coord[j] ; + this->matrix_bins[i][j] = + GenomeRegion(region_chr, + this->chrom_map_names[region_chr], + region_mid + relative_coord.first, + region_mid + relative_coord.second) ; + } + i++ ; + } + this->close_bed_files() ; + this->close_bam_files() ; +} + +CorrelationMatrixCreator::~CorrelationMatrixCreator() +{ this->threads.join() ; + this->close_bam_files() ; + this->close_bed_files() ; +} + +/* +Matrix2D CorrelationMatrixCreator::create_matrix() +{ + this->open_bam_files() ; + this->open_bai_files() ; + + // read BAM header + seqan::BamHeader bam_header ; + seqan::readHeader(bam_header, this->bam_files) ; + + for(size_t i=0; imatrix_counts.get_nrow(); i++) + { + const auto& row = this->matrix_bins[i] ; + GenomeRegion region(row.front().chromosome, + row.front().chromosome_idx, + row.front().start, + row.back().end) ; + + bool jump = this->jump_upstream(region, 600) ; + if(not jump) + { continue ; } + // read all relevant targets + this->to_downstream_target(region) ; + // update count matrix row + this->update_count_matrix(i) ; + // clean buffers + this->clear_target_lists() ; + } + this->close_bam_files() ; + return this->matrix_counts ; +}*/ + +/* +Matrix2D CorrelationMatrixCreator::create_matrix() +{ + // compute the slices on which each thread will work + std::vector> slices = + ThreadPool::split_range(0, + this->matrix_counts.get_nrow(), + this->n_threads) ; + + // prepare the futures and promises + std::vector> promises(this->n_threads) ; + std::vector> futures(this->n_threads) ; + for(size_t i=0; in_threads; i++) + { futures[i] = promises[i].get_future() ; } + + // open all streams + this->open_bam_files() ; + this->open_bai_files() ; + + // ----------------------- threads start ----------------------- + for(size_t i=0; in_threads; i++) + { auto slice = slices[i] ; + std::cerr << "CorrelationMatrixCreator::create_matrix " << i << " [" << slice.first << " " << slice.second << ")" << std::endl ; + this->threads.addJob(std::move( + std::bind(&CorrelationMatrixCreator::create_matrix_routine, + this, + slice.first, + slice.second, + i, + std::ref(promises[i])))) ; + } + // wait for all threads to be done + for(auto& future : futures) + { future.get() ; } + // ----------------------- threads stop ------------------------ + + // close all streams + this->close_bam_files() ; + return this->matrix_counts ; +} +*/ + +Matrix2D CorrelationMatrixCreator::create_matrix() +{ + + // prepare the futures and promises + std::promise promise ; + std::future future = promise.get_future() ; + + // open all streams + this->open_bam_files() ; + this->open_bai_files() ; + + this->threads.addJob(std::move( + std::bind(&CorrelationMatrixCreator::create_matrix_routine, + this, + 0, + this->matrix_counts.get_nrow(), + 0, + std::ref(promise)))) ; + // wait for all threads to be done + future.get() ; + // ----------------------- threads stop ------------------------ + + // close all streams + this->close_bam_files() ; + return this->matrix_counts ; +} + +void CorrelationMatrixCreator::create_matrix_routine(size_t row_from, + size_t row_to, + size_t thread_idx, + std::promise& done) +{ std::cerr << "CorrelationMatrixCreator::create_matrix_routine " << thread_idx << std::endl ; + // read BAM header + seqan::BamHeader bam_header ; + seqan::readHeader(bam_header, this->bam_files[thread_idx]) ; + + for(size_t i=row_from; imatrix_bins[i] ; + GenomeRegion region(row.front().chromosome, + row.front().chromosome_idx, + row.front().start, + row.back().end) ; + + bool jump = this->jump_upstream(region, 600, thread_idx) ; + if(not jump) + { continue ; } + // read all relevant targets + this->to_downstream_target(region, thread_idx) ; + // update count matrix row + this->update_count_matrix(i, thread_idx) ; + // clean buffers + this->clear_target_lists(thread_idx) ; + } + // signal done working + done.set_value(true) ; +} + +bool CorrelationMatrixCreator::jump_upstream(const GenomeRegion& region, + int margin, + size_t thread_idx) +{ bool has_alignment = false ; + int rID = -10 ; + if(this->chrom_map_names.find(region.chromosome) != + this->chrom_map_names.end()) + { rID = this->chrom_map_names[region.chromosome] ; } + else + { char msg[4096] ; + sprintf(msg, "Error! chromosome %s is not linked with a valid ID in BAM file", + region.chromosome.c_str()) ; + std::cerr << msg << std::endl ; + return false ; + } + + int start = std::max(0, region.start - margin) ; + int end = start + 1 ; + bool jump = seqan::jumpToRegion(this->bam_files[thread_idx], + has_alignment, + rID, + start, + end, + this->bai_files[thread_idx]) ; + return jump ; +} + +void CorrelationMatrixCreator::to_downstream_target(const GenomeRegion& region, + size_t thread_idx) +{ if(this->method == CorrelationMatrixCreator::methods::READ or + this->method == CorrelationMatrixCreator::methods::READ_ATAC) + { this->to_downstream_read(region, thread_idx) ; } + else + { this->to_downstream_fragment(region, thread_idx) ; } +} + +void CorrelationMatrixCreator::to_downstream_read(const GenomeRegion& region, + size_t thread_idx) +{ bool done = false ; + + seqan::BamAlignmentRecord record ; + + while(not seqan::atEnd(this->bam_files[thread_idx]) and + not done) + { // QC check and transform record + seqan::readRecord(record, this->bam_files[thread_idx]) ; + if(not CorrelationMatrixCreator::is_good_read(record) or + not this->is_valid_chromosome(record, thread_idx)) + { continue ; } + + GenomeRegion target ; + try + { if(this->method == CorrelationMatrixCreator::methods::READ) + { target = GenomeRegion::constructRead(record, this->bam_files[thread_idx]) ; } + else + { target = GenomeRegion::constructReadATAC(record, this->bam_files[thread_idx]) ; } + } + catch(std::invalid_argument& e) + { // connect to cerr to write in SAM + seqan::BamFileOut samFileOut(seqan::context(this->bam_files[thread_idx]), + std::cerr, + seqan::Sam()) ; + std::cerr << "std::invalid_argument caught! could not use " + "this record as read: " << std::endl ; + writeRecord(samFileOut, record) ; + std::cerr << "message was : " << e.what() << std::endl << std::endl ; + continue ; + } + + // upstream -> continue + if(target < region) + { continue ; } + // overlap -> store + else if(target | region) + { if(not seqan::hasFlagRC(record)) + { this->target_lists_fw[thread_idx].push_back(target) ; } + else + { this->target_lists_rv[thread_idx].push_back(target) ; } + } + // downstream -> stop + else + { done = true ; } + } +} + +void CorrelationMatrixCreator::to_downstream_fragment(const GenomeRegion& region, + size_t thread_idx) +{ + bool done = false ; + + seqan::BamAlignmentRecord record ; + + while(not seqan::atEnd(this->bam_files[thread_idx]) and + not done) + { // QC check and transform record + seqan::readRecord(record, this->bam_files[thread_idx]) ; + if(not CorrelationMatrixCreator::is_good_pair(record) or + not this->is_valid_chromosome(record, thread_idx)) + { continue ; } + + GenomeRegion target ; + try + { target = GenomeRegion::constructFragment(record, this->bam_files[thread_idx]) ; } + catch(std::invalid_argument& e) + { // connect to cerr to write in SAM + seqan::BamFileOut samFileOut(seqan::context(this->bam_files[thread_idx]), + std::cerr, + seqan::Sam()) ; + std::cerr << "std::invalid_argument caught! could not use " + "this record as fragment: " << std::endl ; + writeRecord(samFileOut, record) ; + std::cerr << "message was : " << e.what() << std::endl << std::endl ; + continue ; + } + + // upstream -> continue + if(target < region) + { continue ; } + // overlap -> store + else if(target | region) + { if(this->method == CorrelationMatrixCreator::methods::FRAGMENT_CENTER) + { target = GenomeRegion::constructFragmentCenter(record, + this->bam_files[thread_idx]) ; + if(target | region) + { this->target_lists_fw[thread_idx].push_back(target) ; } + } + else + { this->target_lists_fw[thread_idx].push_back(target) ; } + } + // downstream -> stop + else if(target > region) + { // std::cerr << std::endl ; + done = true ; + } + } + // std::cerr << "to_downstream_fragment END" << std::endl ; +} + +void CorrelationMatrixCreator::clear_target_lists(size_t thread_idx) +{ this->target_lists_fw[thread_idx].clear() ; + this->target_lists_rv[thread_idx].clear() ; +} + +/* +void CorrelationMatrixCreator::remove_upstream_targets(const GenomeRegion& region) +{ // forward targets + auto iter_fw = this->target_list_fw.cbegin() ; + while(iter_fw != this->target_list_fw.end()) + { // remove upstream reads + if(*iter_fw < region) + { iter_fw = this->target_list_fw.erase(iter_fw) ; } + // keep overlapping reads, don't stop here + else if(*iter_fw | region) + { iter_fw++ ; } + // stop at first read downstream + else + { break ; } + } + // reverse targets + auto iter_rv = this->target_list_rv.cbegin() ; + while(iter_rv != this->target_list_rv.end()) + { // remove upstream reads + if(*iter_rv < region) + { iter_rv = this->target_list_rv.erase(iter_rv) ; } + // keep overlapping reads + else if(*iter_rv | region) + { iter_rv++ ; } + // stop at first read downstream + else + { break ; } + } +} +*/ + +void CorrelationMatrixCreator::update_count_matrix(size_t row_index, + size_t thread_idx) +{ + // forward targets + for(const auto& iter : this->target_lists_fw[thread_idx]) + { auto bin_start_end = CorrelationMatrixCreator:: + get_bin_indices(iter, this->matrix_bins[row_index]) ; + for(int j=bin_start_end.first; jmatrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } + // reverse targets + for(const auto& iter : this->target_lists_rv[thread_idx]) + { auto bin_start_end = CorrelationMatrixCreator:: + get_bin_indices(iter, this->matrix_bins[row_index]) ; + for(int j=bin_start_end.first; jmatrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } +} + +/* +void CorrelationMatrixCreator::update_count_matrix_naive(size_t row_index) +{ // forward targets + for(const auto& iter : target_list_fw) + { for(size_t j=0; jmatrix_counts.get_ncol(); j++) + { this->matrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } + // reverse targets + for(const auto& iter : target_list_rv) + { for(size_t j=0; jmatrix_counts.get_ncol(); j++) + { this->matrix_counts(row_index, j) += + iter.overlap_len(this->matrix_bins[row_index][j]) ; + } + } +} +*/ diff --git a/src/GenomicTools/CorrelationMatrixCreatorParallel.hpp b/src/GenomicTools/CorrelationMatrixCreatorParallel.hpp new file mode 100644 index 0000000..62aa481 --- /dev/null +++ b/src/GenomicTools/CorrelationMatrixCreatorParallel.hpp @@ -0,0 +1,222 @@ +#ifndef CORRELATIONMATRIXCREATOR_HPP +#define CORRELATIONMATRIXCREATOR_HPP + +#include +#include +#include + +#include // BamFileIn +#include // BedFileIn + +#include +#include + +/*! + * \brief The CorrelationMatrixCreator class allows + * to create correlation matrices. + * A correlation matrix contains the number of target + * mapped at different positions around a set of + * reference positions. + * This class will read the reference positions from + * a BED file and the targets from a BAM file. For each + * reference, the region center is computed and then a + * region covering the interval [from,to] is build + * around the middle and divided into equally sized + * bins. Finally, each bin is assigned the number of + * target present in the BAM file that are mapped at + * that position. + * The final matrix contains one row per reference, + * with the number of targets counted at each possible + * position (bin). relative to this reference. + */ +class CorrelationMatrixCreator: public MatrixCreator +{ + public: + + CorrelationMatrixCreator() = delete ; + + /*! + * \brief Constructs an object to build a + * correlation matrix. + * \param bed_file_path the path to the file containing + * the references. + * \param bam_file_path the path to the file containing + * the targets. + * \param bai_file_path the path to index file of the bam + * file containing the targets. + * \param from the upstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param to the dowmstream most relative position + * to consider around the references. It may + * be changed to make sure that the central bin + * is centered on +/- 0. + * \param bin_size the bin size in base pair. + * \param method how the targets should be counted. + * READ all the positions inside the reads are + * counted. + * READ_ATAC only the +4bp position of +strand reads + * and the -5bp of -strand reads are counted. It + * correspond to the insertion position in ATAC-seq + * data. + * FRAGMENT all the positions within fragments (the + * genome segment between a pair of reads, reads + * included) are counted. + * FRAGMENT_CENTER only the central position of the + * fragements (the genome segment between a pair of + * reads, reads included) are counted. + * \param n_threads the number of working threads. + */ + CorrelationMatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method, + size_t n_threads=1) ; + /*! + * Destructor. + */ + ~CorrelationMatrixCreator() ; + + /*! + * \brief Computes the matrix and returns it. + * \return the count matrix. + */ + virtual Matrix2D create_matrix() override ; + + protected: + + void create_matrix_routine(size_t row_from, + size_t row_to, + size_t thread_idx, + std::promise& done) ; + + /*! + * \brief Seek in the BAM file right before the last + * record upstream the given region. The margin + * parameters allows to modify the region start + * value. + * To read a record within the region, a read + * operation is required to get ride of the + * record right + * \param region the region in front of which the + * pointer is desired. + * \param margin + * \param thread_idx the thread index/number. This + * specifies + * which streams in the stream vectors to use. + * \return whether the reading pointer could be moved + * to the desired position. + */ + bool jump_upstream(const GenomeRegion& region, + int margin, + size_t thread_idx) ; + + /*! + * \brief A generic routine that reads the following records + * until finding the first one located downstream the region + * of interest (the definition of the first target downstream + * the region of interest depends if READ/READ_ATAC/FRAGMENT + * or FRAGMENT_CENTER is set as method). + * All record overlapping the region of interest are stored + * in the target lists. + * The reading pointer is supposed to be located + * upstream the region of interest. If this is note the case, + * the method will read records until reaching the end of + * the file. + * \param region the region of interest. + * \param thread_idx the thread index/number. This specifies + * which target lists in the list vectors and which + * streams in the stream vector to use. + */ + void to_downstream_target(const GenomeRegion& region, + size_t thread_idx) ; + + /*! + * \brief The routine that reads the following records + * until finding the first one located downstream the region + * of interest if READ or READ_ATAC is set as method. + * in the target lists. + * All record overlapping the region of interest are stored + * in the target lists. + * The reading pointer is supposed to be located + * upstream the region of interest. If this is note the case, + * the method will read records until reaching the end of + * the file. + * \param region the region of interest. + * \param thread_idx the thread index/number. This specifies + * which target lists in the list vectors and which + * streams in the stream vector to use. + */ + void to_downstream_read(const GenomeRegion& region, + size_t thread_idx) ; + + /*! + * \brief The routine that reads the following records + * until finding the first one located downstream the region + * of interest if FRAGMENT or FRAGMENT_CENTER is set as + * method in the target lists. + * All record overlapping the region of interest are stored + * in the target lists. + * The reading pointer is supposed to be located + * upstream the region of interest. If this is note the case, + * the method will read records until reaching the end of + * the file. + * \param region the region of interest. + * \param thread_idx the thread index/number. This specifies + * which target lists in the list vectors and which + * streams in the stream vector to use. + */ + void to_downstream_fragment(const GenomeRegion& region, + size_t thread_idx) ; + + /*! + * \brief Clear the content of the target lists. + * \param thread_idx the thread index/number. This specifies + * which lists in the list vectors to clear. + */ + void clear_target_lists(size_t thread_idx) ; + + /*! + * \brief Parses the target lists and remove any target + * located upstream of the given region. + * \param region the region of interest. + */ + // void remove_upstream_targets(const GenomeRegion& region) ; + + /*! + * \brief Update the given row of the count matrix with + * the content of the target lists. + * \param matrix_row_index the index of the row, in the + * count matrix. + * \param thread_idx the thread index/number. This specifies + * which target lists in the list vectors to use. + */ + void update_count_matrix(size_t row_index, + size_t thread_idx) ; + + /*! + * \brief A vector of buffers containing the + * target mapped on the forward strand. + * Target without strand (fragments) + * are also stored in this list. + * The purpose is to allow multiple threads + * to work at the same time on their own + * dedicated buffer. + */ + std::vector> target_lists_fw ; + /*! + * \brief A vector of buffers containing the + * target mapped on the reverse strand. + * The purpose is to allow multiple threads + * to work at the same time on their own + * dedicated buffer. + */ + std::vector> target_lists_rv ; + +} ; + +#endif // CORRELATIONMATRIXCREATOR_HPP diff --git a/src/GenomicTools/GenomeRegion.cpp b/src/GenomicTools/GenomeRegion.cpp new file mode 100644 index 0000000..3a81442 --- /dev/null +++ b/src/GenomicTools/GenomeRegion.cpp @@ -0,0 +1,371 @@ +#include + +#include +#include // abs() +#include // std::invalid_argument +#include + +/* +GenomeRegion GenomeRegion::constructRead(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ + GenomeRegion read ; + + read.chromosome = + seqan::toCString( + seqan::getContigName(record, bam_file)) ; + read.chromosome_idx = record.rID ; + + // read is on + strand + // |-----------> + // record.beginPos + if(not seqan::hasFlagRC(record)) + { read.start = record.beginPos ; + read.length = seqan::endPosition(record.seq) ; + read.end = read.start + read.length ; + } + // read is on - strand + // <-----------| + // record.beginPos + else + { read.length = seqan::endPosition(record.seq) ; + read.end = record.beginPos + 1 ; + read.start = read.end - read.length ; + } + + if(read.start < 0 or read.end < 0) + { char msg[4096] ; + sprintf(msg, "Error! invalide coordinate (<0) : [%s/%d %d %d)]", + read.chromosome.c_str(), read.chromosome_idx, + read.start, read.end) ; + throw std::invalid_argument(msg) ; + } + else if(read.start >= read.end) + { char msg[4096] ; + sprintf(msg, "Error! start >= end : [%s/%d %d %d)]", + read.chromosome.c_str(), read.chromosome_idx, + read.start, read.end) ; + throw std::invalid_argument(msg) ; + } + return read ; +} +*/ + +// new +GenomeRegion GenomeRegion::constructRead(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ + GenomeRegion read ; + + read.chromosome = + seqan::toCString( + seqan::getContigName(record, bam_file)) ; + read.chromosome_idx = record.rID ; + + read.start = record.beginPos ; + read.length = seqan::endPosition(record.seq) ; + read.end = read.start + read.length ; + + if(read.start < 0 or read.end < 0) + { char msg[4096] ; + sprintf(msg, "Error! invalide coordinate (<0) : [%s/%d %d %d)]", + read.chromosome.c_str(), read.chromosome_idx, + read.start, read.end) ; + throw std::invalid_argument(msg) ; + } + else if(read.start >= read.end) + { char msg[4096] ; + sprintf(msg, "Error! start >= end : [%s/%d %d %d)]", + read.chromosome.c_str(), read.chromosome_idx, + read.start, read.end) ; + throw std::invalid_argument(msg) ; + } + return read ; +} + +/* +GenomeRegion GenomeRegion::constructReadATAC(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ GenomeRegion read ; + read.chromosome = + seqan::toCString( + seqan::getContigName(record, bam_file)) ; + read.chromosome_idx = record.rID ; + + // read is on + strand + if(not seqan::hasFlagRC(record)) + { read.start = record.beginPos + 4 ; } + // read is on - strand + else + { read.start = record.beginPos - 5 ; } + read.end = read.start + 1 ; + read.length = 1 ; + return read ; +} +*/ + +// new +GenomeRegion GenomeRegion::constructReadATAC(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ GenomeRegion read = GenomeRegion::constructRead(record, bam_file); + if(not seqan::hasFlagRC(record)) + { read.start += 4 ; + read.end = read.start + 1 ; + read.length = 1 ; + } + else + { read.start = read.end - 1 - 5 ; + read.end = read.start + 1 ; + read.length = 1 ; + } + return read ; +} + +GenomeRegion GenomeRegion::constructReadEdge(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ + GenomeRegion read = GenomeRegion::constructRead(record, bam_file); + if(not seqan::hasFlagRC(record)) + { read.end = read.start + 1 ; + read.length = 1 ; + } + else + { read.start = read.end - 1 ; + read.length = 1 ; + } + return read ; +} + +// old +/* +GenomeRegion GenomeRegion::constructFragment(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ GenomeRegion frag ; + + frag.chromosome = + seqan::toCString( + seqan::getContigName(record, bam_file)) ; + frag.chromosome_idx = record.rID ; + + + // read is on + strand + // record + // |-----> <-----| + // record.beginPos + if(not seqan::hasFlagRC(record)) + { frag.start = record.beginPos ; + frag.length = record.tLen ; + frag.end = frag.start + frag.length ; + } + // read is on - strand + // record + // |-----> <-----| + // record.beginPos + else + { // frag.end = seqan::endPosition(record.seq) + 1 ; + // frag.length = abs(record.tLen) ; + // frag.start = frag.end - frag.length ; + frag.end = record.beginPos + 1 ; + frag.length = abs(record.tLen) ; + frag.start = frag.end - frag.length ; + + } + + if(frag.start < 0 or frag.end < 0) + { char msg[4096] ; + sprintf(msg, "Error! invalide coordinate (<0) : [%s/%d %d %d)]", + frag.chromosome.c_str(), frag.chromosome_idx, + frag.start, frag.end) ; + throw std::invalid_argument(msg) ; + } + else if(frag.start >= frag.end) + { char msg[4096] ; + sprintf(msg, "Error! start >= end : [%s/%d %d %d)]", + frag.chromosome.c_str(), frag.chromosome_idx, + frag.start, frag.end) ; + throw std::invalid_argument(msg) ; + } + return frag ; +} +*/ + +GenomeRegion GenomeRegion::constructFragment(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ GenomeRegion frag ; + + frag.chromosome = + seqan::toCString( + seqan::getContigName(record, bam_file)) ; + frag.chromosome_idx = record.rID ; + + + // read is on + strand + // record + // |-----> <-----| + if(not seqan::hasFlagRC(record)) + { frag.start = record.beginPos ; + frag.length = record.tLen ; + frag.end = frag.start + frag.length ; + } + // read is on - strand + // record + // |-----> <-----| + else + { // frag.end = seqan::endPosition(record.seq) + 1 ; + // frag.length = abs(record.tLen) ; + // frag.start = frag.end - frag.length ; + frag.length = abs(record.tLen) ; + frag.start = record.pNext ; + frag.end = frag.start + frag.length ; + } + + if(frag.start < 0 or frag.end < 0) + { char msg[4096] ; + sprintf(msg, "Error! invalide coordinate (<0) : [%s/%d %d %d)]", + frag.chromosome.c_str(), frag.chromosome_idx, + frag.start, frag.end) ; + throw std::invalid_argument(msg) ; + } + else if(frag.start >= frag.end) + { char msg[4096] ; + sprintf(msg, "Error! start >= end : [%s/%d %d %d)]", + frag.chromosome.c_str(), frag.chromosome_idx, + frag.start, frag.end) ; + throw std::invalid_argument(msg) ; + } + return frag ; +} + +GenomeRegion GenomeRegion::constructFragmentCenter(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) +{ GenomeRegion frag = GenomeRegion::constructFragment(record, bam_file) ; + int mid = frag.start + (frag.length / 2) ; + frag.start = mid ; + frag.end = mid + 1; + frag.length = 1 ; + return frag ; +} + +GenomeRegion::GenomeRegion(const GenomeRegion& other) + : chromosome(other.chromosome), + chromosome_idx(other.chromosome_idx), + start(other.start), + end(other.end), + length(other.length) +{} + +GenomeRegion::GenomeRegion(const std::string& chromosome, + int chromosome_idx, + int start, + int end) + : chromosome(chromosome), + chromosome_idx(chromosome_idx), + start(start), + end(end), + length(end - start) +{ if(this->start < 0 or this->end < 0) + { char msg[4096] ; + sprintf(msg, "Error! invalide coordinate (<0) : [%s/%d %d %d)]", + this->chromosome.c_str(), this->chromosome_idx, + this->start, this->end) ; + throw std::invalid_argument(msg) ; + } + else if(start >= end) + { char msg[4096] ; + sprintf(msg, "Error! start >= end : [%s/%d %d %d)]", + this->chromosome.c_str(), this->chromosome_idx, + this->start, this->end) ; + throw std::invalid_argument(msg) ; + } +} + +int GenomeRegion::overlap_len(const GenomeRegion& other) const +{ int len = 0 ; + if((*this) | other) + { // this is contained in other or overlap perfectly other + if(this->start >= other.start and this->end <= other.end) + { len = this->length ; } + // start of this overlaps end other + else if((other.start < this->start) and (other.end-1 >= this->start)) + { len = other.end - this->start ; } + // other contained in this (perect overlap is handled in first case) + else if(other.start >= this->start and other.end <= this->end) + { len = other.length ; } + // end of this overlaps start of other (only case left) + else + { len = this->end - other.start ; } + } + return len ; +} + +GenomeRegion& GenomeRegion::operator = (const GenomeRegion& rhs) +{ if(this == &rhs) + { return *this ; } + this->start = rhs.start ; + this->end = rhs.end ; + this->length = rhs.length ; + this->chromosome = rhs.chromosome ; + this->chromosome_idx = rhs.chromosome_idx ; + return *this ; +} + +GenomeRegion& GenomeRegion::operator = (GenomeRegion&& rhs) +{ if(this == &rhs) + { return *this ; } + this->start = rhs.start ; + this->end = rhs.end ; + this->length = rhs.length ; + this->chromosome = rhs.chromosome ; + this->chromosome_idx = rhs.chromosome_idx ; + return *this ; +} + +bool GenomeRegion::operator == (const GenomeRegion& rhs) const +{ if(this == &rhs) + { return true ; } + if(this->chromosome_idx == rhs.chromosome_idx and + this->start == rhs.start and + this->end == rhs.end and + this->length == rhs.length) + { return true ; } + return false ; +} + +bool GenomeRegion::operator | (const GenomeRegion& rhs) const +{ + + if((this->chromosome_idx != rhs.chromosome_idx) or // on diff chromosomes + (rhs.end-1 < this->start) or // rhs upstream this + (this->end-1 < rhs.start)) // rhs downstream this + { return false ; } + return true ; +} + +bool GenomeRegion::operator < (const GenomeRegion& rhs) const +{ + if(this->chromosome_idx < rhs.chromosome_idx) + { return true ; } + else if((this->chromosome_idx == rhs.chromosome_idx) and + (this->end-1 < rhs.start)) + { return true ; } + return false ; +} + +bool GenomeRegion::operator > (const GenomeRegion& rhs) const +{ if(this->chromosome_idx > rhs.chromosome_idx) + { return true ; } + if((this->chromosome_idx == rhs.chromosome_idx) and + (rhs.end-1 < this->start)) + { return true ; } + return false ; +} + +std::ostream& operator << (std::ostream& stream, const GenomeRegion& region) +{ stream << "( " + << region.chromosome << "/" + << region.chromosome_idx << " " + << region.start << " " + << region.end << " " + << region.length << " ) " ; + return stream ; +} diff --git a/src/GenomicTools/GenomeRegion.hpp b/src/GenomicTools/GenomeRegion.hpp new file mode 100644 index 0000000..b33aeaf --- /dev/null +++ b/src/GenomicTools/GenomeRegion.hpp @@ -0,0 +1,253 @@ +#ifndef GENOMEREGION_HPP +#define GENOMEREGION_HPP + +#include +#include +#include + +/*! + * \brief The GenomeRegion class models a segment along + * a genome. It encompasses one or more base pairs. + * It is characterized by a pair of start/end positions + * defining an open range [start,end) with start being the + * 1st position within the region and end the 1st position + * outside. All coordinates are expected to be 0-based. + * A genome region is unoriented - no strand information - and + * as such, the starting coordinate should always be smaller + * than the end coordinate. + */ +class GenomeRegion +{ + public: + /*! + * \brief Returns an object corresponding to + * the sequenced read. + * The read is assumed to be of proper quality. No check + * is performed. + * \param record an alignment read from a BAM file. + * \param bam_file an open stream to the corresponding + * BAM file. + * \return a region covering the sequenced read alignment. + * \throw std::invalid_argument if a position + * is negative of if end is smaller or equal + * to start. + */ + static GenomeRegion constructRead(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) ; + + /*! + * \brief Returns an object corresponding to the + * transposition site for ATAC-seq data. It corresponds + * to the primary read start position shifted by +4bp (to the + * right) for reads on the + strand and -5bp (to the left) + * for reads on the - strand. + * The read is assumed to be of proper quality. No check + * is performed. + * \\param record an alignment read from a BAM file. + * \param bam_file an open stream to the corresponding + * BAM file. + * \return a region corresponding to the transposition + * site for ATAC-seq reads. + */ + static GenomeRegion constructReadATAC(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) ; + + /*! + * \brief Returns an object corresponding to the + * starting edge of the read. + * The read is assumed to be of proper quality. No check + * is performed. + * \\param record an alignment read from a BAM file. + * \param bam_file an open stream to the corresponding + * BAM file. + * \return a region corresponding to the edgeof the + * read. + */ + static GenomeRegion constructReadEdge(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) ; + + /*! + * \brief Returns an object corresponding to + * the fragment contained by the two sequenced + * reads of the pair (the BAM file should contain + * paired-end reads). + * The read is assumed to be properly paired with + * another reads and no check is performed. + * \param record an alignment read from a BAM file. + * \param bam_file an open stream to the corresponding + * BAM file. + * \return a region covering the sequenced fragment + * alignment. + * \throw std::invalid_argument if a position + * is negative of if end is smaller or equal + * to start. + */ + static GenomeRegion constructFragment(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) ; + + /*! + * \brief Returns an object corresponding to + * the central position of the fragment contained + * by the two sequenced reads of the pair (the BAM + * file should contain paired-end reads). + * The read is assumed to be properly paired with + * another reads and no check is performed. + * \param record an alignment read from a BAM file. + * \param bam_file an open stream to the corresponding + * BAM file. + * \return a region covering the central position of the + * sequenced fragment alignment. + * \throw std::invalid_argument if a position + * is negative of if end is smaller or equal + * to start. + */ + static GenomeRegion constructFragmentCenter(const seqan::BamAlignmentRecord& record, + const seqan::BamFileIn& bam_file) ; + public: + /*! + * Constructs an empty object. + */ + GenomeRegion() = default ; + + /*! + * \brief Copy constructor. + * \param other the other genomic region + */ + GenomeRegion(const GenomeRegion& other) ; + + /*! + * \brief Creates a genomic region at the + * given position. + * \param chromosome the name of the + * chromosome on which the region is. + * \param start the 1st position, 0 based, + * in the region. + * \param end the 1st position, 0 based, + * after the region. + * \param chromosome_idx the index of the + * chromosome on which the region is. + * \throw std::invalid_argument if a position + * is negative of if end is smaller or equal + * to start. + */ + GenomeRegion(const std::string& chromosome, + int chromosome_idx, + int start, + int end) ; + + /*! + * \brief Returns whether both regions + * overlap by at least one position. + * \param other the second region. + * \return whether there is an overlap. + */ + bool overlap(const GenomeRegion& other) const ; + + /*! + * \brief Returns the length of the overlap + * between both regions. + * \param other the second region. + * \return the length of the overlap + * between both regions. + */ + int overlap_len(const GenomeRegion& other) const ; + + /*! + * \brief Assignment operator. + * \param rhs the other region. + * \return a reference to the current object. + */ + GenomeRegion& operator = (const GenomeRegion& rhs) ; + + /*! + * \brief Move assignment operator. + * \param rhs the other region. + * \return a reference to the current object. + */ + GenomeRegion& operator = (GenomeRegion&& rhs) ; + + /*! + * \brief Checks equality. + * \param rhs the other object. + * \return whether both objects are the + * same. + */ + bool operator == (const GenomeRegion& rhs) const ; + + /*! + * \brief Overlap operator. + * Returns whether both regions + * overlap by at least one position. + * \param other the other region. + * \return whether there is an overlap. + */ + bool operator | (const GenomeRegion& rhs) const ; + + /*! + * \brief Is upstream operator. + * Checks if the current object is + * located stricly upstream (no overlap) + * of the other region. + * The chromosome index values are also + * tested. A smaller chromosome value is + * interpreted as upstream. + * \param rhs the other region. + * \return if the current region is upstream, + * without overlapping the other region. + */ + bool operator < (const GenomeRegion& rhs) const ; + + /*! + * \brief Is downstream operator. + * Checks if the current object is + * located stricly downstream (no overlap) + * of the other region. + * The chromosome index values are also + * tested. A smaller chromosome value is + * interpreted as upstream. + * \param rhs the other region. + * \return if the current region is downstream, + * without overlapping the other region. + */ + bool operator > (const GenomeRegion& rhs) const ; + + /*! + * \brief The chromosome name on which the + * region is. This field is present for + * representations aesthetic only and is + * never used otherwise. + */ + std::string chromosome ; + /*! + * \brief The index of the chromosome on which the + * region is. Indexes are used for sorting per + * position. Chromosomes with smaller indexes + * are considered upstream to chromosomes with + * higher indexes. + */ + int chromosome_idx ; + /*! + * \brief The 1st position, 0 based, within the region. + */ + int start ; + /*! + * \brief The 1st position, 0 based, after the region. + */ + int end ; + + /*! + * \brief The number of positions within the region. + */ + int length ; +} ; + +/*! + * \brief Sends a representation of the region to the given + * stream. + * \param stream the stream of interest. + * \param region the region of interest. + * \return a reference to the stream. + */ +std::ostream& operator << (std::ostream& stream, const GenomeRegion& region) ; + +#endif // GENOMEREGION_HPP diff --git a/src/GenomicTools/MatrixCreator.cpp b/src/GenomicTools/MatrixCreator.cpp new file mode 100644 index 0000000..8615563 --- /dev/null +++ b/src/GenomicTools/MatrixCreator.cpp @@ -0,0 +1,209 @@ +#include +#include // std::pair, std::make_pair() +#include +#include + +#include // BedFileIn +#include // BamFileIn, BamAlignmentRecord + +#include +#include +#include + + +std::pair MatrixCreator::get_bin_indices(const GenomeRegion& target, + const std::vector& bins) +{ // the bin range and chromosome + GenomeRegion range(bins.front().chromosome, + bins.front().chromosome_idx, + bins.front().start, + bins.back().end) ; + // no overlap + if(not (target | range)) + { return std::make_pair(0,0) ; } + // overlap + else + { // target goes over all bins + if(target.start <= range.start and + target.end >= range.end) + { return std::make_pair(0, bins.size()) ; } + // partial overlap + else + { int bin_start = -1 ; + int bin_end = -1 ; + int bin_size = bins.front().end - bins.front().start ; + + // start + if(target.start <= range.start) + { bin_start = 0 ; } + else + { bin_start = (target.start - range.start) / bin_size ; } + + // end + if(target.end >= range.end) + { bin_end = bins.size() ; } + else + { bin_end = ((target.end - 1 - range.start) / bin_size) + 1 ; } + return std::make_pair(bin_start, bin_end) ; + } + } +} + + +bool MatrixCreator::is_good_read(const seqan::BamAlignmentRecord& record) +{ + if(seqan::hasFlagUnmapped(record) or // read unmapped flag + seqan::hasFlagQCNoPass(record) or // not passing QC flag + seqan::hasFlagDuplicate(record)) // PCR duplicate flag + { return false ; } + return true ; +} + +bool MatrixCreator::is_good_pair(const seqan::BamAlignmentRecord& record) +{ + if((not seqan::hasFlagMultiple(record)) or // is paired flag + (not seqan::hasFlagAllProper(record))) // each read properly aligned flag + { return false ; } + + if((not seqan::hasFlagFirst(record)) or // read 1st in pair flag + seqan::hasFlagLast(record)) // mate 1st in pair flag + { return false ; } + + // read info + bool read_is_rev = seqan::hasFlagRC(record) ; // read is rev flag + int read_start = record.beginPos ; + // mate info + bool mate_is_rev = seqan::hasFlagNextRC(record) ; // mate is rev flag + int mate_start = record.pNext ; + + // qc + if((not this->is_good_read(record)) or + // --> --> + (not read_is_rev and not mate_is_rev) or + // <-- <-- + (read_is_rev and mate_is_rev) or + // <-- --> 1/2 + ((read_is_rev and not mate_is_rev) and (read_start < mate_start)) or + // <-- --> 2/2 + ((not read_is_rev and mate_is_rev) and (read_start > mate_start))) + { return false ; } + return true ; +} + +MatrixCreator::MatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method) + : from(from), to(to), bin_size(bin_size), + method(method), + relative_bin_coord(), + bed_path(bed_file_path), + bam_path(bam_file_path), + bai_path(bai_file_path), + bed_file(), + bam_file(), + bai_file(), + chrom_map_names(), + matrix_counts(), + matrix_bins() + +{ if(this->method != MatrixCreator::methods::FRAGMENT and + this->method != MatrixCreator::methods::FRAGMENT_CENTER and + this->method != MatrixCreator::methods::READ and + this->method != MatrixCreator::methods::READ_ATAC) + { throw std::invalid_argument("Error! Unrecognized method!") ; } + +} + + +/* Initialize Histogram (table) */ +/* The windows or bins are placed such that one window will be + centered at pos 0 (odd window size), -0.5 even (window size). + The whole range [$from,$to] will be shortened to an integer + number of window sizes. + + Example: $from = -20, $to = 20, $ win =5; + Windows: [-17,-13], [-12,-8], [-7,-3], [-2,2], [3,7], [8,12], [13,17] + New range: $from = -17, $to =17 +*/ +void MatrixCreator::compute_relative_bin_coord() +{ + int l5_p = 0 ; + int l3_p = 0 ; + + /* begin (xb), end (xe), and center position (xe) of window near 0 */ + int xb = -this->bin_size/2; ; + int xe = xb + this->bin_size - 1 ; + // int xc = (xb + xe)/2 ; // unused + + if (this->from > xb) + { l5_p = (this->from - xb)/this->bin_size + 1 ; } + else + { l5_p = -(xb - this->from)/this->bin_size ; } + if (this->to >= xe) + { l3_p = (this->to - xe)/this->bin_size ; } + else + { l3_p = -(xe - this->to)/this->bin_size + 1 ; } + + /* New range */ + this->from = xb + l5_p * this->bin_size; + this->to = xe + l3_p * this->bin_size; + + // contains the bin coordinate limits [from,to) + // from is the 1st position within the bin and to the + // first position after the bin. + size_t n_bin = ((this->to-this->from)/this->bin_size) + 1 ; + + this->relative_bin_coord = v_pair(n_bin) ; + + int inf = this->from ; + int sup = inf + this->bin_size - 1 ; + for(size_t i=0; inf<=to; inf+=this->bin_size, sup+=this->bin_size, i++) + { this->relative_bin_coord[i] = std::make_pair(inf, sup+1) ; } +} + +bool MatrixCreator::is_valid_chromosome(const seqan::BamAlignmentRecord& record) +{ + std::string name = seqan::toCString( + seqan::getContigName( + record, this->bam_file)) ; + + if(this->chrom_map_names.find(name) == this->chrom_map_names.end()) + { return false ; } + return true ; +} + +void MatrixCreator::open_bam_file() +{ if(not seqan::open(this->bam_file, this->bam_path.c_str())) + { char msg[4096] ; + sprintf(msg, "cannot open %s", this->bam_path.c_str()) ; + throw std::runtime_error(msg) ; + } +} + +void MatrixCreator::open_bai_file() +{ if(not seqan::open(this->bai_file, this->bai_path.c_str())) + { char msg[4096] ; + sprintf(msg, "cannot open %s", this->bai_path.c_str()) ; + throw std::runtime_error(msg) ; + } +} + +void MatrixCreator::open_bed_file() +{ if(not seqan::open(this->bed_file, this->bed_path.c_str())) + { char msg[4096] ; + sprintf(msg, "cannot open %s", this->bed_path.c_str()) ; + throw std::runtime_error(msg) ; + } +} + +void MatrixCreator::close_bam_file() +{ seqan::close(this->bam_file) ; +} + +void MatrixCreator::close_bed_file() +{ seqan::close(this->bed_file) ; +} diff --git a/src/GenomicTools/MatrixCreator.hpp b/src/GenomicTools/MatrixCreator.hpp new file mode 100644 index 0000000..2e8947c --- /dev/null +++ b/src/GenomicTools/MatrixCreator.hpp @@ -0,0 +1,297 @@ +#ifndef MATRIXCREATOR_HPP +#define MATRIXCREATOR_HPP + +#include +#include // std::pair, std::make_pair() +#include +#include + +#include // BedFileIn +#include // BamFileIn, BamAlignmentRecord + +#include +#include +#include + + +class MatrixCreator +{ + public: + /*! + * \brief A list of values indicating how the data + * should be handled when counting the number of + * fragments mapped in a given bin. + * + * FRAGMENT : all positions within a fragment are + * accounted for and attributed to the + * corresponding bins : + * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- pair of reads + * ||||| |||||| scoring positions + * bin1 gets a score of 5 and bin2 a + * score of 6. + * + * FRAGMENT_CENTER : only the central position + * within a fragment is accounted for and + * attributed to the corresponding bin : + * * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- pair of reads + * | | scoring positions + * bin1 gets a score of 1 and bin2 also. + * + * READ : all positions within a read are + * accounted for and attributed to the + * corresponding bins : + * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- reads + * | ||| ||| ||| scoring positions + * bin1 gets a score of 4 and bin2 a + * score of 6. + * + * READ_ATAC : only the shifted start + * of the reads are used. Additionally, the + * start position is shifted by +4bp(towards + * the right) for reads on the + strand and + * -5bp for reads on the - strand (towards the + * left). These positions indicate the insertion + * position in ATAC-seq data. + * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- reads + * | | | scoring positions + * bin1 gets a score of 1 and bin2 a + * score of 2. + */ + enum methods {FRAGMENT=0, + FRAGMENT_CENTER, + READ, + READ_ATAC} ; + + public: + /*! + * \brief Computes which bins (from a contiguous + * range of bins) are overlapped by a given target + * and returns two indices corresponding to : + * i) the index of the 1st bin overlapped by the + * target + * ii) the index of the past last bin overlapepd + * by the target. + * If the target does not overlapp any bin (it is + * located upstream the 1st bin, downstream the + * last bin or on a different chromosome), the + * index pair 0,0 is returned. + * Thus, in any case, a loop of the type + * for(i=first,i get_bin_indices(const GenomeRegion& target, + const std::vector& bins) ; + + /*! + * \brief Checks that the read is i) is mapped + * , ii) passes QC and iii) is not a duplicate, + * based on the flag value. + * \param read the read of interest. + * \return whether the read passes the above tests. + */ + bool is_good_read(const seqan::BamAlignmentRecord& read) ; + + /*! + * \brief Checks that the read is i) a good read, ii) + * a paired read, iii) proplery aligned, iv) the 1st + * of the pair based on the flag values and that + * v) they forms a proper fragment with its mate mate + * (both read should point toward one other). + * \param read the read of interest. + * \return whether the read and its mate form a proper + * fragment. + */ + bool is_good_pair(const seqan::BamAlignmentRecord& read) ; + + public: + + MatrixCreator() = delete ; + + /*! + * \brief Constructs an object to create + * a genomic count matrix. + * \param bed_file_path the path to the file containing + * the references. + * \param bam_file_path the path to the file containing + * the targets. + * \param bai_file_path the path to index file of the bam + * file containing the targets. + * \param from the downstream most position + * to consider, relative to a set of genomic + * positions. + * \param to the upstream most position to + * consider, relative to a set of genomic + * positions + * \param bin_size the size of the bins in + * which the regions encompassing the set + * of genomic positions will be broken + * into. + * \param method how the sequenced fragments + * should be consider when assigning counts + * to the bins. + */ + MatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method) ; + + ~MatrixCreator() = default ; + + /*! + * \brief Creates and return the count matrix. + * \return the count matrix. + */ + virtual Matrix2D create_matrix() = 0 ; + + protected: + /*! + * \brief Binarize the given range [from,to] into + * equal sized bins having the specified size. + * The bin coordinates are stored in bin_coord as + * pairs of [start,end) coordinates. One bin is + * centered on +/- 0. + * + */ + void compute_relative_bin_coord() ; + + /*! + * \brief Checks whether a record has a valid chromosome, + * that is whether this chromosome has been found in the + * bed file has well. + * \param record a record from the bam file. + * \return whether the record chromosome is valid. + */ + bool is_valid_chromosome(const seqan::BamAlignmentRecord& record) ; + + /*! + * \brief Opens the bam file. + * \throw std::runtime_error if the file cannot + * be open. + */ + void open_bam_file() ; + + /*! + * \brief Opens the bam index file. + * \throw std::runtime_error if the file cannot + * be open. + */ + void open_bai_file() ; + + /*! + * \brief Opens the bed file. + * \throw std::runtime_error if the file cannot + * be open. + */ + void open_bed_file() ; + + /*! + * \brief Closes the bam file. + * Does nothing if already closed. + */ + void close_bam_file() ; + + /*! + * \brief Closes the bed file. + * Does nothing if already closed. + */ + void close_bed_file() ; + + /*! + * \brief The smallest relative coordinate for the bins (included). + */ + int from ; + /*! + * \brief The biggest relative coordinate for the bins (not included). + */ + int to ; + /*! + * \brief The bin size. + */ + int bin_size ; + /*! + * \brief How to consider the sequenced fragments when computing + * the bin values. + */ + MatrixCreator::methods method ; + /*! + * \brief The relative bin coordinates, compared to a given + * position. Each bin has a pair [from,to) where is the + * 1st position within the bin and is the 1st position + * after the bin. One bin is centered on +/- 0. + */ + v_pair relative_bin_coord ; + /*! + * \brief Bed file path. + */ + std::string bed_path ; + /*! + * \brief Bam file path. + */ + std::string bam_path ; + /*! + * \brief Bam index file path. + */ + std::string bai_path ; + /*! + * \brief An input stream to the + * bed file. + * Use open_bed_file() to open the stream + * and close_bed_file() to close it. + */ + seqan::BedFileIn bed_file ; + /*! + * \brief An input stream to the + * bam file. + * Use open_bam_file() to open the stream + * and close_bam_file() to close it. + */ + seqan::BamFileIn bam_file; + /*! + * \brief An input stream to the + * bam index file. + * Use open_bai_file() to open the stream + * and close_bai_file() to close it. + */ + seqan::BamIndex bai_file ; + /*! + * \brief A map containing the valid chromsome + * names as keys and their indices (as found + * in the BAM header) as values. + */ + std::unordered_map chrom_map_names ; + /*! + * \brief A matrix containing the number of targets + * found at each position around each reference. + * This is the data structure to fill. + */ + Matrix2D matrix_counts ; + /*! + * \brief A vector containing containing, + * for each reference, the coordinates of + * the genomic region covered by the bins. + */ + std::vector> matrix_bins ; +} ; + + +#endif // MATRIXCREATOR_HPP + + diff --git a/src/GenomicTools/MatrixCreatorParallel.cpp b/src/GenomicTools/MatrixCreatorParallel.cpp new file mode 100644 index 0000000..28b3aa7 --- /dev/null +++ b/src/GenomicTools/MatrixCreatorParallel.cpp @@ -0,0 +1,221 @@ +#include +#include // std::pair, std::make_pair() +#include +#include + +#include // BedFileIn +#include // BamFileIn, BamAlignmentRecord + +#include +#include +#include + + +std::pair MatrixCreator::get_bin_indices(const GenomeRegion& target, + const std::vector& bins) +{ // the bin range and chromosome + GenomeRegion range(bins.front().chromosome, + bins.front().chromosome_idx, + bins.front().start, + bins.back().end) ; + // no overlap + if(not (target | range)) + { return std::make_pair(0,0) ; } + // overlap + else + { // target goes over all bins + if(target.start <= range.start and + target.end >= range.end) + { return std::make_pair(0, bins.size()) ; } + // partial overlap + else + { int bin_start = -1 ; + int bin_end = -1 ; + int bin_size = bins.front().end - bins.front().start ; + + // start + if(target.start <= range.start) + { bin_start = 0 ; } + else + { bin_start = (target.start - range.start) / bin_size ; } + + // end + if(target.end >= range.end) + { bin_end = bins.size() ; } + else + { bin_end = ((target.end - 1 - range.start) / bin_size) + 1 ; } + return std::make_pair(bin_start, bin_end) ; + } + } +} + + +bool MatrixCreator::is_good_read(const seqan::BamAlignmentRecord& record) +{ + if(seqan::hasFlagUnmapped(record) or // read unmapped flag + seqan::hasFlagQCNoPass(record) or // not passing QC flag + seqan::hasFlagDuplicate(record)) // PCR duplicate flag + { return false ; } + return true ; +} + +bool MatrixCreator::is_good_pair(const seqan::BamAlignmentRecord& record) +{ + if((not seqan::hasFlagMultiple(record)) or // is paired flag + (not seqan::hasFlagAllProper(record))) // each read properly aligned flag + { return false ; } + + if((not seqan::hasFlagFirst(record)) or // read 1st in pair flag + seqan::hasFlagLast(record)) // mate 1st in pair flag + { return false ; } + + // read info + bool read_is_rev = seqan::hasFlagRC(record) ; // read is rev flag + int read_start = record.beginPos ; + // mate info + bool mate_is_rev = seqan::hasFlagNextRC(record) ; // mate is rev flag + int mate_start = record.pNext ; + + // qc + if((not this->is_good_read(record)) or + // --> --> + (not read_is_rev and not mate_is_rev) or + // <-- <-- + (read_is_rev and mate_is_rev) or + // <-- --> 1/2 + ((read_is_rev and not mate_is_rev) and (read_start < mate_start)) or + // <-- --> 2/2 + ((not read_is_rev and mate_is_rev) and (read_start > mate_start))) + { return false ; } + return true ; +} + +MatrixCreator::MatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method, + size_t n_threads) + : from(from), to(to), bin_size(bin_size), + method(method), + relative_bin_coord(), + bed_path(bed_file_path), + bam_path(bam_file_path), + bai_path(bai_file_path), + bed_files(n_threads), + bam_files(n_threads), + bai_files(n_threads), + chrom_map_names(), + matrix_counts(), + matrix_bins(), + n_threads(n_threads), + threads(n_threads) + +{ if(this->method != MatrixCreator::methods::FRAGMENT and + this->method != MatrixCreator::methods::FRAGMENT_CENTER and + this->method != MatrixCreator::methods::READ and + this->method != MatrixCreator::methods::READ_ATAC) + { throw std::invalid_argument("Error! Unrecognized method!") ; } + +} + + +/* Initialize Histogram (table) */ +/* The windows or bins are placed such that one window will be + centered at pos 0 (odd window size), -0.5 even (window size). + The whole range [$from,$to] will be shortened to an integer + number of window sizes. + + Example: $from = -20, $to = 20, $ win =5; + Windows: [-17,-13], [-12,-8], [-7,-3], [-2,2], [3,7], [8,12], [13,17] + New range: $from = -17, $to =17 +*/ +void MatrixCreator::compute_relative_bin_coord() +{ + int l5_p = 0 ; + int l3_p = 0 ; + + /* begin (xb), end (xe), and center position (xe) of window near 0 */ + int xb = -this->bin_size/2; ; + int xe = xb + this->bin_size - 1 ; + // int xc = (xb + xe)/2 ; // unused + + if (this->from > xb) + { l5_p = (this->from - xb)/this->bin_size + 1 ; } + else + { l5_p = -(xb - this->from)/this->bin_size ; } + if (this->to >= xe) + { l3_p = (this->to - xe)/this->bin_size ; } + else + { l3_p = -(xe - this->to)/this->bin_size + 1 ; } + + /* New range */ + this->from = xb + l5_p * this->bin_size; + this->to = xe + l3_p * this->bin_size; + + // contains the bin coordinate limits [from,to) + // from is the 1st position within the bin and to the + // first position after the bin. + size_t n_bin = ((this->to-this->from)/this->bin_size) + 1 ; + + this->relative_bin_coord = v_pair(n_bin) ; + + int inf = this->from ; + int sup = inf + this->bin_size - 1 ; + for(size_t i=0; inf<=to; inf+=this->bin_size, sup+=this->bin_size, i++) + { this->relative_bin_coord[i] = std::make_pair(inf, sup+1) ; } +} + +bool MatrixCreator::is_valid_chromosome(const seqan::BamAlignmentRecord& record, + size_t thread_idx) +{ + std::string name = seqan::toCString( + seqan::getContigName( + record, this->bam_files[thread_idx])) ; + + if(this->chrom_map_names.find(name) == this->chrom_map_names.end()) + { return false ; } + return true ; +} + +void MatrixCreator::open_bam_files() +{ for(auto& bam_file : this->bam_files) + { if(not seqan::open(bam_file, this->bam_path.c_str())) + { char msg[4096] ; + sprintf(msg, "cannot open %s", this->bam_path.c_str()) ; + throw std::runtime_error(msg) ; + } + } +} + +void MatrixCreator::open_bai_files() +{ for(auto& bai_file : this->bai_files) + { if(not seqan::open(bai_file, this->bai_path.c_str())) + { char msg[4096] ; + sprintf(msg, "cannot open %s", this->bai_path.c_str()) ; + throw std::runtime_error(msg) ; + } + } +} + +void MatrixCreator::open_bed_files() +{ for(auto& bed_file : this->bed_files) + { if(not seqan::open(bed_file, this->bed_path.c_str())) + { char msg[4096] ; + sprintf(msg, "cannot open %s", this->bed_path.c_str()) ; + throw std::runtime_error(msg) ; + } + } +} + +void MatrixCreator::close_bam_files() +{ for(auto& bam_file : this->bam_files) + { seqan::close(bam_file) ; } +} + +void MatrixCreator::close_bed_files() +{ for(auto& bed_file : this->bed_files) + { seqan::close(bed_file) ; } +} diff --git a/src/GenomicTools/MatrixCreatorParallel.hpp b/src/GenomicTools/MatrixCreatorParallel.hpp new file mode 100644 index 0000000..690fe5a --- /dev/null +++ b/src/GenomicTools/MatrixCreatorParallel.hpp @@ -0,0 +1,316 @@ +#ifndef MATRIXCREATOR_HPP +#define MATRIXCREATOR_HPP + +#include +#include // std::pair, std::make_pair() +#include +#include + +#include // BedFileIn +#include // BamFileIn, BamAlignmentRecord + +#include +#include +#include +#include + + +class MatrixCreator +{ + public: + /*! + * \brief A list of values indicating how the data + * should be handled when counting the number of + * fragments mapped in a given bin. + * + * FRAGMENT : all positions within a fragment are + * accounted for and attributed to the + * corresponding bins : + * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- pair of reads + * ||||| |||||| scoring positions + * bin1 gets a score of 5 and bin2 a + * score of 6. + * + * FRAGMENT_CENTER : only the central position + * within a fragment is accounted for and + * attributed to the corresponding bin : + * * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- pair of reads + * | | scoring positions + * bin1 gets a score of 1 and bin2 also. + * + * READ : all positions within a read are + * accounted for and attributed to the + * corresponding bins : + * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- reads + * | ||| ||| ||| scoring positions + * bin1 gets a score of 4 and bin2 a + * score of 6. + * + * READ_ATAC : only the shifted start + * of the reads are used. Additionally, the + * start position is shifted by +4bp(towards + * the right) for reads on the + strand and + * -5bp for reads on the - strand (towards the + * left). These positions indicate the insertion + * position in ATAC-seq data. + * bin1 bin2 + * ----|-------|-------|------------> genome + * ------- ------- fragments + * --> <-- --> <-- reads + * | | | scoring positions + * bin1 gets a score of 1 and bin2 a + * score of 2. + */ + enum methods {FRAGMENT=0, + FRAGMENT_CENTER, + READ, + READ_ATAC} ; + + public: + /*! + * \brief Computes which bins (from a contiguous + * range of bins) are overlapped by a given target + * and returns two indices corresponding to : + * i) the index of the 1st bin overlapped by the + * target + * ii) the index of the past last bin overlapepd + * by the target. + * If the target does not overlapp any bin (it is + * located upstream the 1st bin, downstream the + * last bin or on a different chromosome), the + * index pair 0,0 is returned. + * Thus, in any case, a loop of the type + * for(i=first,i get_bin_indices(const GenomeRegion& target, + const std::vector& bins) ; + + /*! + * \brief Checks that the read is i) is mapped + * , ii) passes QC and iii) is not a duplicate, + * based on the flag value. + * \param read the read of interest. + * \return whether the read passes the above tests. + */ + bool is_good_read(const seqan::BamAlignmentRecord& read) ; + + /*! + * \brief Checks that the read is i) a good read, ii) + * a paired read, iii) proplery aligned, iv) the 1st + * of the pair based on the flag values and that + * v) they forms a proper fragment with its mate mate + * (both read should point toward one other). + * \param read the read of interest. + * \return whether the read and its mate form a proper + * fragment. + */ + bool is_good_pair(const seqan::BamAlignmentRecord& read) ; + + public: + + MatrixCreator() = delete ; + + /*! + * \brief Constructs an object to create + * a genomic count matrix. + * \param bed_file_path the path to the file containing + * the references. + * \param bam_file_path the path to the file containing + * the targets. + * \param bai_file_path the path to index file of the bam + * file containing the targets. + * \param from the downstream most position + * to consider, relative to a set of genomic + * positions. + * \param to the upstream most position to + * consider, relative to a set of genomic + * positions + * \param bin_size the size of the bins in + * which the regions encompassing the set + * of genomic positions will be broken + * into. + * \param method how the sequenced fragments + * should be consider when assigning counts + * to the bins. + * \param n_threads the number of working + * threads. + */ + MatrixCreator(const std::string& bed_file_path, + const std::string& bam_file_path, + const std::string& bai_file_path, + int from, + int to, + int bin_size, + MatrixCreator::methods method, + size_t n_threads) ; + + ~MatrixCreator() = default ; + + /*! + * \brief Creates and return the count matrix. + * \return the count matrix. + */ + virtual Matrix2D create_matrix() = 0 ; + + protected: + /*! + * \brief Binarize the given range [from,to] into + * equal sized bins having the specified size. + * The bin coordinates are stored in bin_coord as + * pairs of [start,end) coordinates. One bin is + * centered on +/- 0. + * + */ + void compute_relative_bin_coord() ; + + /*! + * \brief Checks whether a record has a valid chromosome, + * that is whether this chromosome has been found in the + * bed file has well. + * \param record a record from the bam file. + * \param thread_idx the thread index/number. This specifies + * which stream in the vector streams to use. + * \return whether the record chromosome is valid. + */ + bool is_valid_chromosome(const seqan::BamAlignmentRecord& record, + size_t thread_idx) ; + + /*! + * \brief Opens the bam files. + * \throw std::runtime_error if the file cannot + * be open. + */ + void open_bam_files() ; + + /*! + * \brief Opens the bam index files. + * \throw std::runtime_error if the file cannot + * be open. + */ + void open_bai_files() ; + + /*! + * \brief Opens the bed files. + * \throw std::runtime_error if the file cannot + * be open. + */ + void open_bed_files() ; + + /*! + * \brief Closes the bam files. + * Does nothing if already closed. + */ + void close_bam_files() ; + + /*! + * \brief Closes the bed files. + * Does nothing if already closed. + */ + void close_bed_files() ; + + /*! + * \brief The smallest relative coordinate for the bins (included). + */ + int from ; + /*! + * \brief The biggest relative coordinate for the bins (not included). + */ + int to ; + /*! + * \brief The bin size. + */ + int bin_size ; + /*! + * \brief How to consider the sequenced fragments when computing + * the bin values. + */ + MatrixCreator::methods method ; + /*! + * \brief The relative bin coordinates, compared to a given + * position. Each bin has a pair [from,to) where is the + * 1st position within the bin and is the 1st position + * after the bin. One bin is centered on +/- 0. + */ + v_pair relative_bin_coord ; + /*! + * \brief Bed file path. + */ + std::string bed_path ; + /*! + * \brief Bam file path. + */ + std::string bam_path ; + /*! + * \brief Bam index file path. + */ + std::string bai_path ; + /*! + * \brief A vector of input stream to the + * same bed file. + * The purpose is to allow multiple threads + * to read the same file at the same time. + * Use open_bed_files() to open the streams + * and close_bed_files() to close them. + */ + std::vector bed_files ; + /*! + * \brief A vector of input stream to the + * same bam file. + * The purpose is to allow multiple threads + * to read the same file at the same time. + * Use open_bam_files() to open the streams + * and close_bam_files() to close them. + */ + std::vector bam_files; + /*! + * \brief A vector of input stream to the + * same bam index file. + * The purpose is to allow multiple threads + * to read the same file at the same time. + * Use open_bai_files() to open the streams + * and close_bai_files() to close them. + */ + std::vector> bai_files ; + /*! + * \brief A map containing the valid chromsome + * names as keys and their indices (as found + * in the BAM header) as values. + */ + std::unordered_map chrom_map_names ; + /*! + * \brief A matrix containing the number of targets + * found at each position around each reference. + * This is the data structure to fill. + */ + Matrix2D matrix_counts ; + /*! + * \brief A vector containing containing, + * for each reference, the coordinates of + * the genomic region covered by the bins. + */ + std::vector> matrix_bins ; + /*! + * \brief The number of working threads. + */ + size_t n_threads ; + + ThreadPool threads ; +} ; + + +#endif // MATRIXCREATOR_HPP + + diff --git a/src/GenomicTools/typedef.hpp b/src/GenomicTools/typedef.hpp new file mode 100644 index 0000000..de1ef36 --- /dev/null +++ b/src/GenomicTools/typedef.hpp @@ -0,0 +1,9 @@ +#ifndef TYPEDEFGENOMICTOOL_HPP +#define TYPEDEFGENOMICTOOL_HPP + +#include // std::vector +#include // std::pair + +typedef std::vector> v_pair ; + +#endif // TYPEDEFGENOMICTOOL_HPP diff --git a/src/Matrix/Matrix.hpp b/src/Matrix/Matrix.hpp index 4f37a92..fa13945 100644 --- a/src/Matrix/Matrix.hpp +++ b/src/Matrix/Matrix.hpp @@ -1,654 +1,654 @@ #ifndef MATRIX_HPP #define MATRIX_HPP #include #include // accumulate() #include #include // setw(), setprecision(), fixed #include // out_of_range, invalid_argument #include // swap()f /*! * \brief The Matrix class is a generic class to store data in a matrix. * The matrix dimensionality can be any value : 1 is a vector, 2 is a regular * 2D matrix, 3 is a 3D matrix, etc. * * In order to store the data properly and to perform all operations smoothly, the * internal representation format differs from the "usual format". That is : the user * provides coordinates as (x,y,z,...) where x referes to the row number, y to * the column number, z the the z slice, etc. * Internally however, x corresponds to the column number and y to the row number. * Every other dimension has the same meaning. * * Internal representation : * * Here is an example of a 2x3 matrix (2D) * * {0,1,2,3,4,5} vector is turned to * X * ----------> * 0 1 2 | * 3 4 5 | Y * \|/ * * dimensions are stored as {nx, ny} which corresponds to {ncol, nrow}. Coordinates * are given using the universal format coord=(x,y) which are interpreted as {row, col}. * Thus a simple swap(coord[0],coord[1]) should be performed to ensurethat the user given * coordinates can be used in this referencial. * * * Here is an example of a 2x3x2x2 matrix(4D) * {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23} is turned to * * X * -----------> | | * 0 1 2 | | | * 3 4 5 | Y | | * \|/ | Z | * 6 7 8 | | | * 9 10 11 | Y | | * \|/ \|/ | * | A * 12 13 14 | | | * 15 16 17 | Y | | * \|/ | Z | * 18 19 20 | | | * 21 22 23 | Y | | * \|/ \|/ \|/ * * dimensions are stored as {nx, ny, nz, na} which corredponds to {ncol, nrow, nz, na}. * Coordinates are given using the universal format coord=(x,y,z,a) which are interpreted * as {row, col, z, a}. Thus a simple swap(coord[0],coord[1]) should be performed to ensure * that the user given coordinates can be used in this referencial. * * */ template class Matrix { public: // constructors Matrix() = default ; /*! * \brief Constructs an matrix with the given dimension with * 0 values. * \param dim the dimensions. */ Matrix(const std::vector& dim) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim the dimensions. * \param value the value to initialize the matrix content * with. */ Matrix(const std::vector& dim, T value) ; /*! * \brief Copy constructor. * \param other the matrix to copy. */ Matrix (const Matrix& other) ; /*! * \brief Destructor. */ virtual ~Matrix() = default ; // methods /*! * \brief Gets the element at the given offset. * \param offset the offset of the element to get. * \throw std::out_of_range exception if the offset * is out of range. * \return the element. */ - T get(size_t offset) const throw(std::out_of_range) ; + T get(size_t offset) const ; /*! * \brief Gets the element at the given coordinates. * \param coord the coordinates of the element to get. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ - T get(const std::vector& coord) const throw(std::out_of_range) ; + T get(const std::vector& coord) const ; /*! * \brief Sets the element at the given offset * to the given value. * \param offset the offset of the element to set. * \param value the new value. * \throw std::out_of_range exception if the offset * is out of range. */ - void set(size_t offset, T value) throw(std::out_of_range) ; + void set(size_t offset, T value) ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param coord the coordinates of the element to set. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ - void set(const std::vector& coord, T value) throw(std::out_of_range) ; + void set(const std::vector& coord, T value) ; /*! * \brief Gets the matrix dimensions. * \return the dimensions. */ std::vector get_dim() const ; /*! * \brief Gets the data vector. * \return a a vector containing the data. */ std::vector get_data() ; /*! * \brief Gets the number of dimensions (the length * of the dimension vector). * \return the number of dimensions */ size_t get_dim_size() const ; /*! * \brief Gets the number of elements contained in the * matrix. * \return the number of element contained in the * matrix. */ size_t get_data_size() const ; /*! * \brief Returns the partial products of the dimensions. * \return the partial products of the dimensions. */ std::vector get_dim_product() const ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stram, size_t precision=4, size_t width=8, char sep=' ') const ; // operator /*! * \brief Assignment operator. * \param other an other matrix to copy the values from. * \return a reference to the current instance. */ Matrix& operator = (const Matrix& other) ; /*! * \brief Adds value to each element. * \param value the value to add. * \return a reference to the instance. */ Matrix& operator += (T value) ; /*! * \brief Substracts value to each element. * \param value the value to substract. * \return a reference to the instance. */ Matrix& operator -= (T value) ; /*! * \brief Multiplies each element by value. * \param value the value to multiply the elements by. * \return a reference to the instance. */ Matrix& operator *= (T value) ; /*! * \brief Divides each element by value. * \param value the value to multiply the elements by. * \throw std::invalid_argument if value is 0. * \return a reference to the instance. */ - Matrix& operator /= (T value) throw (std::invalid_argument) ; + Matrix& operator /= (T value) ; /*! * \brief Comparison operator, returns true if * both matrices are identical, that is do not * have the same data and dimensions. * \param other an other matrix. * \return true if both matrices have the same * data and dimensions. */ bool operator == (const Matrix& other) const ; /*! * \brief Comparison operator, returns true if * both matrices are different, that is do not * have the same data and dimensions. * \param other an other matrix. * \return true if both matrices are different. */ bool operator != (const Matrix& other) const ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param coord coord the coordinates of the element to get. * \return a reference to this element. */ T& operator () (const std::vector& coord) ; /*! * \brief Returns a const reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param coord coord the coordinates of the element to get. * \return a const reference to this element. */ const T& operator () (const std::vector& coord) const ; protected: // methods /*! * \brief Computes the partial dimension products and fills * this->dim_prod according to the current values of * this->_dim and this->dim_size. */ void compute_dim_product() ; /*! * \brief Given a vector of at least 2 dimensional coordinates, * it simply swaps the elements at index 0 (row number) and 1 * (column number) to make them fit the x,y,... matrix * reprensetation (x:number of columns, y:number of rows). * \param coord a vector of coordinates (row, column, ...). * \return a vector of coordinates corresponding to (x,y,...). */ std::vector swap_coord(const std::vector& coord) const ; /*! * \brief Complementary function of convert_coord(). Given * a vector of coordinates in (x,y,...) format, it turns it * into (row,col,...) format. * \param coord a vector of coordinates (x,y, ...). * \return a vector of coordinates corresponding to (row,col,...). */ std::vector convert_coord_back(const std::vector& coord) const ; /*! * \brief Checks whether a given offset is a valid offset or * whether it is out of range. * \param offset the offset to check. * \return whether the offset is valid. */ bool is_valid(size_t offset) const ; /*! * \brief Checks whether coordinates in (x,y,...) format are * valid or whether they are out of range. * \param offset the offset to check. * \return whether the offset is valid. */ bool is_valid(const std::vector& coord) const ; /*! * \brief Converts a vector of VALID (x,y,...) coordinates to a * the corresponding offset allowing to get an element in the * data vector. * If the coordinate vector has a (row, column, ...) format, the * result will be wrong. * \param coord a vector of coordinates with (x,y,...) format. * \return the corresponding offset. */ size_t convert_to_offset(const std::vector& coord) const ; /*! * \brief Complementary function of convert_to_offset(). Given an * offset, this function returns the corresponding coordinate * vector in (x,y,...) format. * \param offset a given offset. * \return the corresponding vector of (x,y,..) coordinates. */ std::vector convert_to_coord(size_t offset) const ; // fields /*! * \brief The dimensions values. */ std::vector _dim ; /*! * \brief Stores the data. */ std::vector _data ; /*! * \brief The number of dimensions. */ size_t _dim_size ; /*! * \brief The number of data elements stored. */ size_t _data_size ; /*! * \brief Contains the partial product of the dimensions. That is, * the ith element contains the product of all the i-1 precedent * dimensions : * element 0 : 1, element 1 : x, element 2 : x*y, element 3 : x*y*z, * and so one. * This is used for coordinates to offset and offset to coordinates * conversions. */ std::vector _dim_prod ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix operator + (Matrix m, T value) { Matrix other(m) ; other += value ; return other ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix operator - (Matrix m, T value) { Matrix other(m) ; other -= value ; return other ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix operator * (Matrix m, T value) { Matrix other(m) ; other *= value ; return other ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template -const Matrix operator / (Matrix m, T value) throw (std::invalid_argument) +const Matrix operator / (Matrix m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix& m) { m.print(stream) ; return stream ; } // method implementation template Matrix::Matrix(const std::vector& dim) : Matrix(dim, 0) {} template Matrix::Matrix(const std::vector& dim, T value) { this->_dim_size = dim.size() ; this->_dim = this->swap_coord(dim) ; this->_data_size = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; this->_data = std::vector(this->_data_size, value) ; this->compute_dim_product() ; } template Matrix::Matrix(const Matrix &other) { *this = other ; } template -T Matrix::get(size_t offset) const throw(std::out_of_range) +T Matrix::get(size_t offset) const { if(not this->is_valid(offset)) { throw std::out_of_range("offset is out of range!") ; } return this->_data[offset] ; } template -T Matrix::get(const std::vector& coord) const throw(std::out_of_range) +T Matrix::get(const std::vector& coord) const { std::vector coord_new = this->swap_coord(coord) ; if(not this->is_valid(coord_new)) { throw std::out_of_range("coordinates are out of range!") ; } return this->_data[this->convert_to_offset(coord_new)] ; } template -void Matrix::set(size_t offset, T value) throw(std::out_of_range) +void Matrix::set(size_t offset, T value) { if(not this->is_valid(offset)) { throw std::out_of_range("offset is out of range!") ; } this->_data[offset] = value ; } template -void Matrix::set(const std::vector& coord, T value) throw(std::out_of_range) +void Matrix::set(const std::vector& coord, T value) { std::vector coord_new = this->swap_coord(coord) ; if(not this->is_valid(coord_new)) { throw std::out_of_range("coordinates are out of range!") ; } this->_data[this->convert_to_offset(coord_new)] = value ; } template std::vector Matrix::get_dim() const { return this->swap_coord(this->_dim) ; } template std::vector Matrix::get_data() { return this->_data ; } template size_t Matrix::get_dim_size() const { return this->_dim_size ; } template size_t Matrix::get_data_size() const { return this->_data_size ; } template std::vector Matrix::get_dim_product() const { return this->_dim_prod ; } template void Matrix::print(std::ostream& stream, size_t precision, size_t width, char sep) const { stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; for(size_t i=0; iget_data_size(); i++) { stream << std::setw(width) << this->get(i) << sep ; } } template Matrix& Matrix::operator = (const Matrix& other) { this->_dim = other._dim ; this->_dim_size = other._dim_size ; this->_data = other._data ; this->_data_size = other._data_size ; this->_dim_prod = other._dim_prod ; return *this ; } template Matrix& Matrix::operator += (T value) { for(auto& i : this->_data) { i += value ; } return *this ; } template Matrix& Matrix::operator -= (T value) { for(auto& i : this->_data) { i -= value ; } return *this ; } template Matrix& Matrix::operator *= (T value) { for(auto& i : this->_data) { i *= value ; } return *this ; } template -Matrix& Matrix::operator /= (T value) throw (std::invalid_argument) +Matrix& Matrix::operator /= (T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } for(auto& i : this->_data) { i /= value ; } return *this ; } template bool Matrix::operator == (const Matrix& other) const { if(&other == this) { return true ; } // check dim if(this->_dim_size != other._dim_size) { return false ; } for(size_t i=0; i_dim_size; i++) { if(this->_dim[i] != other._dim[i]) { return false ; } } // check data if(this->_data_size != other._data_size) { return false ; } for(size_t i=0; i_data_size; i++) { if(this->_data[i] != other._data[i]) { return false ; } } return true ; } template bool Matrix::operator !=(const Matrix& other) const { return not ((*this) == other) ;} template T& Matrix::operator () (const std::vector& coord) { std::vector coord_new = this->swap_coord(coord) ; return this->_data[this->convert_to_offset(coord_new)] ; } template const T& Matrix::operator () (const std::vector& coord) const { std::vector coord_new = this->swap_coord(coord) ; return this->_data[this->convert_to_offset(coord_new)] ; } template void Matrix::compute_dim_product() { this->_dim_prod = std::vector(this->_dim_size, 0) ; this->_dim_prod[0] = 1 ; if(this->_dim_size > 1) { this->_dim_prod[1] = this->_dim[0] ; } if(this->_dim_size > 2) { for(size_t i=2; i_dim_size; i++) { this->_dim_prod[i] = this->_dim_prod[i-1]*this->_dim[i-1] ; } } } template std::vector Matrix::swap_coord(const std::vector &coord) const { std::vector coord_new = coord ; // reformat coord = (row,col,...) = (y,y,...) into coord = (col,row,...) = (x,y,...) if(this->_dim_size > 1) { std::swap(coord_new[0], coord_new[1]) ; } return coord_new ; } template bool Matrix::is_valid(size_t offset) const { if(offset > this->_data_size-1) { return false ; } return true ; } template bool Matrix::is_valid(const std::vector& coord) const { if(coord.size() != this->_dim_size) { return false ; } for(size_t i=0; i this->_dim[i]) { return false ; } } return true ; } template size_t Matrix::convert_to_offset(const std::vector& coord) const { size_t offset = 0 ; for(size_t i=0; i_dim_size; i++) { offset += coord[i] * this->_dim_prod[i] ; } return offset ; } template std::vector Matrix::convert_to_coord(size_t offset) const { std::vector coord(this->_dim_size, 0) ; for(int i=this->_dim_size-1; i>=0; i--) { size_t c = offset / this->_dim_prod[i] ; coord[i] = c ; offset -= (this->_dim_prod[i]*c) ; } return coord ; } #endif // MATRIX_HPP diff --git a/src/Matrix/Matrix2D.hpp b/src/Matrix/Matrix2D.hpp index 2a532c0..297344c 100644 --- a/src/Matrix/Matrix2D.hpp +++ b/src/Matrix/Matrix2D.hpp @@ -1,481 +1,481 @@ #ifndef MATRIX2D_HPP #define MATRIX2D_HPP #include #include #include #include // ifstream #include #include // setw(), setprecision(), fixed #include // istringstream #include // runtime_error, out_of_range #define BUFFER_SIZE 4096 /*! The Matrix2D class is a specialisation of the Matrix * class to make work with 2D matrices easier. * * A text format is defined to store such matrices. * In this format, each row is written on a single line * and the values should separated by any blank character * (tab, space, multiple spaces, ...). Empty lines are * not allowed. * * ---- start ---- * 1 2 3 * 4 5 6 * 7 8 9 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix2D : public Matrix { public: // constructors Matrix2D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param nrow the number of rows. * \param ncol the number of columns. */ Matrix2D(size_t nrow, size_t ncol) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param nrow the number of rows. * \param ncol the number of columns. * \param value the value to initialize the matrix content * with. */ Matrix2D(size_t nrow, size_t ncol, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix2D(const Matrix2D& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ - Matrix2D(const std::string& file_address) throw (std::runtime_error) ; + Matrix2D(const std::string& file_address) ; /*! * \brief Destructor. */ virtual ~Matrix2D() = default ; // methods overloaded in Matrix using Matrix::get ; using Matrix::set ; // methods /*! * \brief Gets the element at the given coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ - T get(size_t row, size_t col) const throw(std::out_of_range) ; + T get(size_t row, size_t col) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param row the row number of the element to set. * \param col the column number of the element to set. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ - void set(size_t row, size_t col, T value) throw (std::out_of_range) ; + void set(size_t row, size_t col, T value) ; /*! * \brief Gets the number of rows. * \return the number of rows. */ size_t get_nrow() const ; /*! * \brief Gets the number of columns. * \return the number of columns. */ size_t get_ncol() const ; /*! * \brief Gets the values in the i-th row. * \param i the row of interest. * \throw std::out_of_range if i is out of range. * \return the values in this row. */ - std::vector get_row(size_t i) const throw (std::out_of_range) ; + std::vector get_row(size_t i) const ; /*! * \brief Gets the values in the i-th column. * \param i the column of interest. * \throw std::out_of_range if i is out of range. * \return the values in this column. */ - std::vector get_col(size_t i) const throw (std::out_of_range) ; + std::vector get_col(size_t i) const ; /*! * \brief Sets the values of a given rows with the values of a given * vector. * \param i the row of interest. * \param values the new values. * \throw std::out_of_range if i is out of range. * \throw std::invalid_argument if values does not have a length equal * to the number of columns of the matrix. */ - void set_row(size_t i, const std::vector& values) throw (std::out_of_range, std::invalid_argument) ; + void set_row(size_t i, const std::vector& values) ; /*! * \brief Sets the values of a given column with the values of a given * vector. * \param i the column of interest. * \param values the new values. * \throw std::out_of_range if i is out of range. * \throw std::invalid_argument if values does not have a length equal * to the number of rows of the matrix. */ - void set_col(size_t i, const std::vector& values) throw (std::out_of_range, std::invalid_argument) ; + void set_col(size_t i, const std::vector& values) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stram, size_t precision=4, size_t width=8, char sep=' ') const override ; // operators /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \return a reference to this element. */ T& operator () (size_t row, size_t col) ; /*! * \brief Returns a const reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param row the row number of the element to set. * \param col the column number of the element to set. * \return a const reference to this element. */ const T& operator () (size_t row, size_t col) const ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix2D operator + (Matrix2D m, T value) { Matrix2D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix2D operator - (Matrix2D m, T value) { Matrix2D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix2D operator * (Matrix2D m, T value) { Matrix2D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template -const Matrix2D operator / (Matrix2D m, T value) throw (std::invalid_argument) +const Matrix2D operator / (Matrix2D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix2D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix2D& m) { m.print(stream) ; return stream ; } // other usefull functions /*! * \brief Produces a transpose of the given matrix. * \param m a matrix. */ template Matrix2D transpose(const Matrix2D& m) ; // method implementation template Matrix2D transpose(const Matrix2D& m) { std::vector dim = m.get_dim() ; size_t nrow = dim[0] ; size_t ncol = dim[1] ; Matrix2D m2(ncol, nrow, 0) ; for(size_t i=0; i Matrix2D::Matrix2D(size_t nrow, size_t ncol) : Matrix2D(nrow, ncol, static_cast(0)) {} template Matrix2D::Matrix2D(size_t nrow, size_t ncol, T value) : Matrix({nrow, ncol}, value) {} template Matrix2D::Matrix2D(const Matrix2D& other) : Matrix(other) {} template -Matrix2D::Matrix2D(const std::string &file_address) throw (std::runtime_error) +Matrix2D::Matrix2D(const std::string &file_address) // : Matrix({0,0}) { this->_dim = {0,0} ; this->_data = std::vector() ; this->_dim_size = this->_dim.size() ; this->_data_size = this->_data.size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; // read file size_t n_line = 0 ; size_t row_len = 0 ; while(getline(file, buffer_str)) { // check stream status and read content if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(n_line == 0 and file.peek() == EOF and file.eof()) { break ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // parse line buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line == 0) { row_len = buffer_vec.size() ; } else if(buffer_vec.size() != row_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! variable number of columns in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { this->_data.push_back(i) ; this->_data_size++ ; } this->_dim[1]++ ; n_line++ ; } file.close() ; this->_dim[0] = row_len ; this->compute_dim_product() ; } template -T Matrix2D::get(size_t row, size_t col) const throw(std::out_of_range) +T Matrix2D::get(size_t row, size_t col) const { try { return this->get({row, col}) ; } catch(std::out_of_range& e) { throw e ; } } template -void Matrix2D::set(size_t row, size_t col, T value) throw(std::out_of_range) +void Matrix2D::set(size_t row, size_t col, T value) { try { this->set({row, col}, value) ; } catch(std::out_of_range& e) { throw e ; } } template size_t Matrix2D::get_nrow() const { return this->_dim[1] ; } template size_t Matrix2D::get_ncol() const { return this->_dim[0] ; } template -std::vector Matrix2D::get_row(size_t i) const throw (std::out_of_range) +std::vector Matrix2D::get_row(size_t i) const { if(i>=this->get_nrow()) { throw std::out_of_range("row index is out of range!") ; } std::vector row(this->get_ncol()) ; for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) { row[n] = this->_data[j] ; } return row ; } template -std::vector Matrix2D::get_col(size_t i) const throw (std::out_of_range) +std::vector Matrix2D::get_col(size_t i) const { if(i>=this->get_ncol()) { throw std::out_of_range("column index is out of range!") ; } std::vector col(this->get_nrow()) ; for(size_t j=i, n=0; nget_nrow(); j+=this->get_ncol(), n++) { col[n] = this->_data[j] ; } return col ; } template -void Matrix2D::set_row(size_t i, const std::vector& values) throw (std::out_of_range, std::invalid_argument) +void Matrix2D::set_row(size_t i, const std::vector& values) { if(i>=this->get_nrow()) { throw std::out_of_range("row index is out of range!") ; } else if(values.size() != this->get_ncol()) { throw std::invalid_argument("the given vector length is not equal to the number of columns!") ; } for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) { this->_data[j] = values[n] ; } } template -void Matrix2D::set_col(size_t i, const std::vector& values) throw (std::out_of_range, std::invalid_argument) +void Matrix2D::set_col(size_t i, const std::vector& values) { if(i>=this->get_ncol()) { throw std::out_of_range("row index is out of range!") ; } else if(values.size() != this->get_nrow()) { throw std::invalid_argument("the given vector length is not equal to the number of rows!") ; } for(size_t n=0, j=i; nget_nrow(); n++, j+=this->get_ncol()) { this->_data[j] = values[n] ; } } template void Matrix2D::print(std::ostream& stream, size_t precision, size_t width, char sep) const { stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; size_t n = 0 ; size_t n_tot = this->get_nrow()*this->get_ncol() ; for(size_t i=0; iget_nrow(); i++) { for(size_t j=0; jget_ncol(); j++, n++) { stream << std::setw(width) << (*this)(i,j) << sep ; } if(n T& Matrix2D::operator () (size_t row, size_t col) { std::vector coord = {col, row} ; return this->_data[this->convert_to_offset(coord)] ; } template const T& Matrix2D::operator () (size_t row, size_t col) const { std::vector coord = {col, row} ; return this->_data[this->convert_to_offset(coord)] ; } #endif // MATRIX2D_HPP diff --git a/src/Matrix/Matrix3D.hpp b/src/Matrix/Matrix3D.hpp index 5fc6572..a812f4b 100644 --- a/src/Matrix/Matrix3D.hpp +++ b/src/Matrix/Matrix3D.hpp @@ -1,444 +1,444 @@ #ifndef MATRIX3D_HPP #define MATRIX3D_HPP #include #include #include #include #include // setw(), setprecision(), fixed #include // ifstream #include // istringstream #include // runtime_error, out_of_range #include // equal() #define BUFFER_SIZE 4096 /*! * The Matrix3D class is a specialisation of the Matrix * class to make work with 3D matrices more easily. * * A text file format is defined to store such matrices. The specifications are as * follows : * Absolutely NO empty lines are allowed! * The following lines should contain : * * 1st line : a slice header, ',,0' indicates that a slice of the 3rd dimension * is beginning (this is a z slice). * 2nd - Nth line : the firt slice, as a 2d matrix (the exemple below has dimensions 3x4). * N+1th line : a slice header, ',,1' indicates that the 2nd slice is beginning. * N+1th - ... : the second slice * and so on... * * Example of a 3x4x2 3D matrix * ---- start ---- * ,,0 * 1 2 3 4 * 5 6 7 8 * 8 9 10 11 *,,1 * 12 13 14 15 * 16 17 18 19 * 20 21 22 23 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix3D : public Matrix { public: // constructors Matrix3D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. */ Matrix3D(size_t dim1, size_t dim2, size_t dim3) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param value the value to initialize the matrix content * with. */ Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix3D(const Matrix3D& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ - Matrix3D(const std::string& file_address) throw (std::runtime_error) ; + Matrix3D(const std::string& file_address) ; /*! * \brief Destructor. */ virtual ~Matrix3D() = default ; // methods overloaded from Matrix using Matrix::get ; using Matrix::set ; // methods /*! * \brief Gets the element at the given coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ - T get(size_t dim1, size_t dim2, size_t dim3) const throw (std::out_of_range) ; + T get(size_t dim1, size_t dim2, size_t dim3) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ - void set(size_t dim1, size_t dim2, size_t dim3, T value) throw (std::out_of_range) ; + void set(size_t dim1, size_t dim2, size_t dim3, T value) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stream, size_t precision=4 ,size_t width=8, char sep=' ') const override ; // operators /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \return a reference to this element. */ T& operator() (size_t dim1, size_t dim2, size_t dim3) ; /*! * \brief Returns a constant reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \return a constant reference to this element. */ const T& operator() (size_t dim1, size_t dim2, size_t dim3) const ; private: // methods /*! * \brief Checks whether a given string is a slice header * (such as ",,0"), as found in files storing Matrix3D. * \param str the string to check. * \return whether the string is a slice header. */ bool is_header(const std::string& str) const ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix3D operator + (Matrix3D m, T value) { Matrix3D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix3D operator - (Matrix3D m, T value) { Matrix3D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix3D operator * (Matrix3D m, T value) { Matrix3D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template -const Matrix3D operator / (Matrix3D m, T value) throw (std::invalid_argument) +const Matrix3D operator / (Matrix3D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix3D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix3D& m) { m.print(stream) ; return stream ; } // method implementation template Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3) : Matrix3D(dim1, dim2, dim3, 0) {} template Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) : Matrix({dim1, dim2, dim3}, value) {} template Matrix3D::Matrix3D(const Matrix3D &other) : Matrix(other) {} template -Matrix3D::Matrix3D(const std::string &file_address) throw (std::runtime_error) +Matrix3D::Matrix3D(const std::string &file_address) { this->_dim = {0,0,0} ; this->_data = std::vector() ; this->_dim_size = this->_dim.size() ; this->_data_size = this->_data.size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; // read file size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col while(getline(file, buffer_str)) { if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(n_line == 0 and file.peek() == EOF and file.eof()) { break ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check whether it is the beginning of a slice // 1st line in file should be one like this if(this->is_header(buffer_str)) { // check that slice have a constant number of rows if(this->_dim[2] == 1) { col_len = col_len_cur ; // this->_dim[0] = row_len ; // this->_dim[1] = col_len ; } else if(col_len_cur != col_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 1 in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } this->_dim[2]++ ; col_len_cur = 0 ; n_line++ ; continue ; } // 1st line in file should be a header and entering // this block is forbidden if(n_line == 0) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! first line is not a slice header in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // parse line row_len_cur = 0 ; buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; row_len_cur++ ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line_data == 0) { row_len = row_len_cur ; } else if(row_len_cur != row_len) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 2 in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { this->_data.push_back(i) ; this->_data_size++ ; } col_len_cur++ ; n_line_data++ ; n_line++ ; // update matrix dimensions this->_dim[0] = row_len_cur ; this->_dim[1] = col_len_cur ; } // check dimensions of last slice if(col_len_cur != this->_dim[1]) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } file.close() ; this->compute_dim_product() ; } template -T Matrix3D::get(size_t dim1, size_t dim2, size_t dim3) const throw(std::out_of_range) +T Matrix3D::get(size_t dim1, size_t dim2, size_t dim3) const { try { return this->get({dim1, dim2, dim3}) ; } catch(std::out_of_range& e) { throw e ; } } template -void Matrix3D::set(size_t dim1, size_t dim2, size_t dim3, T value) throw(std::out_of_range) +void Matrix3D::set(size_t dim1, size_t dim2, size_t dim3, T value) { try { return this->set({dim1, dim2, dim3}, value) ; } catch(std::out_of_range& e) { throw e ; } } template T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) { std::vector coord = {dim2, dim1, dim3} ; return this->_data[this->convert_to_offset(coord)] ; } template void Matrix3D::print(std::ostream& stream, size_t precision, size_t width, char sep) const { // if the matrix has at least one 0 dimension (no data), don't do anything if(this->_dim[0]==0 or this->_dim[1]==0 or this->_dim[2]==0) { return ; } stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; std::vector dim = this->get_dim() ; size_t n = 0 ; size_t n_tot = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; for(size_t z=0; z const T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) const { std::vector coord = {dim2, dim1, dim3} ; return this->_data[this->convert_to_offset(coord)] ; } template bool Matrix3D::is_header(const std::string& str) const { if(str[0] == ',' and str[1] == ',' and str.find(',', 2) == std::string::npos) { return true ; } return false ; } #endif // MATRIX3D_HPP diff --git a/src/Matrix/Matrix4D.hpp b/src/Matrix/Matrix4D.hpp index d9cb111..d0a280a 100644 --- a/src/Matrix/Matrix4D.hpp +++ b/src/Matrix/Matrix4D.hpp @@ -1,594 +1,594 @@ #ifndef MATRIX4D_HPP #define MATRIX4D_HPP #include #include #include #include // runtime_error, out_of_range #include #include // setw(), setprecision(), fixed #include // ifstream #include // sstream #define BUFFER_SIZE 4096 /*! * The Matrix4D class is a specialisation of the Matrix * class to make work with 4D matrices more easily. * * A text file format is defined to store such matrices. The specifications are as * follows : * Absolutely NO empty lines are allowed! * The following lines should contain : * * 1st line : a slice header ',,,0' indicating that a slice of the 4th dimension * is beginning. * 3nd - Nth line : the slice of the 4th dimension. It contains slice in the 3rd dimension * which are 2D matrices separated by headers (',,0' and ',,1', in the * example below, they have 2x3 dimensions). * N+1th line : ',,,1' indicating that the 2nd slice of the 4th dimension is beginning. * and so on... * Example * ---- start ---- * ,,,0 * ,,0 * 1 2 3 * 4 5 6 * ,,1 * 7 8 9 * 10 11 12 * ,,,1 * ,,0 * 21 22 23 * 24 25 26 * ,,1 * 27 28 29 * 30 31 32 * ----- end ----- * * Constructing a matrix from an empty file (0 bytes or only an EOL char) returns a null * matrix (0x0x0x0 dimensions). Writting a null matrix (that is with at least one null * dimension creates an empty file. * */ template class Matrix4D : public Matrix { public: // constructors Matrix4D() = default ; /*! * \brief Constructs a matrix with the given dimensions, * filled with 0 values. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param dim4 the fourth dimension. */ Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4) ; /*! * \brief Constructs a matrix with the given dimensions and * initialize the values to the given value. * \param dim1 the first dimension. * \param dim2 the second dimension. * \param dim3 the third dimension. * \param dim4 the fourth dimension. * \param value the value to initialize the matrix content * with. */ Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) ; /*! * \brief Copy constructor * \param other the matrix to copy the content from. */ Matrix4D(const Matrix4D& other) ; /*! * \brief Constructs a matrix from a text file. A matrix contructed * from an empty file (or a file containing only one EOL char) returns * an empty matrix (null dimensions). * \param file_address the address of the file containing the matrix. * \throw std::runtime_error if anything happen while reading the * file (format error, file not found, etc). */ - Matrix4D(const std::string& file_address) throw (std::runtime_error) ; + Matrix4D(const std::string& file_address) ; /*! * \brief Destructor. */ virtual ~Matrix4D() = default ; // methods overloaded from Matrix using Matrix::get ; using Matrix::set ; // methods OK /*! * \brief Gets the element at the given coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the fourth dimension coordinate. * \throw std::out_of_range exception if the coordinates * are out of range. * \return the element. */ - T get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const throw (std::out_of_range) ; + T get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; /*! * \brief Sets the element at the given coordinates * to the given value. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the fourth dimension coordinate. * \param value the new value. * \throw std::out_of_range exception if the coordinates * are out of range. */ - void set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) throw (std::out_of_range) ; + void set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) ; /*! * \brief Produces a nice representation of the matrix on the given * stream. * \param stream the stream. * \param precision the rounding precision. * \param width the column width in number of characters. * \param sep the character separator. */ virtual void print(std::ostream& stream, size_t precision=4 ,size_t width=8, char sep=' ') const override ; // operators OK /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the third dimension coordinate. * \return a reference to this element. */ T& operator() (size_t dim1, size_t dim2, size_t dim3, size_t dim4) ; /*! * \brief Returns a reference to the corrresponding * element. This method does not perform any check on * the coordinates. * \param dim1 the first dimension coordinate. * \param dim2 the second dimension coordinate. * \param dim3 the third dimension coordinate. * \param dim4 the third dimension coordinate. * \return a reference to this element. */ const T& operator() (size_t dim1, size_t dim2, size_t dim3, size_t dim4) const ; private: // methods /*! * \brief Checks whether a given string is a 3D header * (such as ",,0"), as found in files storing Matrix4D. * \param str the string to check. * \return whether the string is such a slice header. */ bool is_header_3d(const std::string& str) const ; /*! * \brief Checks whether a given string is a 4D header * (such as ",,,0"), as found in files storing Matrix4D. * \param str the string to check. * \return whether the string is such a slice header. */ bool is_header_4d(const std::string& str) const ; /*! * \brief Routine to load 4D matrices from files. * This method reads from a std::ifstream object, * from the current pointer location until i) a 4D * header line is found (such as ',,,1') or ii) until * it cannot read anymore from the stream. All * data are pushed back into the data vector and * the dimensions of the data read are stored into * the dim vector (these data are actually a 3D * matrix). If the method returned because it * found another 4D header, it returns true, false * otherwise. * To read an entire 4D matrix from a file, simply * use this scheme : i) read the 1st 4D header * ii) call this function while it returns true. * \param file_name a reference to a string containing * the address of the file currently read (for exception * messages). * \param file a reference to the std::ifstream to read * from. Obviously, the stream state will be modified as * the method reads from it. However, it will never be * closed by the method. * \param data a reference to an empty vector where the * read data will be pushed back. * \param dim a reference to an empty vector where the * dimensions of the read data will be stored. * \return whether the last piece of data read from the * stream was a 4D header. */ bool get_3d_slice(const std::string& file_name, std::ifstream& file, - std::vector& data, std::vector& dim) const throw (std::runtime_error) ; + std::vector& data, std::vector& dim) const ; } ; // operators /*! * \brief Addition operator. * \param m the matrix of interest * \param value the value to add to each element. * \return the resulting matrix. */ template const Matrix4D operator + (Matrix4D m, T value) { Matrix4D other(m) ; m += value ; return m ; } /*! * \brief Substraction operator * \param m the matrix of interest. * \param value the value to substract to each element. * \return the resulting matrix. */ template const Matrix4D operator - (Matrix4D m, T value) { Matrix4D other(m) ; m -= value ; return m ; } /*! * \brief Multiplication operator. * \param m the matrix of interest. * \param value the value to multiply each elements by. * \return the resulting matrix. */ template const Matrix4D operator * (Matrix4D m, T value) { Matrix4D other(m) ; m *= value ; return m ; } /*! * \brief Division operator. * \param m the matrix of interest. * \param value the value to divide each elements by. * \throw std::invalid_argument if value is 0. * \return the resulting matrix. */ template -const Matrix4D operator / (Matrix4D m, T value) throw (std::invalid_argument) +const Matrix4D operator / (Matrix4D m, T value) { if(value == static_cast(0)) { throw std::invalid_argument("division by 0!") ; } Matrix4D other(m) ; other /= value ; return other ; } /*! * \brief Sends a representation of the matrix to the stream. * \param stream the stream of interest. * \param m the matrix of interest. * \return a reference to the stream. */ template std::ostream& operator << (std::ostream& stream, const Matrix4D& m) { m.print(stream) ; return stream ; } // method implementation template Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4) : Matrix({dim1, dim2, dim3, dim4}, 0) {} template Matrix4D::Matrix4D(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) : Matrix({dim1, dim2, dim3, dim4}, value) {} template Matrix4D::Matrix4D(const Matrix4D &other) : Matrix(other) {} template -Matrix4D::Matrix4D(const std::string &file_address) throw (std::runtime_error) +Matrix4D::Matrix4D(const std::string &file_address) { this->_dim = {0,0,0,0} ; this->_data = std::vector() ; this->_dim_size = this->_dim.size() ; this->_data_size = this->_data.size() ; this->_dim_prod = std::vector(this->_dim_size, 0) ; std::ifstream file(file_address, std::ifstream::in) ; if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! cannot open %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } std::string buffer_str ; std::vector buffer_t ; std::vector dim ; // read 1st line getline(file, buffer_str) ; // empty line if(buffer_str.size() == 0) { // this file only contains one eol char and should be considered as empty, // -> returns empty matrix not an error if(file.peek() == EOF and file.eof()) { file.close() ; return ; } file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } bool found_4d_header = this->is_header_4d(buffer_str) ; do { if(file.fail()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; throw std::runtime_error(msg) ; } // this is the beginning of a 3D slice -> get it using routine if(found_4d_header) { try { // get slice buffer_t.clear() ; dim.clear() ; found_4d_header = this->get_3d_slice(file_address, file, buffer_t, dim); // update data for(const auto& i : buffer_t) { this->_data.push_back(i) ; this->_data_size++ ; } // update dim only for the 1st slice (the 1st slice set the dimensions) if(this->_dim[3] == 0) { this->_dim[0] = dim[0] ; this->_dim[1] = dim[1] ; this->_dim[2] = dim[2] ; } // check dimensions of the slice else { if(dim[0] != this->_dim[0] or dim[1] != this->_dim[1] or dim[2] != this->_dim[2]) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } } this->_dim[3]++ ; } catch(std::runtime_error& e) { file.close() ; throw e ; } } // this is an error, everything between two ',,,N' header // should be read at once. The only way out of the loop // is that no more header has been read because of eof else if(not found_4d_header and not file.eof()) { file.close() ; char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_address.c_str()) ; throw std::runtime_error(msg) ; } } while(found_4d_header) ; file.close() ; this->compute_dim_product() ; } template -T Matrix4D::get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const throw (std::out_of_range) +T Matrix4D::get(size_t dim1, size_t dim2, size_t dim3, size_t dim4) const { try { return this->get({dim1, dim2, dim3, dim4}) ; } catch(std::out_of_range& e) { throw e ; } } template -void Matrix4D::set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) throw (std::out_of_range) +void Matrix4D::set(size_t dim1, size_t dim2, size_t dim3, size_t dim4, T value) { try { this->set({dim1, dim2, dim3, dim4}, value) ; } catch(std::out_of_range& e) { throw e ; } } template void Matrix4D::print(std::ostream &stream, size_t precision, size_t width, char sep) const { // if the matrix has at least one 0 dimension (no data), don't do anything if(this->_dim[0]==0 or this->_dim[1]==0 or this->_dim[2]==0 or this->_dim[3]==0) { return ; } stream.setf(std::ios::left) ; stream << std::setprecision(precision) << std::fixed ; std::vector dim = this->get_dim() ; size_t n = 0 ; size_t n_tot = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; for(size_t dim4=0; dim4 T& Matrix4D::operator () (size_t dim1, size_t dim2, size_t dim3, size_t dim4) { std::vector coord = {dim2, dim1, dim3, dim4} ; return this->_data[this->convert_to_offset(coord)] ; } template const T& Matrix4D::operator () (size_t dim1, size_t dim2, size_t dim3, size_t dim4) const { std::vector coord = {dim2, dim1, dim3, dim4} ; return this->_data[this->convert_to_offset(coord)] ; } template bool Matrix4D::is_header_3d(const std::string &str) const { if(str[0] == ',' and str[1] == ',' and str.find(',', 2) == std::string::npos) { return true ; } return false ; } template bool Matrix4D::is_header_4d(const std::string &str) const { if(str[0] == ',' and str[1] == ',' and str[2] == ',' and str.find(',', 3) == std::string::npos) { return true ; } return false ; } template bool Matrix4D::get_3d_slice(const std::string& file_name, std::ifstream& file, - std::vector &data, std::vector &dim) const throw (std::runtime_error) + std::vector &data, std::vector &dim) const { bool found_4d_header = false ; // the flag to return dim = {0,0,0} ; std::string buffer_str ; std::vector buffer_vec ; T buffer_T ; size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col while(getline(file, buffer_str)) { if(file.fail()) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check empty line if(buffer_str.size() == 0) { char msg[BUFFER_SIZE] ; sprintf(msg, "error! while reading %s (empty line)", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check whether this is the beginning of a 4D slice header, if so // break if(this->is_header_4d(buffer_str)) { found_4d_header = true ; break ; } // check whether it is the beginning of a slice // 1st line in file should be if(this->is_header_3d(buffer_str)) { // check that slice have a constant number of rows if(dim[2] == 1) { col_len = col_len_cur ; // dim[0] = row_len ; // dim[1] = col_len ; } else if(col_len_cur != col_len) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } dim[2]++ ; col_len_cur = 0 ; n_line++ ; continue ; } // 1st line in file should be a header and entering // this block is forbidden if(n_line == 0) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! first line is not a slice header in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // parse line row_len_cur = 0 ; buffer_vec.clear() ; std::istringstream buffer_ss(buffer_str) ; while(buffer_ss >> buffer_T) { buffer_vec.push_back(buffer_T) ; row_len_cur++ ; } // check for an error which likely indicates that a value could not be // casted into a type T (mixed data types in the file) if(buffer_ss.fail() and not buffer_ss.eof()) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_name.c_str()) ; throw std::runtime_error(msg) ; } // check that number of column is constant if(n_line_data == 0) { row_len = row_len_cur ; } else if(row_len_cur != row_len) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } // update matrix content for(auto i : buffer_vec) { data.push_back(i) ; } col_len_cur++ ; n_line_data++ ; n_line++ ; // update dimension dim[0] = row_len_cur ; dim[1] = col_len_cur ; } // check dimensions of last slice if(col_len_cur != dim[1]) { char msg[BUFFER_SIZE] ; sprintf(msg, "format error! slice have variable dimensions 333 in %s", file_name.c_str()) ; throw std::runtime_error(msg) ; } return found_4d_header ; } #endif // MATRIX4D_HPP diff --git a/src/Matrix_old/Matrix.hpp b/src/Matrix_old/Matrix.hpp deleted file mode 100755 index 7568412..0000000 --- a/src/Matrix_old/Matrix.hpp +++ /dev/null @@ -1,506 +0,0 @@ -#ifndef MATRIX_HPP -#define MATRIX_HPP - - -#include -#include // accumulate() -#include -#include // out_of_range - - - -/*! - * \brief The Matrix class is a generic class to store data in a matrix. - * The matrix dimensionality can be any value : 1 is a vector, 2 is a regular - * 2D matrix, 3 is a 3D matrix, etc. - * - * In order to store the data properly and to perform all operations smoothly, the - * internal representation format differs from the "usual format". That is : the user - * provides coordinates as (x,y,z,...) where x referes to the row number, y to - * the column number, z the the z slice, etc. - * Internally however, x corresponds to the column number and y to the row number. - * Every other dimension has the same meaning. - * - * Internal representation : - * - * Here is an example of a 2x3 matrix (2D) - * - * {0,1,2,3,4,5} vector is turned to - * X - * ----------> - * 0 1 2 | - * 3 4 5 | Y - * \|/ - * - * dimensions are stored as {nx, ny} which corresponds to {ncol, nrow}. Coordinates - * are given using the universal format coord=(x,y) which are interpreted as {row, col}. - * Thus a simple swap(coord[0],coord[1]) should be performed to ensurethat the user given - * coordinates can be used in this referencial. - * - * - * Here is an example of a 2x3x2x2 matrix(4D) - * {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23} is turned to - * - * X - * -----------> | | - * 0 1 2 | | | - * 3 4 5 | Y | | - * \|/ | Z | - * 6 7 8 | | | - * 9 10 11 | Y | | - * \|/ \|/ | - * | A - * 12 13 14 | | | - * 15 16 17 | Y | | - * \|/ | Z | - * 18 19 20 | | | - * 21 22 23 | Y | | - * \|/ \|/ \|/ - * - * dimensions are stored as {nx, ny, nz, na} which corredponds to {ncol, nrow, nz, na}. - * Coordinates are given using the universal format coord=(x,y,z,a) which are interpreted - * as {row, col, z, a}. Thus a simple swap(coord[0],coord[1]) should be performed to ensure - * that the user given coordinates can be used in this referencial. - * - */ - -template -class Matrix -{ - public: - // constructors - Matrix() = default ; - - /*! - * \brief Constructs an matrix with the given dimension with - * 0 values. - * \param dim the dimensions. - */ - Matrix(const std::vector& dim) ; - - /*! - * \brief Constructs a matrix with the given dimensions and - * initialize the values to the given value. - * \param dim the dimensions. - * \param value the value to initialize the matrix content - * with. - */ - Matrix(const std::vector& dim, T value) ; - - /*! - * \brief Copy constructor. - * \param other the matrix to copy. - */ - Matrix (const Matrix& other) ; - - // methods - /*! - * \brief Gets the element at the given offset. - * \param offset the offset of the element to get. - * \throw std::out_of_range exception if the offset - * is out of range. - * \return the element. - */ - T get(size_t offset) const /* throw(std::out_of_range) */ ; - - /*! - * \brief Gets the element at the given coordinates. - * \param coord the coordinates of the element to get. - * \throw std::out_of_range exception if the coordinates - * are out of range. - * \return the element. - */ - T get(const std::vector& coord) const /* throw(std::out_of_range) */ ; - - /*! - * \brief Sets the element at the given offset - * to the given value. - * \param offset the offset of the element to set. - * \param value the new value. - * \throw std::out_of_range exception if the offset - * is out of range. - */ - void set(size_t offset, T value) /*throw(std::out_of_range) */ ; - /*! - * \brief Sets the element at the given coordinates - * to the given value. - * \param coord the coordinates of the element to set. - * \param value the new value. - * \throw std::out_of_range exception if the coordinates - * are out of range. - */ - void set(const std::vector& coord, T value) /* throw(std::out_of_range) */ ; - - /*! - * \brief Gets the matrix dimensions. - * \return the dimensions. - */ - std::vector get_dim() const ; - - /*! - * \brief Gets the data contained in the - * matrix as a vector. - * \return a a vector containing the data. - */ - std::vector get_data() ; - - /*! - * \brief Gets the number of dimensions (the length - * of the dimension vector). - * \return the number of dimensions - */ - size_t get_dim_size() const ; - - /*! - * \brief Gets the number of elements contained in the - * matrix. - * \return the number of element contained in the - * matrix. - */ - size_t get_data_size() const ; - - // operator - /*! - * \brief Assignment operator. - * \param other an other matrix to copy the values from. - * \return a reference to the current instance. - */ - Matrix& operator = (const Matrix& other) ; - - /*! - * \brief Comparison operator, returns true if - * both matrices are identical, that is do not - * have the same data and dimensions. - * \param other an other matrix. - * \return true if both matrices have the same - * data and dimensions. - */ - bool operator == (const Matrix& other) const ; - - /*! - * \brief Comparison operator, returns true if - * both matrices are different, that is do not - * have the same data and dimensions. - * \param other an other matrix. - * \return true if both matrices are different. - */ - bool operator != (const Matrix& other) const ; - - /*! - * \brief Returns a reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param coord coord the coordinates of the element to get. - * \return a reference to this element. - */ - T& operator () (const std::vector& coord) ; - - /*! - * \brief Returns a const reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param coord coord the coordinates of the element to get. - * \return a const reference to this element. - */ - const T& operator () (const std::vector& coord) const ; - - protected: - // methods - /*! - * \brief Computes the partial dimension products and fills - * this->dim_prod according to the current values of - * this->_dim and this->dim_size. These values are used to - * access the elements given a set of coordinates. - */ - void compute_dim_product() ; - - /*! - * \brief Given a vector of at least 2 elements corresponding - * to coordinates, it simply swaps the elements at index 0 (row - * number) and 1 (column number) to make them fit the x,y,... - * matrix reprensetation (x:number of columns, y:number of rows). - * \param coord a vector of coordinates (row, column, ...). - * \return a vector of coordinates corresponding to (x,y,...). - */ - std::vector swap_coord(const std::vector& coord) const ; - - /*! - * \brief Complementary function of convert_coord(). Given - * a vector of coordinates in (x,y,...) format, it turns it - * into (row,col,...) format. - * \param coord a vector of coordinates (x,y, ...). - * \return a vector of coordinates corresponding to (row,col,...). - */ - std::vector convert_coord_back(const std::vector& coord) const ; - - /*! - * \brief Checks whether a given offset is a valid offset or - * whether it is out of range. - * \param offset the offset to check. - * \return whether the offset is valid. - */ - bool is_valid(size_t offset) const ; - - /*! - * \brief Checks whether coordinates in (x,y,...) format are - * valid or whether they are out of range. - * \param offset the offset to check. - * \return whether the offset is valid. - */ - bool is_valid(const std::vector& coord) const ; - - /*! - * \brief Converts a vector of VALID (x,y,...) coordinates to a - * the corresponding offset allowing to get an element in the - * data vector. - * If the coordinate vector has a (row, column, ...) format, the - * result will be wrong. - * \param coord a vector of coordinates with (x,y,...) format. - * \return the corresponding offset. - */ - size_t convert_to_offset(const std::vector& coord) const ; - - /*! - * \brief Complementary function of convert_to_offset(). Given an - * offset, this function returns the corresponding coordinate - * vector in (x,y,...) format. - * \param offset a given offset. - * \return the corresponding vector of (x,y,..) coordinates. - */ - std::vector convert_to_coord(size_t offset) const ; - - // fields - /*! - * \brief The dimensions values. - */ - std::vector _dim ; - /*! - * \brief Stores the data. - */ - std::vector _data ; - /*! - * \brief The number of dimensions. - */ - size_t _dim_size ; - /*! - * \brief The number of data elements stored. - */ - size_t _data_size ; - - /*! - * \brief Contains the partial product of the dimensions. That is, - * the ith element contains the product of all the i-1 precedent - * dimensions : - * element 0 : 1, element 1 : x, element 2 : x*y, element 3 : x*y*z, - * and so one. - * This is used for coordinates to offset and offset to coordinates - * conversions. - */ - std::vector _dim_prod ; -} ; - - -/*! - * \brief Sends a representation of the matrix to the stream. - * \param stream the stream of interest. - * \param m the matrix of interest. - * \return a reference to the stream. - */ -template -std::ostream& operator << (std::ostream& stream, const Matrix& m) -{ for(size_t i=0; i -Matrix::Matrix(const std::vector& dim) - : Matrix(dim, 0) -{} - -template -Matrix::Matrix(const std::vector& dim, T value) -{ this->_dim_size = dim.size() ; - this->_dim = this->swap_coord(dim) ; - this->_data_size = std::accumulate(dim.begin(), dim.end(), 1, std::multiplies()) ; - this->_data = std::vector(this->_data_size, value) ; - - this->compute_dim_product() ; -} - -template -Matrix::Matrix(const Matrix &other) -{ *this = other ; } - - -template -T Matrix::get(size_t offset) const /* throw(std::out_of_range) */ -{ if(not this->is_valid(offset)) - { throw std::out_of_range("offset is out of range!") ; } - return this->_data[offset] ; -} - -template -T Matrix::get(const std::vector& coord) const /*throw(std::out_of_range) */ -{ std::vector coord_new = this->swap_coord(coord) ; - if(not this->is_valid(coord_new)) - { throw std::out_of_range("coordinates are out of range!") ; } - return this->_data[this->convert_to_offset(coord_new)] ; -} - - -template -void Matrix::set(size_t offset, T value) /* throw(std::out_of_range) */ -{ if(not this->is_valid(offset)) - { throw std::out_of_range("offset is out of range!") ; } - this->_data[offset] = value ; -} - -template -void Matrix::set(const std::vector& coord, T value) /* throw(std::out_of_range) */ -{ std::vector coord_new = this->swap_coord(coord) ; - if(not this->is_valid(coord_new)) - { throw std::out_of_range("coordinates are out of range!") ; } - this->_data[this->convert_to_offset(coord_new)] = value ; -} - - -template -std::vector Matrix::get_dim() const -{ return this->swap_coord(this->_dim) ; } - -template -std::vector Matrix::get_data() -{ return this->_data ; } - -template -size_t Matrix::get_dim_size() const -{ return this->_dim_size ; } - -template -size_t Matrix::get_data_size() const -{ return this->_data_size ; } - - -template -Matrix& Matrix::operator = (const Matrix& other) -{ this->_dim = other._dim ; - this->_dim_size = other._dim_size ; - this->_data = other._data ; - this->_data_size = other._data_size ; - this->_dim_prod = other._dim_prod ; - return *this ; -} - -template -bool Matrix::operator == (const Matrix& other) const -{ if(&other == this) - { return true ; } - // check dim - if(this->_dim_size != other._dim_size) - { return false ; } - for(size_t i=0; i_dim_size; i++) - { if(this->_dim[i] != other._dim[i]) - { return false ; } - } - // check data - if(this->_data_size != other._data_size) - { return false ; } - for(size_t i=0; i_data_size; i++) - { if(this->_data[i] != other._data[i]) - { return false ; } - } - return true ; -} - -template -bool Matrix::operator !=(const Matrix& other) const -{ return not ((*this) == other) ;} - -template -T& Matrix::operator () (const std::vector& coord) -{ std::vector coord_new = this->swap_coord(coord) ; - return this->_data[this->convert_to_offset(coord_new)] ; -} - -template -const T& Matrix::operator () (const std::vector& coord) const -{ std::vector coord_new = this->swap_coord(coord) ; - return this->_data[this->convert_to_offset(coord_new)] ; -} - - -template -void Matrix::compute_dim_product() -{ this->_dim_prod = std::vector(this->_dim_size, 0) ; - this->_dim_prod[0] = 1 ; - if(this->_dim_size > 1) - { this->_dim_prod[1] = this->_dim[0] ; } - if(this->_dim_size > 2) - { for(size_t i=2; i_dim_size; i++) - { this->_dim_prod[i] = this->_dim_prod[i-1]*this->_dim[i-1] ; } - } -} - - -template -std::vector Matrix::swap_coord(const std::vector &coord) const -{ std::vector coord_new = coord ; - // reformat coord = (row,col,...) = (y,y,...) into coord = (col,row,...) = (x,y,...) - if(this->_dim_size > 1) - { std::swap(coord_new[0], coord_new[1]) ; } - return coord_new ; -} - - -template -bool Matrix::is_valid(size_t offset) const -{ if(offset > this->_data_size-1) - { return false ; } - return true ; -} - -template -bool Matrix::is_valid(const std::vector& coord) const -{ if(coord.size() != this->_dim_size) - { return false ; } - for(size_t i=0; i this->_dim[i]) - { return false ; } - } - return true ; -} - - - -template -size_t Matrix::convert_to_offset(const std::vector& coord) const -{ size_t offset = 0 ; - - for(size_t i=0; i_dim_size; i++) - { offset += coord[i] * this->_dim_prod[i] ; } - - return offset ; -} - - -template -std::vector Matrix::convert_to_coord(size_t offset) const -{ - std::vector coord(this->_dim_size, 0) ; - - for(int i=this->_dim_size-1; i>=0; i--) - { size_t c = offset / this->_dim_prod[i] ; - coord[i] = c ; - offset -= (this->_dim_prod[i]*c) ; - } - - return coord ; -} - - - - -#endif // MATRIX_HPP diff --git a/src/Matrix_old/Matrix2D.hpp b/src/Matrix_old/Matrix2D.hpp deleted file mode 100755 index 2a18813..0000000 --- a/src/Matrix_old/Matrix2D.hpp +++ /dev/null @@ -1,414 +0,0 @@ -#ifndef MATRIX2D_HPP -#define MATRIX2D_HPP - -#include "Matrix.hpp" - -#include -#include -#include -#include -#include -#include - -#define BUFFER_SIZE 4096 -// const size_t BUFFER_SIZE = 4096 ; - -/*! The Matrix2D class is a specialisation of the Matrix - * class to make work with 2D matrices easier. - * - * A text format is defined to store such matrices. - * In this format, each row is written on a single line - * and the values should separated by any blank character - * (tab, space, multiple spaces, ...). Empty lines are - * not allowed. - * - * ---- start ---- - * 1 2 3 - * 4 5 6 - * 7 8 9 - * ----- end ----- - * - */ -template -class Matrix2D : public Matrix -{ - public: - // constructors - Matrix2D() = default ; - /*! - * \brief Constructs a matrix with the given dimensions, - * filled with 0 values. - * \param nrow the number of rows. - * \param ncol the number of columns. - */ - Matrix2D(size_t nrow, size_t ncol) ; - - /*! - * \brief Constructs a matrix with the given dimensions and - * initialize the values to the given value. - * \param nrow the number of rows. - * \param ncol the number of columns. - * \param value the value to initialize the matrix content - * with. - */ - Matrix2D(size_t nrow, size_t ncol, T value) ; - - /*! - * \brief Copy constructor - * \param other the matrix to copy the content from. - */ - Matrix2D(const Matrix2D& other) ; - - /*! - * \brief Constructs a matrix from a text file. - * \param file_address the address of the file containing the matrix. - * \throw std::runtime_error if anything happen while reading the - * file (format error, file not found, etc). - */ - Matrix2D(const std::string& file_address) /* throw (std::runtime_error) */ ; - - // methods overloaded in Matrix - using Matrix::get ; - using Matrix::set ; - - // methods - /*! - * \brief Gets the element at the given coordinates. - * \param row the row number of the element to set. - * \param col the column number of the element to set. - * \throw std::out_of_range exception if the coordinates - * are out of range. - * \return the element. - */ - T get(size_t row, size_t col) const /* throw(std::out_of_range) */ ; - - /*! - * \brief Sets the element at the given coordinates - * to the given value. - * \param row the row number of the element to set. - * \param col the column number of the element to set. - * \param value the new value. - * \throw std::out_of_range exception if the coordinates - * are out of range. - */ - void set(size_t row, size_t col, T value) /* throw (std::out_of_range) */ ; - - /*! - * \brief Gets the number of rows. - * \return the number of rows. - */ - size_t get_nrow() const ; - - /*! - * \brief Gets the number of columns. - * \return the number of columns. - */ - size_t get_ncol() const ; - - /*! - * \brief Gets the values in the i-th row. - * \param i the row of interest. - * \throw std::out_of_range if i is out of range. - * \return the values in this row. - */ - std::vector get_row(size_t i) const /* throw (std::out_of_range) */ ; - - /*! - * \brief Gets the values in the i-th column. - * \param i the column of interest. - * \throw std::out_of_range if i is out of range. - * \return the values in this column. - */ - std::vector get_col(size_t i) const /* throw (std::out_of_range) */ ; - - /*! - * \brief Sets the values of a given rows with the values of a given - * vector. - * \param i the row of interest. - * \param values the new values. - * \throw std::out_of_range if i is out of range. - * \throw std::invalid_argument if values does not have a length equal - * to the number of columns of the matrix. - */ - void set_row(size_t i, const std::vector& values) /* throw (std::out_of_range, std::invalid_argument) */ ; - - /*! - * \brief Sets the values of a given column with the values of a given - * vector. - * \param i the column of interest. - * \param values the new values. - * \throw std::out_of_range if i is out of range. - * \throw std::invalid_argument if values does not have a length equal - * to the number of rows of the matrix. - */ - void set_col(size_t i, const std::vector& values) /* throw (std::out_of_range, std::invalid_argument) */ ; - - /*! - * \brief Produces a nice representation of the matrix on the given - * stream. - * \param stream the stream. - * \param precision the rounding precision. - * \param width the column width in number of characters. - * \param sep the character separator. - */ - void print(std::ostream& stram, size_t precision=4, size_t width=6, char sep=' ') const ; - - // operators - /*! - * \brief Returns a reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param row the row number of the element to set. - * \param col the column number of the element to set. - * \return a reference to this element. - */ - T& operator () (size_t row, size_t col) ; - - /*! - * \brief Returns a const reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param row the row number of the element to set. - * \param col the column number of the element to set. - * \return a const reference to this element. - */ - const T& operator () (size_t row, size_t col) const ; - -} ; - -/*! - * \brief Sends a representation of the matrix to the stream. - * \param stream the stream of interest. - * \param m the matrix of interest. - * \return a reference to the stream. - */ -template -std::ostream& operator << (std::ostream& stream, const Matrix2D& m) -{ m.print(stream) ; - return stream ; -} - - -/*! - * \brief Produces a transpose of the given matrix. - * \param m a matrix. - */ -template -Matrix2D transpose(const Matrix2D& m) ; - - - - -template -Matrix2D transpose(const Matrix2D& m) -{ std::vector dim = m.get_dim() ; - size_t nrow = dim[0] ; - size_t ncol = dim[1] ; - Matrix2D m2(ncol, nrow, 0) ; - for(size_t i=0; i -Matrix2D::Matrix2D(size_t nrow, size_t ncol) - : Matrix2D(nrow, ncol, 0) -{} - -template -Matrix2D::Matrix2D(size_t nrow, size_t ncol, T value) - : Matrix({nrow, ncol}, value) -{} - -template -Matrix2D::Matrix2D(const Matrix2D& other) - : Matrix(other) -{} - -template -Matrix2D::Matrix2D(const std::string &file_address) /* throw (std::runtime_error) */ -// : Matrix({0,0}) -{ - this->_dim = {0,0} ; - this->_data = std::vector() ; - this->_dim_size = this->_dim.size() ; - this->_data_size = this->_data.size() ; - this->_dim_prod = std::vector(this->_dim_size, 0) ; - - std::ifstream file(file_address, std::ifstream::in) ; - if(file.fail()) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "error! cannot open %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - std::string buffer_str ; - std::vector buffer_vec ; - T buffer_T ; - - // read file - size_t i = 0 ; - size_t row_len = 0 ; - while(getline(file, buffer_str)) - { // check stream status and read content - if(file.eof()) - { break ; } - if(buffer_str.size() == 0) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! while reading %s (empty line)", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - if(file.fail()) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - // parse line - buffer_vec.clear() ; - std::istringstream buffer_ss(buffer_str) ; - while(buffer_ss >> buffer_T) - { buffer_vec.push_back(buffer_T) ; } - // check for an error which likely indicates that a value could not be - // casted into a type T (mixed data types in the file) - if(buffer_ss.fail() and not buffer_ss.eof()) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - // check that number of column is constant - if(i == 0) - { row_len = buffer_vec.size() ; } - else if(buffer_vec.size() != row_len) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! variable number of columns in %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - // update matrix content - for(auto i : buffer_vec) - { this->_data.push_back(i) ; - this->_data_size++ ; - } - this->_dim[1]++ ; - i++ ; - } - file.close() ; - - this->_dim[0] = row_len ; - this->compute_dim_product() ; -} - - - -template -T Matrix2D::get(size_t row, size_t col) const /* throw(std::out_of_range) */ -{ try - { return this->get({row, col}) ; } - catch(std::out_of_range& e) - { throw e ; } -} - - -template -void Matrix2D::set(size_t row, size_t col, T value) /* throw(std::out_of_range) */ -{ try - { this->set({row, col}, value) ; } - catch(std::out_of_range& e) - { throw e ; } -} - - -template -size_t Matrix2D::get_nrow() const -{ return this->_dim[1] ; } - - -template -size_t Matrix2D::get_ncol() const -{ return this->_dim[0] ; } - - -template -std::vector Matrix2D::get_row(size_t i) const /* throw (std::out_of_range) */ -{ if(i>=this->get_nrow()) - { throw std::out_of_range("row index is out of range!") ; } - - std::vector row(this->get_ncol()) ; - for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) - { row[n] = this->_data[j] ; } - - return row ; -} - - -template -std::vector Matrix2D::get_col(size_t i) const /* throw (std::out_of_range) */ -{ if(i>=this->get_ncol()) - { throw std::out_of_range("column index is out of range!") ; } - - std::vector col(this->get_nrow()) ; - for(size_t j=i, n=0; nget_nrow(); j+=this->get_ncol(), n++) - { col[n] = this->_data[j] ; } - - return col ; -} - - -template -void Matrix2D::set_row(size_t i, const std::vector& values) /* throw (std::out_of_range, std::invalid_argument) */ -{ if(i>=this->get_nrow()) - { throw std::out_of_range("row index is out of range!") ; } - else if(values.size() != this->get_ncol()) - { throw std::invalid_argument("the given vector length is not equal to the number of columns!") ; } - - for(size_t j=i*this->get_ncol(), n=0; nget_ncol(); j++, n++) - { this->_data[j] = values[n] ; } -} - - -template -void Matrix2D::set_col(size_t i, const std::vector& values) /* throw (std::out_of_range, std::invalid_argument) */ -{ if(i>=this->get_ncol()) - { throw std::out_of_range("row index is out of range!") ; } - else if(values.size() != this->get_nrow()) - { throw std::invalid_argument("the given vector length is not equal to the number of rows!") ; } - - for(size_t n=0, j=i; nget_nrow(); n++, j+=this->get_ncol()) - { this->_data[j] = values[n] ; } -} - -template -void Matrix2D::print(std::ostream& stream, size_t precision, size_t width, char sep) const -{ stream.setf(std::ios::left) ; - - for(size_t i=0; iget_nrow(); i++) - { for(size_t j=0; jget_ncol(); j++) - { stream << std::setprecision(precision) << std::setw(width) << (*this)(i,j) << sep ; } - stream << std::endl ; - } -} - -template -T& Matrix2D::operator () (size_t row, size_t col) -{ std::vector coord = {col, row} ; - return this->_data[this->convert_to_offset(coord)] ; -} - - -template -const T& Matrix2D::operator () (size_t row, size_t col) const -{ std::vector coord = {col, row} ; - return this->_data[this->convert_to_offset(coord)] ; -} - - -#endif // MATRIX2D_HPP - - diff --git a/src/Matrix_old/Matrix3D.hpp b/src/Matrix_old/Matrix3D.hpp deleted file mode 100755 index 96f840b..0000000 --- a/src/Matrix_old/Matrix3D.hpp +++ /dev/null @@ -1,361 +0,0 @@ -#ifndef MATRIX3D_HPP -#define MATRIX3D_HPP - -#include "Matrix.hpp" - -#include -#include -#include -#include -#include // ifstream -#include // istringstream -#include // std::runtime_error -#include // std::equal() - -#define BUFFER_SIZE 4096 -// const size_t BUFFER_SIZE = 4096 ; - -/*! - * The Matrix3D class is a specialisation of the Matrix - * class to make work with 3D matrices more easily. - * - * A text file format is defined to store such matrices. The specifications are as - * follows : - * Absolutely NO empty lines are allowed! - * The following lines should contain : - * - * 1st line : a slice header, ',,0' indicates that a slice of the 3rd dimension - * is beginning (this is a z slice). - * 2nd - Nth line : the firt slice, as a 2D matrix. In the example below, it has - * dimensions 3x4. - * N+1th line : a slice header, ',,1' indicates that the 2nd slice is beginning. - * N+1th - ... : the second slice - * and so on... - * - * Example of a 3x4x2 3D matrix - * ---- start ---- - * ,,0 - * 1 2 3 4 - * 5 6 7 8 - * 8 9 10 11 - *,,1 - * 12 13 14 15 - * 16 17 18 19 - * 20 21 22 23 - * ----- end ----- - * - */ -template -class Matrix3D : public Matrix -{ - public: - // constructors - Matrix3D() = default ; - - /*! - * \brief Constructs a matrix with the given dimensions, - * filled with 0 values. - * \param dim1 the first dimension. - * \param dim2 the second dimension. - * \param dim3 the third dimension. - */ - Matrix3D(size_t dim1, size_t dim2, size_t dim3) ; - - /*! - * \brief Constructs a matrix with the given dimensions and - * initialize the values to the given value. - * \param dim1 the first dimension. - * \param dim2 the second dimension. - * \param dim3 the third dimension. - * \param value the value to initialize the matrix content - * with. - */ - Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) ; - - /*! - * \brief Copy constructor - * \param other the matrix to copy the content from. - */ - Matrix3D(const Matrix3D& other) ; - - /*! - * \brief Constructs a matrix from a text file. - * \param file_address the address of the file containing the matrix. - * \throw std::runtime_error if anything happen while reading the - * file (format error, file not found, etc). - */ - Matrix3D(const std::string& file_address) /* throw (std::runtime_error) */ ; - - // methods overloaded from Matrix - using Matrix::get ; - using Matrix::set ; - - // methods - /*! - * \brief Gets the element at the given coordinates. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \throw std::out_of_range exception if the coordinates - * are out of range. - * \return the element. - */ - T get(size_t dim1, size_t dim2, size_t dim3) const /* throw (std::out_of_range) */ ; - /*! - * \brief Sets the element at the given coordinates - * to the given value. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \param value the new value. - * \throw std::out_of_range exception if the coordinates - * are out of range. - */ - void set(size_t dim1, size_t dim2, size_t dim3, T value) /* throw (std::out_of_range) */ ; - - /*! - * \brief Produces a nice representation of the matrix on the given - * stream. - * \param stream the stream. - * \param precision the rounding precision. - * \param width the column width in number of characters. - * \param sep the character separator. - */ - void print(std::ostream& stream, size_t precision=4 ,size_t width=6, char sep=' ') const ; - - // operators - /*! - * \brief Returns a reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \return a reference to this element. - */ - T& operator() (size_t dim1, size_t dim2, size_t dim3) ; - /*! - * \brief Returns a constant reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \return a constant reference to this element. - */ - const T& operator() (size_t dim1, size_t dim2, size_t dim3) const ; - - private: - // methods - /*! - * \brief Checks whether a given string is a slice header - * (such as ",,0"), as found in files storing Matrix3D. - * \param str the string to check. - * \return whether the string is a slice header. - */ - bool is_header(const std::string& str) const ; - -} ; - -/*! - * \brief Sends a representation of the matrix to the stream. - * \param stream the stream of interest. - * \param m the matrix of interest. - * \return a reference to the stream. - */ -template -std::ostream& operator << (std::ostream& stream, const Matrix3D& m) -{ m.print(stream) ; - return stream ; -} - -template -Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3) - : Matrix3D(dim1, dim2, dim3, 0) -{} - -template -Matrix3D::Matrix3D(size_t dim1, size_t dim2, size_t dim3, T value) - : Matrix({dim1, dim2, dim3}, value) -{} - -template -Matrix3D::Matrix3D(const Matrix3D &other) - : Matrix(other) -{} - - -template -Matrix3D::Matrix3D(const std::string &file_address) /* throw (std::runtime_error) */ -{ - this->_dim = {0,0,0} ; - this->_data = std::vector() ; - this->_dim_size = this->_dim.size() ; - this->_data_size = this->_data.size() ; - this->_dim_prod = std::vector(this->_dim_size, 0) ; - - std::ifstream file(file_address, std::ifstream::in) ; - if(file.fail()) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "error! cannot open %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - std::string buffer_str ; - std::vector buffer_vec ; - T buffer_T ; - - // read file - size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read - size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values - size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col - - while(getline(file, buffer_str)) - { // check stream status and read content - if(buffer_str.size() == 0) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - if(file.fail()) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - // check whether it is the beginning of a slice - // 1st line in file should be - if(this->is_header(buffer_str)) - { // check that slice have a constant number of rows - if(this->_dim[2] == 1) - { col_len = col_len_cur ; - this->_dim[0] = row_len ; - this->_dim[1] = col_len ; - } - else if(col_len_cur != col_len) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - this->_dim[2]++ ; - col_len_cur = 0 ; - n_line++ ; - continue ; - } - // 1st line in file should be a header and entering - // this block is forbidden - if(n_line == 0) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! first line is not a slice header in %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - // parse line - row_len_cur = 0 ; - buffer_vec.clear() ; - std::istringstream buffer_ss(buffer_str) ; - while(buffer_ss >> buffer_T) - { buffer_vec.push_back(buffer_T) ; - row_len_cur++ ; - } - // check for an error which likely indicates that a value could not be - // casted into a type T (mixed data types in the file) - if(buffer_ss.fail() and not buffer_ss.eof()) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! could not read a line in %s (incompatible data types)", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - // check that number of column is constant - if(n_line_data == 0) - { row_len = row_len_cur ; } - else if(row_len_cur != row_len) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - // update matrix content - for(auto i : buffer_vec) - { this->_data.push_back(i) ; - this->_data_size++ ; - } - col_len_cur++ ; - n_line_data++ ; - n_line++ ; - } - // check dimensions of last slice - if(col_len_cur != col_len) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - file.close() ; - this->compute_dim_product() ; -} - - -template -T Matrix3D::get(size_t dim1, size_t dim2, size_t dim3) const /* throw(std::out_of_range) */ -{ try - { return this->get({dim1, dim2, dim3}) ; } - catch(std::out_of_range& e) - { throw e ; } -} - -template -void Matrix3D::set(size_t dim1, size_t dim2, size_t dim3, T value) /* throw(std::out_of_range) */ -{ try - { return this->set({dim1, dim2, dim3}, value) ; } - catch(std::out_of_range& e) - { throw e ; } -} - - -template -T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) -{ std::vector coord = {dim2, dim1, dim3} ; - return this->_data[this->convert_to_offset(coord)] ; -} - - -template -void Matrix3D::print(std::ostream& stream, size_t precision, size_t width, char sep) const -{ - stream.setf(std::ios::left) ; - std::vector dim = this->get_dim() ; - for(size_t z=0; z -const T& Matrix3D::operator () (size_t dim1, size_t dim2, size_t dim3) const -{ std::vector coord = {dim2, dim1, dim3} ; - return this->_data[this->convert_to_offset(coord)] ; -} - - -template -bool Matrix3D::is_header(const std::string& str) const -{ if(str[0] == ',' and - str[1] == ',' and - str.find(',', 2) == std::string::npos) - { return true ; } - return false ; -} - -#endif // MATRIX3D_HPP diff --git a/src/Matrix_old/Matrix4D.hpp b/src/Matrix_old/Matrix4D.hpp deleted file mode 100755 index ee699bd..0000000 --- a/src/Matrix_old/Matrix4D.hpp +++ /dev/null @@ -1,539 +0,0 @@ -#ifndef MATRIX4D_HPP -#define MATRIX4D_HPP - -#include "Matrix.hpp" - -#include -#include -#include // std::out_of_range -#include -#include -#include // ifstream -#include // sstream - - -#define BUFFER_SIZE 4096 -// const size_t BUFFER_SIZE = 4096 ; - -/*! - * The Matrix4D class is a specialisation of the Matrix - * class to handle 4D matrices more easily. - * - * A text file format is defined to store such matrices. - * The specifications are as follows : - * - * Absolutely NO empty lines are allowed! - * The following lines should contain : - * - * 1st line : a slice header ',,,0' indicating that a - * slice of the 4th dimension is beginning. - * 3nd - Nth line : the slice of the 4th dimension. It contains - * slice in the 3rd dimension which are 2D - * matrices separated by headers (',,0' and - * ',,1' in the below example). - * N+1th line : ',,,1' indicating that the 2nd slice of the - * 4th dimension is beginning. - * and so on... - * - * Example - * ---- start ---- - * ,,,0 - * ,,0 - * 1 2 3 - * 4 5 6 - * ,,1 - * 7 8 9 - * 10 11 12 - * ,,,1 - * ,,0 - * 21 22 23 - * 24 25 26 - * ,,1 - * 27 28 29 - * 30 31 32 - * ----- end ----- - * - */ -template -class Matrix4D : public Matrix -{ - public: - // constructors - Matrix4D() = default ; - /*! - * \brief Constructs a matrix with the given dimensions, - * filled with 0 values. - * \param dim1 the first dimension. - * \param dim2 the second dimension. - * \param dim3 the third dimension. - * \param dim4 the fourth dimension. - */ - Matrix4D(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) ; - /*! - * \brief Constructs a matrix with the given dimensions and - * initialize the values to the given value. - * \param dim1 the first dimension. - * \param dim2 the second dimension. - * \param dim3 the third dimension. - * \param dim4 the fourth dimension. - * \param value the value to initialize the matrix content - * with. - */ - Matrix4D(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4, - T value) ; - /*! - * \brief Copy constructor - * \param other the matrix to copy the content from. - */ - Matrix4D(const Matrix4D& other) ; - /*! - * \brief Constructs a matrix from a text file. - * \param file_address the address of the file containing the matrix. - * \throw std::runtime_error if anything happen while reading the - * file (format error, file not found, etc). - */ - Matrix4D(const std::string& file_address) /* throw (std::runtime_error) */ ; - - // methods overloaded from Matrix - using Matrix::get ; - using Matrix::set ; - - // methods OK - /*! - * \brief Gets the element at the given coordinates. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \param dim4 the fourth dimension coordinate. - * \throw std::out_of_range exception if the coordinates - * are out of range. - * \return the element. - */ - T get(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) const /* throw (std::out_of_range) */ ; - /*! - * \brief Sets the element at the given coordinates - * to the given value. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \param dim4 the fourth dimension coordinate. - * \param value the new value. - * \throw std::out_of_range exception if the coordinates - * are out of range. - */ - void set(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4, - T value) /* throw (std::out_of_range) */ ; - /*! - * \brief Produces a nice representation of the matrix on the given - * stream. - * \param stream the stream. - * \param precision the rounding precision. - * \param width the column width in number of characters. - * \param sep the character separator. - */ - void print(std::ostream& stream, - size_t precision=4, - size_t width=6, - char sep=' ') const ; - - // operators OK - /*! - * \brief Returns a reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \param dim4 the third dimension coordinate. - * \return a reference to this element. - */ - T& operator() (size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) ; - /*! - * \brief Returns a reference to the corrresponding - * element. This method does not perform any check on - * the coordinates. - * \param dim1 the first dimension coordinate. - * \param dim2 the second dimension coordinate. - * \param dim3 the third dimension coordinate. - * \param dim4 the third dimension coordinate. - * \return a reference to this element. - */ - const T& operator() (size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) const ; - - private: - // methods - /*! - * \brief Checks whether a given string is a 3D header - * (such as ",,0"), as found in files storing Matrix4D. - * \param str the string to check. - * \return whether the string is such a slice header. - */ - bool is_header_3d(const std::string& str) const ; - /*! - * \brief Checks whether a given string is a 4D header - * (such as ",,,0"), as found in files storing Matrix4D. - * \param str the string to check. - * \return whether the string is such a slice header. - */ - bool is_header_4d(const std::string& str) const ; - - /*! - * \brief Routine to load 4D matrices from files. - * This method reads from a std::ifstream object, - * from the current pointer location until i) a 4D - * header line is found (such as ',,,1') or ii) until - * it cannot read anymore from the stream. All - * data are pushed back into the data vector and - * the dimensions of the data read are stored into - * the dim vector (these data are actually a 3D - * matrix). If the method returned because it - * found another 4D header, it returns true, false - * otherwise. - * To read an entire 4D matrix from a file, simply - * use this scheme : i) read the 1st 4D header - * ii) call this function while it returns true. - * \param file_name a reference to a string containing - * the address of the file currently read (for exception - * messages). - * \param file a reference to the std::ifstream to read - * from. Obviously, the stream state will be modified as - * the method reads from it. However, it will never be - * closed by the method. - * \param data a reference to an empty vector where the - * read data will be pushed back. - * \param dim a reference to an empty vector where the - * dimensions of the read data will be stored. - * \return whether the last piece of data read from the - * stream was a 4D header. - */ - bool get_3d_slice(const std::string& file_name, - std::ifstream& file, - std::vector& data, - std::vector& dim) const - /* throw (std::runtime_error) */ ; - -} ; - -template -std::ostream& operator << (std::ostream& stream, const Matrix4D& m) -{ m.print(stream) ; - return stream ; -} - -template -Matrix4D::Matrix4D(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) - : Matrix({dim1, dim2, dim3, dim4}, 0) -{} - -template -Matrix4D::Matrix4D(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4, - T value) - : Matrix({dim1, dim2, dim3, dim4}, value) -{} - -template -Matrix4D::Matrix4D(const Matrix4D &other) - : Matrix(other) -{} - -template -Matrix4D::Matrix4D(const std::string &file_address) /* throw (std::runtime_error) */ -{ this->_dim = {0,0,0,0} ; - this->_data = std::vector() ; - this->_dim_size = this->_dim.size() ; - this->_data_size = this->_data.size() ; - this->_dim_prod = std::vector(this->_dim_size, 0) ; - - std::ifstream file(file_address, std::ifstream::in) ; - if(file.fail()) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "error! cannot open %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - - std::string buffer_str ; - std::vector buffer_t ; - std::vector dim ; - - getline(file, buffer_str) ; - bool found_4d_header = this->is_header_4d(buffer_str) ; - do - { // check stream status and read content - if(buffer_str.size() == 0) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s (empty line)", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - if(file.fail()) - { file.close() ; - char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s", file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - // this is the beginning of a 3D slice -> get it using routine - if(found_4d_header) - { try - { // get slice - buffer_t.clear() ; - dim.clear() ; - found_4d_header = this->get_3d_slice(file_address, file, buffer_t, dim); - // update data - for(const auto& i : buffer_t) - { this->_data.push_back(i) ; - this->_data_size++ ; - } - // update dim only for the 1st slice (the 1st slice set the dimensions) - if(this->_dim[3] == 0) - { this->_dim[0] = dim[0] ; - this->_dim[1] = dim[1] ; - this->_dim[2] = dim[2] ; - } - // check dimensions of the slice - else - { if(dim[0] != this->_dim[0] or - dim[1] != this->_dim[1] or - dim[2] != this->_dim[2]) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", - file_address.c_str()) ; - throw std::runtime_error(msg) ; - } - } - this->_dim[3]++ ; - } - catch(std::runtime_error& e) - { file.close() ; - throw e ; - } - } - } while(found_4d_header) ; - - file.close() ; - this->compute_dim_product() ; -} - -template -T Matrix4D::get(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) const /* throw (std::out_of_range) */ -{ try - { return this->get({dim1, dim2, dim3, dim4}) ; } - catch(std::out_of_range& e) - { throw e ; } -} - -template -void Matrix4D::set(size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4, - T value) /* throw (std::out_of_range) */ -{ try - { this->set({dim1, dim2, dim3, dim4}, value) ; } - catch(std::out_of_range& e) - { throw e ; } -} - -template -void Matrix4D::print(std::ostream &stream, - size_t precision, - size_t width, - char sep) const -{ stream.setf(std::ios::left) ; - std::vector dim = this->get_dim() ; - - for(size_t dim4=0; dim4 -T& Matrix4D::operator () (size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) -{ std::vector coord = {dim2, dim1, dim3, dim4} ; - return this->_data[this->convert_to_offset(coord)] ; -} - -template -const T& Matrix4D::operator () (size_t dim1, - size_t dim2, - size_t dim3, - size_t dim4) const -{ std::vector coord = {dim2, dim1, dim3, dim4} ; - return this->_data[this->convert_to_offset(coord)] ; -} - -template -bool Matrix4D::is_header_3d(const std::string &str) const -{ if(str[0] == ',' and - str[1] == ',' and - str.find(',', 2) == std::string::npos) - { return true ; } - return false ; -} - -template -bool Matrix4D::is_header_4d(const std::string &str) const -{ if(str[0] == ',' and - str[1] == ',' and - str[2] == ',' and - str.find(',', 3) == std::string::npos) - { return true ; } - return false ; -} - -template -bool Matrix4D::get_3d_slice(const std::string& file_name, - std::ifstream& file, - std::vector &data, - std::vector &dim) const /* throw (std::runtime_error) */ -{ - bool found_4d_header = false ; // the flag to return - - dim = {0,0,0} ; - - std::string buffer_str ; - std::vector buffer_vec ; - T buffer_T ; - - size_t n_line = 0, n_line_data = 0 ; // number of line and of data line read - size_t row_len = 0, col_len = 0 ; // length of row and column in nber of values - size_t row_len_cur = 0, col_len_cur = 0 ; // current number of values read in row and col - - while(getline(file, buffer_str)) - { // std::cerr << "in " << n_line << " : " << buffer_str << std::endl ; - // check stream status and read content - if(buffer_str.size() == 0) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s (empty line)", file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - if(file.fail()) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "error! while reading %s", file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - // check whether this is the beginning of a 4D slice header, if so - // break ; - if(this->is_header_4d(buffer_str)) - { found_4d_header = true ; - break ; - } - // check whether it is the beginning of a slice - // 1st line in file should be - if(this->is_header_3d(buffer_str)) - { // check that slice have a constant number of rows - if(dim[2] == 1) - { col_len = col_len_cur ; - dim[0] = row_len ; - dim[1] = col_len ; - } - else if(col_len_cur != col_len) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", - file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - dim[2]++ ; - col_len_cur = 0 ; - n_line++ ; - continue ; - } - // 1st line in file should be a header and entering - // this block is forbidden - if(n_line == 0) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! first line is not a slice header in %s", - file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - - // parse line - row_len_cur = 0 ; - buffer_vec.clear() ; - std::istringstream buffer_ss(buffer_str) ; - while(buffer_ss >> buffer_T) - { buffer_vec.push_back(buffer_T) ; - row_len_cur++ ; - } - // check for an error which likely indicates that a value could not be - // casted into a type T (mixed data types in the file) - if(buffer_ss.fail() and not buffer_ss.eof()) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! could not read a line in %s (incompatible data types)", - file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - - // check that number of column is constant - if(n_line_data == 0) - { row_len = row_len_cur ; } - else if(row_len_cur != row_len) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", - file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - - // update matrix content - for(auto i : buffer_vec) - { data.push_back(i) ; } - col_len_cur++ ; - n_line_data++ ; - n_line++ ; - } - - // check dimensions of last slice - if(col_len_cur != col_len) - { char msg[BUFFER_SIZE] ; - sprintf(msg, "format error! slice have variable dimensions in %s", - file_name.c_str()) ; - throw std::runtime_error(msg) ; - } - - return found_4d_header ; -} - - -#endif // MATRIX4D_HPP diff --git a/src/Parallel/ThreadPool.cpp b/src/Parallel/ThreadPool.cpp index 7d03b24..b31df0d 100755 --- a/src/Parallel/ThreadPool.cpp +++ b/src/Parallel/ThreadPool.cpp @@ -1,144 +1,156 @@ #include "ThreadPool.hpp" #include #include #include //std::pair std::vector> ThreadPool::split_range(size_t from, size_t to, size_t n_thread) { // contains the [from, to) slices std::vector> coordinate_list(n_thread) ; size_t by = to / n_thread ; size_t from_current = from, to_current = 0 ; for(size_t n=0; n to) ? (to_current = to) : 0 ; coordinate_list[n] = std::pair(from_current, to_current) ; } + + // if there is some remains, distribute it equally over last threads + size_t remain = to - to_current ; + if(remain) + { size_t from_correction = 0 ; + size_t to_correction = 1 ; + for(size_t n=n_thread-remain; n 0) ; this->threads = std::vector(n_threads) ; for(size_t i=0; ithreads[i] = std::thread(&ThreadPool::thread_routine, this) ; } } ThreadPool::~ThreadPool() {} size_t ThreadPool::getNThread() const { return this->threads.size() ; } void ThreadPool::addJob(std::function&& task) { // only add a job in the queues if they are open if(this->isQueueOpen()) { this->lock_mutex_queue() ; // this->debug_print(std::string("started adding job")) ; this->queue_task.push(task) ; // this->debug_print(std::string("ended adding job")) ; this->unlock_mutex_queue() ; } std::this_thread::sleep_for(std::chrono::milliseconds(5)) ; } void ThreadPool::join() { // closes the queues, later call to addJob() will be effect this->close_queue() ; // joins the threads for(auto& thr : this->threads) { if(thr.joinable()) { this->debug_print(std::string("joined a thread")) ; thr.join() ; } } } void ThreadPool::thread_routine() { this->debug_print(std::string("started")) ; while(true) { // get a function and the arguments value from the queue std::function task ; bool has_task = false ; bool is_queue_open = this->isQueueOpen() ; std::pair args ; this->lock_mutex_queue() ; bool is_queue_empty = this->queue_task.empty() ; if(not is_queue_empty) { // this->debug_print(std::string("fetching from queue")) ; task = this->queue_task.front() ; this->queue_task.pop() ; has_task = true ; } this->unlock_mutex_queue() ; // runs the task if(has_task) { this->debug_print(std::string("working")) ; task() ; } // exit else if(is_queue_empty and not is_queue_open) { break ; } std::this_thread::sleep_for(std::chrono::milliseconds(10)) ; } this->debug_print(std::string("ended")) ; } void ThreadPool::lock_mutex_queue() { this->queue_mutex.lock() ; // this->debug_print(std::string("locked mutex")) ; } void ThreadPool::unlock_mutex_queue() { this->queue_mutex.unlock() ; // this->debug_print(std::string("unlocked mutex")) ; } void ThreadPool::open_queue() { this->queue_open = true ; } void ThreadPool::close_queue() { this->queue_open = false ; } bool ThreadPool::isQueueOpen() { return this->queue_open ; } bool ThreadPool::isDebugOn() const { return this->debug ; } void ThreadPool::debug_print(const std::string& msg, std::ostream& out) const { if(this->isDebugOn()) { std::hash hasher ; char message[1024] ; sprintf(message, "Thread %zu : %s\n", hasher(std::this_thread::get_id()), msg.c_str()) ; out << message ; } } diff --git a/src/Statistics/Statistics.cpp b/src/Statistics/Statistics.cpp index ff322e1..c26f789 100644 --- a/src/Statistics/Statistics.cpp +++ b/src/Statistics/Statistics.cpp @@ -1,26 +1,31 @@ #include #include // M_PI, pow(), sqrt(), log(), lgamma() #include // beta_distribution #include double normal_pmf(double x, double mean, double sd) { static double pi_2 = 2.*M_PI ; return ( 1. / ( sd * sqrt(pi_2) )) * exp(-0.5 * pow((x-mean)/sd, 2.0 ) ); } double poisson_pmf(int x, double lambda) -{ if(x + lambda == 0) +{ if((x != 0) and (lambda == 0)) + { return 0. ; } + else if((x == 0) and (lambda == 0)) { return 1. ; } + + // if(x + lambda == 0) + // { return 1. ; } return exp(x * log(lambda) - lgamma(x + 1.0) - lambda); } double beta_pmf(double x, double alpha, double beta) { boost::math::beta_distribution<> beta_dist(alpha, beta) ; double y = quantile(beta_dist, x) ; return y ; } diff --git a/src/Unittests/unittests_genomictools.cpp b/src/Unittests/unittests_genomictools.cpp new file mode 100644 index 0000000..d355a28 --- /dev/null +++ b/src/Unittests/unittests_genomictools.cpp @@ -0,0 +1,509 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include // std::invalid_argument + +std::string file_bed = "/local/groux/scATAC-seq/data/toy_data/peaks.bed" ; +std::string file_bam = "/local/groux/scATAC-seq/data/toy_data/sc_reads.bam" ; +std::string file_bai = "/local/groux/scATAC-seq/data/toy_data/sc_reads.bam.bai" ; + +// GenomeRegion test suite +SUITE(GenomeRegion) +{ + // displays message + TEST(message) + { std::cout << "Starting GenomicTools tests..." << std::endl ; } + + // tests vonstructor with value + TEST(constructor_value) + { std::string chr = "chr1" ; + int idx = 0 ; + + GenomeRegion r1(chr, idx, 0, 10) ; + CHECK_EQUAL(chr, r1.chromosome) ; + CHECK_EQUAL(0, r1.start) ; + CHECK_EQUAL(10, r1.end) ; + CHECK_EQUAL(10, r1.length) ; + + GenomeRegion r2(chr, idx, 1, 10) ; + CHECK_EQUAL(chr, r2.chromosome) ; + CHECK_EQUAL(1, r2.start) ; + CHECK_EQUAL(10, r2.end) ; + CHECK_EQUAL(9, r2.length) ; + + CHECK_THROW(GenomeRegion(chr, idx, -1, 10), std::invalid_argument) ; + CHECK_THROW(GenomeRegion(chr, idx, 0, -10), std::invalid_argument) ; + } + + + // tests constructFragment factory function to create regions from bam + /* + TEST(test_contructFragment) + { + // expected content of bam file + std::vector regions ; + regions.push_back(GenomeRegion("chr1", 400, 480)) ; + regions.push_back(GenomeRegion("chr1", 470, 550)) ; + regions.push_back(GenomeRegion("chr1", 560, 800)) ; + regions.push_back(GenomeRegion("chr1", 560, 640)) ; + regions.push_back(GenomeRegion("chr1", 610, 690)) ; + regions.push_back(GenomeRegion("chr1", 670, 750)) ; + regions.push_back(GenomeRegion("chr1", 730, 810)) ; + regions.push_back(GenomeRegion("chr1", 770, 850)) ; + regions.push_back(GenomeRegion("chr1", 950, 1150)) ; + regions.push_back(GenomeRegion("chr1", 960, 1040)) ; + regions.push_back(GenomeRegion("chr1", 1010, 1090)) ; + regions.push_back(GenomeRegion("chr1", 1060, 1140)) ; + regions.push_back(GenomeRegion("chr1", 1070, 1150)) ; + regions.push_back(GenomeRegion("chr1", 1350, 1430)) ; + regions.push_back(GenomeRegion("chr1", 1360, 1440)) ; + regions.push_back(GenomeRegion("chr1", 1410, 1490)) ; + regions.push_back(GenomeRegion("chr1", 1500, 1600)) ; + regions.push_back(GenomeRegion("chr1", 1600, 1700)) ; + + regions.push_back(GenomeRegion("chr2", 400, 480)) ; + regions.push_back(GenomeRegion("chr2", 470, 550)) ; + regions.push_back(GenomeRegion("chr2", 560, 800)) ; + regions.push_back(GenomeRegion("chr2", 560, 640)) ; + regions.push_back(GenomeRegion("chr2", 610, 690)) ; + regions.push_back(GenomeRegion("chr2", 670, 750)) ; + regions.push_back(GenomeRegion("chr2", 730, 810)) ; + regions.push_back(GenomeRegion("chr2", 770, 850)) ; + regions.push_back(GenomeRegion("chr2", 950, 1150)) ; + regions.push_back(GenomeRegion("chr2", 960, 1040)) ; + regions.push_back(GenomeRegion("chr2", 1010, 1090)) ; + regions.push_back(GenomeRegion("chr2", 1060, 1140)) ; + regions.push_back(GenomeRegion("chr2", 1070, 1150)) ; + regions.push_back(GenomeRegion("chr2", 1350, 1430)) ; + regions.push_back(GenomeRegion("chr2", 1360, 1440)) ; + regions.push_back(GenomeRegion("chr2", 1410, 1490)) ; + regions.push_back(GenomeRegion("chr2", 1500, 1600)) ; + regions.push_back(GenomeRegion("chr2", 1600, 1700)) ; + + seqan::BamAlignmentRecord record ; + std::string bam_path = "src/Unittests/data/sc_reads.bam" ; + + // read file for fragments starting on + strand + seqan::BamFileIn bam_file(bam_path.c_str()) ; + // header + seqan::BamHeader bam_header ; + seqan::readHeader(bam_header, bam_file) ; + for(size_t i=0; not seqan::atEnd(bam_file); i++) + { seqan::readRecord(record, bam_file) ; + if(seqan::hasFlagFirst(record) and not seqan::hasFlagRC(record)) + { std::cout << regions[i] << " " + << GenomeRegion::constructFragment(record) + << std::endl ; + CHECK_EQUAL(regions[i], GenomeRegion::constructFragment(record)) ; } + } + seqan::close(bam_file) ; + + // read file for fragments starting on - strand + seqan::BamFileIn bam_file(bam_path.c_str()) ; + // header + seqan::BamHeader bam_header ; + seqan::readHeader(bam_header, bam_file) ; + for(size_t i=0; not seqan::atEnd(bam_file); i++) + { seqan::readRecord(record, bam_file) ; + if(seqan::hasFlagFirst(record) and seqan::hasFlagRC(record)) + { CHECK_EQUAL(regions[i], GenomeRegion::constructFragment(record)) ; } + } + seqan::close(bam_file) ; + } + */ + + TEST(test_contructRead) + { // expected content of bam file + std::list regions_exp ; + // chromosome 1 -> has index 0 in BAM file header + regions_exp.push_back(GenomeRegion("chr1", 0, 400, 435)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 400, 435)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 445, 480)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 445, 480)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 470, 505)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 470, 505)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 515, 550)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 515, 550)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 605, 640)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 605, 640)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 610, 645)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 610, 645)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 655, 690)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 655, 690)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 670, 705)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 670, 705)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 715, 750)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 715, 750)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 730, 765)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 730, 765)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 765, 800)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 765, 800)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 770, 805)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 770, 805)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 775, 810)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 775, 810)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 815, 850)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 815, 850)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 950, 985)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 950, 985)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 960, 995)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 960, 995)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1005, 1040)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1005, 1040)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1010, 1045)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1010, 1045)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1055, 1090)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1055, 1090)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1060, 1095)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1060, 1095)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1070, 1105)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1070, 1105)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1105, 1140)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1105, 1140)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1350, 1385)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1350, 1385)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1360, 1395)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1360, 1395)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1395, 1430)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1395, 1430)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1405, 1440)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1405, 1440)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1410, 1445)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1410, 1445)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1455, 1490)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1455, 1490)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1500, 1535)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1500, 1535)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1565, 1600)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1565, 1600)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1600, 1635)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1600, 1635)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1665, 1700)) ; + regions_exp.push_back(GenomeRegion("chr1", 0, 1665, 1700)) ; + + // chromosome 2 -> has index 1 in BAM file header + regions_exp.push_back(GenomeRegion("chr2", 1, 400, 435)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 400, 435)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 445, 480)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 445, 480)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 470, 505)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 470, 505)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 515, 550)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 515, 550)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 560, 595)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 605, 640)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 605, 640)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 610, 645)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 610, 645)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 655, 690)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 655, 690)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 670, 705)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 670, 705)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 715, 750)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 715, 750)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 730, 765)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 730, 765)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 765, 800)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 765, 800)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 770, 805)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 770, 805)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 775, 810)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 775, 810)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 815, 850)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 815, 850)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 950, 985)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 950, 985)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 960, 995)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 960, 995)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1005, 1040)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1005, 1040)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1010, 1045)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1010, 1045)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1055, 1090)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1055, 1090)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1060, 1095)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1060, 1095)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1070, 1105)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1070, 1105)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1105, 1140)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1105, 1140)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1115, 1150)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1350, 1385)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1350, 1385)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1360, 1395)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1360, 1395)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1395, 1430)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1395, 1430)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1405, 1440)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1405, 1440)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1410, 1445)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1410, 1445)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1455, 1490)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1455, 1490)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1500, 1535)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1500, 1535)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1565, 1600)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1565, 1600)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1600, 1635)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1600, 1635)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1665, 1700)) ; + regions_exp.push_back(GenomeRegion("chr2", 1, 1665, 1700)) ; + + // open file + seqan::BamFileIn bam_file ; + if (!seqan::open(bam_file, file_bam.c_str())) + { char msg[4096] ; + sprintf(msg, "ERROR: could not open input file %s", file_bam.c_str()) ; + } + + // read file + seqan::BamAlignmentRecord record ; + seqan::BamHeader header ; + seqan::readHeader(header, bam_file) ; + std::list regions_val ; + while(not seqan::atEnd(bam_file)) + { seqan::readRecord(record, bam_file) ; + regions_val.push_back(GenomeRegion::constructRead(record, bam_file)) ; + } + seqan::close(bam_file) ; + + // compare + CHECK_EQUAL(regions_exp.size(), regions_val.size()) ; + auto iter_exp = regions_exp.begin() ; + auto iter_val = regions_val.begin() ; + while(iter_exp != regions_exp.end()) + { CHECK_EQUAL(*iter_exp, *iter_val) ; + iter_exp++ ; + iter_val++ ; + } + } + + // tests the method to check overlaps + TEST(overlap) + { GenomeRegion r1("chr1", 0, 20, 30) ; // reference + GenomeRegion r2("chr1", 0, 20, 30) ; // same as reference + GenomeRegion r3("chr1", 0, 0, 45) ; // totally contain reference + GenomeRegion r4("chr1", 0, 0, 10) ; // no overlap, upstream reference + GenomeRegion r5("chr1", 0, 15, 25) ; // partial overlap reference + GenomeRegion r6("chr1", 0, 22, 29) ; // inside reference + GenomeRegion r7("chr1", 0, 25, 35) ; // partial overlap reference + GenomeRegion r8("chr1", 0, 35, 45) ; // no overlap, downstream reference + GenomeRegion r9("chr2", 1, 20, 30) ; // diff chromosome + + // always check reciprocity + CHECK_EQUAL(true, r1 | r1) ; + CHECK_EQUAL(true, r1 | r2) ; CHECK_EQUAL(true, r2 | r1) ; + CHECK_EQUAL(true, r1 | r3) ; CHECK_EQUAL(true, r3 | r1) ; + CHECK_EQUAL(false, r1 | r4) ; CHECK_EQUAL(false, r4 | r1) ; + CHECK_EQUAL(true, r1 | r5) ; CHECK_EQUAL(true, r5 | r1) ; + CHECK_EQUAL(true, r1 | r6) ; CHECK_EQUAL(true, r6 | r1) ; + CHECK_EQUAL(true, r1 | r7) ; CHECK_EQUAL(true, r7 | r1) ; + CHECK_EQUAL(false, r1 | r8) ; CHECK_EQUAL(false, r8 | r1) ; + CHECK_EQUAL(false, r1 | r9) ; CHECK_EQUAL(false, r9 | r1) ; + } + + // tests the methods to get overlap length + TEST(overlap_len) + { GenomeRegion r1("chr1", 0, 10, 20) ; // reference + GenomeRegion r2("chr1", 0, 10, 20) ; // same as reference + GenomeRegion r3("chr1", 0, 0, 45) ; // totally contain reference + GenomeRegion r4("chr2", 1, 10, 20) ; // diff chromosome + + // always check reciprocity + CHECK_EQUAL(10, r1.overlap_len(r1)) ; + CHECK_EQUAL(10, r1.overlap_len(r2)) ; CHECK_EQUAL(10, r1.overlap_len(r2)) ; + CHECK_EQUAL(10, r1.overlap_len(r3)) ; CHECK_EQUAL(10, r1.overlap_len(r3)) ; + CHECK_EQUAL(0, r1.overlap_len(r4)) ; CHECK_EQUAL(0, r1.overlap_len(r4)) ; + + // slide a smaller region along reference, from before to after + std::vector overlaps = {0,0,1,2,3,4,4,4,4,4,4,4,3,2,1,0,0,0} ; + int len = 4 ; + for(int i=0, start=5; start<23; i++, start++) + { int end = start + len ; + GenomeRegion s1("chr1", 0, start, end) ; + CHECK_EQUAL(overlaps[i], r1.overlap_len(s1)) ; + CHECK_EQUAL(overlaps[i], s1.overlap_len(r1)) ; + } + } + + // tests the is upstream and is downstream operators + TEST(upstream_downstream) + { GenomeRegion r1("chr1", 0, 10, 20) ; // reference + GenomeRegion r2("chr1", 0, 10, 20) ; // same as reference + GenomeRegion r3("chr1", 0, 0, 45) ; // totally contain reference + GenomeRegion r4("chr2", 1, 10, 20) ; // diff chromosome (downstream has 0 < 1) + + // always check reciprocity + CHECK_EQUAL(false, r1 < r1) ; CHECK_EQUAL(false, r1 > r1) ; + CHECK_EQUAL(false, r1 < r2) ; CHECK_EQUAL(false, r1 > r2) ; + + CHECK_EQUAL(false, r1 < r3) ; CHECK_EQUAL(false, r1 < r3) ; + CHECK_EQUAL(false, r3 < r1) ; CHECK_EQUAL(false, r3 < r1) ; + + // not on the same chromosome -> depends on the index value + CHECK_EQUAL(r1 < r4, true) ; CHECK_EQUAL(r1 > r4, false) ; + CHECK_EQUAL(r4 < r1, false) ; CHECK_EQUAL(r4 > r1, true) ; + + // slide a smaller region along reference, from before to after + std::vector s1_upstream = {1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} ; // s1 < r1 + std::vector r1_downstream = {1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} ; // r1 > s1 + std::vector s1_downstream = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1} ; // s1 > r1 + std::vector r1_upstream = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1} ; // r1 < s1 + int len = 4 ; + for(int i=0, start=5; start<23; i++, start++) + { // the sliding one + int end = start + len ; + GenomeRegion s1("chr1", 0, start, end) ; + + CHECK_EQUAL(s1_upstream[i], s1 < r1) ; + CHECK_EQUAL(r1_downstream[i], r1 > s1) ; + + CHECK_EQUAL(r1_upstream[i], r1 < s1) ; + CHECK_EQUAL(s1_downstream[i], s1 > r1) ; + } + } +} + + +// CorrelationMatrixCreator test suite +SUITE(CorrelationMatrixCreator) +{ + // displays message + TEST(message) + { std::cout << "Starting CorrelationMatrixCreator tests..." << std::endl ; } + + // tests matrix creation with full fragments + TEST(create_matrix_fragment) + { CorrelationMatrixCreator creator(file_bed, + file_bam, + file_bai, + -500, + 500, + 100, + CorrelationMatrixCreator::FRAGMENT) ; + Matrix2D m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 420 ; m_exp(0,1) = 480 ; m_exp(0,2) = 380 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 440 ; m_exp(0,5) = 600 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 400 ; + + m_exp(1,0) = 420 ; m_exp(1,1) = 480 ; m_exp(1,2) = 380 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 440 ; m_exp(1,5) = 600 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 400 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 2 ; m_exp(0,1) = 6 ; m_exp(0,2) = 4 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 2 ; m_exp(0,5) = 8 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 4 ; + + m_exp(1,0) = 2 ; m_exp(1,1) = 6 ; m_exp(1,2) = 4 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 2 ; m_exp(1,5) = 8 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 4 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 280 ; m_exp(0,1) = 250 ; m_exp(0,2) = 310 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 280 ; m_exp(0,5) = 420 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 350 ; + + m_exp(1,0) = 280 ; m_exp(1,1) = 250 ; m_exp(1,2) = 310 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 280 ; m_exp(1,5) = 420 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 350 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i m_val = creator.create_matrix() ; + Matrix2D m_exp(2, 9, 0) ; + m_exp(0,0) = 8 ; m_exp(0,1) = 8 ; m_exp(0,2) = 8 ; + m_exp(0,3) = 0 ; m_exp(0,4) = 8 ; m_exp(0,5) = 12 ; + m_exp(0,6) = 0 ; m_exp(0,7) = 0 ; m_exp(0,8) = 10 ; + + m_exp(1,0) = 8 ; m_exp(1,1) = 8 ; m_exp(1,2) = 8 ; + m_exp(1,3) = 0 ; m_exp(1,4) = 8 ; m_exp(1,5) = 12 ; + m_exp(1,6) = 0 ; m_exp(1,7) = 0 ; m_exp(1,8) = 10 ; + + CHECK_EQUAL(m_exp.get_nrow(), m_val.get_nrow()) ; + CHECK_EQUAL(m_exp.get_ncol(), m_val.get_ncol()) ; + + for(size_t i=0; i #include // accumulate() #include #include #include #include /*! * \brief Given a matrix and an offset, this methods converts * the offset into a coordinates vector (row, col, ...). It is * a simple copy/paste of Matrix::convert_to_coord() which is * private. * \param m a matrix. * \param offset an offset * \return a vector of coordinates (row,col,...) corresponding to * the offset for the given matrix. */ std::vector convert_to_coord(const Matrix& m, size_t offset) { std::vector dim = m.get_dim() ; // (row, col, ...) format if(dim.size() > 1) { std::swap(dim[0], dim[1]) ; } // (x,y,...) format std::vector coord(dim.size(), 0) ; std::vector dim_prod(dim.size(), 0) ; dim_prod[0] = 1 ; if(dim.size() > 1) { dim_prod[1] = dim[0] ; } if(dim.size() > 2) { for(size_t i=2; i=0; i--) { size_t c = offset / dim_prod[i] ; coord[i] = c ; offset -= (dim_prod[i]*c) ; } if(dim.size() > 1) { std::swap(coord[0], coord[1]) ; } // (row,col,...) format return coord ; } +/* // Matrix test suite SUITE(Matrix) { // displays message TEST(message) { std::cout << "Starting Matrix tests..." << std::endl ; } // tests normal constructor TEST(constructor) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_1.size(), m1.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_1, m1.get_dim(), dim_1.size()) ; CHECK_EQUAL(data_size_1, m1.get_data_size()) ; // always has a zero dimension : 0 / 0x1 / 0x1x2/ ... / 0x1x...x10 Matrix m2(dim_2) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_2.size(), m2.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_2, m2.get_dim(), dim_2.size()) ; CHECK_EQUAL(data_size_2, m2.get_data_size()) ; CHECK_EQUAL(data_size_2, m2.get_data().size()) ; // is a 0 dimension matrix : 0 / 0x0 / 0x0x...x0 Matrix m3(dim_3) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_3.size(), m3.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_3, m3.get_dim(), dim_3.size()) ; CHECK_EQUAL(data_size_3, m3.get_data_size()) ; CHECK_EQUAL(data_size_3, m3.get_data().size()) ; } } // tests contructor with value TEST(constructor_value) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_1.size(), m1.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_1, m1.get_dim(), dim_1.size()) ; CHECK_EQUAL(data_size_1, m1.get_data_size()) ; for(const auto x : m1.get_data()) { CHECK_EQUAL(i, x) ; } // always has a zero dimension : 0 / 0x1 / 0x1x2/ ... / 0x1x...x10 Matrix m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_2.size(), m2.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_2, m2.get_dim(), dim_2.size()) ; CHECK_EQUAL(data_size_2, m2.get_data_size()) ; CHECK_EQUAL(data_size_2, m2.get_data().size()) ; for(const auto x : m2.get_data()) { CHECK_EQUAL(i, x) ; } // is a 0 dimension matrix : 0 / 0x0 / 0x0x...x0 Matrix m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; CHECK_EQUAL(dim_3.size(), m3.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim_3, m3.get_dim(), dim_3.size()) ; CHECK_EQUAL(data_size_3, m3.get_data_size()) ; CHECK_EQUAL(data_size_3, m3.get_data().size()) ; for(const auto x : m3.get_data()) { CHECK_EQUAL(i, x) ; } } } // tests the get() method, compare a value get with offset with the value get with coordinates // (computed from offset) TEST(get) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; Matrix m1_2(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m1_2(m1) ; CHECK_EQUAL(true, m1 == m1_2) ; // always has a zero dimension : 0 / 0x1 / 0x1x2/ ... / 0x1x...x10 Matrix m2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m2_2(m2) ; CHECK_EQUAL(true, m2 == m2_2) ; // is a 0 dimension matrix : 0 / 0x0 / 0x0x...x0 Matrix m3(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j m3_2(m3) ; CHECK_EQUAL(true, m3 == m3_2) ; } } // tests the () operator TEST(parenthesis_operator) { std::vector dim_1, dim_2, dim_3 ; size_t data_size_1, data_size_2, data_size_3 ; // from 0D to 10D for(size_t i=1; i<11; i++) { dim_1.push_back(i+1) ; dim_2.push_back(i) ; dim_3.push_back(0) ; // has non-0 dimensions : 1 /1x2 / 1x2x3 / ... / 1x2x...x11 Matrix m1(dim_1, i) ; data_size_1 = std::accumulate(dim_1.begin(), dim_1.end(), 1, std::multiplies()) ; for(size_t j=0; j m2(dim_2, i) ; Matrix m2_2(dim_2, i) ; data_size_2 = std::accumulate(dim_2.begin(), dim_2.end(), 1, std::multiplies()) ; for(size_t j=0; j m3(dim_3, i) ; Matrix m3_2(dim_3, i) ; data_size_3 = std::accumulate(dim_3.begin(), dim_3.end(), 1, std::multiplies()) ; for(size_t j=0; j dim = {i,j} ; Matrix2D m(i,j) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; } } } // tests contructor with value TEST(constructor_value) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { std::vector dim = {i,j} ; Matrix2D m(i,j,n) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; for(const auto& i : m.get_data()) { CHECK_EQUAL(n, i) ; } } } } // tests the copy constructor TEST(constructor_copy) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... dim = {i, i+1} ; Matrix2D m1(i,i+1) ; for(size_t j=0; j m1_2(m1) ; CHECK_EQUAL(true, m1 == m1_2) ; // always has a zero dimension : // has a zero dimension : 0x1 / 0x2 / ... dim = {0, i} ; Matrix2D m2(0,i) ; for(size_t j=0; j m2_2(m2) ; CHECK_EQUAL(true, m2 == m2_2) ; // is a 0 dimension matrix : 0x0 dim = {0, 0} ; Matrix2D m3(0,0) ; for(size_t j=0; j m3_2(m3) ; CHECK_EQUAL(true, m3 == m3_2) ; } } // tests the get() method, compare a value get with offset with the value get with coordinates // (computed from offset) TEST(get) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1, i) ; dim = {i,i+1} ; for(size_t j=0; j coord = convert_to_coord(m1, j) ; CHECK_EQUAL(m1.get(j), m1.get(coord[0], coord[1])) ; } // has a zero dimension : 0x1 / 0x2 / ... Matrix2D m2(0,i,i) ; dim = {0,i} ; for(size_t j=0; j coord = convert_to_coord(m2, j) ; CHECK_EQUAL(m2.get(j), m2.get(coord[0], coord[1])) ; } // has zero dimensions : 0x0 Matrix2D m3(0,0,i) ; dim = {0,0} ; for(size_t j=0; j coord = convert_to_coord(m3, j) ; CHECK_EQUAL(m3.get(j), m3.get(coord[0], coord[1])) ; } } } // test the set() method, set a value and then check it using get() TEST(set) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1, i) ; dim = {i,i+1} ; for(size_t j=0; j coord = convert_to_coord(m1, j) ; m1.set(coord[0], coord[1], j) ; } for(size_t j=0; j m2(0,i,i) ; dim = {0,i} ; for(size_t j=0; j coord = convert_to_coord(m2, j) ; m2.set(coord[0], coord[1], j) ; } for(size_t j=0; j m3(0,0,i) ; dim = {0,0} ; for(size_t j=0; j coord = convert_to_coord(m3, j) ; m3.set(coord[0], coord[1], j) ; } for(size_t j=0; j m1(i,i+1) ; CHECK_EQUAL(i, m1.get_nrow()) ; // always has a zero dimension : // has a zero dimension : 0x1 / 0x2 / ... Matrix2D m2(0,i) ; CHECK_EQUAL(0, m2.get_nrow()) ; // is a 0 dimension matrix : 0x0 Matrix2D m3(0,0) ; CHECK_EQUAL(0, m3.get_nrow()) ; } } // tests get_ncol() TEST(get_ncol) { for(size_t i=1; i<11; i++) { // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1) ; CHECK_EQUAL(i+1, m1.get_ncol()) ; // always has a zero dimension : // has a zero dimension : 0x1 / 0x2 / ... Matrix2D m2(0,i) ; CHECK_EQUAL(i, m2.get_ncol()) ; // is a 0 dimension matrix : 0x0 Matrix2D m3(0,0) ; CHECK_EQUAL(0, m3.get_ncol()) ; } } // tests get_row() TEST(get_row) { for(size_t i=0; i<11; i++) { Matrix2D m(5,i) ; for(size_t j=0; j row(m.get_ncol()) ; for(size_t n=0, k=j*m.get_ncol(); n m(i,5) ; for(size_t j=0; j col(m.get_nrow()) ; for(size_t n=0, k=j; n m(5,i) ; for(size_t j=0; j new_row(i, 999) ; m.set_row(j, new_row) ; CHECK_EQUAL(i, m.get_row(j).size()) ; CHECK_ARRAY_EQUAL(new_row, m.get_row(j), new_row.size()) ; } CHECK_THROW(m.set_row(9999, std::vector(i,0)), std::out_of_range) ; CHECK_THROW(m.set_row(0, std::vector(i+1,0)), std::invalid_argument) ; } } // tests set_col() TEST(set_col) { for(size_t i=0; i<11; i++) { Matrix2D m(i,5) ; for(size_t j=0; j new_col(i, 999) ; m.set_col(j, new_col) ; CHECK_EQUAL(i, m.get_col(j).size()) ; CHECK_ARRAY_EQUAL(new_col, m.get_col(j), new_col.size()) ; } CHECK_THROW(m.set_col(9999, std::vector(i,0)), std::out_of_range) ; CHECK_THROW(m.set_col(0, std::vector(i+1,0)), std::invalid_argument) ; } } TEST(parenthesis_operator) { for(size_t i=1; i<11; i++) { std::vector dim ; // has non-0 dimensions : 1x2 / 2x3 / ... Matrix2D m1(i,i+1, i) ; dim = {i,i+1} ; for(size_t j=0; j coord = convert_to_coord(m1, j) ; m1(coord[0], coord[1]) = j ; } for(size_t j=0; j m2(0,i,i) ; dim = {0,i} ; for(size_t j=0; j coord = convert_to_coord(m2, j) ; m2(coord[0], coord[1]) = j ; } for(size_t j=0; j m3(0,0,i) ; dim = {0,0} ; for(size_t j=0; j coord = convert_to_coord(m3, j) ; m3(coord[0], coord[1]) = j ; } for(size_t j=0; j> v_int({{0,1,2,3},{4,5,6,7}}) ; std::vector> v_char({{'A','A','A'},{'C','C','C'}, {'G','G','G'},{'T','T','T'}}) ; std::vector> v_double({{0.,1.,2.,3.},{4.,5.,6.,7.}}) ; Matrix2D m_int(2,4) ; m_int.set_row(0, {0,1,2,3}) ; m_int.set_row(1, {4,5,6,7}) ; Matrix2D m_char(4,3) ; m_char.set_row(0, {'A','A','A'}) ; m_char.set_row(1, {'C','C','C'}) ; m_char.set_row(2, {'G','G','G'}) ; m_char.set_row(3, {'T','T','T'}) ; Matrix2D m_dbl(2,4) ; m_dbl.set_row(0, {0.,1.,2.,3.}) ; m_dbl.set_row(1, {4.,5.,6.,7.}) ; // matrix of int Matrix2D m_int1(file_int1) ; // this one is perfect Matrix2D m_int2(file_int2) ; // this one has inhomogeneous spaceers but is OK CHECK_EQUAL(m_int, m_int1) ; CHECK_EQUAL(m_int, m_int2) ; // matrix with only 1 int Matrix2D m_int3(file_int7) ; CHECK_EQUAL( Matrix2D(1,1,1), m_int3) ; // empty matrix (empty file) Matrix2D m_int4(file_int8) ; CHECK_EQUAL(Matrix2D(0,0), m_int4) ; // empty matrix (only eol in file) Matrix2D m_int5(file_int9) ; CHECK_EQUAL(Matrix2D(0,0), m_int5) ; // these files are not well formatted CHECK_THROW(m_int2 = Matrix2D(file_int3), std::runtime_error) ; // data are inhomogeneous CHECK_THROW(m_int2 = Matrix2D(file_int4), std::runtime_error) ; // empty line CHECK_THROW(m_int2 = Matrix2D(file_int5), std::runtime_error) ; // empty line CHECK_THROW(m_int2 = Matrix2D(file_int6), std::runtime_error) ; // empty line // matrix of char Matrix2D m_char1(file_char1) ; CHECK_EQUAL(m_char, m_char1) ; // matrix of double Matrix2D m_dbl1(file_double1) ; CHECK_EQUAL(m_dbl, m_dbl1) ; // file does not exist CHECK_THROW(Matrix2D m_int2(file_ghost), std::runtime_error) ; } // tests file format, writting a matrix and reading it should return the // same matrix, uses set() and the == operator // loading an empty file is not allowed (has no meaning, the file is empty) TEST(file_format) { for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { Matrix2D m(i,j) ; for(size_t a=0; a m2("./src/Unittests/data/matrix2d_out.mat") ; // any matrix with at least one zero dimension is a null // matrix if(i==0 or j==0) { CHECK_EQUAL(Matrix2D(0,0), m2) ; } else { CHECK_EQUAL(m, m2) ; } } } } } SUITE(Matrix3D) { // displays message TEST(message) { std::cout << "Starting Matrix3D tests..." << std::endl ; } // tests constructor TEST(constructor) { for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; } } } } // test constructor value TEST(constructor_value) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; for(const auto& i : m.get_data()) { CHECK_EQUAL(n, i) ; } } } } } // tests copy constructor TEST(constructor_copy) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; Matrix3D m2(m) ; CHECK_EQUAL(m, m2) ; } } } } // tests contructor from file, uses the == operator TEST(constructor_file) { std::string file_int1("./src/Unittests/data/matrix3d_int1.mat") ; std::string file_int2("./src/Unittests/data/matrix3d_int2.mat") ; std::string file_int3("./src/Unittests/data/matrix3d_int3.mat") ; std::string file_int4("./src/Unittests/data/matrix3d_int4.mat") ; std::string file_int5("./src/Unittests/data/matrix3d_int5.mat") ; std::string file_int6("./src/Unittests/data/matrix3d_int6.mat") ; std::string file_int7("./src/Unittests/data/matrix3d_int7.mat") ; std::string file_int8("./src/Unittests/data/matrix3d_int8.mat") ; std::string file_int9("./src/Unittests/data/matrix3d_int9.mat") ; std::string file_int10("./src/Unittests/data/matrix3d_int10.mat") ; std::string file_int11("./src/Unittests/data/matrix3d_int11.mat") ; std::string file_int12("./src/Unittests/data/matrix3d_int12.mat") ; std::string file_int13("./src/Unittests/data/matrix3d_int13.mat") ; std::string file_int14("./src/Unittests/data/matrix3d_int14.mat") ; std::string file_double("./src/Unittests/data/matrix3d_double.mat") ; std::string file_ghost("./src/Unittests/data/foo.mat") ; std::vector v_int = {-1,0,2,0, 0,3,0,4, 0,0,0,0, 0,0,0,0, 0,5,-6,0, 0,7,0,0} ; std::vector v_int2 = {1} ; std::vector v_dbl = {-1.,0., 2.,0., 0.,3., 0.,4., 0.,0., 0.,0., 0.,0., 0.,0., 0.,5.,-6.,0., 0.,7., 0.,0.} ; std::vector dim = {2,4,3} ; std::vector dim2 = {1,1,1} ; // matrix of int Matrix3D m_int(file_int1) ; CHECK_EQUAL(dim.size(), m_int.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_int.get_dim(), dim.size()) ; CHECK_EQUAL(v_int.size(), m_int.get_data_size()) ; CHECK_ARRAY_EQUAL(v_int, m_int.get_data(), v_int.size()) ; // matrix with only 1 int Matrix3D m_int2(file_int12) ; CHECK_EQUAL(Matrix3D(1,1,1,1), m_int2) ; // empty matrix (empty file) Matrix3D m_int3(file_int13) ; CHECK_EQUAL(Matrix3D(0,0,0), m_int3) ; // empty matrix (only eol in file) Matrix3D m_int4(file_int13) ; CHECK_EQUAL(Matrix3D(0,0,0), m_int4) ; // these files are not well formatted CHECK_THROW(Matrix3D m_int3(file_int2), std::runtime_error) ; // mixed data types CHECK_THROW(Matrix3D m_int3(file_int3), std::runtime_error) ; // slice of variable dim CHECK_THROW(Matrix3D m_int3(file_int4), std::runtime_error) ; // slice of variable dim CHECK_THROW(Matrix3D m_int3(file_int5), std::runtime_error) ; // slice of variable dim CHECK_THROW(Matrix3D m_int3(file_int6), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int7), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int8), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int9), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int10), std::runtime_error) ; // empty line CHECK_THROW(Matrix3D m_int3(file_int11), std::runtime_error) ; // empty line // this file does not exist CHECK_THROW(Matrix3D m_int3(file_ghost), std::runtime_error) ; // matrix of double Matrix3D m_double(file_double) ; CHECK_EQUAL(dim.size(), m_double.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_double.get_dim(), dim.size()) ; CHECK_EQUAL(v_int.size(), m_double.get_data_size()) ; CHECK_ARRAY_EQUAL(v_int, m_double.get_data(), v_int.size()) ; } // tests get() TEST(get) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; for(size_t l=0; l coord = convert_to_coord(m, l) ; CHECK_EQUAL(m.get(l), m.get(coord[0], coord[1], coord[2])) ; } } } } } // tests set() TEST(set) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { std::vector dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; for(size_t l=0; l coord = convert_to_coord(m, l) ; m.set(coord[0], coord[1], coord[2], l) ; } for(size_t l=0; l dim = {i,j,k} ; Matrix3D m(i,j,k,n) ; for(size_t l=0; l coord = convert_to_coord(m, l) ; m(coord[0], coord[1], coord[2]) = l ; } for(size_t l=0; l m(i,j,k) ; for(size_t a=0; a m2("./src/Unittests/data/matrix3d_out.mat") ; // any matrix with at least one zero dimension is a null // matrix if(i==0 or j==0 or k==0) { CHECK_EQUAL(Matrix3D(0,0,0), m2) ; } else { CHECK_EQUAL(m, m2) ; } } } } } } SUITE(Matrix4D) { // displays message TEST(message) { std::cout << "Starting Matrix4D tests..." << std::endl ; } // constructor TEST(constructor) { for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; } } } } } // test constructor value TEST(constructor_value) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l,n) ; CHECK_EQUAL(dim.size(), m.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m.get_dim(), dim.size()) ; CHECK_EQUAL(std::accumulate(begin(dim), end(dim), 1, std::multiplies()), m.get_data_size()) ; for(const auto& i : m.get_data()) { CHECK_EQUAL(n, i) ; } } } } } } // tests copy constructor TEST(constructor_copy) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l,n) ; Matrix4D m2(m) ; CHECK_EQUAL(m, m2) ; } } } } } // tests contructor from file, uses the == operator TEST(constructor_file) { std::string file_int1("./src/Unittests/data/matrix4d_int1.mat") ; std::string file_int2("./src/Unittests/data/matrix4d_int2.mat") ; std::string file_int3("./src/Unittests/data/matrix4d_int3.mat") ; std::string file_int4("./src/Unittests/data/matrix4d_int4.mat") ; std::string file_int5("./src/Unittests/data/matrix4d_int5.mat") ; std::string file_int6("./src/Unittests/data/matrix4d_int6.mat") ; std::string file_int7("./src/Unittests/data/matrix4d_int7.mat") ; std::string file_int8("./src/Unittests/data/matrix4d_int8.mat") ; std::string file_int9("./src/Unittests/data/matrix4d_int9.mat") ; std::string file_int10("./src/Unittests/data/matrix4d_int10.mat") ; std::string file_int11("./src/Unittests/data/matrix4d_int11.mat") ; std::string file_int12("./src/Unittests/data/matrix4d_int12.mat") ; std::string file_int13("./src/Unittests/data/matrix4d_int13.mat") ; std::string file_int14("./src/Unittests/data/matrix4d_int14.mat") ; std::string file_int15("./src/Unittests/data/matrix4d_int15.mat") ; std::string file_int16("./src/Unittests/data/matrix4d_int16.mat") ; std::string file_int17("./src/Unittests/data/matrix4d_int17.mat") ; std::string file_int18("./src/Unittests/data/matrix4d_int18.mat") ; std::string file_int19("./src/Unittests/data/matrix4d_int19.mat") ; std::string file_int20("./src/Unittests/data/matrix4d_int20.mat") ; std::string file_dbl1("./src/Unittests/data/matrix4d_double1.mat") ; std::string file_ghost("./src/Unittests/data/foo.mat") ; std::vector v_int = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24} ; std::vector v_dbl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12, 13,14,15, 16,17,18, 19,20,21, 22,23,24} ; std::vector dim = {2,3,2,4} ; // matrix of int Matrix4D m_int(file_int1) ; CHECK_EQUAL(dim.size(), m_int.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_int.get_dim(), dim.size()) ; CHECK_EQUAL(v_int.size(), m_int.get_data_size()) ; CHECK_ARRAY_EQUAL(v_int, m_int.get_data(), v_int.size()) ; // matrix with only 1 int Matrix4D m_int2(file_int18) ; CHECK_EQUAL(Matrix4D(1,1,1,1,1), m_int2) ; // empty matrix (empty file) Matrix4D m_int3(file_int19) ; CHECK_EQUAL(Matrix4D(0,0,0,0), m_int3) ; // empty matrix (only eol in file) Matrix4D m_int4(file_int20) ; CHECK_EQUAL(Matrix4D(0,0,0,0), m_int3) ; // these files are not well formatted CHECK_THROW(Matrix4D m_int5(file_int2), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int3), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int4), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int5), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int6), std::runtime_error) ; // empty lines CHECK_THROW(Matrix4D m_int5(file_int7), std::runtime_error) ; // first line problem CHECK_THROW(Matrix4D m_int5(file_int8), std::runtime_error) ; // first line problem CHECK_THROW(Matrix4D m_int5(file_int9), std::runtime_error) ; // first line problem CHECK_THROW(Matrix4D m_int5(file_int10), std::runtime_error) ; // second line problem CHECK_THROW(Matrix4D m_int5(file_int11), std::runtime_error) ; // extra column CHECK_THROW(Matrix4D m_int5(file_int12), std::runtime_error) ; // missing column CHECK_THROW(Matrix4D m_int5(file_int13), std::runtime_error) ; // extra row CHECK_THROW(Matrix4D m_int5(file_int14), std::runtime_error) ; // extra 2d slice CHECK_THROW(Matrix4D m_int5(file_int15), std::runtime_error) ; // extra 2d slice CHECK_THROW(Matrix4D m_int5(file_int16), std::runtime_error) ; // last line problem CHECK_THROW(Matrix4D m_int5(file_int17), std::runtime_error) ; // mixded data types // this file does not exist CHECK_THROW(Matrix4D m_int3(file_ghost), std::runtime_error) ; // matrix of double Matrix4D m_dbl(file_dbl1) ; CHECK_EQUAL(dim.size(), m_dbl.get_dim_size()) ; CHECK_ARRAY_EQUAL(dim, m_dbl.get_dim(), dim.size()) ; CHECK_EQUAL(v_dbl.size(), m_dbl.get_data_size()) ; CHECK_ARRAY_EQUAL(v_dbl, m_dbl.get_data(), v_dbl.size()) ; } // tests get() TEST(get) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,l,n) ; for(size_t a=0; a coord = convert_to_coord(m, a) ; CHECK_EQUAL(m.get(a), m.get(coord[0], coord[1], coord[2], coord[3])) ; } } } } } } // tests set() TEST(set) { int n = 999 ; for(size_t i=0; i<10; i++) { for(size_t j=0; j<10; j++) { for(size_t k=0; k<10; k++) { for(size_t l=0; l<10; l++) { std::vector dim = {i,j,k,l} ; Matrix4D m(i,j,k,n) ; for(size_t a=0; a coord = convert_to_coord(m, a) ; m.set(coord[0], coord[1], coord[2], coord[3], a) ; } for(size_t a=0; a dim = {i,j,k,l} ; Matrix4D m(i,j,k,l) ; for(size_t a=0; a m2("./src/Unittests/data/matrix4d_out.mat") ; // any matrix with at least one zero dimension is a null // matrix if(i==0 or j==0 or k==0 or l==0) { CHECK_EQUAL(Matrix4D(0,0,0,0), m2) ; } else { CHECK_EQUAL(m, m2) ; } } } } } } } - +*/ diff --git a/src/main.cpp b/src/main.cpp deleted file mode 100644 index 79a1aa7..0000000 --- a/src/main.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include -#include -#include -#include - -int main() -{ - // Matrix2D data("/local/groux/scATAC-seq/toy.txt") ; - Matrix2D data("/local/groux/scATAC-seq/toy.txt") ; - size_t iter = 20 ; - size_t shift = 1 ; - bool flip = false ; - std::string seed("") ; - size_t thread = 4 ; - - size_t k_max = 6 ; - size_t i_max = 5 ; - - for(size_t k=1; k<=k_max; k++) - { // std::vector ll ; - for(size_t i=1; i<=i_max; i++) - { EMEngine em(data, - k, - iter, - shift, - flip, - EMEngine::seeding_codes::RANDOM, - seed, - thread) ; - - em.cluster() ; - // double ll = em.get_loglikelihood() ; - double n_param = ((double)k * ((double)data.get_ncol() - (double)shift + 1.)) + - ((double)shift * ((double)flip+1.) * (double)k) - 1. ; - /* - std::cout << "k : " << k - << " n param : " << n_param - << " AIC : " << (2.*n_param) - (2.*ll) - << std::endl ; - */ - std::cout << k << " " << n_param << " " << em.get_aic() << std::endl ; - } - - } - - return EXIT_SUCCESS ; -} diff --git a/src/main_cormat.cpp b/src/main_cormat.cpp new file mode 100644 index 0000000..d9936c6 --- /dev/null +++ b/src/main_cormat.cpp @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include + +using namespace seqan; + + +template +std::ostream& operator << (std::ostream& stream, const std::vector& v) +{ + for(const auto& p : v) + { stream << p << " " ; } + return stream ; +} + +template +std::ostream& operator << (std::ostream& stream, const std::pair& p) +{ + stream << "[" << p.first << " " << p.second << "] " ; + return stream ; +} + +/* +std::pair get_bin_indices(const GenomeRegion& target, + const std::vector& bins) +{ // the bin range and chromosome + int chromosome_idx = bins.front().chromosome_idx ; + int bin_size = bins.front().end - bins.front().start ; + int from = bins.front().start ; + int to = bins.back().end ; + + // not on the same chromosome + if(target.chromosome_idx != chromosome_idx) + { return std::make_pair(0,0) ; } + // target goes over all bins + else if(target.start <= from and + target.end >= to) + { return std::make_pair(0, bins.size()) ; } + // check if overlap + else + { // define whether target limits are inside + int bin_start = -1 ; + int bin_end = -1 ; + + // define whether target limits are inside + bool target_start_in = false ; + bool target_end_in = false ; + if(target.start >= from and + target.start < to) + { target_start_in = true ; } + if(target.end > from and + target.end <= to) + { target_end_in = true ; } + + // start + if(not target_start_in) + { bin_start = 0 ; } + else + { bin_start = (target.start - from) / bin_size ; } + + // end + if(target_start_in and not target_end_in) + { bin_end = bin_start + 1 ; } + else if(not target_start_in and not target_end_in) + { bin_end = 0 ; } + else + { bin_end = ((target.end - 1 - from) / bin_size) + 1 ; } + + return std::make_pair(bin_start, bin_end) ; + } +} +*/ + +std::pair get_bin_indices_naive(const GenomeRegion& target, + const std::vector& bins) +{ int bin_start = 0 ; + int bin_end = 0 ; + + GenomeRegion range(bins.front().chromosome, + bins.front().chromosome_idx, + bins.front().start, + bins.back().end) ; + + // no overlap + if(not (target | range)) + { return std::make_pair(0,0) ; } + else + { // start + if(target.start < bins.front().start) + { bin_start = 0 ; } + else + { for(int i=0; i< (int)bins.size(); i++) + { if(target.start >= bins[i].start and + target.start < bins[i].end) + { bin_start = i ; + break ; + } + } + } + // end + if(target.end > bins.back().end) + { bin_end = bins.size() ; } + else + { + for(int i=0; i<(int)bins.size(); i++) + { if(target.end <= bins[i].end and + target.end > bins[i].start) + { bin_end = i+1 ; + break ; + } + } + } + return std::make_pair(bin_start, bin_end) ; + } +} + +std::pair get_bin_indices(const GenomeRegion& target, + const std::vector& bins) +{ // the bin range and chromosome + GenomeRegion range(bins.front().chromosome, + bins.front().chromosome_idx, + bins.front().start, + bins.back().end) ; + // no overlap + if(not (target | range)) + { return std::make_pair(0,0) ; } + // overlap + else + { // target goes over all bins + if(target.start <= range.start and + target.end >= range.end) + { return std::make_pair(0, bins.size()) ; } + else + { int bin_start = -1 ; + int bin_end = -1 ; + int bin_size = bins.front().end - bins.front().start ; + + // start + if(target.start <= range.start) + { bin_start = 0 ; } + else + { bin_start = (target.start - range.start) / bin_size ; } + + // end + if(target.end >= range.end) + { bin_end = bins.size() ; } + else + { bin_end = ((target.end - 1 - range.start) / bin_size) + 1 ; } + return std::make_pair(bin_start, bin_end) ; + } + } +} + +int main() +{ + std::string bed = "data/10xgenomics_PBMC_5k/ctcf_motifs_10e-6.bed" ; + std::string bam = "data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" ; + std::string bai = "data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" ; + + CorrelationMatrixCreator mc(bed, + bam, + bai, + -100, + 100, + 10, + CorrelationMatrixCreator::methods::READ_ATAC) ; + + mc.create_matrix() ; + return 0; +} diff --git a/src/main_em.cpp b/src/main_em.cpp new file mode 100644 index 0000000..1ea4eea --- /dev/null +++ b/src/main_em.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include +#include + +int main() +{ + Matrix2D data("/local/groux/scATAC-seq/toy.txt") ; + size_t iter = 5 ; + bool flip = false ; + std::string seed("") ; + size_t thread = 10 ; + + size_t shift_min = 1, shift_max = 5 ; + size_t k_min = 1, k_max = 6 ; + size_t i_max = 1 ; + + for(size_t k=k_min; k<=k_max; k++) + { // std::vector ll ; + for(size_t shift=shift_min; shift<=shift_max; shift++) + for(size_t i=1; i<=i_max; i++) + { EMEngine em(data, + k, + iter, + shift, + flip, + EMEngine::seeding_codes::RANDOM, + seed, + thread) ; + + em.cluster() ; + std::cout << i << " " << k << " " << shift << " " << em.get_aic() << std::endl ; + // std::cout << em.get_loglikelihood2() << std::endl ; + // std::cout << em.get_posterior_prob() << std::endl ; + // std::cout << em.get_references() << std::endl ; + // std::cout << em.test() << " " << em.get_loglikelihood() << " " << em.get_loglikelihood2() << std::endl ; + } + + } + + return EXIT_SUCCESS ; +} diff --git a/src/main_seqan.cpp b/src/main_seqan.cpp new file mode 100644 index 0000000..5967ae9 --- /dev/null +++ b/src/main_seqan.cpp @@ -0,0 +1,419 @@ +#include +#include +#include +#include + +#include +#include + +using namespace seqan; + +template +std::ostream& operator << (std::ostream& o, const std::unordered_map& map) +{ for(const auto& pair : map) + { o << "< " << pair.first << " " + << pair.second << " >" + << std::endl ; + } + return o ; +} + + +void bam_stat(const std::string& path_bam) +{ // CharString bamFileInName = path_bam.c_str() ; + // Open input BAM file. + BamFileIn bamFileIn; + if (!open(bamFileIn, path_bam.c_str())) + { + char msg[1024] ; + sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; + throw std::runtime_error(msg); + } + + // read header. + BamHeader header; + try + { readHeader(header, bamFileIn); + } + catch (ParseError const & e) + { char msg[1024] ; + sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; + throw std::runtime_error(msg); + } + + // counters + int n_frag = 0 ; + int n_frag_bad = 0 ; + int n_read_fw_1 = 0 ; + int n_read_fw_2 = 0 ; + int n_read_rv_1 = 0 ; + int n_read_rv_2 = 0 ; + + BamAlignmentRecord record; + while (!atEnd(bamFileIn)) + { readRecord(record, bamFileIn) ; + n_frag++ ; + if(not seqan::hasFlagAllProper(record)) + { n_frag_bad++ ; + continue ; + } + else if((seqan::hasFlagRC(record) and + seqan::hasFlagNextRC(record)) or + (not seqan::hasFlagRC(record) and + not seqan::hasFlagNextRC(record))) + { n_frag_bad++ ; + continue ; + } + + // read + if(seqan::hasFlagFirst(record) and not seqan::hasFlagRC(record)) + { n_read_fw_1++ ; } + + if(not seqan::hasFlagFirst(record) and not seqan::hasFlagRC(record)) + { n_read_fw_2++ ; } + + if(seqan::hasFlagFirst(record) and seqan::hasFlagRC(record)) + { n_read_rv_1++ ; } + + if(not seqan::hasFlagFirst(record) and seqan::hasFlagRC(record)) + { n_read_rv_2++ ; } + } + close(bamFileIn) ; + std::cout << path_bam << std::endl ; + std::cout << "n frag : " << n_frag << std::endl ; + std::cout << "n frag bad qual : " << n_frag_bad << std::endl ; + std::cout << "n read fw 1st : " << n_read_fw_1 << std::endl ; + std::cout << "n read fw 2nd : " << n_read_fw_2 << std::endl ; + std::cout << "n read rv 1st : " << n_read_rv_1 << std::endl ; + std::cout << "n read rv 2nd : " << n_read_rv_2 << std::endl << std::endl ; +} + +bool is_good_read(const seqan::BamAlignmentRecord& record) +{ + if((seqan::hasFlagUnmapped(record)) or // read unmapped flag + seqan::hasFlagQCNoPass(record) or // not passing QC flag + seqan::hasFlagDuplicate(record)) // PCR duplicate flag + { return false ; } + return true ; +} + +bool is_good_pair(const seqan::BamAlignmentRecord& record) +{ + if((not seqan::hasFlagMultiple(record)) or // is paired flag + (not seqan::hasFlagAllProper(record))) // each read properly aligned flag + { return false ; } + + if((not seqan::hasFlagFirst(record)) or // read 1st in pair flag + seqan::hasFlagLast(record)) // mate 1st in pair flag + { return false ; } + + // read info + bool read_is_rev = seqan::hasFlagRC(record) ; // read is rev flag + int read_start = record.beginPos ; + // mate info + bool mate_is_rev = seqan::hasFlagNextRC(record) ; // mate is rev flag + int mate_start = record.pNext ; + + // qc + if((not is_good_read(record)) or + // --> --> + (not read_is_rev and not mate_is_rev) or + // <-- <-- + (read_is_rev and mate_is_rev) or + // <-- --> 1/2 + ((read_is_rev and not mate_is_rev) and (read_start < mate_start)) or + // <-- --> 2/2 + ((not read_is_rev and mate_is_rev) and (read_start > mate_start))) + { return false ; } + return true ; +} + +void read_bam(const std::string& path_bam) +{ + BamFileIn bamFileIn; + if (!open(bamFileIn, path_bam.c_str())) + { + char msg[1024] ; + sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; + throw std::runtime_error(msg); + } + + // read header. + BamHeader header; + try + { readHeader(header, bamFileIn); + } + catch (ParseError const & e) + { char msg[1024] ; + sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; + throw std::runtime_error(msg); + } + + BamAlignmentRecord record; + while (!atEnd(bamFileIn)) + { + readRecord(record, bamFileIn) ; + + bool read_rev = seqan::hasFlagRC(record) ; + bool mate_rev = seqan::hasFlagNextRC(record) ; + int read_start = record.beginPos ; + int mate_start = record.pNext ; + std::string chrom = seqan::toCString(seqan::getContigName(record, bamFileIn)) ; + + if(not is_good_pair(record)) + { continue ; } + + char msg[1024] ; + if(not read_rev and mate_rev) + { sprintf(msg, "[fw %s %d] [rv %s %d]", + chrom.c_str(), read_start, chrom.c_str(), mate_start) ; + } + else if(read_rev and not mate_rev) + { sprintf(msg, "[rv %s %d] [fw %s %d]", + chrom.c_str(), read_start, chrom.c_str(), mate_start) ; + } + std::cout << msg << std::endl ; + } + close(bamFileIn) ; +} + +void count_record_bam(const std::string& path_bam) +{ + BamFileIn bamFileIn; + if (!open(bamFileIn, path_bam.c_str())) + { + char msg[1024] ; + sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; + throw std::runtime_error(msg); + } + + // read header. + BamHeader header; + try + { readHeader(header, bamFileIn); + } + catch (ParseError const & e) + { char msg[1024] ; + sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; + throw std::runtime_error(msg); + } + + size_t n_rec = 0 ; + BamAlignmentRecord record; + while (!atEnd(bamFileIn)) + { + readRecord(record, bamFileIn) ; + n_rec++ ; + } + close(bamFileIn) ; + std::cout << "nber record : " << n_rec << std::endl ; +} + +void check_chromosomes_bam(const std::string& path_bam) +{ + BamFileIn bamFileIn; + if (!open(bamFileIn, path_bam.c_str())) + { + char msg[1024] ; + sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; + throw std::runtime_error(msg); + } + + // read header. + BamHeader header; + try + { readHeader(header, bamFileIn); + } + catch (ParseError const & e) + { char msg[1024] ; + sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; + throw std::runtime_error(msg); + } + + int chrom_n = 0 ; + std::unordered_map map ; + + BamAlignmentRecord record; + size_t i=0 ; + while (!atEnd(bamFileIn)) + { + readRecord(record, bamFileIn) ; + std::string chrom = seqan::toCString( + seqan::getContigName( + record, bamFileIn)) ; + if(map.find(chrom) == map.end()) + { map[chrom] = chrom_n ; + chrom_n++ ; + } + /* + else if(map.find(chrom) != map.end() and + chrom_n-1 != map.find(chrom)->second) + { auto chrom_tmp = map.find(chrom)->first ; + auto chrom_n_tmp = map.find(chrom)->second ; + std::cout << "sorting issue with " + << chrom_tmp << " " << chrom_n_tmp << "/" << chrom_n + << std::endl ; + } + */ + std::cout << i << std::endl ; + i++ ; + } + close(bamFileIn) ; + std::cout << "chromosomes :" << std::endl << map << std::endl ; +} + +void test(const std::string& path_bam) +{ + BamFileIn bamFileIn; + if (!open(bamFileIn, path_bam.c_str())) + { + char msg[1024] ; + sprintf(msg, "ERROR: could not open input file %s", path_bam.c_str()) ; + throw std::runtime_error(msg); + } + + // Open output SAM which is the standard output. + BamFileOut samFileOut(context(bamFileIn), std::cout, Sam()); + + // read header. + BamHeader header; + try + { readHeader(header, bamFileIn) ; } + catch (ParseError const & e) + { char msg[1024] ; + sprintf(msg, "ERROR: input header is badly formatted. %s", e.what()) ; + throw std::runtime_error(msg); + } + + size_t n_rec = 0 ; + BamAlignmentRecord record; + while (!atEnd(bamFileIn)) + { + readRecord(record, bamFileIn) ; + if(n_rec == 165421293) + { writeRecord(samFileOut, record) ; + std::cout << is_good_read(record) << std::endl ; + } + else if(n_rec == 366090419) + { writeRecord(samFileOut, record) ; + std::cout << is_good_read(record) << std::endl ; + // should bug + /* + std::string chrom = seqan::toCString( + seqan::getContigName( + record, bamFileIn)) ; + */ + } + n_rec++ ; + } + close(bamFileIn) ; + std::cout << "nber record : " << n_rec << std::endl ; +} + + +int copy_top_bam(const std::string& bam_path_in, + const std::string& bam_path_out, + int n_lines) +{ + + // Open input file, BamFileIn can read SAM and BAM files. + BamFileIn bamFileIn; + if (!open(bamFileIn, bam_path_in.c_str())) + { + std::cerr << "ERROR: Could not open " << bam_path_in << std::endl; + return 1; + } + // Open output file, BamFileOut accepts also an ostream and a format tag. + BamFileOut bamFileOut(bam_path_out.c_str()); + + try + { + // Copy header. + BamHeader header; + readHeader(header, bamFileIn); + writeHeader(bamFileOut, header); + + // Copy records. + BamAlignmentRecord record; + int n=0 ; + while (not atEnd(bamFileIn) and + n