# This toy dataset contains 2 chromosomes with exactly the same fragments # on each one of them. Each fragment belong to 1 cell For each chromosome, the situation is # the following. # In the following picture, the fragments are depicted as [from,to) inverval. Thus the fragments # contain the from position but do NOT contain the end position. Each fragment is present 2x, # once the fw read is the 1st read of the pair, once the rv read is the 1st read of the pair. # # Each fragment is composed of two reads of 35bp each. The fw read is always the 1st read of the # pair and the rv read the 2nd. # # <--------------------------> # AAAAAAA # ------> <------ # TTTTTTT # read fw (35bp) read rv (35bp) # # The genome is only made of A on the fw strand and T on the rv strand. # # 550 650 750 850 950 1050 1150 1250 1350 1450 # | | | | | | | | | | # --------------------------------------------------------------------------------------------------> chrom # 400 480 | | | | | | | | | | # cell 0 <-----> | | | | | | | | | | # 480 550 | | | | | | | | | # cell 1 <----->| | | | | | | | | | # | 560 | | 800 | | | | | | | # cell 2 | <------------------> | | | | | | | # | 560 640| | | | | | | | | # cell 3 | <----->| | | | | | | | | # | 610| 690 | | | | | | | | # cell 4 | <-----> | | | | | | | | # | |670 750 | | | | | | | # cell 5 | | <-----> | | | | | | | # | | 730 810 | | | | | | | # cell 6 | | <-----> | | | | | | | # | | | 770 850 | | | | | | # cell 7 | | | <-----> | | | | | | # | | | | 950 | 1150 | | | # cell 8 | | | | <-----------------> | | | # | | | | |960 1040 | | | | # cell 9 | | | | | <----->| | | | | # | | | | | 1010 1090 | | | | # cell 10 | | | | | <-----> | | | | # | | | | | |1060 1140 | | | # cell 11 | | | | | | <----->| | | | # | | | | | | 1070 1150 | | | # cell 12 | | | | | | <-----> | | | # | | | | | | | | 1350 1430| # cell 13 | | | | | | | | <-----> | # | | | | | | | | |1360 1440 # cell 14 | | | | | | | | | <----->| # | | | | | | | | | 1410 | 1490 # cell 15 | | | | | | | | | <-----> # | | | | | | | | | | 1500 1600 # cell 16 | | | | | | | | | | <------------> # | | | | | | | | | | 1600 1700 # cell 17 | | | | | | | | | | <------------> import pysam import os def create_read_pair(frag_start, ref_id, frag_len): # Creates 2 pairs of read representing twice # the same fragment. In one pair, the 1st # read is fw and the 2n is rev, in the other # pair, the 1st read is rev and the 2nd is fw. # ---------------------------------> reference # read_fw1 read_rv1 # --------> <-------- # |start end| # |-----------frag len-----------| # |end start| # read_fw2 read_rv2 # --------> <-------- n = create_read_pair.counter # the reads read_fw1 = pysam.AlignedSegment() read_rv1 = pysam.AlignedSegment() read_fw2 = pysam.AlignedSegment() read_rv2 = pysam.AlignedSegment() # the start of the reverse mate (rightmost pos) # start_rv = frag_start + frag_len - 1 # the start of the reverse mate (leftmost pos) start_rv = frag_start + frag_len - 35 print("%d %d" % (frag_start, start_rv)) # flags flag_fw1 = 99 # paired read, read mapped in proper pair # mate rev strand, first in pair flag_rv1 = 147 # paired read, read mapped in proper pair # read in rev strand, second in pair flag_fw2 = 163 # paired read, read mapped in proper pair # mate rev strand, second in pair flag_rv2 = 83 # paired read, read mapped in proper pair # read in rev strand, first in pair # optional field tags extra_tags = (("NM", 1), # edit distance with ref ("RG", "L1"), # read group ("CB", "cell_%d" % n)) # cell barcode # pair 1 : 1st read fw, 2nd read rev ## fw read read_fw1.query_name = "read_fw1_%d" % n read_fw1.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" read_fw1.flag = flag_fw1 read_fw1.reference_id = ref_id read_fw1.reference_start = frag_start read_fw1.mapping_quality = 20 # read_fw1.cigar = ((0,10), (2,1), (0,25)) read_fw1.next_reference_id = ref_id read_fw1.next_reference_start = start_rv read_fw1.template_length = frag_len read_fw1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") read_fw1.tags = extra_tags ## rev read read_rv1.query_name = "read_rv1_%d" % n read_rv1.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" read_rv1.flag = flag_rv1 read_rv1.reference_id = ref_id read_rv1.reference_start = start_rv read_rv1.mapping_quality = 20 # read_rv1.cigar = ((0,10), (2,1), (0,25)) read_rv1.next_reference_id = ref_id read_rv1.next_reference_start = frag_start read_rv1.template_length = -frag_len read_rv1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") read_rv1.tags = extra_tags create_read_pair.counter += 1 # pair 2 : 1st read rev, 2nd read fw ## fw read read_fw2.query_name = "read_fw2_%d" % n read_fw2.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" read_fw2.flag = flag_fw2 read_fw2.reference_id = ref_id read_fw2.reference_start = frag_start read_fw2.mapping_quality = 20 # read_fw2.cigar = ((0,10), (2,1), (0,25)) read_fw2.next_reference_id = ref_id read_fw2.next_reference_start = start_rv read_fw2.template_length = frag_len read_fw2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") read_fw2.tags = extra_tags ## rev read read_rv2.query_name = "read_rv2_%d" % n read_rv2.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" read_rv2.flag = flag_rv2 read_rv2.reference_id = ref_id read_rv2.reference_start = start_rv read_rv2.mapping_quality = 20 # read_rv2.cigar = ((0,10), (2,1), (0,25)) read_rv2.next_reference_id = ref_id read_rv2.next_reference_start = frag_start read_rv2.template_length = -frag_len read_rv2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") read_rv2.tags = extra_tags create_read_pair.counter += 1 return ((read_fw1, read_rv1), (read_fw2, read_rv2)) create_read_pair.counter = 0 if __name__ == "__main__": # file header, the genome will have 2 chromosomes header = { 'HD': {'VN': '1.0', 'SO': 'unsorted'}, 'SQ': [{'LN': 2000, 'SN': 'chr1'}, # chrom index 0 {'LN': 2000, 'SN': 'chr2'}] } # chrom index 1 file_out = os.path.join("data", "toy_data", "sc_reads.bam") f_out = pysam.AlignmentFile(file_out, header=header, mode="wb") chromosomes = [0, 1] read_fw_starts = (400, 470, 560, 560, 610, 670, 730, 770, 950, 960, \ 1010, 1060, 1070, 1350, 1360, 1410, 1500, 1600) frag_lengths = (80, 80, 240, 80, 80, 80, 80, 80, 200, 80, \ 80, 80, 80, 80, 80, 80, 100, 100) for chrom in chromosomes: for i in range(0, len(read_fw_starts), 1): read_fw_start = read_fw_starts[i] frag_len = frag_lengths[i] reads = create_read_pair(read_fw_start, chrom, frag_len) f_out.write(reads[0][0]) f_out.write(reads[0][1]) f_out.write(reads[1][0]) f_out.write(reads[1][1]) f_out.close() # read_fw, read_rev = create_read_pair(0, 0, 100) # f_out.write(read_fw) # f_out.write(read_rev) # f_out.close() # read_fw = pysam.AlignedSegment() # read_fw.query_name = "read_1" # read_fw.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" # read_fw.flag = 99 # read_fw.reference_id = 0 # read_fw.reference_start = 32 # read_fw.mapping_quality = 20 # # read_fw.cigar = ((0,10), (2,1), (0,25)) # read_fw.next_reference_id = 0 # read_fw.next_reference_start=199 # read_fw.template_length=167 # read_fw.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") # read_fw.tags = (("NM", 1),("RG", "L1")) # f_out.write(read_fw) # read_rv = pysam.AlignedSegment() # read_rv.query_name = "read_1" # read_rv.query_sequence= "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" # read_rv.flag = 147 # read_rv.reference_id = 0 # read_rv.reference_start = 199 # read_rv.mapping_quality = 20 # read_fw.cigar = ((0,10), (2,1), (0,25)) # read_rv.next_reference_id = 0 # read_rv.next_reference_start=32 # read_rv.template_length=167 # read_rv.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") # read_rv.tags = (("NM", 1),("RG", "L1")) # f_out.write(read_rv)