# This toy dataset contains 2 chromosomes with exactly the same fragments
# on each one of them. Each fragment belong to 1 cell For each chromosome, the situation is
# the following.
# In the following picture, the fragments are depicted as [from,to) inverval. Thus the fragments
# contain the from position but do NOT contain the end position. Each fragment is present 2x, 
# once the fw read is the 1st read of the pair, once the rv read is the 1st read of the pair.
#
# Each fragment is composed of two reads of 35bp each. The fw read is always the 1st read of the 
# pair and the rv read the 2nd.
#
#    <-------------------------->
#    AAAAAAA
#    ------>              <------
#                         TTTTTTT
#    read fw (35bp)       read rv (35bp)
#
# The genome is only made of A on the fw strand and T on the rv strand.
#
#                       550      650      750      850      950      1050     1150     1250     1350     1450
#                        |        |        |        |        |        |        |        |        |        |
#         --------------------------------------------------------------------------------------------------> chrom
#         400  480       |        |        |        |        |        |        |        |        |        |
# cell 0   <----->       |        |        |        |        |        |        |        |        |        |
#                480  550         |        |        |        |        |        |        |        |        |
# cell 1          <----->|        |        |        |        |        |        |        |        |        | 
#                        | 560    |        | 800    |        |        |        |        |        |        |
# cell 2                 | <------------------>     |        |        |        |        |        |        |
#                        | 560 640|        |        |        |        |        |        |        |        |
# cell 3                 | <----->|        |        |        |        |        |        |        |        |
#                        |     610| 690    |        |        |        |        |        |        |        |
# cell 4                 |      <----->    |        |        |        |        |        |        |        |
#                        |        |670    750       |        |        |        |        |        |        |
# cell 5                 |        |  <----->        |        |        |        |        |        |        |
#                        |        |      730    810 |        |        |        |        |        |        |
# cell 6                 |        |       <----->   |        |        |        |        |        |        |
#                        |        |        | 770   850       |        |        |        |        |        |
# cell 7                 |        |        |  <----->        |        |        |        |        |        |
#                        |        |        |        |       950       |       1150      |        |        |
# cell 8                 |        |        |        |        <----------------->        |        |        |
#                        |        |        |        |        |960   1040       |        |        |        |
# cell 9                 |        |        |        |        | <----->|        |        |        |        |
#                        |        |        |        |        |     1010  1090  |        |        |        |
# cell 10                |        |        |        |        |      <----->    |        |        |        |
#                        |        |        |        |        |        |1060  1140       |        |        |
# cell 11                |        |        |        |        |        | <----->|        |        |        |
#                        |        |        |        |        |        | 1070  1150      |        |        |
# cell 12                |        |        |        |        |        |  <----->        |        |        |
#                        |        |        |        |        |        |        |        |       1350  1430|
# cell 13                |        |        |        |        |        |        |        |        <----->  |
#                        |        |        |        |        |        |        |        |        |1360  1440
# cell 14                |        |        |        |        |        |        |        |        | <----->|
#                        |        |        |        |        |        |        |        |        |   1410 | 1490
# cell 15                |        |        |        |        |        |        |        |        |     <----->
#                        |        |        |        |        |        |        |        |        |        |   1500          1600
# cell 16                |        |        |        |        |        |        |        |        |        |     <------------>
#                        |        |        |        |        |        |        |        |        |        |                 1600         1700    
# cell 17                |        |        |        |        |        |        |        |        |        |                  <------------>

import pysam
import os

def create_read_pair(frag_start, ref_id, frag_len):

	# Creates 2 pairs of read representing twice 
	# the same fragment. In one pair, the 1st 
	# read is fw and the 2n is rev, in the other 
	# pair, the 1st read is rev and the 2nd is fw.
	# ---------------------------------> reference
	#  read_fw1               read_rv1	
	# -------->              <--------
	# |start                      end|
	# |-----------frag len-----------|
	# |end                      start|
	#  read_fw2               read_rv2	
	# -------->              <--------
	
	n = create_read_pair.counter

	# the reads	
	read_fw1 = pysam.AlignedSegment()
	read_rv1 = pysam.AlignedSegment()
	read_fw2 = pysam.AlignedSegment()
	read_rv2 = pysam.AlignedSegment()

	# the start of the reverse mate (rightmost pos)
	# start_rv = frag_start + frag_len - 1

	# the start of the reverse mate (leftmost pos)
	start_rv = frag_start + frag_len - 35
	print("%d  %d" % (frag_start, start_rv))

	# flags
	flag_fw1 = 99  # paired read, read mapped in proper pair
	               # mate rev strand, first in pair	
	flag_rv1 = 147 # paired read, read mapped in proper pair
	               # read in rev strand, second in pair

	flag_fw2 = 163 # paired read, read mapped in proper pair
	               # mate rev strand, second in pair	
	flag_rv2 = 83  # paired read, read mapped in proper pair
	               # read in rev strand, first in pair 

	# optional field tags
	extra_tags = (("NM", 1),    # edit distance with ref
	              ("RG", "L1"),	# read group
                  ("CB", "cell_%d" % n)) # cell barcode

	# pair 1 : 1st read fw, 2nd read rev
	## fw read
	read_fw1.query_name = "read_fw1_%d" % n
	read_fw1.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
	read_fw1.flag = flag_fw1
	read_fw1.reference_id = ref_id
	read_fw1.reference_start = frag_start
	read_fw1.mapping_quality = 20
	# read_fw1.cigar = ((0,10), (2,1), (0,25))
	read_fw1.next_reference_id = ref_id
	read_fw1.next_reference_start = start_rv
	read_fw1.template_length = frag_len
	read_fw1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
	read_fw1.tags = extra_tags

	## rev read
	read_rv1.query_name = "read_rv1_%d" % n
	read_rv1.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
	read_rv1.flag = flag_rv1
	read_rv1.reference_id = ref_id
	read_rv1.reference_start = start_rv
	read_rv1.mapping_quality = 20
	# read_rv1.cigar = ((0,10), (2,1), (0,25))
	read_rv1.next_reference_id = ref_id
	read_rv1.next_reference_start = frag_start
	read_rv1.template_length = -frag_len
	read_rv1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
	read_rv1.tags = extra_tags
	create_read_pair.counter += 1

	# pair 2 : 1st read rev, 2nd read fw
	## fw read
	read_fw2.query_name = "read_fw2_%d" % n
	read_fw2.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
	read_fw2.flag = flag_fw2
	read_fw2.reference_id = ref_id
	read_fw2.reference_start = frag_start
	read_fw2.mapping_quality = 20
	# read_fw2.cigar = ((0,10), (2,1), (0,25))
	read_fw2.next_reference_id = ref_id
	read_fw2.next_reference_start = start_rv
	read_fw2.template_length = frag_len
	read_fw2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
	read_fw2.tags = extra_tags

	## rev read
	read_rv2.query_name = "read_rv2_%d" % n
	read_rv2.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
	read_rv2.flag = flag_rv2
	read_rv2.reference_id = ref_id
	read_rv2.reference_start = start_rv
	read_rv2.mapping_quality = 20
	# read_rv2.cigar = ((0,10), (2,1), (0,25))
	read_rv2.next_reference_id = ref_id
	read_rv2.next_reference_start = frag_start
	read_rv2.template_length = -frag_len
	read_rv2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
	read_rv2.tags = extra_tags
	create_read_pair.counter += 1

	return ((read_fw1, read_rv1), (read_fw2, read_rv2))

create_read_pair.counter = 0

if __name__ == "__main__":
	# file header, the genome will have 2 chromosomes
	header = { 'HD': {'VN': '1.0',
		              'SO': 'unsorted'}, 
		       'SQ': [{'LN': 2000, 'SN': 'chr1'},    # chrom index 0
		              {'LN': 2000, 'SN': 'chr2'}] }  # chrom index 1

	file_out = os.path.join("data", "toy_data", "sc_reads.bam")
	f_out = pysam.AlignmentFile(file_out, header=header, mode="wb")

	chromosomes = [0, 1]

	read_fw_starts = (400, 470, 560, 560, 610, 670, 730, 770, 950, 960, \
		              1010, 1060, 1070, 1350, 1360, 1410, 1500, 1600)

	frag_lengths = (80, 80, 240, 80, 80, 80, 80, 80, 200, 80, \
		            80, 80, 80, 80, 80, 80, 100, 100)

	for chrom in chromosomes:
		for i in range(0, len(read_fw_starts), 1):
			read_fw_start = read_fw_starts[i]
			frag_len = frag_lengths[i]
			reads = create_read_pair(read_fw_start, chrom, frag_len)
			
			f_out.write(reads[0][0])
			f_out.write(reads[0][1])
			f_out.write(reads[1][0])
			f_out.write(reads[1][1])

	f_out.close()

# read_fw, read_rev = create_read_pair(0, 0, 100)
# f_out.write(read_fw)
# f_out.write(read_rev)
# f_out.close()


# read_fw = pysam.AlignedSegment()
# read_fw.query_name = "read_1"
# read_fw.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
# read_fw.flag = 99
# read_fw.reference_id = 0
# read_fw.reference_start = 32
# read_fw.mapping_quality = 20
# # read_fw.cigar = ((0,10), (2,1), (0,25))
# read_fw.next_reference_id = 0
# read_fw.next_reference_start=199
# read_fw.template_length=167
# read_fw.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
# read_fw.tags = (("NM", 1),("RG", "L1"))
# f_out.write(read_fw)

# read_rv = pysam.AlignedSegment()
# read_rv.query_name = "read_1"
# read_rv.query_sequence= "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
# read_rv.flag = 147
# read_rv.reference_id = 0
# read_rv.reference_start = 199 
# read_rv.mapping_quality = 20
# read_fw.cigar = ((0,10), (2,1), (0,25))
# read_rv.next_reference_id = 0
# read_rv.next_reference_start=32
# read_rv.template_length=167
# read_rv.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
# read_rv.tags = (("NM", 1),("RG", "L1"))
# f_out.write(read_rv)