Page MenuHomec4science

split_bam.py
No OneTemporary

File Metadata

Created
Tue, May 7, 15:52

split_bam.py

import optparse
import sys
import os
import typing as tp
import pysam
def construct_dict_files(f_values: str, f_prefix: str) -> tp.Dict[str, str]:
"""
Reads a file containing a list of tag values (one value per line) and constructs a dictionary
of values (key) and file addresses (values) to later dispatch the reads in.
:param f_values: the address of the file containing the tag values
:param f_prefix: a common prefix for the addresses of all the file addresses in the dictionary.
:return: a dictionary with tag values and their associated file addresses.
"""
d = dict()
with open(f_values, "rt") as f:
for line in f:
value = line.rstrip()
if d.get(value, None) is None:
f_reads = "%s%s.sam" % (f_prefix, value)
d[value] = f_reads
else:
pass
return d
def split_bam(f_bam: str, tag:str, d_files: tp.Dict[str, str]):
"""
Splits the bam file according to the given tag values.
The bam file is read and each read is check for the given tag value. If the value is listed in the given file
dictionary, then the read is written to the corresponding file.
:param f_bam: the address of the bam file to split.
:param tag: the tag which should be used for splitting.
:param d_files: a dictionary containing the accepted values for sorting (key) and the addresses of the
corresponding files in which the reads should be dispatched.
"""
# Create all files and a 2nd dictionary telling whether header has already been written, the key is still the
# value. Don't write the sam file headers now. If a file is given no read, then it will be empty, not only
# containing a header
d_header = dict()
for key in d_files.keys():
f = open(d_files[key], "wt")
f.close()
d_header[key] = False
# read bam file and dispatch the reads
bam = pysam.AlignmentFile(f_bam)
for read in bam:
if read.has_tag(tag):
value = read.get_tag(tag)
# only treat value present in the list
if d_files.get(value, None) is not None:
# cannot keep all files open, raises an OS Error if too many are open at the same time
with open(d_files[value], "at") as f:
# write header if file has not been written before
if d_header[value] is False:
f.write(str(read.header))
d_header[value] = True
f.write("%s\n" % read.to_string())
bam.close()
if __name__ == "__main__":
# parse options
usage = "usage: %s [options]" % os.path.basename(__file__)
epilog = "This program reads a bam file and dispatches the reads into separated sam files according to the " \
"values associated with a specified tag. The accepted values should be listed into a text file. The " \
"output files will be located in the current directory.\n" \
"Written by Romain Groux, January 2019"
parser = optparse.OptionParser(usage=usage, epilog=epilog)
parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store",
help="the addresse of the bam file to split.")
parser.add_option("-p", "--prefix", dest="prefix", default="", type="string", action="store",
help="a name prefix for the files in which the reads will be dispatched.")
parser.add_option("--values", dest="file_values", default=None, type="string", action="store",
help="the address of the file containing the associated tag values relevant for the splitting.")
parser.add_option("--tag", dest="tag", default=None, type="string", action="store",
help="The tag which values will be used for the splitting.")
(options, args) = parser.parse_args()
file_in = options.file_in
file_values = options.file_values
prefix = options.prefix
tag = options.tag
# check options
if file_in is None:
print("Error! No input file given (-i)!", sys.stderr)
exit(1)
elif not os.path.isfile(file_in):
print("Error! %s does not exist!" % file_in)
exit(1)
elif file_values is None:
print("Error! No value file given (--values)!", sys.stderr)
exit(1)
elif not os.path.isfile(file_values):
print("Error! %s does not exist!" % file_values)
exit(1)
elif tag is None:
print("Error! no tag was given (--tag)!")
if prefix != "":
prefix = "%s_" % prefix
# split bam file
dict_files = construct_dict_files(file_values, prefix)
split_bam(file_in, tag, dict_files)

Event Timeline