Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61588209
split_bam.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 7, 15:52
Size
4 KB
Mime Type
text/x-python
Expires
Thu, May 9, 15:52 (2 d)
Engine
blob
Format
Raw Data
Handle
17531844
Attached To
R8820 scATAC-seq
split_bam.py
View Options
import
optparse
import
sys
import
os
import
typing
as
tp
import
pysam
def
construct_dict_files
(
f_values
:
str
,
f_prefix
:
str
)
->
tp
.
Dict
[
str
,
str
]:
"""
Reads a file containing a list of tag values (one value per line) and constructs a dictionary
of values (key) and file addresses (values) to later dispatch the reads in.
:param f_values: the address of the file containing the tag values
:param f_prefix: a common prefix for the addresses of all the file addresses in the dictionary.
:return: a dictionary with tag values and their associated file addresses.
"""
d
=
dict
()
with
open
(
f_values
,
"rt"
)
as
f
:
for
line
in
f
:
value
=
line
.
rstrip
()
if
d
.
get
(
value
,
None
)
is
None
:
f_reads
=
"
%s%s
.sam"
%
(
f_prefix
,
value
)
d
[
value
]
=
f_reads
else
:
pass
return
d
def
split_bam
(
f_bam
:
str
,
tag
:
str
,
d_files
:
tp
.
Dict
[
str
,
str
]):
"""
Splits the bam file according to the given tag values.
The bam file is read and each read is check for the given tag value. If the value is listed in the given file
dictionary, then the read is written to the corresponding file.
:param f_bam: the address of the bam file to split.
:param tag: the tag which should be used for splitting.
:param d_files: a dictionary containing the accepted values for sorting (key) and the addresses of the
corresponding files in which the reads should be dispatched.
"""
# Create all files and a 2nd dictionary telling whether header has already been written, the key is still the
# value. Don't write the sam file headers now. If a file is given no read, then it will be empty, not only
# containing a header
d_header
=
dict
()
for
key
in
d_files
.
keys
():
f
=
open
(
d_files
[
key
],
"wt"
)
f
.
close
()
d_header
[
key
]
=
False
# read bam file and dispatch the reads
bam
=
pysam
.
AlignmentFile
(
f_bam
)
for
read
in
bam
:
if
read
.
has_tag
(
tag
):
value
=
read
.
get_tag
(
tag
)
# only treat value present in the list
if
d_files
.
get
(
value
,
None
)
is
not
None
:
# cannot keep all files open, raises an OS Error if too many are open at the same time
with
open
(
d_files
[
value
],
"at"
)
as
f
:
# write header if file has not been written before
if
d_header
[
value
]
is
False
:
f
.
write
(
str
(
read
.
header
))
d_header
[
value
]
=
True
f
.
write
(
"
%s
\n
"
%
read
.
to_string
())
bam
.
close
()
if
__name__
==
"__main__"
:
# parse options
usage
=
"usage:
%s
[options]"
%
os
.
path
.
basename
(
__file__
)
epilog
=
"This program reads a bam file and dispatches the reads into separated sam files according to the "
\
"values associated with a specified tag. The accepted values should be listed into a text file. The "
\
"output files will be located in the current directory.
\n
"
\
"Written by Romain Groux, January 2019"
parser
=
optparse
.
OptionParser
(
usage
=
usage
,
epilog
=
epilog
)
parser
.
add_option
(
"-i"
,
"--input"
,
dest
=
"file_in"
,
default
=
None
,
type
=
"string"
,
action
=
"store"
,
help
=
"the addresse of the bam file to split."
)
parser
.
add_option
(
"-p"
,
"--prefix"
,
dest
=
"prefix"
,
default
=
""
,
type
=
"string"
,
action
=
"store"
,
help
=
"a name prefix for the files in which the reads will be dispatched."
)
parser
.
add_option
(
"--values"
,
dest
=
"file_values"
,
default
=
None
,
type
=
"string"
,
action
=
"store"
,
help
=
"the address of the file containing the associated tag values relevant for the splitting."
)
parser
.
add_option
(
"--tag"
,
dest
=
"tag"
,
default
=
None
,
type
=
"string"
,
action
=
"store"
,
help
=
"The tag which values will be used for the splitting."
)
(
options
,
args
)
=
parser
.
parse_args
()
file_in
=
options
.
file_in
file_values
=
options
.
file_values
prefix
=
options
.
prefix
tag
=
options
.
tag
# check options
if
file_in
is
None
:
print
(
"Error! No input file given (-i)!"
,
sys
.
stderr
)
exit
(
1
)
elif
not
os
.
path
.
isfile
(
file_in
):
print
(
"Error!
%s
does not exist!"
%
file_in
)
exit
(
1
)
elif
file_values
is
None
:
print
(
"Error! No value file given (--values)!"
,
sys
.
stderr
)
exit
(
1
)
elif
not
os
.
path
.
isfile
(
file_values
):
print
(
"Error!
%s
does not exist!"
%
file_values
)
exit
(
1
)
elif
tag
is
None
:
print
(
"Error! no tag was given (--tag)!"
)
if
prefix
!=
""
:
prefix
=
"
%s
_"
%
prefix
# split bam file
dict_files
=
construct_dict_files
(
file_values
,
prefix
)
split_bam
(
file_in
,
tag
,
dict_files
)
Event Timeline
Log In to Comment