Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102790364
20200804_cmsearch_comparison.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Feb 24, 05:25
Size
2 KB
Mime Type
text/x-python
Expires
Wed, Feb 26, 05:25 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
24425471
Attached To
R10722 git_metatranscriptomic_LBE
20200804_cmsearch_comparison.py
View Options
#!/usr/bin/python3
#dictionary of the reads mapping rRNA
import
os
import
re
import
argparse
import
fnmatch
# example command line :
# /media/bioinfoteam/SCRATCH_850GB/temp_metatranscriptomic/20200804_cmsearch_comparison.py -i '/run/user/1002/gvfs/smb-share:server=lbenas2.epfl.ch,share=lbe_data-bioinfo/ASAP_AGS/metatranscriptomic/3_RNA_sorting/infernal_output/test_script_20200804' -o /media/bioinfoteam/SCRATCH_850GB/temp_metatranscriptomic/cmsearch_summary.tsv
################################################### PARSER ###################################################
parser
=
argparse
.
ArgumentParser
(
description
=
'Script that creates a genome file for bedtool coverage calculation'
)
parser
.
add_argument
(
'-i'
,
'--input_folder'
,
help
=
'name of input folder containing infernal file'
,
required
=
True
)
parser
.
add_argument
(
'-o'
,
'--output_file'
,
help
=
'name of the output file'
,
required
=
True
)
args
=
parser
.
parse_args
()
input_folder
=
args
.
input_folder
output_file_name
=
args
.
output_file
####################################################################################################
os
.
chdir
(
input_folder
)
d_cmsearch
=
{}
d_count
=
{}
#test on one file
filename_list
=
fnmatch
.
filter
(
os
.
listdir
(
input_folder
),
'*_R1_*_infernal.txt'
)
print
(
filename_list
)
for
filename
in
filename_list
:
#filename = '37C_L1_R2_5S_infernal.txt'
with
open
(
filename
,
'r'
)
as
document
:
split_file_name
=
filename
.
split
(
'_'
)
sample_name
=
split_file_name
[
0
]
RNA_name
=
split_file_name
[
3
]
# create entries in d_cmsearch for sample_names and RNA_names if necessary
if
sample_name
not
in
d_cmsearch
:
d_cmsearch
[
sample_name
]
=
{}
if
RNA_name
not
in
d_cmsearch
[
sample_name
]
:
d_cmsearch
[
sample_name
][
RNA_name
]
=
0
# create an entry in d_count for the new sample_name if necessary
if
sample_name
not
in
d_count
:
d_count
[
sample_name
]
=
{}
line
=
document
.
readline
()
while
line
and
'inclusion threshold'
not
in
line
:
seq_list
=
re
.
findall
(
'K00382:[A-Z,:,0-9]+'
,
line
)
if
seq_list
:
seq
=
seq_list
[
0
]
d_cmsearch
[
sample_name
][
RNA_name
]
+=
1
if
seq
not
in
d_count
[
sample_name
]:
d_count
[
sample_name
][
seq
]
=
0
d_count
[
sample_name
][
seq
]
+=
1
line
=
document
.
readline
()
# write in output file
output_file
=
open
(
output_file_name
,
'w'
)
header
=
'sample'
for
RNA_name
in
d_cmsearch
[
sample_name
]
:
header
+=
'
\t
'
+
RNA_name
header
+=
'
\t
total
\n
'
output_file
.
write
(
header
)
for
sample_name
in
d_cmsearch
:
output_line
=
sample_name
for
RNA_name
in
d_cmsearch
[
sample_name
]
:
output_line
+=
'
\t
'
+
str
(
d_cmsearch
[
sample_name
][
RNA_name
])
output_line
+=
'
\t
'
+
str
(
len
(
d_count
[
sample_name
]))
output_line
+=
'
\n
'
output_file
.
write
(
output_line
)
output_file
.
close
()
Event Timeline
Log In to Comment