Page MenuHomec4science

20200804_cmsearch_comparison.py
No OneTemporary

File Metadata

Created
Mon, Feb 24, 05:25

20200804_cmsearch_comparison.py

#!/usr/bin/python3
#dictionary of the reads mapping rRNA
import os
import re
import argparse
import fnmatch
# example command line :
# /media/bioinfoteam/SCRATCH_850GB/temp_metatranscriptomic/20200804_cmsearch_comparison.py -i '/run/user/1002/gvfs/smb-share:server=lbenas2.epfl.ch,share=lbe_data-bioinfo/ASAP_AGS/metatranscriptomic/3_RNA_sorting/infernal_output/test_script_20200804' -o /media/bioinfoteam/SCRATCH_850GB/temp_metatranscriptomic/cmsearch_summary.tsv
################################################### PARSER ###################################################
parser = argparse.ArgumentParser(description = 'Script that creates a genome file for bedtool coverage calculation')
parser.add_argument('-i','--input_folder', help = 'name of input folder containing infernal file', required = True)
parser.add_argument('-o','--output_file', help = 'name of the output file', required = True)
args = parser.parse_args()
input_folder = args.input_folder
output_file_name = args.output_file
####################################################################################################
os.chdir(input_folder)
d_cmsearch = {}
d_count = {}
#test on one file
filename_list = fnmatch.filter(os.listdir(input_folder),'*_R1_*_infernal.txt')
print(filename_list)
for filename in filename_list :
#filename = '37C_L1_R2_5S_infernal.txt'
with open(filename,'r') as document:
split_file_name = filename.split('_')
sample_name = split_file_name[0]
RNA_name = split_file_name[3]
# create entries in d_cmsearch for sample_names and RNA_names if necessary
if sample_name not in d_cmsearch :
d_cmsearch[sample_name] = {}
if RNA_name not in d_cmsearch[sample_name] :
d_cmsearch[sample_name][RNA_name] = 0
# create an entry in d_count for the new sample_name if necessary
if sample_name not in d_count :
d_count[sample_name] = {}
line = document.readline()
while line and 'inclusion threshold' not in line :
seq_list = re.findall('K00382:[A-Z,:,0-9]+', line)
if seq_list :
seq = seq_list[0]
d_cmsearch[sample_name][RNA_name] += 1
if seq not in d_count[sample_name]:
d_count[sample_name][seq] = 0
d_count[sample_name][seq] += 1
line = document.readline()
# write in output file
output_file = open(output_file_name,'w')
header = 'sample'
for RNA_name in d_cmsearch[sample_name] :
header += '\t' + RNA_name
header += '\ttotal\n'
output_file.write(header)
for sample_name in d_cmsearch:
output_line = sample_name
for RNA_name in d_cmsearch[sample_name] :
output_line += '\t' + str(d_cmsearch[sample_name][RNA_name])
output_line += '\t' + str(len(d_count[sample_name]))
output_line += '\n'
output_file.write(output_line)
output_file.close()

Event Timeline