Page MenuHomec4science

blast_file_reader.py
No OneTemporary

File Metadata

Created
Sun, Jul 13, 17:56

blast_file_reader.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
" module blast_file_reader contains the definition of the class BlastFileReader"
##########################################################################
from reader import FileReader
##########################################################################
class BlastFileReader(FileReader):
"""
class BlastFileReader: can read and store information from blastn software output
"""
# ------------------------------------------------------------------ #
# Constructors/Destructors #
# ------------------------------------------------------------------ #
def __init__(self,filename):
"""__init__: the reader is initialized with 2 members : a filename and the dictionnary where the information will be stored """
# Members ---------------------- #
# the name of the blastfile to read (with full path)
self.filename = filename
# self.dict[seqID] = [refID, identity%]
self.dict= {}
def __del__(self):
"""__del__: not implemented """
pass
# ------------------------------------------------------------------ #
# Methods #
# ------------------------------------------------------------------ #
# public:
def read(self,filter_reader, key_string = None):
"""read: this function read the information in the blast_file; for each sequenceID from the query it will get the matching reference and the percentage of identity with it, if there is a match, the reference ID will be 'NONE' if there was no match and the identity will be 0. """
blast_file = open(self.filename,'r')
# read the file line by line from the start
blast_line = blast_file.readline()
while blast_line:
# the seqID of the query is extracted and a new item is initialized in the dictionary 'self.dict'
if blast_line[0:6]=='Query=':
seqID = blast_line.split('ry= ')[1].replace('\n','')# extract sequence ID
self.dict[seqID]=['','']
#pass three lines
for i in range(0,3):
blast_file.readline()
blast_line=blast_file.readline()
# if 'Sequences is in the line, there is a match
if 'Sequences ' in blast_line:
# pass for line and extract the refID of the matching reference
for i in range(0,4):
blast_file.readline()
blast_line=blast_file.readline()
refID = blast_line.split('>')[1].replace('\n','').replace(' ','')
# else, there is no match, the refID is set to 'NONE'
else:
blast_line=blast_file.readline()
if '****' in blast_line :
refID = 'NONE'
# if neither 'Sequences' or '****' is found, there is a problem of reading
else:
print 'probleme de capture du blast'
# the ID of the matching reference is stored in self.dict
self.dict[seqID][0]=refID
# pass 3 lines
for i in range(0,3):
blast_file.readline()
blast_line=blast_file.readline()
#extract identity score
identities = blast_line.split(' ')[3].split('/')
try :
# if there is a match the percentage of identity is : number of matches/alignment length
ID_percent = str(round(float(identities[0])*100/float(identities[1]),2))
except:
# if there is no match this identity percentage is set to zero
ID_percent = str(0)
# the percentage of identity is stored in self.dict
self.dict[seqID][1]= ID_percent
blast_line = blast_file.readline()
# if the begining of the line is not 'Query=', a new line is read.
else:
blast_line = blast_file.readline()
blast_file.close()
return 0
##########################################################################
if __name__ == '__main__':
test = BlastFileReader('/home/aline/spe_repository_aa/project/class_diagram/input_files/R2_trim.contigs.good_all_cluster_heads_blast_results')
#print 'blast dictionnary ', test.read(None,None)

Event Timeline