Page MenuHomec4science

pdbsort.py
No OneTemporary

File Metadata

Created
Fri, Jul 19, 14:53

pdbsort.py

#!/usr/bin/env python
"""
Unfortunately, the lines in a PDB files are not always listed in the
correct order. Software that reads PDB files is expected to re-sort this
data before interpreting it. (One reason I don't like them.)
This script reads a PDB file from the standard input, sorts the lines
according to ChainID, SeqNum (residue-number), Icode (insert-code),
and AtomID (in that order) and prints the result to the standard-out.
Only the ATOM and HETATM records are effected.
All other lines in the PDB file are printed back to the user verbatim.
Note: the "altLoc" column (character #17 on each line) is ignored
and is not used for sorting purposes.
"""
import sys
from operator import attrgetter
from collections import defaultdict
g_program_name = __file__.split('/')[-1]
g_version_str = '0.12.0'
g_date_str = '2016-12-21'
# In order to specify any amino acid in a PDB file, you must provide 3
# identifiers:
# the ChainID a single letter specifying
# the SeqNum an integer indicating the location within that chain
# the ICode ("insert code" usually 0. I don't know why this number is
# necessary. ..For the record, I never loved the PDB file format.)
class AtomDescr:
def __init__(self, setChainID, setSeqNum, setICode, setAtomID):
self.chainID = setChainID
self.seqNum = setSeqNum
self.iCode = setICode
self.atomID = setAtomID
# I plan to store this information in a python dictionary.
# Unfortunately, in order to use such classes as keys in python dictionaries
# I must define comparison operators, and a hash function.
# In retrospect I figured out it would have been easier just to use tuples
# as dictionary keys. I suppose it was a good excercise to try to do it
# using python classes instead. I'm sorry it made the code so long.
def __le__(self, other):
# return ((self.chainID < other.chainID) or ((self.chainID == other.cha
# instead I'll exploit python's ability to compare tuples
return (self.chainID, self.seqNum, self.iCode, self.atomID) <= (other.chainID, other.seqNum, other.iCode, other.atomID)
def __lt__(self, other):
return (self.chainID, self.seqNum, self.iCode, self.atomID) < (other.chainID, other.seqNum, other.iCode, other.atomID)
def __eq__(self, other):
# return ((self.chainID == x.chainID) and (self.seqNum == x.seqNum) and
# (self.iCode == x.iCode))
return (self.chainID, self.seqNum, self.iCode, self.atomID) == (other.chainID, other.seqNum, other.iCode, other.atomID)
def __ne__(self, other):
return not __eq__(self, other)
def __gt__(self, other):
return not __le__(self, other)
def __ge__(self, other):
return not __lt__(self, other)
def __cmp__(self, other):
if __lt__(self, other):
return -1
elif __gt__(self, other):
return 1
else:
return 0
def __hash__(self):
numChainIDs = 128
numICodes = 128
i = self.seqNum
i *= numChainIDs
i += ord(self.chainID)
i *= numICodes
i += ord(self.iCode)
i *= 10
i += self.atomID
return i
def main():
if len(sys.argv) == 1:
use_all_residues = True
elif len(sys.argv) == 7:
use_all_residues = False
first = AtomDescr(sys.argv[1], int(sys.argv[2]), sys.argv[3], 0)
last = AtomDescr(sys.argv[4], int(sys.argv[5]), sys.argv[6], 2147483647)
else:
sys.stderr.write("Error(" + g_program_name + "): This program requires either 0 or 6 arguments.\n"
" By default, the the sequence is extracted from the entire PDB file.\n"
" In that case, no arguments are required.\n"
" Alternately, you can limit the selection to a single interval of\n"
" residues from one of the chains in the PDB file.\n"
" To specify an interval, you must passing 6 arguments to this program.\n"
" This program requires a pair of residues to designate the first and\n"
" last members of the interval. Each residue requires 3 identifiers.\n"
" Consequently the six arguments needed are:\n"
"ChainID_first SeqNum_first ICode_first ChainID_last SeqNum_last ICode_last\n")
exit(-1)
atoms2lines = defaultdict(list)
for line in sys.stdin:
if (line[0:6] == "ATOM ") or (line[0:6] == "HETATM"):
atomID = int(line[6:11])
#atomType = line[12:16]
#altLoc = line[16:17]
iCode = line[26:27]
#resType = line[17:20]
chainID = line[21:22]
seqNumStr = line[22:26]
seqNum = int(seqNumStr)
atomdescr = AtomDescr(chainID, int(seqNumStr), iCode, int(atomID))
atoms2lines[atomdescr].append(line.rstrip('\n'))
else:
sys.stdout.write(line)
# Extract an (unordered) list of the atomdescrs of the atoms in the sequence
atomdescrs = [atomdescr for atomdescr in atoms2lines]
# Residues in PDB files are often not listed in order.
# Consequently, we must sort the list by chainID, seqNum, and finnaly iCode:
sequence_of_atomdescrs = sorted(atomdescrs, key=attrgetter(
'chainID', 'seqNum', 'iCode', 'atomID'))
for atomdescr in sequence_of_atomdescrs:
for line in atoms2lines[atomdescr]:
sys.stdout.write(line + '\n')
return
if __name__ == '__main__':
main()

Event Timeline