File Metadata

Created: Fri, Jul 19, 14:53

pdbsort.py
View Options

	#!/usr/bin/env python
	"""
	Unfortunately, the lines in a PDB files are not always listed in the
	correct order. Software that reads PDB files is expected to re-sort this
	data before interpreting it. (One reason I don't like them.)
	This script reads a PDB file from the standard input, sorts the lines
	according to ChainID, SeqNum (residue-number), Icode (insert-code),
	and AtomID (in that order) and prints the result to the standard-out.
	Only the ATOM and HETATM records are effected.
	All other lines in the PDB file are printed back to the user verbatim.
	Note: the "altLoc" column (character #17 on each line) is ignored
	and is not used for sorting purposes.

	"""

	import sys
	from operator import attrgetter
	from collections import defaultdict

	g_program_name = __file__.split('/')[-1]
	g_version_str = '0.12.0'
	g_date_str = '2016-12-21'

	# In order to specify any amino acid in a PDB file, you must provide 3
	# identifiers:
	# the ChainID a single letter specifying
	# the SeqNum an integer indicating the location within that chain
	# the ICode ("insert code" usually 0. I don't know why this number is
	# necessary. ..For the record, I never loved the PDB file format.)


	class AtomDescr:

	def __init__(self, setChainID, setSeqNum, setICode, setAtomID):
	self.chainID = setChainID
	self.seqNum = setSeqNum
	self.iCode = setICode
	self.atomID = setAtomID

	# I plan to store this information in a python dictionary.
	# Unfortunately, in order to use such classes as keys in python dictionaries
	# I must define comparison operators, and a hash function.
	# In retrospect I figured out it would have been easier just to use tuples
	# as dictionary keys. I suppose it was a good excercise to try to do it
	# using python classes instead. I'm sorry it made the code so long.

	def __le__(self, other):
	# return ((self.chainID < other.chainID) or ((self.chainID == other.cha
	# instead I'll exploit python's ability to compare tuples
	return (self.chainID, self.seqNum, self.iCode, self.atomID) <= (other.chainID, other.seqNum, other.iCode, other.atomID)

	def __lt__(self, other):
	return (self.chainID, self.seqNum, self.iCode, self.atomID) < (other.chainID, other.seqNum, other.iCode, other.atomID)

	def __eq__(self, other):
	# return ((self.chainID == x.chainID) and (self.seqNum == x.seqNum) and
	# (self.iCode == x.iCode))
	return (self.chainID, self.seqNum, self.iCode, self.atomID) == (other.chainID, other.seqNum, other.iCode, other.atomID)

	def __ne__(self, other):
	return not __eq__(self, other)

	def __gt__(self, other):
	return not __le__(self, other)

	def __ge__(self, other):
	return not __lt__(self, other)

	def __cmp__(self, other):
	if __lt__(self, other):
	return -1
	elif __gt__(self, other):
	return 1
	else:
	return 0

	def __hash__(self):
	numChainIDs = 128
	numICodes = 128
	i = self.seqNum
	i *= numChainIDs
	i += ord(self.chainID)
	i *= numICodes
	i += ord(self.iCode)
	i *= 10
	i += self.atomID
	return i


	def main():
	if len(sys.argv) == 1:
	use_all_residues = True
	elif len(sys.argv) == 7:
	use_all_residues = False
	first = AtomDescr(sys.argv[1], int(sys.argv[2]), sys.argv[3], 0)
	last = AtomDescr(sys.argv[4], int(sys.argv[5]), sys.argv[6], 2147483647)
	else:
	sys.stderr.write("Error(" + g_program_name + "): This program requires either 0 or 6 arguments.\n"
	" By default, the the sequence is extracted from the entire PDB file.\n"
	" In that case, no arguments are required.\n"
	" Alternately, you can limit the selection to a single interval of\n"
	" residues from one of the chains in the PDB file.\n"
	" To specify an interval, you must passing 6 arguments to this program.\n"
	" This program requires a pair of residues to designate the first and\n"
	" last members of the interval. Each residue requires 3 identifiers.\n"
	" Consequently the six arguments needed are:\n"
	"ChainID_first SeqNum_first ICode_first ChainID_last SeqNum_last ICode_last\n")
	exit(-1)


	atoms2lines = defaultdict(list)

	for line in sys.stdin:
	if (line[0:6] == "ATOM ") or (line[0:6] == "HETATM"):
	atomID = int(line[6:11])
	#atomType = line[12:16]
	#altLoc = line[16:17]
	iCode = line[26:27]
	#resType = line[17:20]
	chainID = line[21:22]
	seqNumStr = line[22:26]
	seqNum = int(seqNumStr)
	atomdescr = AtomDescr(chainID, int(seqNumStr), iCode, int(atomID))
	atoms2lines[atomdescr].append(line.rstrip('\n'))
	else:
	sys.stdout.write(line)

	# Extract an (unordered) list of the atomdescrs of the atoms in the sequence
	atomdescrs = [atomdescr for atomdescr in atoms2lines]

	# Residues in PDB files are often not listed in order.
	# Consequently, we must sort the list by chainID, seqNum, and finnaly iCode:
	sequence_of_atomdescrs = sorted(atomdescrs, key=attrgetter(
	'chainID', 'seqNum', 'iCode', 'atomID'))

	for atomdescr in sequence_of_atomdescrs:
	for line in atoms2lines[atomdescr]:
	sys.stdout.write(line + '\n')

	return


	if __name__ == '__main__':
	main()

pdbsort.py
No OneTemporary
Actions

File Metadata

pdbsort.py
View Options

Event Timeline

pdbsort.pyNo OneTemporaryActions

File Metadata

pdbsort.pyView Options

Event Timeline

pdbsort.py
No OneTemporary
Actions

pdbsort.py
View Options