Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F73207713
pdbsort.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jul 19, 14:53
Size
5 KB
Mime Type
text/x-python
Expires
Sun, Jul 21, 14:53 (2 d)
Engine
blob
Format
Raw Data
Handle
19162800
Attached To
rLAMMPS lammps
pdbsort.py
View Options
#!/usr/bin/env python
"""
Unfortunately, the lines in a PDB files are not always listed in the
correct order. Software that reads PDB files is expected to re-sort this
data before interpreting it. (One reason I don't like them.)
This script reads a PDB file from the standard input, sorts the lines
according to ChainID, SeqNum (residue-number), Icode (insert-code),
and AtomID (in that order) and prints the result to the standard-out.
Only the ATOM and HETATM records are effected.
All other lines in the PDB file are printed back to the user verbatim.
Note: the "altLoc" column (character #17 on each line) is ignored
and is not used for sorting purposes.
"""
import
sys
from
operator
import
attrgetter
from
collections
import
defaultdict
g_program_name
=
__file__
.
split
(
'/'
)[
-
1
]
g_version_str
=
'0.12.0'
g_date_str
=
'2016-12-21'
# In order to specify any amino acid in a PDB file, you must provide 3
# identifiers:
# the ChainID a single letter specifying
# the SeqNum an integer indicating the location within that chain
# the ICode ("insert code" usually 0. I don't know why this number is
# necessary. ..For the record, I never loved the PDB file format.)
class
AtomDescr
:
def
__init__
(
self
,
setChainID
,
setSeqNum
,
setICode
,
setAtomID
):
self
.
chainID
=
setChainID
self
.
seqNum
=
setSeqNum
self
.
iCode
=
setICode
self
.
atomID
=
setAtomID
# I plan to store this information in a python dictionary.
# Unfortunately, in order to use such classes as keys in python dictionaries
# I must define comparison operators, and a hash function.
# In retrospect I figured out it would have been easier just to use tuples
# as dictionary keys. I suppose it was a good excercise to try to do it
# using python classes instead. I'm sorry it made the code so long.
def
__le__
(
self
,
other
):
# return ((self.chainID < other.chainID) or ((self.chainID == other.cha
# instead I'll exploit python's ability to compare tuples
return
(
self
.
chainID
,
self
.
seqNum
,
self
.
iCode
,
self
.
atomID
)
<=
(
other
.
chainID
,
other
.
seqNum
,
other
.
iCode
,
other
.
atomID
)
def
__lt__
(
self
,
other
):
return
(
self
.
chainID
,
self
.
seqNum
,
self
.
iCode
,
self
.
atomID
)
<
(
other
.
chainID
,
other
.
seqNum
,
other
.
iCode
,
other
.
atomID
)
def
__eq__
(
self
,
other
):
# return ((self.chainID == x.chainID) and (self.seqNum == x.seqNum) and
# (self.iCode == x.iCode))
return
(
self
.
chainID
,
self
.
seqNum
,
self
.
iCode
,
self
.
atomID
)
==
(
other
.
chainID
,
other
.
seqNum
,
other
.
iCode
,
other
.
atomID
)
def
__ne__
(
self
,
other
):
return
not
__eq__
(
self
,
other
)
def
__gt__
(
self
,
other
):
return
not
__le__
(
self
,
other
)
def
__ge__
(
self
,
other
):
return
not
__lt__
(
self
,
other
)
def
__cmp__
(
self
,
other
):
if
__lt__
(
self
,
other
):
return
-
1
elif
__gt__
(
self
,
other
):
return
1
else
:
return
0
def
__hash__
(
self
):
numChainIDs
=
128
numICodes
=
128
i
=
self
.
seqNum
i
*=
numChainIDs
i
+=
ord
(
self
.
chainID
)
i
*=
numICodes
i
+=
ord
(
self
.
iCode
)
i
*=
10
i
+=
self
.
atomID
return
i
def
main
():
if
len
(
sys
.
argv
)
==
1
:
use_all_residues
=
True
elif
len
(
sys
.
argv
)
==
7
:
use_all_residues
=
False
first
=
AtomDescr
(
sys
.
argv
[
1
],
int
(
sys
.
argv
[
2
]),
sys
.
argv
[
3
],
0
)
last
=
AtomDescr
(
sys
.
argv
[
4
],
int
(
sys
.
argv
[
5
]),
sys
.
argv
[
6
],
2147483647
)
else
:
sys
.
stderr
.
write
(
"Error("
+
g_program_name
+
"): This program requires either 0 or 6 arguments.
\n
"
" By default, the the sequence is extracted from the entire PDB file.
\n
"
" In that case, no arguments are required.
\n
"
" Alternately, you can limit the selection to a single interval of
\n
"
" residues from one of the chains in the PDB file.
\n
"
" To specify an interval, you must passing 6 arguments to this program.
\n
"
" This program requires a pair of residues to designate the first and
\n
"
" last members of the interval. Each residue requires 3 identifiers.
\n
"
" Consequently the six arguments needed are:
\n
"
"ChainID_first SeqNum_first ICode_first ChainID_last SeqNum_last ICode_last
\n
"
)
exit
(
-
1
)
atoms2lines
=
defaultdict
(
list
)
for
line
in
sys
.
stdin
:
if
(
line
[
0
:
6
]
==
"ATOM "
)
or
(
line
[
0
:
6
]
==
"HETATM"
):
atomID
=
int
(
line
[
6
:
11
])
#atomType = line[12:16]
#altLoc = line[16:17]
iCode
=
line
[
26
:
27
]
#resType = line[17:20]
chainID
=
line
[
21
:
22
]
seqNumStr
=
line
[
22
:
26
]
seqNum
=
int
(
seqNumStr
)
atomdescr
=
AtomDescr
(
chainID
,
int
(
seqNumStr
),
iCode
,
int
(
atomID
))
atoms2lines
[
atomdescr
]
.
append
(
line
.
rstrip
(
'
\n
'
))
else
:
sys
.
stdout
.
write
(
line
)
# Extract an (unordered) list of the atomdescrs of the atoms in the sequence
atomdescrs
=
[
atomdescr
for
atomdescr
in
atoms2lines
]
# Residues in PDB files are often not listed in order.
# Consequently, we must sort the list by chainID, seqNum, and finnaly iCode:
sequence_of_atomdescrs
=
sorted
(
atomdescrs
,
key
=
attrgetter
(
'chainID'
,
'seqNum'
,
'iCode'
,
'atomID'
))
for
atomdescr
in
sequence_of_atomdescrs
:
for
line
in
atoms2lines
[
atomdescr
]:
sys
.
stdout
.
write
(
line
+
'
\n
'
)
return
if
__name__
==
'__main__'
:
main
()
Event Timeline
Log In to Comment