edPDB.xpdb

1 # edPDB.xpdb 2 """ 3 :mod:`edPDB.xpdb` -- Extensions to :mod:`Bio.PDB` 4 ================================================= 5 6 Extensions to Bio.PDB, such as handling of large pdb files and some 7 useful selections (see :mod:`edPDB.selections`). 8 9 Partly published on http://biopython.org/wiki/Reading_large_PDB_files 10 11 License: like Biopython 12 13 Module content 14 -------------- 15 16 .. autoclass:: SloppyStructureBuilder 17 .. autoclass:: SloppyPDBIO 18 .. autoclass:: AtomGroup 19 20 .. autofunction:: get_structure 21 .. autofunction:: write_pdb 22 """ 23 24 import sys 25 import Bio.PDB 26 import Bio.PDB.StructureBuilder 27 28 from utilities import asiterable 29 30 import logging 31 logger = logging.getLogger('edPDB.xpdb')

32 33 -class SloppyStructureBuilder(Bio.PDB.StructureBuilder.StructureBuilder):

34 """Cope with resSeq < 10,000 limitation by just incrementing internally. 35 36 Solves the follwing problem with :class:`Bio.PDB.StructureBuilder.StructureBuilder`: 37 Q: What's wrong here?? 38 Some atoms or residues will be missing in the data structure. 39 WARNING: Residue (' ', 8954, ' ') redefined at line 74803. 40 PDBConstructionException: Blank altlocs in duplicate residue SOL (' ', 8954, ' ') at line 74803. 41 42 A: resSeq only goes to 9999 --> goes back to 0 (PDB format is not really good here) 43 44 .. warning:: H and W records are probably not handled yet (don't have examples to test) 45 """ 46

47 - def __init__(self,verbose=False):

48 Bio.PDB.StructureBuilder.StructureBuilder.__init__(self) 49 self.max_resseq = -1 50 self.verbose = verbose

51

52 - def init_residue(self, resname, field, resseq, icode):

53 """ 54 Initiate a new Residue object. 55 56 Arguments: 57 o resname - string, e.g. "ASN" 58 o field - hetero flag, "W" for waters, "H" for 59 hetero residues, otherwise blanc. 60 o resseq - int, sequence identifier 61 o icode - string, insertion code 62 """ 63 if field!=" ": 64 if field=="H": 65 # The hetero field consists of H_ + the residue name (e.g. H_FUC) 66 field="H_"+resname 67 res_id=(field, resseq, icode) 68 69 if resseq > self.max_resseq: 70 self.max_resseq = resseq 71 72 if field==" ": 73 fudged_resseq = False 74 while (self.chain.has_id(res_id) or resseq == 0): 75 # There already is a residue with the id (field, resseq, icode). 76 # resseq == 0 catches already wrapped residue numbers which do not 77 # trigger the has_id() test. 78 # 79 # Be sloppy and just increment... 80 # (This code will not leave gaps in resids... I think) 81 # 82 # XXX: shouldn't we also do this for hetero atoms and water?? 83 self.max_resseq += 1 84 resseq = self.max_resseq 85 res_id = (field, resseq, icode) # use max_resseq! 86 fudged_resseq = True 87 88 if fudged_resseq and self.verbose: 89 logger.debug("Residues are wrapping (Residue ('%s', %i, '%s') at line %i)." 90 % (field, resseq, icode, self.line_counter) + 91 ".... assigning new resid %d.\n" % self.max_resseq) 92 residue=Bio.PDB.Residue.Residue(res_id, resname, self.segid) 93 self.chain.add(residue) 94 self.residue=residue

95

96 -class SloppyPDBIO(Bio.PDB.PDBIO):

97 """PDBIO class that can deal with large pdb files as used in MD simulations. 98 99 - resSeq simply wrap and are printed modulo 10,000. 100 - atom numbers wrap at 99,999 and are printed modulo 100,000 101 """ 102 # directly copied from PDBIO.py 103 # (has to be copied because of the package layout it is not externally accessible) 104 _ATOM_FORMAT_STRING="%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f%6.2f%6.2f %4s%2s%2s\n" 105

106 - def _get_atom_line(self, atom, hetfield, segid, atom_number, resname, 107 resseq, icode, chain_id, element=" ", charge=" "):

108 """ 109 Returns an ATOM PDB string that is guaranteed to fit into the ATOM format. 110 111 - Resid (resseq) is wrapped (modulo 10,000) to fit into %4i (4I) format 112 - Atom number (atom_number) is wrapped (modulo 100,000) to fit into %4i (4I) format 113 """ 114 if hetfield!=" ": 115 record_type="HETATM" 116 else: 117 record_type="ATOM " 118 name=atom.get_fullname() 119 altloc=atom.get_altloc() 120 x, y, z=atom.get_coord() 121 bfactor=atom.get_bfactor() 122 occupancy=atom.get_occupancy() 123 args=(record_type, atom_number % 100000, name, altloc, resname, chain_id, 124 resseq % 10000, icode, x, y, z, occupancy, bfactor, segid, 125 element, charge) 126 return self._ATOM_FORMAT_STRING % args

127 128 129 130 131 sloppyparser = Bio.PDB.PDBParser(PERMISSIVE=True,structure_builder=SloppyStructureBuilder())

132 133 -def get_structure(pdbfile,pdbid='system'):

134 return sloppyparser.get_structure(pdbid,pdbfile)

135

136 137 -class AtomGroup(set):

138 - def __init__(self, atoms=None):

139 atoms = atoms or [] 140 super(AtomGroup, self).__init__(atoms)

141 142 @property

143 - def identifiers(self):

144 return [(a.parent.resname, a.parent.id[1], a.name) for a in self]

145

146 - def __repr__(self):

147 return "[" + ", ".join(["%s%d:%s" % x for x in self.identifiers]) + "]"

148 149 150 from selections import ResidueSelect, NotResidueSelect 151 152 writelogger = logging.getLogger('edPDB.write_pdb')

153 -def write_pdb(structure, filename, **kwargs):

154 """Write Bio.PDB molecule *structure* to *filename*. 155 156 :Arguments: 157 *structure* 158 Bio.PDB structure instance 159 *filename* 160 pdb file 161 *selection* 162 Bio.PDB.Selection 163 *exclusions* 164 list of **residue** instances that will *not* be included 165 *inclusions* 166 list of **residue** instances that will be included 167 *chain* 168 set the chain identifier for **all** atoms written; this can be 169 useful to simply to erase all chain ids by setting it to ' ' 170 171 Typical use is to supply a list of water molecules that should not 172 be written or a ligand that should be include. 173 174 .. Note:: Currently only one of *selection, *exclusions* or 175 *inclusions* is supported. 176 """ 177 exclusions = kwargs.pop('exclusions', None) 178 inclusions = kwargs.pop('inclusions', None) 179 selection = kwargs.pop('selection', None) 180 181 if selection and (exclusions or inclusions): 182 raise NotImplementedError("only use selection on its own") 183 if exclusions and inclusions: 184 raise NotImplementedError("Sorry, only either exclusions OR " 185 "inclusions are supported") 186 if exclusions: 187 selection = NotResidueSelect(exclusions) 188 elif inclusions: 189 selection = ResidueSelect(inclusions) 190 191 chain = kwargs.pop('chain',None) 192 if not chain is None: 193 writelogger.info("Setting the chain id for ALL atoms to %(chain)r", vars()) 194 for c in structure.get_chains(): 195 c.id = chain 196 197 writelogger.debug("Starting PDBIO...") 198 io = SloppyPDBIO() # deals with resSeq > 9999 199 io.set_structure(structure) 200 io.save(filename, select=selection) 201 202 writelogger.info("Wrote pdb %(filename)r." % vars())

203

Source Code for Module edPDB.xpdb