1
2
3
4
5
6 """
7 ``analysis.plugins.dist`` --- Helper Class for ``g_dist``
8 =========================================================
9
10 :mod:`dist` contains helper classes for other analysis plugins that
11 want to make use of the Gromacs command ``g_dist``.
12
13
14 Overview
15 --------
16
17 The task we are solving is to analyze output from ::
18
19 g_dist -f md.xtc -s md.tpr -n cys_ow.ndx -dist 1.0 | bzip2 -vc > mindist_C60_OW_1nm.dat.bz2
20
21 and produce a histogram of minimum contact distances. This should
22 provide an estimate for water accessibility of the atom (here: SG of
23 Cys60).
24
25 File format
26 -----------
27
28 ``g_dist`` with the ``-dist CUTOFF`` option writes to stdout the
29 identity of all atoms within the cutoff distance and the distance
30 itself::
31
32 Selected 22: 'CYSH_CYSH_60_&_SG'
33 Selected 25: 'OW'
34 ....
35 t: 184 6682 SOL 35993 OW 0.955138 (nm)
36 t: 184 10028 SOL 46031 OW 0.803889 (nm)
37 t: 185 6682 SOL 35993 OW 0.879949 (nm)
38 t: 185 10028 SOL 46031 OW 0.738299 (nm)
39 t: 186 6682 SOL 35993 OW 0.897016 (nm)
40 t: 186 10028 SOL 46031 OW 0.788268 (nm)
41 t: 187 6682 SOL 35993 OW 0.997688 (nm)
42 ....
43
44
45 Classes
46 -------
47
48 .. autoclass:: Mindist
49 :members: histogram, hist, dist, edges, midpoints, plot
50 .. autoclass:: GdistData
51 :members: __iter__
52
53 """
54 __docformat__ = "restructuredtext en"
55
56 import re
57 import numpy
58
59 from recsql import SQLarray
60
61 import gromacs.utilities
62
64 """The Mindist class allows analysis of the output from ``g_dist -dist CUTOFF``.
65
66 Output is read from a file or stream. The raw data is transformed
67 into a true 'mindist' time series (available in the
68 :attr:`Mindist.distances` attribute): for each frame only the
69 shortest distance is stored (whereas ``g_dist`` provides *all*
70 distances below the cutoff).
71
72 :TODO:
73 * Save analysis to pickle or data files.
74 * Export data as simple data files for plotting in other programs.
75
76 .. Note:: :class:`gromacs.tools.G_mindist` is apparently providing
77 exactly the service that is required: a timeseries of
78 the minimum distance between two groups. Feel free to
79 use that tool instead :-).
80
81 """
82
83 - def __init__(self,datasource,cutoff=None):
84 """Read mindist data from file or stream.
85
86 :Arguments:
87 *datasource*
88 a filename (plain, gzip, bzip2) or file object
89 *cutoff*
90 the ``-dist CUTOFF`` that was provided to ``g_dist``; if supplied
91 we work around a bug in ``g_dist`` (at least in Gromacs 4.0.2) in which
92 sometimes numbers >> CUTOFF are printed.
93 """
94 stream, self.filename = gromacs.utilities.anyopen(datasource)
95 try:
96 M = GdistData(stream)
97
98 all_distances = SQLarray('distances', records=M, columns=('frame','distance'))
99 finally:
100 stream.close()
101 if cutoff is None:
102 cutoff_filter = ""
103 else:
104 cutoff_filter = "WHERE distance <= %d" % float(cutoff)
105 self.distances = all_distances.selection(
106 "SELECT frame, MIN(distance) AS distance FROM __self__ "+cutoff_filter+" GROUP BY frame",
107 name="mindistances", cache=False)
108
109 - def histogram(self,nbins=None,lo=None,hi=None,midpoints=False,normed=True):
110 """Returns a distribution or histogram of the minimum distances::
111
112 If no values for the bin edges are given then they are set to
113 0.1 below and 0.1 above the minimum and maximum values seen in
114 the data.
115
116 If the number of bins is not provided then it is set so that
117 on average 100 counts come to a bin. Set nbins manually if the
118 histogram only contains a single bin (and then get more data)!
119
120 :Keywords:
121 *nbins* : int
122 number of bins
123 *lo* : float
124 lower edge of histogram
125 *hi* : float
126 upper edge of histogram
127 *midpoints* : boolean
128 False: return edges. True: return midpoints
129 *normed* : boolean
130 True: return probability distribution. False: histogram
131 """
132 D = self.distances
133 if lo is None or hi is None:
134 dmin,dmax = D.limits('distance')
135 if lo is None:
136 lo = round(dmin - 0.1, 1)
137 if lo < 0:
138 lo = 0.0
139 if hi is None:
140 hi = round(dmax + 0.1, 1)
141 if nbins is None:
142 nbins = int(len(D)/100)
143 FUNC = 'distribution'
144 if not normed:
145 FUNC = 'histogram'
146 SQL = """SELECT %(FUNC)s(distance,%(nbins)d,%(lo)f,%(hi)f) AS "h [Object]"
147 FROM __self__""" % vars()
148 (((h,e),),) = D.sql(SQL, asrecarray=False)
149
150 if normed:
151 self.__distribution = h
152 else:
153 self.__histogram = h
154 self.__edges = e
155 if midpoints:
156 return h, self.midpoints
157 return h, e
158
160 doc = "Histogram of the minimum distances."
161 def fget(self):
162 try:
163 return self.__histogram
164 except AttributeError:
165 raise RuntimeError("run histogram(...,normed=False) first")
166 return locals()
167 hist = property(**hist())
168
170 doc = "Distribution of the minimum distances."
171 def fget(self):
172 try:
173 return self.__distribution
174 except AttributeError:
175 raise RuntimeError("run histogram(...,normed=True) first")
176 return locals()
177 dist = property(**dist())
178
180 doc = "Edges of the histogram of the minimum distances."
181 def fget(self):
182 try:
183 return self.__edges
184 except AttributeError:
185 raise RuntimeError("run histogram(...) first")
186 return locals()
187 edges = property(**edges())
188
190 doc = "Midpoints of the histogram of the minimum distances."
191 def fget(self):
192 e = self.__edges
193 return 0.5*(e[:-1] + e[1:])
194 return locals()
195 midpoints = property(**midpoints())
196
197 - def plot(self,**kwargs):
198 """Plot histograms with matplotlib's plot() function::
199
200 plot(**histogramargs, **plotargs)
201
202 Arguments for both :meth:`Mindist.histogram` and :func:`pylab.plot` can be provided (qv).
203 """
204 import pylab
205 kwargs['midpoints'] = True
206 histogram_args = ('nbins','lo','hi','midpoints','normed')
207 histargs = kwargs.copy()
208 histargs = dict([(k, kwargs.pop(k)) for k in histargs if k in histogram_args])
209 h,m = self.histogram(**histargs)
210
211 pylab.plot(m, h, **kwargs)
212 pylab.xlabel('minimum distance (nm)')
213
215 """Object that represents the standard output of ``g_dist -dist CUTOFF``.
216
217 Initialize from a stream (e.g. a pipe) and the iterate over the
218 instance to get the data line by line. Each line consists of a
219 tuple
220
221 (*frame*, *distance*)
222 """
223 data_pattern = re.compile("""t:\s* # marker for beginning of line
224 (?P<FRAME>\d+)\s+ # frame number (?)
225 (?P<RESID>\d+)\s+(?P<RESNAME>\w+)\s+ # resid and residue name
226 (?P<ATOMID>\d+)\s+(?P<ATOMNAME>\w+)\s+ # atomid and atom name
227 (?P<DISTANCE>[0-9]+\.?[0-9]+) # distance
228 \s+\(nm\)""", re.VERBOSE)
229
231 """Initialize with an open stream to the data (eg stdin or file).
232
233 :Arguments:
234 *stream*
235 open stream (file or pipe or really any iterator
236 providing the data from ``g_dist``); the *stream* is not
237 closed automatically when the iterator completes.
238 """
239 self.stream = stream
240
242 """Iterator that filters the input stream and returns (frame, distance) tuples."""
243 for line in self.stream:
244 line = line.strip()
245 m = self.data_pattern.search(line)
246 if m:
247 distance = float(m.group('DISTANCE'))
248 frame = int(m.group('FRAME'))
249 yield frame,distance
250