1
2
3
4
5 """
6 :mod:`recsql.sqlfunctions` --- Functions that enhance a SQLite db
7 =================================================================
8
9 This module contains new SQL functions to be added to a SQLite database that
10 can be used in the same way as the builtin functions.
11
12 Example:
13
14 Add the functions to an existing connection in the following way (assuming
15 that the db connection is available in ``self.connection``)::
16
17 from sqlfunctions import *
18 self.connection.create_function("sqrt", 1, _sqrt)
19 self.connection.create_function("fformat",2,_fformat)
20 self.connection.create_aggregate("std",1,_Stdev)
21 self.connection.create_aggregate("median",1,_Median)
22 self.connection.create_aggregate("array",1,_NumpyArray)
23 self.connection.create_aggregate("histogram",4,_NumpyHistogram)
24 self.connection.create_aggregate("distribution",4,_NormedNumpyHistogram)
25 self.connection.create_aggregate("meanhistogram",5,_MeanHistogram)
26 self.connection.create_aggregate("stdhistogram",5,_StdHistogram)
27 self.connection.create_aggregate("minhistogram",5,_MinHistogram)
28 self.connection.create_aggregate("maxhistogram",5,_MaxHistogram)
29 self.connection.create_aggregate("medianhistogram",5,_MedianHistogram)
30 self.connection.create_aggregate("zscorehistogram",5,_ZscoreHistogram)
31
32 Module content
33 --------------
34 .. See the autogenerated content in the online docs or the source code.
35
36 """
37 import numpy
38
39
40 _numpyversion = map(int, numpy.version.version.split('.'))
41 if _numpyversion[0] < 1:
42 raise ImportError('Need at least numpy 1.x, only have %r' % numpy.version.version)
43 if _numpyversion[1] < 1:
44
46 _range = kwargs.pop('range',None)
47 if not _range is None:
48 kwargs['range'] = (_range,)
49 h,e = numpy.histogramdd(*args,**kwargs)
50 return h,e[0]
51 histogram1d.__doc__ = "1D histogram, based on numpy histogramdd; returns edges as in numpy 1.1.x\n"+\
52 numpy.histogram.__doc__
53 else:
54
56 kwargs['new'] = True
57 h,e = numpy.histogram(*args,**kwargs)
58 return h,e
59 histogram1d.__doc__ = numpy.histogram.__doc__
60
61
62 from sqlutil import adapt_numpyarray, convert_numpyarray,\
63 adapt_object, convert_object
64
65
67 try:
68 x = float(x)
69 except TypeError:
70 return None
71 return numpy.sqrt(x)
72
75
77 """Implement standard deviation of the sample as SQL aggregate function.
78 (Uses N-1 variance.)
79 Do it in one pass (see eg
80 http://smallcode.weblogs.us/2006/11/27/calculate-standard-deviation-in-one-pass/
81 though we may run in an underflow by calculating N/N-1<X^2-<X>^2>.).
82
83 Also, we don't check if our arguments are valid as numbers.
84 """
86 self.x2 = 0
87 self.x = 0
88 self.n = 0
90 try:
91 x = float(x)
92 self.x2 += x*x
93 self.x += x
94 self.n += 1
95 except TypeError:
96 pass
98 if self.n<2: return 0.0
99 return numpy.sqrt((self.n*self.x2 - self.x*self.x)/(self.n*(self.n-1)))
100
112
120
123 self.is_initialized = False
124 self.data = []
125 - def step(self,x,bins,xmin,xmax):
126 if not self.is_initialized:
127 self.bins = bins
128 self.range = (xmin,xmax)
129 self.is_initialized = True
130 self.data.append(x)
132 hist,edges = histogram1d(self.data,bins=self.bins,range=self.range,
133 normed=False)
134 return adapt_object((hist,edges))
135
138 hist,edges = histogram1d(self.data,bins=self.bins,range=self.range,
139 normed=True)
140 return adapt_object((hist,edges))
141
143 """Baseclass for histogrammed functions.
144
145 A histogrammed function is created by applying a function
146 to all values y that have been accumulated in a bin x.
147 """
151 - def step(self,x,y,bins,xmin,xmax):
155 raise NotImplementedError("_FunctionHistogram must be inherited from.")
156
157
159 """Mean of the weights in each bin.
160 Takes TWO column arguments: value and weight"""
164
166 """Standard deviation of the weights in each bin.
167 Takes TWO column arguments: value and weight"""
171
173 """Min value of the weights in each bin.
174 Takes TWO column arguments: value and weight"""
176 try:
177 return numpy.min(v)
178 except ValueError:
179 return numpy.nan
180
184
186 """Max value of the weights in each bin.
187 Takes TWO column arguments: value and weight"""
189 try:
190 return numpy.max(v)
191 except ValueError:
192 return numpy.nan
193
197
204
206 """Z-score of the weights in each bin abs(Y - <Y>)/std(Y).
207 Takes TWO column arguments: value and weight"""
209 m = v.mean()
210 s = v.std()
211 return numpy.nan_to_num( numpy.mean(numpy.abs(v - m))/s )
212
216
217
218
219
221 """Compute func() over data aggregated in bins.
222
223 (x,y) --> (x', func(Y')) with Y' = {y: y(x) where x in x' bin}
224
225 First the data is collected in bins x' along x and then func is applied to
226 all data points Y' that have been collected in the bin.
227
228 :Arguments:
229 x
230 abscissa values (for binning)
231 y
232 ordinate values (func is applied)
233 func
234 a numpy ufunc that takes one argument, func(Y')
235 bins
236 number or array
237 range
238 limits (used with number of bins)
239
240 :Returns:
241 F,edges
242 function and edges (midpoints = 0.5*(edges[:-1]+edges[1:]))
243 """
244 _x = numpy.asarray(x)
245 _y = numpy.asarray(y)
246
247
248 if (range is not None):
249 mn, mx = range
250 if (mn > mx):
251 raise AttributeError('max must be larger than min in range parameter.')
252
253 if not numpy.iterable(bins):
254 if range is None:
255 range = (_x.min(), _x.max())
256 mn, mx = [float(mi) for mi in range]
257 if mn == mx:
258 mn -= 0.5
259 mx += 0.5
260 bins = numpy.linspace(mn, mx, bins+1, endpoint=True)
261 else:
262 bins = numpy.asarray(bins)
263 if (numpy.diff(bins) < 0).any():
264 raise AttributeError('bins must increase monotonically.')
265
266 sorting_index = numpy.argsort(_x)
267 sx = _x[sorting_index]
268 sy = _y[sorting_index]
269
270
271 bin_index = numpy.r_[sx.searchsorted(bins[:-1], 'left'),
272 sx.searchsorted(bins[-1], 'right')]
273
274
275
276
277
278
279
280 F = numpy.zeros(len(bins)-1)
281 F[:] = [func(sy[start:stop]) for start,stop in zip(bin_index[:-1],bin_index[1:])]
282 return F,bins
283