Package recsql :: Module sqlfunctions
[hide private]
[frames] | no frames]

Source Code for Module recsql.sqlfunctions

  1  # $Id: sqlfunctions.py 3379 2009-04-21 17:57:29Z oliver $ 
  2  # Copyright (C) 2009 Oliver Beckstein <orbeckst@gmail.com> 
  3  # Released under the GNU Public License, version 3 or higher (your choice) 
  4   
  5  """ 
  6  :mod:`recsql.sqlfunctions` --- Functions that enhance a SQLite db 
  7  ================================================================= 
  8   
  9  This module contains new SQL functions to be added to a SQLite database that 
 10  can be used in the same way as the builtin functions. 
 11   
 12  Example: 
 13   
 14    Add the functions to an existing connection in the following way (assuming 
 15    that the db connection is available in ``self.connection``):: 
 16   
 17       from sqlfunctions import * 
 18       self.connection.create_function("sqrt", 1, _sqrt) 
 19       self.connection.create_function("fformat",2,_fformat) 
 20       self.connection.create_aggregate("std",1,_Stdev) 
 21       self.connection.create_aggregate("median",1,_Median) 
 22       self.connection.create_aggregate("array",1,_NumpyArray) 
 23       self.connection.create_aggregate("histogram",4,_NumpyHistogram) 
 24       self.connection.create_aggregate("distribution",4,_NormedNumpyHistogram) 
 25       self.connection.create_aggregate("meanhistogram",5,_MeanHistogram) 
 26       self.connection.create_aggregate("stdhistogram",5,_StdHistogram) 
 27       self.connection.create_aggregate("minhistogram",5,_MinHistogram) 
 28       self.connection.create_aggregate("maxhistogram",5,_MaxHistogram) 
 29       self.connection.create_aggregate("medianhistogram",5,_MedianHistogram) 
 30       self.connection.create_aggregate("zscorehistogram",5,_ZscoreHistogram) 
 31   
 32  Module content 
 33  -------------- 
 34  .. See the autogenerated content in the online docs or the source code. 
 35   
 36  """ 
 37  import numpy 
 38  # compatibility check: we NEED consistent 1d histogram functions: we 
 39  # decided to use numpy 1.x style, which returns edges, NOT lower bin edges 
 40  _numpyversion = map(int, numpy.version.version.split('.')) 
 41  if _numpyversion[0] < 1: 
 42      raise ImportError('Need at least numpy 1.x, only have %r' % numpy.version.version) 
 43  if _numpyversion[1] < 1: 
 44      # we want a histogram that returns edges 
45 - def histogram1d(*args,**kwargs):
46 _range = kwargs.pop('range',None) 47 if not _range is None: 48 kwargs['range'] = (_range,) # needs to be a sequence 49 h,e = numpy.histogramdd(*args,**kwargs) 50 return h,e[0]
51 histogram1d.__doc__ = "1D histogram, based on numpy histogramdd; returns edges as in numpy 1.1.x\n"+\ 52 numpy.histogram.__doc__ 53 else: 54 # once deprecation for new=True sets in we can catch this here
55 - def histogram1d(*args,**kwargs):
56 kwargs['new'] = True 57 h,e = numpy.histogram(*args,**kwargs) 58 return h,e
59 histogram1d.__doc__ = numpy.histogram.__doc__ 60 61 62 from sqlutil import adapt_numpyarray, convert_numpyarray,\ 63 adapt_object, convert_object 64 65
66 -def _sqrt(x):
67 try: 68 x = float(x) 69 except TypeError: 70 return None 71 return numpy.sqrt(x)
72
73 -def _fformat(format,x):
74 return format % x
75
76 -class _Stdev(object):
77 """Implement standard deviation of the sample as SQL aggregate function. 78 (Uses N-1 variance.) 79 Do it in one pass (see eg 80 http://smallcode.weblogs.us/2006/11/27/calculate-standard-deviation-in-one-pass/ 81 though we may run in an underflow by calculating N/N-1<X^2-<X>^2>.). 82 83 Also, we don't check if our arguments are valid as numbers. 84 """
85 - def __init__(self):
86 self.x2 = 0 87 self.x = 0 88 self.n = 0
89 - def step(self,x):
90 try: 91 x = float(x) 92 self.x2 += x*x 93 self.x += x 94 self.n += 1 95 except TypeError: 96 pass # don't contribute to average
97 - def finalize(self):
98 if self.n<2: return 0.0 99 return numpy.sqrt((self.n*self.x2 - self.x*self.x)/(self.n*(self.n-1)))
100
101 -class _Median(object):
102 - def __init__(self):
103 self.data = []
104 - def step(self,x):
105 try: 106 x = float(x) 107 self.data.append(x) 108 except TypeError: 109 pass # don't contribute
110 - def finalize(self):
111 return numpy.median(self.data)
112
113 -class _NumpyArray(object):
114 - def __init__(self):
115 self.data = []
116 - def step(self,x):
117 self.data.append(x)
118 - def finalize(self):
119 return adapt_numpyarray(numpy.array(self.data))
120
121 -class _NumpyHistogram(object):
122 - def __init__(self):
123 self.is_initialized = False 124 self.data = []
125 - def step(self,x,bins,xmin,xmax):
126 if not self.is_initialized: 127 self.bins = bins 128 self.range = (xmin,xmax) 129 self.is_initialized = True 130 self.data.append(x)
131 - def finalize(self):
132 hist,edges = histogram1d(self.data,bins=self.bins,range=self.range, 133 normed=False) 134 return adapt_object((hist,edges))
135
136 -class _NormedNumpyHistogram(_NumpyHistogram):
137 - def finalize(self):
138 hist,edges = histogram1d(self.data,bins=self.bins,range=self.range, 139 normed=True) 140 return adapt_object((hist,edges))
141
142 -class _FunctionHistogram(_NumpyHistogram):
143 """Baseclass for histogrammed functions. 144 145 A histogrammed function is created by applying a function 146 to all values y that have been accumulated in a bin x. 147 """
148 - def __init__(self):
149 _NumpyHistogram.__init__(self) 150 self.y = []
151 - def step(self,x,y,bins,xmin,xmax):
152 _NumpyHistogram.step(self,x,bins,xmin,xmax) 153 self.y.append(y)
154 - def finalize(self):
155 raise NotImplementedError("_FunctionHistogram must be inherited from.")
156 # return adapt_object( (...,...,...) ) 157
158 -class _MeanHistogram(_FunctionHistogram):
159 """Mean of the weights in each bin. 160 Takes TWO column arguments: value and weight"""
161 - def finalize(self):
162 return adapt_object(regularized_function(\ 163 self.data,self.y,numpy.mean,bins=self.bins,range=self.range))
164
165 -class _StdHistogram(_FunctionHistogram):
166 """Standard deviation of the weights in each bin. 167 Takes TWO column arguments: value and weight"""
168 - def finalize(self):
169 return adapt_object(regularized_function(\ 170 self.data,self.y,numpy.std,bins=self.bins,range=self.range))
171
172 -class _MinHistogram(_FunctionHistogram):
173 """Min value of the weights in each bin. 174 Takes TWO column arguments: value and weight"""
175 - def _min(self,v):
176 try: 177 return numpy.min(v) 178 except ValueError: # empty array 179 return numpy.nan
180
181 - def finalize(self):
182 return adapt_object(regularized_function(\ 183 self.data,self.y,self._min,bins=self.bins,range=self.range))
184
185 -class _MaxHistogram(_FunctionHistogram):
186 """Max value of the weights in each bin. 187 Takes TWO column arguments: value and weight"""
188 - def _max(self,v):
189 try: 190 return numpy.max(v) 191 except ValueError: # empty array 192 return numpy.nan
193
194 - def finalize(self):
195 return adapt_object(regularized_function(\ 196 self.data,self.y,self._max,bins=self.bins,range=self.range))
197
198 -class _MedianHistogram(_FunctionHistogram):
199 """Median value of the weights in each bin. 200 Takes TWO column arguments: value and weight"""
201 - def finalize(self):
202 return adapt_object(regularized_function(\ 203 self.data,self.y,numpy.median,bins=self.bins,range=self.range))
204
205 -class _ZscoreHistogram(_FunctionHistogram):
206 """Z-score of the weights in each bin abs(Y - <Y>)/std(Y). 207 Takes TWO column arguments: value and weight"""
208 - def Zscore(self,v):
209 m = v.mean() 210 s = v.std() 211 return numpy.nan_to_num( numpy.mean(numpy.abs(v - m))/s )
212
213 - def finalize(self):
214 return adapt_object(\ 215 regularized_function(self.data,self.y,self.Zscore,bins=self.bins,range=self.range))
216 217 218 # Helper functions 219
220 -def regularized_function(x,y,func,bins=None,range=None):
221 """Compute func() over data aggregated in bins. 222 223 (x,y) --> (x', func(Y')) with Y' = {y: y(x) where x in x' bin} 224 225 First the data is collected in bins x' along x and then func is applied to 226 all data points Y' that have been collected in the bin. 227 228 :Arguments: 229 x 230 abscissa values (for binning) 231 y 232 ordinate values (func is applied) 233 func 234 a numpy ufunc that takes one argument, func(Y') 235 bins 236 number or array 237 range 238 limits (used with number of bins) 239 240 :Returns: 241 F,edges 242 function and edges (midpoints = 0.5*(edges[:-1]+edges[1:])) 243 """ 244 _x = numpy.asarray(x) 245 _y = numpy.asarray(y) 246 247 # setup of bins from numpy.histogram 248 if (range is not None): 249 mn, mx = range 250 if (mn > mx): 251 raise AttributeError('max must be larger than min in range parameter.') 252 253 if not numpy.iterable(bins): 254 if range is None: 255 range = (_x.min(), _x.max()) 256 mn, mx = [float(mi) for mi in range] 257 if mn == mx: 258 mn -= 0.5 259 mx += 0.5 260 bins = numpy.linspace(mn, mx, bins+1, endpoint=True) 261 else: 262 bins = numpy.asarray(bins) 263 if (numpy.diff(bins) < 0).any(): 264 raise AttributeError('bins must increase monotonically.') 265 266 sorting_index = numpy.argsort(_x) 267 sx = _x[sorting_index] 268 sy = _y[sorting_index] 269 270 # boundaries in SORTED data that demarcate bins; position in bin_index is the bin number 271 bin_index = numpy.r_[sx.searchsorted(bins[:-1], 'left'), 272 sx.searchsorted(bins[-1], 'right')] 273 274 # naive implementation: apply operator to each chunk = sy[start:stop] separately 275 # 276 # It's not clear to me how one could effectively block this procedure (cf 277 # block = 65536 in numpy.histogram) because there does not seem to be a 278 # general way to combine the chunks for different blocks, just think of 279 # func=median 280 F = numpy.zeros(len(bins)-1) # final function 281 F[:] = [func(sy[start:stop]) for start,stop in zip(bin_index[:-1],bin_index[1:])] 282 return F,bins
283