recsql.sqlfunctions

1 # $Id: sqlfunctions.py 3379 2009-04-21 17:57:29Z oliver $ 2 # Copyright (C) 2009 Oliver Beckstein <orbeckst@gmail.com> 3 # Released under the GNU Public License, version 3 or higher (your choice) 4 5 """ 6 :mod:`recsql.sqlfunctions` --- Functions that enhance a SQLite db 7 ================================================================= 8 9 This module contains new SQL functions to be added to a SQLite database that 10 can be used in the same way as the builtin functions. 11 12 Example: 13 14 Add the functions to an existing connection in the following way (assuming 15 that the db connection is available in ``self.connection``):: 16 17 from sqlfunctions import * 18 self.connection.create_function("sqrt", 1, _sqrt) 19 self.connection.create_function("fformat",2,_fformat) 20 self.connection.create_aggregate("std",1,_Stdev) 21 self.connection.create_aggregate("median",1,_Median) 22 self.connection.create_aggregate("array",1,_NumpyArray) 23 self.connection.create_aggregate("histogram",4,_NumpyHistogram) 24 self.connection.create_aggregate("distribution",4,_NormedNumpyHistogram) 25 self.connection.create_aggregate("meanhistogram",5,_MeanHistogram) 26 self.connection.create_aggregate("stdhistogram",5,_StdHistogram) 27 self.connection.create_aggregate("minhistogram",5,_MinHistogram) 28 self.connection.create_aggregate("maxhistogram",5,_MaxHistogram) 29 self.connection.create_aggregate("medianhistogram",5,_MedianHistogram) 30 self.connection.create_aggregate("zscorehistogram",5,_ZscoreHistogram) 31 32 Module content 33 -------------- 34 .. See the autogenerated content in the online docs or the source code. 35 36 """ 37 import numpy 38 # compatibility check: we NEED consistent 1d histogram functions: we 39 # decided to use numpy 1.x style, which returns edges, NOT lower bin edges 40 _numpyversion = map(int, numpy.version.version.split('.')) 41 if _numpyversion[0] < 1: 42 raise ImportError('Need at least numpy 1.x, only have %r' % numpy.version.version) 43 if _numpyversion[1] < 1: 44 # we want a histogram that returns edges

45 - def histogram1d(*args,**kwargs):

46 _range = kwargs.pop('range',None) 47 if not _range is None: 48 kwargs['range'] = (_range,) # needs to be a sequence 49 h,e = numpy.histogramdd(*args,**kwargs) 50 return h,e[0]

51 histogram1d.__doc__ = "1D histogram, based on numpy histogramdd; returns edges as in numpy 1.1.x\n"+\ 52 numpy.histogram.__doc__ 53 else: 54 # once deprecation for new=True sets in we can catch this here

55 - def histogram1d(*args,**kwargs):

56 kwargs['new'] = True 57 h,e = numpy.histogram(*args,**kwargs) 58 return h,e

59 histogram1d.__doc__ = numpy.histogram.__doc__ 60 61 62 from sqlutil import adapt_numpyarray, convert_numpyarray,\ 63 adapt_object, convert_object 64 65

66 -def _sqrt(x):

67 try: 68 x = float(x) 69 except TypeError: 70 return None 71 return numpy.sqrt(x)

72

73 -def _fformat(format,x):

74 return format % x

75

76 -class _Stdev(object):

77 """Implement standard deviation of the sample as SQL aggregate function. 78 (Uses N-1 variance.) 79 Do it in one pass (see eg 80 http://smallcode.weblogs.us/2006/11/27/calculate-standard-deviation-in-one-pass/ 81 though we may run in an underflow by calculating N/N-1<X^2-<X>^2>.). 82 83 Also, we don't check if our arguments are valid as numbers. 84 """

85 - def __init__(self):

86 self.x2 = 0 87 self.x = 0 88 self.n = 0

89 - def step(self,x):

90 try: 91 x = float(x) 92 self.x2 += x*x 93 self.x += x 94 self.n += 1 95 except TypeError: 96 pass # don't contribute to average

97 - def finalize(self):

98 if self.n<2: return 0.0 99 return numpy.sqrt((self.n*self.x2 - self.x*self.x)/(self.n*(self.n-1)))

100

101 -class _Median(object):

102 - def __init__(self):

103 self.data = []

104 - def step(self,x):

105 try: 106 x = float(x) 107 self.data.append(x) 108 except TypeError: 109 pass # don't contribute

110 - def finalize(self):

111 return numpy.median(self.data)

112

113 -class _NumpyArray(object):

114 - def __init__(self):

115 self.data = []

116 - def step(self,x):

117 self.data.append(x)

118 - def finalize(self):

119 return adapt_numpyarray(numpy.array(self.data))

120

121 -class _NumpyHistogram(object):

122 - def __init__(self):

123 self.is_initialized = False 124 self.data = []

125 - def step(self,x,bins,xmin,xmax):

126 if not self.is_initialized: 127 self.bins = bins 128 self.range = (xmin,xmax) 129 self.is_initialized = True 130 self.data.append(x)

131 - def finalize(self):

132 hist,edges = histogram1d(self.data,bins=self.bins,range=self.range, 133 normed=False) 134 return adapt_object((hist,edges))

135

136 -class _NormedNumpyHistogram(_NumpyHistogram):

137 - def finalize(self):

138 hist,edges = histogram1d(self.data,bins=self.bins,range=self.range, 139 normed=True) 140 return adapt_object((hist,edges))

141

142 -class _FunctionHistogram(_NumpyHistogram):

143 """Baseclass for histogrammed functions. 144 145 A histogrammed function is created by applying a function 146 to all values y that have been accumulated in a bin x. 147 """

148 - def __init__(self):

149 _NumpyHistogram.__init__(self) 150 self.y = []

151 - def step(self,x,y,bins,xmin,xmax):

152 _NumpyHistogram.step(self,x,bins,xmin,xmax) 153 self.y.append(y)

154 - def finalize(self):

155 raise NotImplementedError("_FunctionHistogram must be inherited from.")

156 # return adapt_object( (...,...,...) ) 157

158 -class _MeanHistogram(_FunctionHistogram):

159 """Mean of the weights in each bin. 160 Takes TWO column arguments: value and weight"""

161 - def finalize(self):

162 return adapt_object(regularized_function(\ 163 self.data,self.y,numpy.mean,bins=self.bins,range=self.range))

164

165 -class _StdHistogram(_FunctionHistogram):

166 """Standard deviation of the weights in each bin. 167 Takes TWO column arguments: value and weight"""

168 - def finalize(self):

169 return adapt_object(regularized_function(\ 170 self.data,self.y,numpy.std,bins=self.bins,range=self.range))

171

172 -class _MinHistogram(_FunctionHistogram):

173 """Min value of the weights in each bin. 174 Takes TWO column arguments: value and weight"""

175 - def _min(self,v):

176 try: 177 return numpy.min(v) 178 except ValueError: # empty array 179 return numpy.nan

180

181 - def finalize(self):

182 return adapt_object(regularized_function(\ 183 self.data,self.y,self._min,bins=self.bins,range=self.range))

184

185 -class _MaxHistogram(_FunctionHistogram):

186 """Max value of the weights in each bin. 187 Takes TWO column arguments: value and weight"""

188 - def _max(self,v):

189 try: 190 return numpy.max(v) 191 except ValueError: # empty array 192 return numpy.nan

193

194 - def finalize(self):

195 return adapt_object(regularized_function(\ 196 self.data,self.y,self._max,bins=self.bins,range=self.range))

197

198 -class _MedianHistogram(_FunctionHistogram):

199 """Median value of the weights in each bin. 200 Takes TWO column arguments: value and weight"""

201 - def finalize(self):

202 return adapt_object(regularized_function(\ 203 self.data,self.y,numpy.median,bins=self.bins,range=self.range))

204

205 -class _ZscoreHistogram(_FunctionHistogram):

206 """Z-score of the weights in each bin abs(Y - <Y>)/std(Y). 207 Takes TWO column arguments: value and weight"""

208 - def Zscore(self,v):

209 m = v.mean() 210 s = v.std() 211 return numpy.nan_to_num( numpy.mean(numpy.abs(v - m))/s )

212

213 - def finalize(self):

214 return adapt_object(\ 215 regularized_function(self.data,self.y,self.Zscore,bins=self.bins,range=self.range))

216 217 218 # Helper functions 219

220 -def regularized_function(x,y,func,bins=None,range=None):

221 """Compute func() over data aggregated in bins. 222 223 (x,y) --> (x', func(Y')) with Y' = {y: y(x) where x in x' bin} 224 225 First the data is collected in bins x' along x and then func is applied to 226 all data points Y' that have been collected in the bin. 227 228 :Arguments: 229 x 230 abscissa values (for binning) 231 y 232 ordinate values (func is applied) 233 func 234 a numpy ufunc that takes one argument, func(Y') 235 bins 236 number or array 237 range 238 limits (used with number of bins) 239 240 :Returns: 241 F,edges 242 function and edges (midpoints = 0.5*(edges[:-1]+edges[1:])) 243 """ 244 _x = numpy.asarray(x) 245 _y = numpy.asarray(y) 246 247 # setup of bins from numpy.histogram 248 if (range is not None): 249 mn, mx = range 250 if (mn > mx): 251 raise AttributeError('max must be larger than min in range parameter.') 252 253 if not numpy.iterable(bins): 254 if range is None: 255 range = (_x.min(), _x.max()) 256 mn, mx = [float(mi) for mi in range] 257 if mn == mx: 258 mn -= 0.5 259 mx += 0.5 260 bins = numpy.linspace(mn, mx, bins+1, endpoint=True) 261 else: 262 bins = numpy.asarray(bins) 263 if (numpy.diff(bins) < 0).any(): 264 raise AttributeError('bins must increase monotonically.') 265 266 sorting_index = numpy.argsort(_x) 267 sx = _x[sorting_index] 268 sy = _y[sorting_index] 269 270 # boundaries in SORTED data that demarcate bins; position in bin_index is the bin number 271 bin_index = numpy.r_[sx.searchsorted(bins[:-1], 'left'), 272 sx.searchsorted(bins[-1], 'right')] 273 274 # naive implementation: apply operator to each chunk = sy[start:stop] separately 275 # 276 # It's not clear to me how one could effectively block this procedure (cf 277 # block = 65536 in numpy.histogram) because there does not seem to be a 278 # general way to combine the chunks for different blocks, just think of 279 # func=median 280 F = numpy.zeros(len(bins)-1) # final function 281 F[:] = [func(sy[start:stop]) for start,stop in zip(bin_index[:-1],bin_index[1:])] 282 return F,bins

283

Source Code for Module recsql.sqlfunctions