1
2 """
3 :mod:`recsql.rest_table` --- Parse a simple reST table
4 ======================================================
5
6 Turn a `restructured text simple table`_ into a numpy array. See the Example_
7 below for how the table must look like. The module allows inclusion of
8 parameters and data in the documentation itself in a natural way. Thus the
9 parameters are automatically documented and only exist in a single place. The
10 idea is inspired by `literate programming`_ and is embodied by the DRY_ ("Do not
11 repeat yourself") principle.
12
13 .. _restructured text simple table:
14 http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#simple-tables
15 .. _literate programming:
16 http://en.wikipedia.org/wiki/Literate_programming
17 .. _DRY:
18 http://c2.com/cgi/wiki?DontRepeatYourself
19
20 Limitations
21 -----------
22
23 Note that not the full specifications of the original `restructured
24 text simple table`_ are supported. In order to keep the parser simple,
25 the following additional restriction apply:
26
27 * All row data must be on a single line.
28 * Column spans are not supported.
29 * Headings must be single legal SQL and python words as they are used
30 as column names.
31 * The delimiters are used to extract the fields. Only data within the
32 range of the '=====' markers is used. Thus, each column marker
33 *must* span the whole range of input. Otherwise, data will be lost.
34 * The keyword 'Table' must precede the first marker line and the table
35 name must be provided in square brackets; the table name should be a
36 valid SQL identifier.
37 * Currently, only a *single* table can be present in the string.
38 * Autoconversion of list fields might not always work...
39
40
41 Example
42 -------
43
44 The following table is converted::
45
46 Table[laureates]: Physics Nobel prize statistics.
47 ============= ========== =========
48 name age year
49 ============= ========== =========
50 A. Einstein 42 1921
51 P. Dirac 31 1933
52 R. P. Feynman 47 1965
53 ============= ========== =========
54
55 with
56
57 >>> import recsql.rest_table as T
58 >>> P = T.Table2array(T.__doc__)
59 >>> P.recarray()
60 rec.array([(u'A. Einstein', 42, 1921), (u'P. Dirac', 31, 1933),
61 (u'R. P. Feynman', 47, 1965)],
62 dtype=[('name', '<U52'), ('age', '<i4'), ('year', '<i4')])
63
64
65 Module content
66 --------------
67
68 The only class that the user really needs to know anything about is
69 :class:`recsql.rest_table.Table2array`.
70
71 .. autoclass:: Table2array
72 :members: __init__, recarray
73
74 .. autoexception:: ParseError
75
76 """
77
78 import re
79 import numpy
80 import convert
81
82
83
84
85
86 TABLE = re.compile("""
87 ^[ \t]*Table(\[(?P<name>\w*)\])?:\s*(?P<title>[^\n]*)[ \t]*$ # 'Table[name]:' is required
88 [\n]+
89 ^(?P<toprule>[ \t]*==+[ \t=]+)[ \t]*$ # top rule
90 [\n]+
91 ^(?P<fields>[\w\t ]+?)$ # field names (columns), must only contain A-z0-9_
92 [\n]+
93 ^(?P<midrule>[ \t]*==+[ \t=]+)[ \t]*$ # mid rule
94 [\n]+
95 (?P<data>.*?) # all data across multiple lines
96 [\n]+
97 ^(?P<botrule>[ \t]*==+[ \t=]+)[ \t]*$ # bottom rule
98 """, re.VERBOSE | re.DOTALL | re.MULTILINE)
99
100
101 EMPTY_ROW = re.compile("""
102 ^[-\s]*$ # white-space lines or '----' dividers are ignored (or '-- - ---')
103 """, re.VERBOSE)
104
105
107 """Signifies a failure to parse."""
108
110 """Primitive parser that converts a simple reST table into ``numpy.recarray``.
111
112 The table must be the only table in the text. It must look similar to the
113 example below (variable parts in angle brackets, optional in double
114 brackets, everything else must be there, matching is case sensitive, '....'
115 signifies repetition in kind)::
116
117 Table[<NAME>]: <<CAPTION>>
118 ============ =========== ====================== ....
119 <COLNAME 1> <COLNAME 2> .... ....
120 ============ =========== ====================== ....
121 <VALUE> <VALUE> <VALUE> <VALUE> ....
122 ....
123 ....
124 ============ =========== ====================== ....
125
126 Rows may *not* span multiple lines. The column names must be single words
127 and legal python names (no spaces, no dots, not starting with a number).
128
129 Field values are converted to one of the following python types: *int*,
130 *float*, or *str*.
131
132 If a value is quote with single or double quotation marks then the
133 outermost quotation marks are stripped and the enclosed value treated as a string.
134
135 .. Note:: Values such as 001 must be quoted as '001' or they will be
136 interpreted as integers (1 in this case).
137 """
138
139 - def __init__(self, string=None, **kwargs):
140 """Table2array(string) --> parser
141
142 :Arguments:
143 *string*
144 string to be parsed
145 *filename*
146 read from *filename* instead of string
147 *autoconvert*
148 EXPERIMENTAL. ``True``: replace certain values
149 with special python values (see :class:`convert.Autoconverter`) and possibly
150 split values into lists (see *sep*).
151 ``False``: leave everything as it is (numbers as numbers and strings
152 as strings).
153 *mode*
154 mode of the :class:`~convert.Autoconverter`
155 *sep*
156 If set and *autoconvert* = ``True`` then split field values on the
157 separator (using :func:`split`) before possible autoconversion.
158 (NOT WORKING PROPERLY YET)
159 """
160 self.filename = kwargs.pop('filename', None)
161 if self.filename:
162 with open(self.filename, 'rb') as f:
163 string = "".join(f.readlines())
164 self.string = string
165 m = TABLE.search(string)
166 if m is None:
167 raise ParseError('Table cannot be parsed.')
168 self.t = m.groupdict()
169
170 self.tablename = self.t['name']
171
172 self.caption = self.t['title']
173
174 self.records = None
175 self.names = None
176 self.autoconvert = convert.Autoconverter(**kwargs).convert
177
178 self.parse()
179
181 """Parse the table data string into records."""
182
183 self.parse_fields()
184 records = []
185 for line in self.t['data'].split('\n'):
186 if EMPTY_ROW.match(line):
187 continue
188 row = [self.autoconvert(line[start_field:end_field+1])
189 for start_field, end_field in self.fields]
190 records.append(tuple(row))
191 self.records = records
192
194 """Return a recarray from the (parsed) string."""
195
196 if self.records is None:
197 self.parse()
198 try:
199
200 return numpy.rec.fromrecords(self.records, names=self.names)
201 except ValueError:
202
203
204
205
206 D = numpy.empty(len(self.records[0]), dtype=object)
207 types = numpy.array([map(type, r) for r in self.records])
208 for icol, isSame in enumerate([numpy.all(col) for col in types.T]):
209 if isSame:
210 D[icol] = types[0][icol]
211 else:
212 D[icol] = object
213 dtype = numpy.dtype(zip(self.names, D))
214
215 retval = numpy.array(self.records, dtype=dtype)
216 res = retval.view(numpy.recarray)
217
218 return convert.to_int64(res)
219
221 """Determine the start and end columns and names of the fields."""
222
223 rule = self.t['toprule'].rstrip()
224 if not (rule == self.t['midrule'].rstrip() and rule == self.t['botrule'].rstrip()):
225 raise ParseError("Table rules differ from each other (check white space).")
226 names = self.t['fields'].split()
227 nfields = len(rule.split())
228 if nfields != len(names):
229 raise ParseError("number of field names (%d) does not match number of fields (%d)"
230 % (nfields, len(names)))
231 fields = []
232 ifield = 0
233 is_field = rule.startswith('=')
234 len_rule = len(rule)
235 start_field = 0
236 end_field = 0
237 for c in xrange(len_rule):
238 char = rule[c]
239 if not is_field and char == '=':
240 start_field = c
241 is_field = True
242 if is_field and (char == ' ' or c == len_rule-1):
243
244 fields.append((start_field, c))
245 ifield += 1
246 is_field = False
247 self.names = names
248 self.fields = fields
249