14
14
# See the License for the specific language governing permissions and
15
15
# limitations under the License.
16
16
17
- """Typed CSV reader.
17
+ """Typed column ( CSV) reader.
18
18
19
19
Based on the built-in ``csv`` module, this Generator module provides the user
20
- with the ability to load _typed_ CSV content, a CSV file with optional type
21
- specifications provided in the header (which must be supplied).
20
+ with the ability to load _typed_ CSV-like content, a text file of values
21
+ that include a header with optional type specifications provided in the
22
+ header.
22
23
23
24
Alan Christie
24
25
October 2018
@@ -36,8 +37,8 @@ class UnknownTypeError(Error):
36
37
"""Exception raised for an unknown type in the header.
37
38
38
39
Attributes:
39
- column -- the problematic column number
40
- column_type -- The column type
40
+ column -- the problematic (1-based) column number
41
+ column_type -- The column's type value
41
42
"""
42
43
43
44
def __init__ (self , column , column_type ):
@@ -51,9 +52,9 @@ class ContentError(Error):
51
52
comply with the defined type.
52
53
53
54
Attributes:
54
- column -- the problematic column number
55
+ column -- the problematic (1-based) column number
55
56
row -- the problematic (1-based) row number
56
- value -- The value (or None is n/a)
57
+ value -- The value (or None if n/a)
57
58
message -- explanation of the error
58
59
"""
59
60
@@ -65,56 +66,69 @@ def __init__(self, column, row, value, message):
65
66
66
67
67
68
def convert_int (string_value ):
68
- """Converts string to integer (see CONVERTERS).
69
- There is a ``converter_?()`` function for each column type.
69
+ """Converts a string to integer (see CONVERTERS).
70
+ There is a converter function for each column type.
70
71
71
72
:param string_value: The string to convert
73
+
74
+ :raises: ValueError if the string cannot be represented by an int
72
75
"""
73
76
return int (string_value .strip ())
74
77
75
78
76
79
def convert_float (string_value ):
77
80
"""Converts string to float (see CONVERTERS).
78
- There is a ``converter_?()`` function for each column type.
81
+ There is a converter function for each column type.
79
82
80
83
:param string_value: The string to convert
84
+
85
+ :raises: ValueError if the string cannot be represented by a float
81
86
"""
82
87
return float (string_value .strip ())
83
88
84
89
85
90
def convert_string (string_value ):
86
91
"""String and default converter (see CONVERTERS).
87
- There is a ``converter_?()`` function for each column type.
92
+ There is a converter function for each column type.
88
93
89
94
:param string_value: The string to convert
90
95
"""
91
96
return string_value
92
97
93
98
94
- # A map of built-in column type to string conversion function.
95
- # If a column is 'name:INT' then we call 'convert_int()'
96
- # for the column values.
99
+ # A map of column type names (case-insensitive) to string conversion function.
100
+ # If a column is 'name:INT' then we call 'convert_int()' for the column values.
97
101
CONVERTERS = {'int' : convert_int ,
98
102
'float' : convert_float ,
99
103
'string' : convert_string }
100
104
101
105
102
106
class TypedColumnReader (object ):
107
+ """A generator to handle 'typed' CSV-like files, files that include
108
+ a header that may also define data types. This class supports
109
+ neo4j-like node/edge column typing where fields are annotated
110
+ with type information where each column header is of the format
111
+ ``name[:type]``.
103
112
104
- """A generator to handle 'typed' CSV-like files, files (normally)
105
- with a header that can include type information. This class supports
106
- neo4j-like column typing where field are annotated
107
- with type information. The class returns
108
- a list of values for each row in the file where, if the column header
109
- defines a type, the value is converted to that type.
113
+ As a Generator it returns a dictionary of values for each row in the file
114
+ where, if the column header defines a type, the value is converted to that
115
+ type.
110
116
111
117
There is built-in support for ``int``, ``float`` and ``string`` data types.
112
118
113
- The following is a comma-separated header for a file where the first two
114
- columns contain strings and the last two contain `int`` and ``float``
115
- types: -
119
+ As an example, the following is a comma-separated header for a file with
120
+ columns ``names`` "smiles", "comment", "hac" and "ratio" where
121
+ the first two column types are strings and the last two are
122
+ ``int`` and ``float`` types: -
123
+
124
+ "smiles,comment:string,hac:int,ratio:float"
125
+
126
+ * The ``name`` cannot be blank and must be unique.
127
+ * Whitespace is stripped from the start and end of the column ``name``
128
+ * If a column value is empty/blank the corresponding dictionary
129
+ value is ``None``
130
+
116
131
117
- "smiles,comment:string,hac:int,ratio:float"
118
132
"""
119
133
120
134
def __init__ (self , csv_file ,
@@ -123,20 +137,19 @@ def __init__(self, csv_file,
123
137
header = None ):
124
138
"""Basic initialiser.
125
139
126
- :param csvfile : The typed CSV file. csvfile can be any object which
127
- supports the iterator protocol and returns a string
128
- each time its next() method is called
140
+ :param csv_file : The typed CSV-like file. csv_file can be any object
141
+ tha supports the iterator protocol and returns a string
142
+ each time its next() method is called
129
143
:param column_sep: The file column separator
130
- :param type_sep: The type separator
131
- :param header: An optional header. If provided the must not have
132
- a header line . This is provided to allow processing
133
- of exiting files that have no header. The headder
134
- must contain column names and optional types .
144
+ :param type_sep: The type separator, the character between the column
145
+ header name and its type declaration.
146
+ :param header: An optional header. This is provided to allow processing
147
+ of existing files that have no header. The header
148
+ must contain column names.
135
149
"smiles:string" would be a column named "smiles"
136
150
of type "string" and "n:int" would be a column known as
137
- "n" of type "integer". Although you can provide
138
- thew header here you are strongly
139
- encouraged to add a header line to all new files.
151
+ "n" of type "integer". When provided here the header
152
+ column separator must be comma-separated.
140
153
"""
141
154
142
155
self ._csv_file = csv_file
@@ -149,16 +162,19 @@ def __init__(self, csv_file,
149
162
strict = True )
150
163
151
164
# Column value type converter functions.
152
- # An entry for each column in the file and compiled by _handle_header
153
- # using the provided header or file content oin the first iteration.
165
+ # An entry for each column in the file and compiled by _handle_header()
166
+ # using the provided header or file content on the first iteration.
154
167
self ._converters = []
155
- # The the column names extracted from the header
168
+ # The ordered list of unique column names extracted from the header
156
169
self ._column_names = []
157
170
158
171
def __iter__ (self ):
159
172
"""Return the next type-converted row from the file.
160
173
Unless the header is provided in the initialiser, the first row is
161
- expected to be a header with optional type definitions.
174
+ expected to be a header with optional type declarations.
175
+
176
+ If the column value is empty/blank the corresponding dictionary
177
+ value is None.
162
178
163
179
:returns: A dictionary of type-converted values for the next row
164
180
where the dictionary key is the name of the column
@@ -167,6 +183,7 @@ def __iter__(self):
167
183
:raises: ValueError if a column value cannot be converted
168
184
:raises: ContentError if the column value is unknown or does not
169
185
comply with the column type.
186
+ :raises: UnknownTypeError if the column type is unknown.
170
187
"""
171
188
172
189
# If we have not generated the converter array but we have been given
@@ -180,42 +197,50 @@ def __iter__(self):
180
197
181
198
# Handle the first row?
182
199
# (which defines column names and types)
200
+ # If we were given a header during initialisation
201
+ # then there's no header in the file
183
202
if not self ._converters :
184
203
self ._handle_hdr (row )
185
204
continue
186
205
187
- # Must have seen a header if we get here...
206
+ # Must have a header if we get here...
188
207
if len (self ._converters ) == 0 :
189
208
raise ContentError (1 , 1 , None , 'Missing header' )
190
209
191
210
# Construct a dictionary of row column names and values,
192
211
# applying type conversions based on the
193
- # type defined in the header....
212
+ # type defined in the header
194
213
row_content = {}
195
214
col_index = 0
196
- # Convert...
197
215
for col in row :
198
216
# Too many items in the row?
217
+ # Can't have a header with 4 columns and a file of 5
199
218
if col_index >= len (self ._converters ):
200
219
raise ContentError (col_index + 1 , self ._c_reader .line_num ,
201
220
None , 'Too many values' )
202
- try :
203
- row_content [self ._column_names [col_index ]] = \
204
- self ._converters [col_index ][1 ](col )
205
- except ValueError :
206
- raise ContentError (col_index + 1 , self ._c_reader .line_num ,
207
- col ,
208
- 'Does not comply with column type' )
221
+ lean_col = col .strip ()
222
+ col_val = None
223
+ if lean_col :
224
+ try :
225
+ col_val = self ._converters [col_index ][1 ](col )
226
+ except ValueError :
227
+ raise ContentError (col_index + 1 , self ._c_reader .line_num ,
228
+ col ,
229
+ 'Does not comply with column type' )
230
+ row_content [self ._column_names [col_index ]] = col_val
209
231
col_index += 1
210
232
211
233
yield row_content
212
234
213
235
def _handle_hdr (self , hdr ):
214
- """Given the file header line (or one provided when the class
215
- is instantiated) this method populates the self.converters array,
216
- a list of type converters indexed by column.
236
+ """Given the file header line (or one provided when the object
237
+ is instantiated) this method populates the `` self._converters`` array,
238
+ a list of type converters indexed by the column name .
217
239
218
240
:param hdr: The header line.
241
+
242
+ :raises: ContentError for any formatting problems
243
+ :raises: UnknownTypeError if the type is not known
219
244
"""
220
245
221
246
column_number = 1
@@ -233,7 +258,7 @@ def _handle_hdr(self, hdr):
233
258
name , 'Duplicate column name' )
234
259
235
260
if len (cell_parts ) == 2 :
236
- column_type = cell_parts [1 ].lower ()
261
+ column_type = cell_parts [1 ].strip (). lower ()
237
262
if column_type not in CONVERTERS :
238
263
raise UnknownTypeError (column_number , column_type )
239
264
else :
0 commit comments