Skip to content

Commit 36afc67

Browse files
author
Alan Christie
committed
- TypedColumnReader now returns None for empty column values
1 parent ea3500f commit 36afc67

File tree

4 files changed

+84
-56
lines changed

4 files changed

+84
-56
lines changed

src/python/pipelines_utils/TypedColumnReader.py

Lines changed: 78 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
"""Typed CSV reader.
17+
"""Typed column (CSV) reader.
1818
1919
Based on the built-in ``csv`` module, this Generator module provides the user
20-
with the ability to load _typed_ CSV content, a CSV file with optional type
21-
specifications provided in the header (which must be supplied).
20+
with the ability to load _typed_ CSV-like content, a text file of values
21+
that include a header with optional type specifications provided in the
22+
header.
2223
2324
Alan Christie
2425
October 2018
@@ -36,8 +37,8 @@ class UnknownTypeError(Error):
3637
"""Exception raised for an unknown type in the header.
3738
3839
Attributes:
39-
column -- the problematic column number
40-
column_type -- The column type
40+
column -- the problematic (1-based) column number
41+
column_type -- The column's type value
4142
"""
4243

4344
def __init__(self, column, column_type):
@@ -51,9 +52,9 @@ class ContentError(Error):
5152
comply with the defined type.
5253
5354
Attributes:
54-
column -- the problematic column number
55+
column -- the problematic (1-based) column number
5556
row -- the problematic (1-based) row number
56-
value -- The value (or None is n/a)
57+
value -- The value (or None if n/a)
5758
message -- explanation of the error
5859
"""
5960

@@ -65,56 +66,69 @@ def __init__(self, column, row, value, message):
6566

6667

6768
def convert_int(string_value):
68-
"""Converts string to integer (see CONVERTERS).
69-
There is a ``converter_?()`` function for each column type.
69+
"""Converts a string to integer (see CONVERTERS).
70+
There is a converter function for each column type.
7071
7172
:param string_value: The string to convert
73+
74+
:raises: ValueError if the string cannot be represented by an int
7275
"""
7376
return int(string_value.strip())
7477

7578

7679
def convert_float(string_value):
7780
"""Converts string to float (see CONVERTERS).
78-
There is a ``converter_?()`` function for each column type.
81+
There is a converter function for each column type.
7982
8083
:param string_value: The string to convert
84+
85+
:raises: ValueError if the string cannot be represented by a float
8186
"""
8287
return float(string_value.strip())
8388

8489

8590
def convert_string(string_value):
8691
"""String and default converter (see CONVERTERS).
87-
There is a ``converter_?()`` function for each column type.
92+
There is a converter function for each column type.
8893
8994
:param string_value: The string to convert
9095
"""
9196
return string_value
9297

9398

94-
# A map of built-in column type to string conversion function.
95-
# If a column is 'name:INT' then we call 'convert_int()'
96-
# for the column values.
99+
# A map of column type names (case-insensitive) to string conversion function.
100+
# If a column is 'name:INT' then we call 'convert_int()' for the column values.
97101
CONVERTERS = {'int': convert_int,
98102
'float': convert_float,
99103
'string': convert_string}
100104

101105

102106
class TypedColumnReader(object):
107+
"""A generator to handle 'typed' CSV-like files, files that include
108+
a header that may also define data types. This class supports
109+
neo4j-like node/edge column typing where fields are annotated
110+
with type information where each column header is of the format
111+
``name[:type]``.
103112
104-
"""A generator to handle 'typed' CSV-like files, files (normally)
105-
with a header that can include type information. This class supports
106-
neo4j-like column typing where field are annotated
107-
with type information. The class returns
108-
a list of values for each row in the file where, if the column header
109-
defines a type, the value is converted to that type.
113+
As a Generator it returns a dictionary of values for each row in the file
114+
where, if the column header defines a type, the value is converted to that
115+
type.
110116
111117
There is built-in support for ``int``, ``float`` and ``string`` data types.
112118
113-
The following is a comma-separated header for a file where the first two
114-
columns contain strings and the last two contain `int`` and ``float``
115-
types: -
119+
As an example, the following is a comma-separated header for a file with
120+
columns ``names`` "smiles", "comment", "hac" and "ratio" where
121+
the first two column types are strings and the last two are
122+
``int`` and ``float`` types: -
123+
124+
"smiles,comment:string,hac:int,ratio:float"
125+
126+
* The ``name`` cannot be blank and must be unique.
127+
* Whitespace is stripped from the start and end of the column ``name``
128+
* If a column value is empty/blank the corresponding dictionary
129+
value is ``None``
130+
116131
117-
"smiles,comment:string,hac:int,ratio:float"
118132
"""
119133

120134
def __init__(self, csv_file,
@@ -123,20 +137,19 @@ def __init__(self, csv_file,
123137
header=None):
124138
"""Basic initialiser.
125139
126-
:param csvfile: The typed CSV file. csvfile can be any object which
127-
supports the iterator protocol and returns a string
128-
each time its next() method is called
140+
:param csv_file: The typed CSV-like file. csv_file can be any object
141+
tha supports the iterator protocol and returns a string
142+
each time its next() method is called
129143
:param column_sep: The file column separator
130-
:param type_sep: The type separator
131-
:param header: An optional header. If provided the must not have
132-
a header line. This is provided to allow processing
133-
of exiting files that have no header. The headder
134-
must contain column names and optional types.
144+
:param type_sep: The type separator, the character between the column
145+
header name and its type declaration.
146+
:param header: An optional header. This is provided to allow processing
147+
of existing files that have no header. The header
148+
must contain column names.
135149
"smiles:string" would be a column named "smiles"
136150
of type "string" and "n:int" would be a column known as
137-
"n" of type "integer". Although you can provide
138-
thew header here you are strongly
139-
encouraged to add a header line to all new files.
151+
"n" of type "integer". When provided here the header
152+
column separator must be comma-separated.
140153
"""
141154

142155
self._csv_file = csv_file
@@ -149,16 +162,19 @@ def __init__(self, csv_file,
149162
strict=True)
150163

151164
# Column value type converter functions.
152-
# An entry for each column in the file and compiled by _handle_header
153-
# using the provided header or file content oin the first iteration.
165+
# An entry for each column in the file and compiled by _handle_header()
166+
# using the provided header or file content on the first iteration.
154167
self._converters = []
155-
# The the column names extracted from the header
168+
# The ordered list of unique column names extracted from the header
156169
self._column_names = []
157170

158171
def __iter__(self):
159172
"""Return the next type-converted row from the file.
160173
Unless the header is provided in the initialiser, the first row is
161-
expected to be a header with optional type definitions.
174+
expected to be a header with optional type declarations.
175+
176+
If the column value is empty/blank the corresponding dictionary
177+
value is None.
162178
163179
:returns: A dictionary of type-converted values for the next row
164180
where the dictionary key is the name of the column
@@ -167,6 +183,7 @@ def __iter__(self):
167183
:raises: ValueError if a column value cannot be converted
168184
:raises: ContentError if the column value is unknown or does not
169185
comply with the column type.
186+
:raises: UnknownTypeError if the column type is unknown.
170187
"""
171188

172189
# If we have not generated the converter array but we have been given
@@ -180,42 +197,50 @@ def __iter__(self):
180197

181198
# Handle the first row?
182199
# (which defines column names and types)
200+
# If we were given a header during initialisation
201+
# then there's no header in the file
183202
if not self._converters:
184203
self._handle_hdr(row)
185204
continue
186205

187-
# Must have seen a header if we get here...
206+
# Must have a header if we get here...
188207
if len(self._converters) == 0:
189208
raise ContentError(1, 1, None, 'Missing header')
190209

191210
# Construct a dictionary of row column names and values,
192211
# applying type conversions based on the
193-
# type defined in the header....
212+
# type defined in the header
194213
row_content = {}
195214
col_index = 0
196-
# Convert...
197215
for col in row:
198216
# Too many items in the row?
217+
# Can't have a header with 4 columns and a file of 5
199218
if col_index >= len(self._converters):
200219
raise ContentError(col_index + 1, self._c_reader.line_num,
201220
None, 'Too many values')
202-
try:
203-
row_content[self._column_names[col_index]] =\
204-
self._converters[col_index][1](col)
205-
except ValueError:
206-
raise ContentError(col_index + 1, self._c_reader.line_num,
207-
col,
208-
'Does not comply with column type')
221+
lean_col = col.strip()
222+
col_val = None
223+
if lean_col:
224+
try:
225+
col_val = self._converters[col_index][1](col)
226+
except ValueError:
227+
raise ContentError(col_index + 1, self._c_reader.line_num,
228+
col,
229+
'Does not comply with column type')
230+
row_content[self._column_names[col_index]] = col_val
209231
col_index += 1
210232

211233
yield row_content
212234

213235
def _handle_hdr(self, hdr):
214-
"""Given the file header line (or one provided when the class
215-
is instantiated) this method populates the self.converters array,
216-
a list of type converters indexed by column.
236+
"""Given the file header line (or one provided when the object
237+
is instantiated) this method populates the ``self._converters`` array,
238+
a list of type converters indexed by the column name.
217239
218240
:param hdr: The header line.
241+
242+
:raises: ContentError for any formatting problems
243+
:raises: UnknownTypeError if the type is not known
219244
"""
220245

221246
column_number = 1
@@ -233,7 +258,7 @@ def _handle_hdr(self, hdr):
233258
name, 'Duplicate column name')
234259

235260
if len(cell_parts) == 2:
236-
column_type = cell_parts[1].lower()
261+
column_type = cell_parts[1].strip().lower()
237262
if column_type not in CONVERTERS:
238263
raise UnknownTypeError(column_number, column_type)
239264
else:

src/python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def get_long_description():
2222
setup(
2323

2424
name='im-pipelines-utils',
25-
version='2.4.1',
25+
version='2.4.2',
2626
author='Alan Christie',
2727
author_email='[email protected]',
2828
url='https://github.com/InformaticsMatters/pipelines-utils',
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
one,two:int,three:float,four:string
2-
A string,45,46,and finally
3-
Another string,55,56,that's it
2+
A string, 45 ,,and finally
3+
Another string, 55 ,,that's it

src/python/test/python2_3/pipelines_utils/test_TypedColumnReader.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ def test_basic_example_a(self):
2222
first_row = row
2323
num_lines += 1
2424
self.assertEqual(2, num_lines)
25+
# Examine the first row...
2526
self.assertEqual('A string', first_row['one'])
27+
self.assertEqual(45, first_row['two'])
28+
self.assertEqual(None, first_row['three'])
2629
self.assertEqual('and finally', first_row['four'])
2730
csv_file.close()
2831

0 commit comments

Comments
 (0)