Skip to content

Commit 3c7bb96

Browse files
author
Alan Christie
committed
- Adds a TypedColumnReader and associated csv-like test files
- Prepares for utils 2.4.0
1 parent da2f7fd commit 3c7bb96

11 files changed

+379
-1
lines changed
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2017 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""Typed CSV reader.
18+
19+
Based on the built-in ``csv`` module, this module provides the user with
20+
the ability to load _typed_ CSV content, a CSV file with optional type
21+
specifications provided in the header (which must be supplied).
22+
23+
Alan Christie
24+
October 2018
25+
"""
26+
27+
import csv
28+
import gzip
29+
30+
31+
class Error(Exception):
32+
"""Base class for exceptions in this module."""
33+
pass
34+
35+
36+
class UnknownType(Error):
37+
"""Exception raised for an unknown type in the header.
38+
39+
Attributes:
40+
column -- the problematic column number
41+
column_type -- The column type
42+
"""
43+
44+
def __init__(self, column, column_type):
45+
self.column = column
46+
self.column_type = column_type
47+
48+
49+
class ContentError(Error):
50+
"""Exception raised for CSV content errors.
51+
This is raised if the column value is unknown or does not
52+
comply with the defined type.
53+
54+
Attributes:
55+
column -- the problematic column number
56+
row -- the problematic (1-based) row number
57+
value -- The value (or None is n/a)
58+
message -- explanation of the error
59+
"""
60+
61+
def __init__(self, column, row, value, message):
62+
self.column = column
63+
self.row = row
64+
self.value = value
65+
self.message = message
66+
67+
68+
def convert_int(string_value):
69+
"""Converts string to integer (see CONVERTERS).
70+
There is a ``converter_?()`` function for each column type.
71+
72+
:param string_value: The string to convert
73+
"""
74+
return int(string_value.strip())
75+
76+
77+
def convert_float(string_value):
78+
"""Converts string to float (see CONVERTERS).
79+
There is a ``converter_?()`` function for each column type.
80+
81+
:param string_value: The string to convert
82+
"""
83+
return float(string_value.strip())
84+
85+
86+
def convert_string(string_value):
87+
"""String and default converter (see CONVERTERS).
88+
There is a ``converter_?()`` function for each column type.
89+
90+
:param string_value: The string to convert
91+
"""
92+
return string_value
93+
94+
95+
# A map of built-in column type to string conversion function.
96+
# If a column is 'name:INT' then we call 'convert_int()'
97+
# for the column values.
98+
CONVERTERS = {'int': convert_int,
99+
'float': convert_float,
100+
'string': convert_string}
101+
102+
103+
class TypedColumnReader(object):
104+
105+
"""A class to handle 'typed' CSV-like files, files (normally) with a header
106+
that can include type information. This class supports
107+
neo4j-like column typing where field are annotated
108+
with type information. The class returns
109+
a list of values for each row in the file where, if the column header
110+
defines a type, the value is converted to that type.
111+
112+
There is built-in support for ``int``, ``float`` and ``string`` data types.
113+
114+
The following is a comma-separated header for a file where the first two
115+
columns contain strings and the last two contain `int`` and ``float``
116+
types: -
117+
118+
"smiles,comment:string,hac:int,ratio:float"
119+
"""
120+
121+
def __init__(self, filename,
122+
column_sep='\t',
123+
type_sep=':',
124+
header=None):
125+
"""Basic initialiser.
126+
127+
:param filename: The typed CSV file name
128+
:param column_sep: The file column separator
129+
:param type_sep: The type separator
130+
:param header: An optional header. If provided the must not have
131+
a header line. This is provided to allow processing
132+
of exiting files that have no header. You are strongly
133+
encouraged to create new files with a header.
134+
"""
135+
136+
self._filename = filename
137+
self._type_sep = type_sep
138+
self._header = header
139+
140+
# Open the CSV file (which may be compressed)
141+
if filename.endswith('.gz'):
142+
self._csv_file = gzip.open(filename, 'rt')
143+
else:
144+
self._csv_file = open(filename, 'rt')
145+
self._c_reader = csv.reader(self._csv_file,
146+
delimiter=column_sep,
147+
skipinitialspace=True,
148+
strict=True)
149+
150+
# Column value type converter functions.
151+
# An entry for each column in the file and compiled by _handle_header
152+
# using the provided header or file content oin the first iteration.
153+
self._converters = []
154+
155+
def __iter__(self):
156+
"""Return the next typ-converted row from the file.
157+
The first row is expected to be a header with optional
158+
type definitions.
159+
160+
:returns: A list of type-converted values for the next row
161+
162+
:raises: ValueError if a column value cannot be converted
163+
:raises: ContentError if the column value is unknown or does not
164+
comply with the column type.
165+
"""
166+
167+
# If we have not generated the converter array but we have been given
168+
# a header then now's the time to build the list of type converters.
169+
# A specified header is always comma-separated, regardless of
170+
# the separator used in the file.
171+
if not self._converters and self._header:
172+
self._handle_hdr(self._header.split(','))
173+
174+
for row in self._c_reader:
175+
176+
# Handle the first row?
177+
# (which defines column names and types)
178+
if not self._converters:
179+
self._handle_hdr(row)
180+
continue
181+
182+
# Must have seen a header if we get here...
183+
if len(self._converters) == 0:
184+
raise ContentError(1, 1, None, 'Missing header')
185+
186+
# Construct a list of row column values,
187+
# applying type conversions based on the
188+
# type defined in the header....
189+
row_values = []
190+
col_index = 0
191+
# Convert...
192+
for col in row:
193+
# Too many items in the row?
194+
if col_index >= len(self._converters):
195+
raise ContentError(col_index + 1, self._c_reader.line_num,
196+
None, 'Too many values')
197+
try:
198+
row_values.append(self._converters[col_index][1](col))
199+
except ValueError:
200+
raise ContentError(col_index + 1, self._c_reader.line_num,
201+
col,
202+
'Does not comply with column type')
203+
col_index += 1
204+
205+
yield row_values
206+
207+
def _handle_hdr(self, hdr):
208+
"""Given the file header line (or one provided when the class
209+
is instantiated) this method populates the self.converters array,
210+
a list of type converters indexed by column.
211+
212+
:param hdr: The header line.
213+
"""
214+
215+
column_number = 1
216+
for cell in hdr:
217+
cell_parts = cell.split(self._type_sep)
218+
if len(cell_parts) not in [1, 2]:
219+
raise ContentError(column_number, self._c_reader.line_num,
220+
cell, 'Expected name and type (up to 2 items)')
221+
name = cell_parts[0]
222+
if len(cell_parts) == 2:
223+
column_type = cell_parts[1].lower()
224+
if column_type not in CONVERTERS:
225+
raise UnknownType(column_number, column_type)
226+
else:
227+
# Unspecified - assume built-in 'string'
228+
column_type = 'string'
229+
self._converters.append([name, CONVERTERS[column_type]])
230+
column_number += 1
231+
232+
def __del__(self):
233+
"""Delete method.
234+
"""
235+
if self._csv_file:
236+
self._csv_file.close()
237+
self._csv_file = None

src/python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def get_long_description():
2222
setup(
2323

2424
name='im-pipelines-utils',
25-
version='2.3.1',
25+
version='2.4.0',
2626
author='Alan Christie',
2727
author_email='[email protected]',
2828
url='https://github.com/InformaticsMatters/pipelines-utils',
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
A string,45,46,and finally
2+
Another string,55,56,that's it
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
one,two:int,three:float,four:string
2+
A string,45,46,and finally
3+
Another string,55,56,that's it
130 Bytes
Binary file not shown.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
one,two:int,three:float,four:unknown-type
2+
A string,45,46,and finally
3+
Another string,55,56,that's it
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
one,two:int,three:float,four:unknown-type:too-many-colons
2+
A string,45,46,and finally
3+
Another string,55,56,that's it
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
one:int
2+
A string
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
one two:int three:float four:string
2+
A string 45 46 and finally
3+
Another string 55 56 that's it
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
one,two
2+
String 1,string 2, string 3

0 commit comments

Comments
 (0)