|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2017 Informatics Matters Ltd. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +"""Typed CSV reader. |
| 18 | +
|
| 19 | +Based on the built-in ``csv`` module, this module provides the user with |
| 20 | +the ability to load _typed_ CSV content, a CSV file with optional type |
| 21 | +specifications provided in the header (which must be supplied). |
| 22 | +
|
| 23 | +Alan Christie |
| 24 | +October 2018 |
| 25 | +""" |
| 26 | + |
| 27 | +import csv |
| 28 | +import gzip |
| 29 | + |
| 30 | + |
| 31 | +class Error(Exception): |
| 32 | + """Base class for exceptions in this module.""" |
| 33 | + pass |
| 34 | + |
| 35 | + |
| 36 | +class UnknownType(Error): |
| 37 | + """Exception raised for an unknown type in the header. |
| 38 | +
|
| 39 | + Attributes: |
| 40 | + column -- the problematic column number |
| 41 | + column_type -- The column type |
| 42 | + """ |
| 43 | + |
| 44 | + def __init__(self, column, column_type): |
| 45 | + self.column = column |
| 46 | + self.column_type = column_type |
| 47 | + |
| 48 | + |
| 49 | +class ContentError(Error): |
| 50 | + """Exception raised for CSV content errors. |
| 51 | + This is raised if the column value is unknown or does not |
| 52 | + comply with the defined type. |
| 53 | +
|
| 54 | + Attributes: |
| 55 | + column -- the problematic column number |
| 56 | + row -- the problematic (1-based) row number |
| 57 | + value -- The value (or None is n/a) |
| 58 | + message -- explanation of the error |
| 59 | + """ |
| 60 | + |
| 61 | + def __init__(self, column, row, value, message): |
| 62 | + self.column = column |
| 63 | + self.row = row |
| 64 | + self.value = value |
| 65 | + self.message = message |
| 66 | + |
| 67 | + |
| 68 | +def convert_int(string_value): |
| 69 | + """Converts string to integer (see CONVERTERS). |
| 70 | + There is a ``converter_?()`` function for each column type. |
| 71 | +
|
| 72 | + :param string_value: The string to convert |
| 73 | + """ |
| 74 | + return int(string_value.strip()) |
| 75 | + |
| 76 | + |
| 77 | +def convert_float(string_value): |
| 78 | + """Converts string to float (see CONVERTERS). |
| 79 | + There is a ``converter_?()`` function for each column type. |
| 80 | +
|
| 81 | + :param string_value: The string to convert |
| 82 | + """ |
| 83 | + return float(string_value.strip()) |
| 84 | + |
| 85 | + |
| 86 | +def convert_string(string_value): |
| 87 | + """String and default converter (see CONVERTERS). |
| 88 | + There is a ``converter_?()`` function for each column type. |
| 89 | +
|
| 90 | + :param string_value: The string to convert |
| 91 | + """ |
| 92 | + return string_value |
| 93 | + |
| 94 | + |
| 95 | +# A map of built-in column type to string conversion function. |
| 96 | +# If a column is 'name:INT' then we call 'convert_int()' |
| 97 | +# for the column values. |
| 98 | +CONVERTERS = {'int': convert_int, |
| 99 | + 'float': convert_float, |
| 100 | + 'string': convert_string} |
| 101 | + |
| 102 | + |
| 103 | +class TypedColumnReader(object): |
| 104 | + |
| 105 | + """A class to handle 'typed' CSV-like files, files (normally) with a header |
| 106 | + that can include type information. This class supports |
| 107 | + neo4j-like column typing where field are annotated |
| 108 | + with type information. The class returns |
| 109 | + a list of values for each row in the file where, if the column header |
| 110 | + defines a type, the value is converted to that type. |
| 111 | +
|
| 112 | + There is built-in support for ``int``, ``float`` and ``string`` data types. |
| 113 | +
|
| 114 | + The following is a comma-separated header for a file where the first two |
| 115 | + columns contain strings and the last two contain `int`` and ``float`` |
| 116 | + types: - |
| 117 | +
|
| 118 | + "smiles,comment:string,hac:int,ratio:float" |
| 119 | + """ |
| 120 | + |
| 121 | + def __init__(self, filename, |
| 122 | + column_sep='\t', |
| 123 | + type_sep=':', |
| 124 | + header=None): |
| 125 | + """Basic initialiser. |
| 126 | +
|
| 127 | + :param filename: The typed CSV file name |
| 128 | + :param column_sep: The file column separator |
| 129 | + :param type_sep: The type separator |
| 130 | + :param header: An optional header. If provided the must not have |
| 131 | + a header line. This is provided to allow processing |
| 132 | + of exiting files that have no header. You are strongly |
| 133 | + encouraged to create new files with a header. |
| 134 | + """ |
| 135 | + |
| 136 | + self._filename = filename |
| 137 | + self._type_sep = type_sep |
| 138 | + self._header = header |
| 139 | + |
| 140 | + # Open the CSV file (which may be compressed) |
| 141 | + if filename.endswith('.gz'): |
| 142 | + self._csv_file = gzip.open(filename, 'rt') |
| 143 | + else: |
| 144 | + self._csv_file = open(filename, 'rt') |
| 145 | + self._c_reader = csv.reader(self._csv_file, |
| 146 | + delimiter=column_sep, |
| 147 | + skipinitialspace=True, |
| 148 | + strict=True) |
| 149 | + |
| 150 | + # Column value type converter functions. |
| 151 | + # An entry for each column in the file and compiled by _handle_header |
| 152 | + # using the provided header or file content oin the first iteration. |
| 153 | + self._converters = [] |
| 154 | + |
| 155 | + def __iter__(self): |
| 156 | + """Return the next typ-converted row from the file. |
| 157 | + The first row is expected to be a header with optional |
| 158 | + type definitions. |
| 159 | +
|
| 160 | + :returns: A list of type-converted values for the next row |
| 161 | +
|
| 162 | + :raises: ValueError if a column value cannot be converted |
| 163 | + :raises: ContentError if the column value is unknown or does not |
| 164 | + comply with the column type. |
| 165 | + """ |
| 166 | + |
| 167 | + # If we have not generated the converter array but we have been given |
| 168 | + # a header then now's the time to build the list of type converters. |
| 169 | + # A specified header is always comma-separated, regardless of |
| 170 | + # the separator used in the file. |
| 171 | + if not self._converters and self._header: |
| 172 | + self._handle_hdr(self._header.split(',')) |
| 173 | + |
| 174 | + for row in self._c_reader: |
| 175 | + |
| 176 | + # Handle the first row? |
| 177 | + # (which defines column names and types) |
| 178 | + if not self._converters: |
| 179 | + self._handle_hdr(row) |
| 180 | + continue |
| 181 | + |
| 182 | + # Must have seen a header if we get here... |
| 183 | + if len(self._converters) == 0: |
| 184 | + raise ContentError(1, 1, None, 'Missing header') |
| 185 | + |
| 186 | + # Construct a list of row column values, |
| 187 | + # applying type conversions based on the |
| 188 | + # type defined in the header.... |
| 189 | + row_values = [] |
| 190 | + col_index = 0 |
| 191 | + # Convert... |
| 192 | + for col in row: |
| 193 | + # Too many items in the row? |
| 194 | + if col_index >= len(self._converters): |
| 195 | + raise ContentError(col_index + 1, self._c_reader.line_num, |
| 196 | + None, 'Too many values') |
| 197 | + try: |
| 198 | + row_values.append(self._converters[col_index][1](col)) |
| 199 | + except ValueError: |
| 200 | + raise ContentError(col_index + 1, self._c_reader.line_num, |
| 201 | + col, |
| 202 | + 'Does not comply with column type') |
| 203 | + col_index += 1 |
| 204 | + |
| 205 | + yield row_values |
| 206 | + |
| 207 | + def _handle_hdr(self, hdr): |
| 208 | + """Given the file header line (or one provided when the class |
| 209 | + is instantiated) this method populates the self.converters array, |
| 210 | + a list of type converters indexed by column. |
| 211 | +
|
| 212 | + :param hdr: The header line. |
| 213 | + """ |
| 214 | + |
| 215 | + column_number = 1 |
| 216 | + for cell in hdr: |
| 217 | + cell_parts = cell.split(self._type_sep) |
| 218 | + if len(cell_parts) not in [1, 2]: |
| 219 | + raise ContentError(column_number, self._c_reader.line_num, |
| 220 | + cell, 'Expected name and type (up to 2 items)') |
| 221 | + name = cell_parts[0] |
| 222 | + if len(cell_parts) == 2: |
| 223 | + column_type = cell_parts[1].lower() |
| 224 | + if column_type not in CONVERTERS: |
| 225 | + raise UnknownType(column_number, column_type) |
| 226 | + else: |
| 227 | + # Unspecified - assume built-in 'string' |
| 228 | + column_type = 'string' |
| 229 | + self._converters.append([name, CONVERTERS[column_type]]) |
| 230 | + column_number += 1 |
| 231 | + |
| 232 | + def __del__(self): |
| 233 | + """Delete method. |
| 234 | + """ |
| 235 | + if self._csv_file: |
| 236 | + self._csv_file.close() |
| 237 | + self._csv_file = None |
0 commit comments