forked from biolab/orange3
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathio_util.py
More file actions
209 lines (176 loc) · 7.26 KB
/
io_util.py
File metadata and controls
209 lines (176 loc) · 7.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import subprocess
from collections import defaultdict
import numpy as np
from chardet.universaldetector import UniversalDetector
from Orange.data import (
is_discrete_values, MISSING_VALUES, Variable,
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
)
from Orange.misc.collections import natural_sorted
__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
"guess_data_type", "sanitize_variable"]
class Compression:
"""Supported compression extensions"""
GZIP = '.gz'
BZIP2 = '.bz2'
XZ = '.xz'
all = (GZIP, BZIP2, XZ)
def open_compressed(filename, *args, _open=open, **kwargs):
"""Return seamlessly decompressed open file handle for `filename`"""
if isinstance(filename, str):
if filename.endswith(Compression.GZIP):
from gzip import open as _open
elif filename.endswith(Compression.BZIP2):
from bz2 import open as _open
elif filename.endswith(Compression.XZ):
from lzma import open as _open
return _open(filename, *args, **kwargs)
# Else already a file, just pass it through
return filename
def detect_encoding(filename):
"""
Detect encoding of `filename`, which can be a ``str`` filename, a
``file``-like object, or ``bytes``.
"""
# Try with Unix file utility first because it's faster (~10ms vs 100ms)
if isinstance(filename, str) and not filename.endswith(Compression.all):
try:
with subprocess.Popen(('file', '--brief', '--mime-encoding',
filename), stdout=subprocess.PIPE) as proc:
proc.wait()
if proc.returncode == 0:
encoding = proc.stdout.read().strip()
# file only supports these encodings; for others it says
# unknown-8bit or binary. So we give chardet a chance to do
# better
if encoding in (b'utf-8', b'us-ascii', b'iso-8859-1',
b'utf-7', b'utf-16le', b'utf-16be',
b'ebcdic'):
return encoding.decode('us-ascii')
except OSError:
pass # windoze
# file not available or unable to guess the encoding, have chardet do it
detector = UniversalDetector()
# We examine only first N 4kB blocks of file because chardet is really slow
MAX_BYTES = 4 * 1024 * 12
def _from_file(f):
detector.feed(f.read(MAX_BYTES))
detector.close()
return (detector.result.get('encoding')
if detector.result.get('confidence', 0) >= .85 else
'utf-8')
if isinstance(filename, str):
with open_compressed(filename, 'rb') as f:
return _from_file(f)
elif isinstance(filename, bytes):
detector.feed(filename[:MAX_BYTES])
detector.close()
return detector.result.get('encoding')
elif hasattr(filename, 'encoding'):
return filename.encoding
else: # assume file-like object that you can iter through
return _from_file(filename)
__isnastr = np.frompyfunc(
{v for v in MISSING_VALUES if isinstance(v, str)}.__contains__, 1, 1)
# wrapper for __isnastr with proper default out dtype
def isnastr(arr, out=None):
"""
Given an (object) array of string values, return a boolean mask array
that is True where the `arr` contains one of the string constants
considered as N/A.
Parameters
----------
arr : np.ndarray
Input array of strings.
out : Optional[np.ndarray]
Optional output array of the same shape as arr
Returns
-------
mask : np.ndarray
"""
arr = np.asarray(arr)
if out is None and arr.shape != ():
out = np.empty_like(arr, dtype=bool)
return __isnastr(arr, out=out)
def guess_data_type(orig_values, namask=None):
"""
Use heuristics to guess data type.
"""
valuemap, values = None, orig_values
is_discrete = is_discrete_values(orig_values)
orig_values = np.asarray(orig_values, dtype=str)
if namask is None:
namask = isnastr(orig_values)
if is_discrete:
valuemap = natural_sorted(is_discrete)
coltype = DiscreteVariable
else:
# try to parse as float
values = np.empty_like(orig_values, dtype=float)
values[namask] = np.nan
try:
np.copyto(values, orig_values, where=~namask, casting="unsafe")
except ValueError:
values = orig_values
coltype = StringVariable
else:
coltype = ContinuousVariable
if coltype is not ContinuousVariable:
# when not continuous variable it can still be time variable even it
# was before recognized as a discrete
tvar = TimeVariable('_')
# introducing new variable prevent overwriting orig_values and values
temp_values = np.empty_like(orig_values, dtype=float)
try:
temp_values[~namask] = [
tvar.parse_exact_iso(i) for i in orig_values[~namask]]
except ValueError:
pass
else:
valuemap = None
coltype = TimeVariable
values = temp_values
return valuemap, values, coltype
def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
name=None):
assert issubclass(coltype, Variable)
def get_number_of_decimals(values):
len_ = len
ndecimals = max((len_(value) - value.find(".")
for value in values if "." in value),
default=1)
return ndecimals - 1
if issubclass(coltype, DiscreteVariable) and valuemap is not None:
coltype_kwargs.update(values=valuemap)
var = coltype.make(name, **coltype_kwargs)
if isinstance(var, DiscreteVariable):
# Map discrete data to 'ints' (or at least what passes as int around
# here)
mapping = defaultdict(
lambda: np.nan,
{val: i for i, val in enumerate(var.values)},
)
mapping[""] = np.nan
mapvalues_ = np.frompyfunc(mapping.__getitem__, 1, 1)
def mapvalues(arr):
arr = np.asarray(arr, dtype=object)
return mapvalues_(arr, out=np.empty_like(arr, dtype=float))
values = mapvalues(orig_values)
if coltype is StringVariable:
values = orig_values
# ContinuousVariable.number_of_decimals is supposed to be handled by
# ContinuousVariable.to_val. In the interest of speed, the reader bypasses
# it, so we set the number of decimals here.
# The number of decimals is increased if not set manually (in which case
# var.adjust_decimals would be 0).
if isinstance(var, ContinuousVariable) and var.adjust_decimals:
ndecimals = get_number_of_decimals(orig_values)
if var.adjust_decimals == 2 or ndecimals > var.number_of_decimals:
var.number_of_decimals = ndecimals
var.adjust_decimals = 1
if isinstance(var, TimeVariable) or coltype is TimeVariable:
# Re-parse the values because only now after coltype.make call
# above, variable var is the correct one
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
values = [_var.parse(i) for i in orig_values]
return values, var