orange3/Orange/data/io_util.py at d8ccced96cf7f912a8ab67b5795ecaa2a3abe89f · PrimozGodec/orange3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import subprocess
from collections import defaultdict

import numpy as np
from chardet.universaldetector import UniversalDetector

from Orange.data import (
    is_discrete_values, MISSING_VALUES, Variable,
    DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
)
from Orange.misc.collections import natural_sorted

__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
           "guess_data_type", "sanitize_variable"]


class Compression:
    """Supported compression extensions"""
    GZIP = '.gz'
    BZIP2 = '.bz2'
    XZ = '.xz'
    all = (GZIP, BZIP2, XZ)


def open_compressed(filename, *args, _open=open, **kwargs):
    """Return seamlessly decompressed open file handle for `filename`"""
    if isinstance(filename, str):
        if filename.endswith(Compression.GZIP):
            from gzip import open as _open
        elif filename.endswith(Compression.BZIP2):
            from bz2 import open as _open
        elif filename.endswith(Compression.XZ):
            from lzma import open as _open
        return _open(filename, *args, **kwargs)
    # Else already a file, just pass it through
    return filename


def detect_encoding(filename):
    """
    Detect encoding of `filename`, which can be a ``str`` filename, a
    ``file``-like object, or ``bytes``.
    """
    # Try with Unix file utility first because it's faster (~10ms vs 100ms)
    if isinstance(filename, str) and not filename.endswith(Compression.all):
        try:
            with subprocess.Popen(('file', '--brief', '--mime-encoding',
                                   filename), stdout=subprocess.PIPE) as proc:
                proc.wait()
                if proc.returncode == 0:
                    encoding = proc.stdout.read().strip()
                    # file only supports these encodings; for others it says
                    # unknown-8bit or binary. So we give chardet a chance to do
                    # better
                    if encoding in (b'utf-8', b'us-ascii', b'iso-8859-1',
                                    b'utf-7', b'utf-16le', b'utf-16be',
                                    b'ebcdic'):
                        return encoding.decode('us-ascii')
        except OSError:
            pass  # windoze

    # file not available or unable to guess the encoding, have chardet do it
    detector = UniversalDetector()
    # We examine only first N 4kB blocks of file because chardet is really slow
    MAX_BYTES = 4 * 1024 * 12

    def _from_file(f):
        detector.feed(f.read(MAX_BYTES))
        detector.close()
        return (detector.result.get('encoding')
                if detector.result.get('confidence', 0) >= .85 else
                'utf-8')

    if isinstance(filename, str):
        with open_compressed(filename, 'rb') as f:
            return _from_file(f)
    elif isinstance(filename, bytes):
        detector.feed(filename[:MAX_BYTES])
        detector.close()
        return detector.result.get('encoding')
    elif hasattr(filename, 'encoding'):
        return filename.encoding
    else:  # assume file-like object that you can iter through
        return _from_file(filename)


__isnastr = np.frompyfunc(
    {v for v in MISSING_VALUES if isinstance(v, str)}.__contains__, 1, 1)


# wrapper for __isnastr with proper default out dtype
def isnastr(arr, out=None):
    """
    Given an (object) array of string values, return a boolean mask array
    that is True where the `arr` contains one of the string constants
    considered as N/A.

    Parameters
    ----------
    arr : np.ndarray
        Input array of strings.
    out : Optional[np.ndarray]
        Optional output array of the same shape as arr

    Returns
    -------
    mask : np.ndarray
    """
    arr = np.asarray(arr)
    if out is None and arr.shape != ():
        out = np.empty_like(arr, dtype=bool)
    return __isnastr(arr, out=out)


def guess_data_type(orig_values, namask=None):
    """
    Use heuristics to guess data type.
    """
    valuemap, values = None, orig_values
    is_discrete = is_discrete_values(orig_values)
    orig_values = np.asarray(orig_values, dtype=str)
    if namask is None:
        namask = isnastr(orig_values)
    if is_discrete:
        valuemap = natural_sorted(is_discrete)
        coltype = DiscreteVariable
    else:
        # try to parse as float
        values = np.empty_like(orig_values, dtype=float)
        values[namask] = np.nan
        try:
            np.copyto(values, orig_values, where=~namask, casting="unsafe")
        except ValueError:
            values = orig_values
            coltype = StringVariable
        else:
            coltype = ContinuousVariable

    if coltype is not ContinuousVariable:
        # when not continuous variable it can still be time variable even it
        # was before recognized as a discrete
        tvar = TimeVariable('_')
        # introducing new variable prevent overwriting orig_values and values
        temp_values = np.empty_like(orig_values, dtype=float)
        try:
            temp_values[~namask] = [
                tvar.parse_exact_iso(i) for i in orig_values[~namask]]
        except ValueError:
            pass
        else:
            valuemap = None
            coltype = TimeVariable
            values = temp_values
    return valuemap, values, coltype


def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
                      name=None):
    assert issubclass(coltype, Variable)

    def get_number_of_decimals(values):
        len_ = len
        ndecimals = max((len_(value) - value.find(".")
                         for value in values if "." in value),
                        default=1)
        return ndecimals - 1

    if issubclass(coltype, DiscreteVariable) and valuemap is not None:
        coltype_kwargs.update(values=valuemap)

    var = coltype.make(name, **coltype_kwargs)

    if isinstance(var, DiscreteVariable):
        # Map discrete data to 'ints' (or at least what passes as int around
        # here)
        mapping = defaultdict(
            lambda: np.nan,
            {val: i for i, val in enumerate(var.values)},
        )
        mapping[""] = np.nan
        mapvalues_ = np.frompyfunc(mapping.__getitem__, 1, 1)

        def mapvalues(arr):
            arr = np.asarray(arr, dtype=object)
            return mapvalues_(arr, out=np.empty_like(arr, dtype=float))

        values = mapvalues(orig_values)

    if coltype is StringVariable:
        values = orig_values

    # ContinuousVariable.number_of_decimals is supposed to be handled by
    # ContinuousVariable.to_val. In the interest of speed, the reader bypasses
    # it, so we set the number of decimals here.
    # The number of decimals is increased if not set manually (in which case
    # var.adjust_decimals would be 0).
    if isinstance(var, ContinuousVariable) and var.adjust_decimals:
        ndecimals = get_number_of_decimals(orig_values)
        if var.adjust_decimals == 2 or ndecimals > var.number_of_decimals:
            var.number_of_decimals = ndecimals
            var.adjust_decimals = 1

    if isinstance(var, TimeVariable) or coltype is TimeVariable:
        # Re-parse the values because only now after coltype.make call
        # above, variable var is the correct one
        _var = var if isinstance(var, TimeVariable) else TimeVariable('_')
        values = [_var.parse(i) for i in orig_values]

    return values, var