|
1 | 1 | import contextlib |
2 | 2 | import csv |
| 3 | +import json |
3 | 4 | import locale |
4 | 5 | import pickle |
5 | 6 | import re |
|
18 | 19 | from urllib.request import urlopen, Request |
19 | 20 | from pathlib import Path |
20 | 21 |
|
| 22 | +import h5py |
21 | 23 | import numpy as np |
22 | 24 |
|
23 | 25 | import xlrd |
24 | 26 | import xlsxwriter |
25 | 27 | import openpyxl |
26 | 28 |
|
27 | | -from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin |
| 29 | +from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \ |
| 30 | + StringVariable |
28 | 31 | from Orange.data import Compression, open_compressed, detect_encoding, \ |
29 | 32 | isnastr, guess_data_type, sanitize_variable |
30 | 33 | from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL |
@@ -511,3 +514,93 @@ def _suggest_filename(self, content_disposition): |
511 | 514 | matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)", |
512 | 515 | content_disposition or '') |
513 | 516 | return urlunquote(matches[-1]) if matches else default_name |
| 517 | + |
| 518 | +class HDF5Reader(FileFormat): |
| 519 | + """Reader for Orange HDF5 files""" |
| 520 | + EXTENSIONS = ('.hdf5',) |
| 521 | + DESCRIPTION = 'Orange on-disk data' |
| 522 | + SUPPORT_COMPRESSED = False |
| 523 | + SUPPORT_SPARSE_DATA = False |
| 524 | + |
| 525 | + def read(self): |
| 526 | + h5file = f = h5py.File(self.filename, "r") |
| 527 | + |
| 528 | + def read_domain(sub): |
| 529 | + d = f['domain'] |
| 530 | + subdomain = d[sub].asstr() if sub in d else [] |
| 531 | + subdomain_args = d[f'{sub}_args'].asstr() \ |
| 532 | + if f'{sub}_args' in d else ['{}'] * len(subdomain) |
| 533 | + for attr, args in zip(subdomain, subdomain_args): |
| 534 | + yield attr[0], attr[1], json.loads(args) |
| 535 | + |
| 536 | + def make_var(name, header, args): |
| 537 | + var_cls = [var for var in (ContinuousVariable, |
| 538 | + DiscreteVariable, |
| 539 | + StringVariable, |
| 540 | + TimeVariable) if header in var.TYPE_HEADERS][0] |
| 541 | + new_var = var_cls(name, **{key: val for key, val in args.items() |
| 542 | + if key != "attributes"}) |
| 543 | + new_var.attributes = args.get("attributes", {}) |
| 544 | + return new_var |
| 545 | + |
| 546 | + def read_hdf5(name, as_str=False): |
| 547 | + if name in f: |
| 548 | + if as_str: |
| 549 | + return f[name].asstr()[:] |
| 550 | + return f[name] |
| 551 | + return None |
| 552 | + |
| 553 | + assert 'domain' in f |
| 554 | + |
| 555 | + domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] |
| 556 | + for subdomain in ['attributes', 'class_vars', 'metas']]) |
| 557 | + |
| 558 | + X = read_hdf5("X") |
| 559 | + Y = read_hdf5("Y") |
| 560 | + |
| 561 | + |
| 562 | + if len(domain.metas) > 1: |
| 563 | + metas = np.hstack([read_hdf5(f'metas/{i}', |
| 564 | + isinstance(attr, StringVariable)) |
| 565 | + for i, attr in enumerate(domain.metas)]) |
| 566 | + elif len(domain.metas) == 1: |
| 567 | + metas = read_hdf5('metas/0', |
| 568 | + isinstance(domain.metas[0], StringVariable) |
| 569 | + ) |
| 570 | + else: |
| 571 | + metas = None |
| 572 | + |
| 573 | + table = Table.from_numpy(domain, X, Y, metas) |
| 574 | + if isinstance(self.filename, str): |
| 575 | + table.name = path.splitext(path.split(self.filename)[-1])[0] |
| 576 | + |
| 577 | + return table |
| 578 | + |
| 579 | + @classmethod |
| 580 | + def write_file(cls, filename, data): |
| 581 | + def parse(attr): |
| 582 | + params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes}) |
| 583 | + if isinstance(attr, DiscreteVariable): |
| 584 | + params[2].update(values=attr.values) |
| 585 | + elif isinstance(attr, TimeVariable): |
| 586 | + params[2].update(have_date=attr.have_date, |
| 587 | + have_time=attr.have_time) |
| 588 | + elif isinstance(attr, ContinuousVariable): |
| 589 | + params[2].update(number_of_decimals=attr.number_of_decimals) |
| 590 | + return params |
| 591 | + |
| 592 | + with h5py.File(filename, 'w') as f: |
| 593 | + for subdomain in ['attributes', 'class_vars', 'metas']: |
| 594 | + parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] |
| 595 | + domain = np.array([[name, header] for name, header, _ in parsed], 'S') |
| 596 | + domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S') |
| 597 | + f.create_dataset(f'domain/{subdomain}', data=domain) |
| 598 | + f.create_dataset(f'domain/{subdomain}_args', data=domain_args) |
| 599 | + f.create_dataset("X", data=data.X) |
| 600 | + if data.Y.size: |
| 601 | + f.create_dataset("Y", data=data.Y) |
| 602 | + if data.metas.size: |
| 603 | + for i, attr in enumerate(data.domain.metas): |
| 604 | + col_type = 'S' if isinstance(attr, StringVariable) else 'f' |
| 605 | + col_data = data.metas[:, [i]].astype(col_type) |
| 606 | + f.create_dataset(f'metas/{i}', data=col_data) |
0 commit comments