Skip to content

Commit c48ea98

Browse files
committed
io: Table-only version of Orange on-disk format (HDF5)
1 parent a2a0cb0 commit c48ea98

File tree

1 file changed

+94
-1
lines changed

1 file changed

+94
-1
lines changed

Orange/data/io.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import contextlib
22
import csv
3+
import json
34
import locale
45
import pickle
56
import re
@@ -18,13 +19,15 @@
1819
from urllib.request import urlopen, Request
1920
from pathlib import Path
2021

22+
import h5py
2123
import numpy as np
2224

2325
import xlrd
2426
import xlsxwriter
2527
import openpyxl
2628

27-
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
29+
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \
30+
StringVariable
2831
from Orange.data import Compression, open_compressed, detect_encoding, \
2932
isnastr, guess_data_type, sanitize_variable
3033
from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL
@@ -520,3 +523,93 @@ def _suggest_filename(self, content_disposition):
520523
matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)",
521524
content_disposition or '')
522525
return urlunquote(matches[-1]) if matches else default_name
526+
527+
class HDF5Reader(FileFormat):
528+
"""Reader for Orange HDF5 files"""
529+
EXTENSIONS = ('.hdf5',)
530+
DESCRIPTION = 'Orange on-disk data'
531+
SUPPORT_COMPRESSED = False
532+
SUPPORT_SPARSE_DATA = False
533+
534+
def read(self):
535+
h5file = f = h5py.File(self.filename, "r")
536+
537+
def read_domain(sub):
538+
d = f['domain']
539+
subdomain = d[sub].asstr() if sub in d else []
540+
subdomain_args = d[f'{sub}_args'].asstr() \
541+
if f'{sub}_args' in d else ['{}'] * len(subdomain)
542+
for attr, args in zip(subdomain, subdomain_args):
543+
yield attr[0], attr[1], json.loads(args)
544+
545+
def make_var(name, header, args):
546+
var_cls = [var for var in (ContinuousVariable,
547+
DiscreteVariable,
548+
StringVariable,
549+
TimeVariable) if header in var.TYPE_HEADERS][0]
550+
new_var = var_cls(name, **{key: val for key, val in args.items()
551+
if key != "attributes"})
552+
new_var.attributes = args.get("attributes", {})
553+
return new_var
554+
555+
def read_hdf5(name, as_str=False):
556+
if name in f:
557+
if as_str:
558+
return f[name].asstr()[:]
559+
return f[name]
560+
return None
561+
562+
assert 'domain' in f
563+
564+
domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)]
565+
for subdomain in ['attributes', 'class_vars', 'metas']])
566+
567+
X = read_hdf5("X")
568+
Y = read_hdf5("Y")
569+
570+
571+
if len(domain.metas) > 1:
572+
metas = np.hstack([read_hdf5(f'metas/{i}',
573+
isinstance(attr, StringVariable))
574+
for i, attr in enumerate(domain.metas)])
575+
elif len(domain.metas) == 1:
576+
metas = read_hdf5('metas/0',
577+
isinstance(domain.metas[0], StringVariable)
578+
)
579+
else:
580+
metas = None
581+
582+
table = Table.from_numpy(domain, X, Y, metas)
583+
if isinstance(self.filename, str):
584+
table.name = path.splitext(path.split(self.filename)[-1])[0]
585+
586+
return table
587+
588+
@classmethod
589+
def write_file(cls, filename, data):
590+
def parse(attr):
591+
params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes})
592+
if isinstance(attr, DiscreteVariable):
593+
params[2].update(values=attr.values)
594+
elif isinstance(attr, TimeVariable):
595+
params[2].update(have_date=attr.have_date,
596+
have_time=attr.have_time)
597+
elif isinstance(attr, ContinuousVariable):
598+
params[2].update(number_of_decimals=attr.number_of_decimals)
599+
return params
600+
601+
with h5py.File(filename, 'w') as f:
602+
for subdomain in ['attributes', 'class_vars', 'metas']:
603+
parsed = [parse(feature) for feature in getattr(data.domain, subdomain)]
604+
domain = np.array([[name, header] for name, header, _ in parsed], 'S')
605+
domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S')
606+
f.create_dataset(f'domain/{subdomain}', data=domain)
607+
f.create_dataset(f'domain/{subdomain}_args', data=domain_args)
608+
f.create_dataset("X", data=data.X)
609+
if data.Y.size:
610+
f.create_dataset("Y", data=data.Y)
611+
if data.metas.size:
612+
for i, attr in enumerate(data.domain.metas):
613+
col_type = 'S' if isinstance(attr, StringVariable) else 'f'
614+
col_data = data.metas[:, [i]].astype(col_type)
615+
f.create_dataset(f'metas/{i}', data=col_data)

0 commit comments

Comments
 (0)