Skip to content

Commit 8bf4bf9

Browse files
committed
io: Table-only version of Orange on-disk format (HDF5)
1 parent e045632 commit 8bf4bf9

File tree

1 file changed

+94
-1
lines changed

1 file changed

+94
-1
lines changed

Orange/data/io.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import contextlib
22
import csv
3+
import json
34
import locale
45
import pickle
56
import re
@@ -18,13 +19,15 @@
1819
from urllib.request import urlopen, Request
1920
from pathlib import Path
2021

22+
import h5py
2123
import numpy as np
2224

2325
import xlrd
2426
import xlsxwriter
2527
import openpyxl
2628

27-
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
29+
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \
30+
StringVariable
2831
from Orange.data import Compression, open_compressed, detect_encoding, \
2932
isnastr, guess_data_type, sanitize_variable
3033
from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL
@@ -511,3 +514,93 @@ def _suggest_filename(self, content_disposition):
511514
matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)",
512515
content_disposition or '')
513516
return urlunquote(matches[-1]) if matches else default_name
517+
518+
class HDF5Reader(FileFormat):
519+
"""Reader for Orange HDF5 files"""
520+
EXTENSIONS = ('.hdf5',)
521+
DESCRIPTION = 'Orange on-disk data'
522+
SUPPORT_COMPRESSED = False
523+
SUPPORT_SPARSE_DATA = False
524+
525+
def read(self):
526+
h5file = f = h5py.File(self.filename, "r")
527+
528+
def read_domain(sub):
529+
d = f['domain']
530+
subdomain = d[sub].asstr() if sub in d else []
531+
subdomain_args = d[f'{sub}_args'].asstr() \
532+
if f'{sub}_args' in d else ['{}'] * len(subdomain)
533+
for attr, args in zip(subdomain, subdomain_args):
534+
yield attr[0], attr[1], json.loads(args)
535+
536+
def make_var(name, header, args):
537+
var_cls = [var for var in (ContinuousVariable,
538+
DiscreteVariable,
539+
StringVariable,
540+
TimeVariable) if header in var.TYPE_HEADERS][0]
541+
new_var = var_cls(name, **{key: val for key, val in args.items()
542+
if key != "attributes"})
543+
new_var.attributes = args.get("attributes", {})
544+
return new_var
545+
546+
def read_hdf5(name, as_str=False):
547+
if name in f:
548+
if as_str:
549+
return f[name].asstr()[:]
550+
return f[name]
551+
return None
552+
553+
assert 'domain' in f
554+
555+
domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)]
556+
for subdomain in ['attributes', 'class_vars', 'metas']])
557+
558+
X = read_hdf5("X")
559+
Y = read_hdf5("Y")
560+
561+
562+
if len(domain.metas) > 1:
563+
metas = np.hstack([read_hdf5(f'metas/{i}',
564+
isinstance(attr, StringVariable))
565+
for i, attr in enumerate(domain.metas)])
566+
elif len(domain.metas) == 1:
567+
metas = read_hdf5('metas/0',
568+
isinstance(domain.metas[0], StringVariable)
569+
)
570+
else:
571+
metas = None
572+
573+
table = Table.from_numpy(domain, X, Y, metas)
574+
if isinstance(self.filename, str):
575+
table.name = path.splitext(path.split(self.filename)[-1])[0]
576+
577+
return table
578+
579+
@classmethod
580+
def write_file(cls, filename, data):
581+
def parse(attr):
582+
params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes})
583+
if isinstance(attr, DiscreteVariable):
584+
params[2].update(values=attr.values)
585+
elif isinstance(attr, TimeVariable):
586+
params[2].update(have_date=attr.have_date,
587+
have_time=attr.have_time)
588+
elif isinstance(attr, ContinuousVariable):
589+
params[2].update(number_of_decimals=attr.number_of_decimals)
590+
return params
591+
592+
with h5py.File(filename, 'w') as f:
593+
for subdomain in ['attributes', 'class_vars', 'metas']:
594+
parsed = [parse(feature) for feature in getattr(data.domain, subdomain)]
595+
domain = np.array([[name, header] for name, header, _ in parsed], 'S')
596+
domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S')
597+
f.create_dataset(f'domain/{subdomain}', data=domain)
598+
f.create_dataset(f'domain/{subdomain}_args', data=domain_args)
599+
f.create_dataset("X", data=data.X)
600+
if data.Y.size:
601+
f.create_dataset("Y", data=data.Y)
602+
if data.metas.size:
603+
for i, attr in enumerate(data.domain.metas):
604+
col_type = 'S' if isinstance(attr, StringVariable) else 'f'
605+
col_data = data.metas[:, [i]].astype(col_type)
606+
f.create_dataset(f'metas/{i}', data=col_data)

0 commit comments

Comments
 (0)