Skip to content

Commit 92915c2

Browse files
authored
Update HERD for user defined zip file (#941)
1 parent 8376a6a commit 92915c2

File tree

6 files changed

+69
-67
lines changed

6 files changed

+69
-67
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- Increase default chunk size for `GenericDataChunkIterator` from 1 MB to 10 MB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
99
- Added the magic `__reduce__` method as well as two private semi-abstract helper methods to enable pickling of the `GenericDataChunkIterator`. @codycbakerphd [#924](https://github.com/hdmf-dev/hdmf/pull/924)
1010
- Added Dynamic Enumerations and Schemasheets support to `TermSet`. @mavaylon1 [#923](https://github.com/hdmf-dev/hdmf/pull/923)
11+
- Updated `HERD` to support user defined file name for the `HERD` zip file. @mavaylon1 [#941](https://github.com/hdmf-dev/hdmf/pull/941)
1112

1213
## HDMF 3.8.1 (July 25, 2023)
1314

docs/gallery/plot_external_resources.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -311,18 +311,18 @@ def __init__(self, **kwargs):
311311
# ------------------------------------------------------
312312
# :py:class:`~hdmf.common.resources.HERD` is written as a zip file of
313313
# the individual tables written to tsv.
314-
# The user provides the path, which contains the name of the directory.
314+
# The user provides the path, which contains the name of the file.
315315

316-
er.to_norm_tsv(path='./')
316+
er.to_zip(path='./HERD.zip')
317317

318318
###############################################################################
319319
# Read HERD
320320
# ------------------------------------------------------
321-
# Users can read :py:class:`~hdmf.common.resources.HERD` from the tsv format
322-
# by providing the path to the directory.
321+
# Users can read :py:class:`~hdmf.common.resources.HERD` from the zip file
322+
# by providing the path to the file itself.
323323

324-
er_read = HERD.from_norm_tsv(path='./')
325-
os.remove('./er.zip')
324+
er_read = HERD.from_zip(path='./HERD.zip')
325+
os.remove('./HERD.zip')
326326

327327
###############################################################################
328328
# Using TermSet with HERD

src/hdmf/backends/io.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def read(self, **kwargs):
6262
if self.herd_path is not None:
6363
from hdmf.common import HERD
6464
try:
65-
self.herd = HERD.from_norm_tsv(path=self.herd_path)
65+
self.herd = HERD.from_zip(path=self.herd_path)
6666
if isinstance(container, HERDManager):
6767
container.link_resources(herd=self.herd)
6868
except FileNotFoundError:
@@ -84,7 +84,7 @@ def write(self, **kwargs):
8484
if self.herd_path is not None:
8585
herd = container.get_linked_resources()
8686
if herd is not None:
87-
herd.to_norm_tsv(path=self.herd_path)
87+
herd.to_zip(path=self.herd_path)
8888
else:
8989
msg = "Could not find linked HERD. Container was still written to IO source."
9090
warn(msg)

src/hdmf/common/resources.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -836,19 +836,20 @@ def to_dataframe(self, **kwargs):
836836
# return the result
837837
return result_df
838838

839-
@docval({'name': 'path', 'type': str, 'doc': 'path of the folder tsv file to write'})
840-
def to_norm_tsv(self, **kwargs):
839+
@docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
840+
def to_zip(self, **kwargs):
841841
"""
842-
Write the tables in HERD to individual tsv files.
842+
Write the tables in HERD to zipped tsv files.
843843
"""
844-
path = kwargs['path']
845-
files = [path+child.name+'.tsv' for child in self.children]
844+
zip_file = kwargs['path']
845+
directory = os.path.dirname(zip_file)
846846

847+
files = [os.path.join(directory, child.name)+'.tsv' for child in self.children]
847848
for i in range(len(self.children)):
848849
df = self.children[i].to_dataframe()
849850
df.to_csv(files[i], sep='\t', index=False)
850851

851-
with zipfile.ZipFile('er.zip', 'w') as zipF:
852+
with zipfile.ZipFile(zip_file, 'w') as zipF:
852853
for file in files:
853854
zipF.write(file)
854855

@@ -857,13 +858,17 @@ def to_norm_tsv(self, **kwargs):
857858
os.remove(file)
858859

859860
@classmethod
860-
@docval({'name': 'path', 'type': str, 'doc': 'path of the folder containing the tsv files to read'},
861-
returns="HERD loaded from TSV", rtype="HERD")
862-
def from_norm_tsv(cls, **kwargs):
863-
path = kwargs['path']
864-
with zipfile.ZipFile(path+'/er.zip', 'r') as zip:
865-
zip.extractall(path)
866-
tsv_paths = glob(path+'/*')
861+
@docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
862+
def from_zip(cls, **kwargs):
863+
"""
864+
Method to read in zipped tsv files to populate HERD.
865+
"""
866+
zip_file = kwargs['path']
867+
directory = os.path.dirname(zip_file)
868+
869+
with zipfile.ZipFile(zip_file, 'r') as zip:
870+
zip.extractall(directory)
871+
tsv_paths = glob(directory+'/*')
867872

868873
for file in tsv_paths:
869874
file_name = os.path.basename(file)

tests/unit/common/test_resources.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,16 @@ def remove_er_files(self):
5151
remove_test_file('./object_keys.tsv')
5252
remove_test_file('./keys.tsv')
5353
remove_test_file('./files.tsv')
54-
remove_test_file('./er.tsv')
55-
remove_test_file('./er.zip')
54+
remove_test_file('./HERD.zip')
5655

5756
def child_tsv(self, external_resources):
5857
for child in external_resources.children:
5958
df = child.to_dataframe()
6059
df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)
6160

62-
def zip_child(self):
61+
def zip_child(self, zip_file):
6362
files = glob('*.tsv')
64-
with zipfile.ZipFile('er.zip', 'w') as zipF:
63+
with zipfile.ZipFile(zip_file, 'w') as zipF:
6564
for file in files:
6665
zipF.write(file)
6766

@@ -590,159 +589,159 @@ def test_get_obj_entities_attribute(self):
590589

591590
pd.testing.assert_frame_equal(df, expected_df)
592591

593-
def test_to_and_from_norm_tsv(self):
592+
def test_to_and_from_zip(self):
594593
er = HERD()
595594
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
596595
er.add_ref(file=HERDManagerContainer(name='file'),
597596
container=data,
598597
key='key1',
599598
entity_id='entity_id1',
600599
entity_uri='entity1')
601-
er.to_norm_tsv(path='./')
600+
er.to_zip(path='./HERD.zip')
602601

603-
er_read = HERD.from_norm_tsv(path='./')
602+
er_read = HERD.from_zip(path='./HERD.zip')
604603
HERD.assert_external_resources_equal(er_read, er, check_dtype=False)
605604

606605
self.remove_er_files()
607606

608-
def test_to_and_from_norm_tsv_entity_value_error(self):
607+
def test_to_and_from_zip_entity_value_error(self):
609608
er = HERD()
610609
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
611610
er.add_ref(file=HERDManagerContainer(name='file'),
612611
container=data,
613612
key='key1',
614613
entity_id='entity_id1',
615614
entity_uri='entity1')
616-
er.to_norm_tsv(path='./')
615+
er.to_zip(path='./HERD.zip')
617616

618617
self.child_tsv(external_resources=er)
619618

620619
df = er.entities.to_dataframe()
621620
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
622621
df.to_csv('./entities.tsv', sep='\t', index=False)
623622

624-
self.zip_child()
623+
self.zip_child(zip_file='HERD.zip')
625624

626625
with self.assertRaises(ValueError):
627-
_ = HERD.from_norm_tsv(path='./')
626+
_ = HERD.from_zip(path='./HERD.zip')
628627

629628
self.remove_er_files()
630629

631-
def test_to_and_from_norm_tsv_entity_key_value_error_key(self):
630+
def test_to_and_from_zip_entity_key_value_error_key(self):
632631
er = HERD()
633632
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
634633
er.add_ref(file=HERDManagerContainer(name='file'),
635634
container=data,
636635
key='key1',
637636
entity_id='entity_id1',
638637
entity_uri='entity1')
639-
er.to_norm_tsv(path='./')
638+
er.to_zip(path='./HERD.zip')
640639

641640
self.child_tsv(external_resources=er)
642641

643642
df = er.entity_keys.to_dataframe()
644643
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
645644
df.to_csv('./entity_keys.tsv', sep='\t', index=False)
646645

647-
self.zip_child()
646+
self.zip_child(zip_file='HERD.zip')
648647

649648
with self.assertRaises(ValueError):
650-
_ = HERD.from_norm_tsv(path='./')
649+
_ = HERD.from_zip(path='./HERD.zip')
651650

652651
self.remove_er_files()
653652

654-
def test_to_and_from_norm_tsv_entity_key_value_error_entity(self):
653+
def test_to_and_from_zip_entity_key_value_error_entity(self):
655654
er = HERD()
656655
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
657656
er.add_ref(file=HERDManagerContainer(name='file'),
658657
container=data,
659658
key='key1',
660659
entity_id='entity_id1',
661660
entity_uri='entity1')
662-
er.to_norm_tsv(path='./')
661+
er.to_zip(path='./HERD.zip')
663662

664663
self.child_tsv(external_resources=er)
665664

666665
df = er.entity_keys.to_dataframe()
667666
df.at[0, ('entities_idx')] = 10 # Change key_ix 0 to 10
668667
df.to_csv('./entity_keys.tsv', sep='\t', index=False)
669668

670-
self.zip_child()
669+
self.zip_child(zip_file='HERD.zip')
671670

672671
with self.assertRaises(ValueError):
673-
_ = HERD.from_norm_tsv(path='./')
672+
_ = HERD.from_zip(path='./HERD.zip')
674673

675674
self.remove_er_files()
676675

677-
def test_to_and_from_norm_tsv_object_value_error(self):
676+
def test_to_and_from_zip_object_value_error(self):
678677
er = HERD()
679678
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
680679
er.add_ref(file=HERDManagerContainer(name='file'),
681680
container=data,
682681
key='key1',
683682
entity_id='entity_id1',
684683
entity_uri='entity1')
685-
er.to_norm_tsv(path='./')
684+
er.to_zip(path='./HERD.zip')
686685

687686
self.child_tsv(external_resources=er)
688687

689688
df = er.objects.to_dataframe()
690689
df.at[0, ('files_idx')] = 10 # Change key_ix 0 to 10
691690
df.to_csv('./objects.tsv', sep='\t', index=False)
692691

693-
self.zip_child()
692+
self.zip_child(zip_file='HERD.zip')
694693

695694
msg = "File_ID Index out of range in ObjectTable. Please check for alterations."
696695
with self.assertRaisesWith(ValueError, msg):
697-
_ = HERD.from_norm_tsv(path='./')
696+
_ = HERD.from_zip(path='./HERD.zip')
698697

699698
self.remove_er_files()
700699

701-
def test_to_and_from_norm_tsv_object_keys_object_idx_value_error(self):
700+
def test_to_and_from_zip_object_keys_object_idx_value_error(self):
702701
er = HERD()
703702
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
704703
er.add_ref(file=HERDManagerContainer(name='file'),
705704
container=data,
706705
key='key1',
707706
entity_id='entity_id1',
708707
entity_uri='entity1')
709-
er.to_norm_tsv(path='./')
708+
er.to_zip(path='./HERD.zip')
710709

711710
self.child_tsv(external_resources=er)
712711

713712
df = er.object_keys.to_dataframe()
714713
df.at[0, ('objects_idx')] = 10 # Change key_ix 0 to 10
715714
df.to_csv('./object_keys.tsv', sep='\t', index=False)
716715

717-
self.zip_child()
716+
self.zip_child(zip_file='HERD.zip')
718717

719718
msg = "Object Index out of range in ObjectKeyTable. Please check for alterations."
720719
with self.assertRaisesWith(ValueError, msg):
721-
_ = HERD.from_norm_tsv(path='./')
720+
_ = HERD.from_zip(path='./HERD.zip')
722721

723722
self.remove_er_files()
724723

725-
def test_to_and_from_norm_tsv_object_keys_key_idx_value_error(self):
724+
def test_to_and_from_zip_object_keys_key_idx_value_error(self):
726725
er = HERD()
727726
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
728727
er.add_ref(file=HERDManagerContainer(name='file'),
729728
container=data,
730729
key='key1',
731730
entity_id='entity_id1',
732731
entity_uri='entity1')
733-
er.to_norm_tsv(path='./')
732+
er.to_zip(path='./HERD.zip')
734733

735734
self.child_tsv(external_resources=er)
736735

737736
df = er.object_keys.to_dataframe()
738737
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
739738
df.to_csv('./object_keys.tsv', sep='\t', index=False)
740739

741-
self.zip_child()
740+
self.zip_child(zip_file='HERD.zip')
742741

743742
msg = "Key Index out of range in ObjectKeyTable. Please check for alterations."
744743
with self.assertRaisesWith(ValueError, msg):
745-
_ = HERD.from_norm_tsv(path='./')
744+
_ = HERD.from_zip(path='./HERD.zip')
746745

747746
self.remove_er_files()
748747

tests/unit/test_io_hdf5_h5tools.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -950,17 +950,16 @@ def remove_er_files(self):
950950
remove_test_file('./object_keys.tsv')
951951
remove_test_file('./keys.tsv')
952952
remove_test_file('./files.tsv')
953-
remove_test_file('./er.tsv')
954-
remove_test_file('./er.zip')
953+
remove_test_file('./HERD.zip')
955954

956955
def child_tsv(self, herd):
957956
for child in herd.children:
958957
df = child.to_dataframe()
959958
df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)
960959

961-
def zip_child(self):
960+
def zip_child(self, zip_file):
962961
files = glob('*.tsv')
963-
with zipfile.ZipFile('er.zip', 'w') as zipF:
962+
with zipfile.ZipFile(zip_file, 'w') as zipF:
964963
for file in files:
965964
zipF.write(file)
966965

@@ -972,13 +971,11 @@ def test_io_read_herd(self):
972971
key='key1',
973972
entity_id='entity_id1',
974973
entity_uri='entity1')
975-
er.to_norm_tsv(path='./')
976-
977-
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
974+
er.to_zip(path='./HERD.zip')
975+
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
978976
container = io.read()
979977
self.assertIsInstance(io.herd, HERD)
980978
self.assertIsInstance(container.get_linked_resources(), HERD)
981-
982979
self.remove_er_files()
983980

984981
def test_io_read_herd_file_warn(self):
@@ -989,7 +986,7 @@ def test_io_read_herd_file_warn(self):
989986
key='key1',
990987
entity_id='entity_id1',
991988
entity_uri='entity1')
992-
er.to_norm_tsv(path='./')
989+
er.to_zip(path='./HERD.zip')
993990

994991
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='wrong_path') as io:
995992
with self.assertWarns(Warning):
@@ -1005,16 +1002,16 @@ def test_io_read_herd_value_warn(self):
10051002
key='key1',
10061003
entity_id='entity_id1',
10071004
entity_uri='entity1')
1008-
er.to_norm_tsv(path='./')
1005+
er.to_zip(path='./HERD.zip')
10091006

10101007
self.child_tsv(herd=er)
10111008

10121009
df = er.entities.to_dataframe()
10131010
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
10141011
df.to_csv('./entities.tsv', sep='\t', index=False)
10151012

1016-
self.zip_child()
1017-
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
1013+
self.zip_child(zip_file='HERD.zip')
1014+
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
10181015
with self.assertWarns(Warning):
10191016
io.read()
10201017

@@ -1031,10 +1028,10 @@ def test_io_write_herd(self):
10311028
entity_id='entity_id1',
10321029
entity_uri='entity1')
10331030

1034-
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./') as io:
1031+
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./HERD.zip') as io:
10351032
io.write(self.foofile)
10361033

1037-
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
1034+
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
10381035
container = io.read()
10391036
self.assertIsInstance(io.herd, HERD)
10401037
self.assertIsInstance(container.get_linked_resources(), HERD)
@@ -1050,7 +1047,7 @@ def test_io_warn(self):
10501047
key='key1',
10511048
entity_id='entity_id1',
10521049
entity_uri='entity1')
1053-
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./') as io:
1050+
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./HERD.zip') as io:
10541051
with self.assertWarns(Warning):
10551052
io.write(self.foofile)
10561053

0 commit comments

Comments
 (0)