Update HERD for user defined zip file (#941)

mavaylon1 · web-flow · commit 92915c2128e9 · 2023-08-15T05:42:32.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 - Increase default chunk size for `GenericDataChunkIterator` from 1 MB to 10 MB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
 - Added the magic `__reduce__` method as well as two private semi-abstract helper methods to enable pickling of the `GenericDataChunkIterator`. @codycbakerphd [#924](https://github.com/hdmf-dev/hdmf/pull/924)
 - Added Dynamic Enumerations and Schemasheets support to `TermSet`. @mavaylon1 [#923](https://github.com/hdmf-dev/hdmf/pull/923)
+- Updated `HERD` to support user defined file name for the `HERD` zip file. @mavaylon1 [#941](https://github.com/hdmf-dev/hdmf/pull/941)
 
 ## HDMF 3.8.1 (July 25, 2023)
 
diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py
@@ -311,18 +311,18 @@ def __init__(self, **kwargs):
 # ------------------------------------------------------
 # :py:class:`~hdmf.common.resources.HERD` is written as a zip file of
 # the individual tables written to tsv.
-# The user provides the path, which contains the name of the directory.
+# The user provides the path, which contains the name of the file.
 
-er.to_norm_tsv(path='./')
+er.to_zip(path='./HERD.zip')
 
 ###############################################################################
 # Read HERD
 # ------------------------------------------------------
-# Users can read :py:class:`~hdmf.common.resources.HERD` from the tsv format
-# by providing the path to the directory.
+# Users can read :py:class:`~hdmf.common.resources.HERD` from the zip file
+# by providing the path to the file itself.
 
-er_read = HERD.from_norm_tsv(path='./')
-os.remove('./er.zip')
+er_read = HERD.from_zip(path='./HERD.zip')
+os.remove('./HERD.zip')
 
 ###############################################################################
 # Using TermSet with HERD
diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py
@@ -62,7 +62,7 @@ def read(self, **kwargs):
         if self.herd_path is not None:
             from hdmf.common import HERD
             try:
-                self.herd = HERD.from_norm_tsv(path=self.herd_path)
+                self.herd = HERD.from_zip(path=self.herd_path)
                 if isinstance(container, HERDManager):
                     container.link_resources(herd=self.herd)
             except FileNotFoundError:
@@ -84,7 +84,7 @@ def write(self, **kwargs):
         if self.herd_path is not None:
             herd = container.get_linked_resources()
             if herd is not None:
-                herd.to_norm_tsv(path=self.herd_path)
+                herd.to_zip(path=self.herd_path)
             else:
                 msg = "Could not find linked HERD. Container was still written to IO source."
                 warn(msg)
diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py
@@ -836,19 +836,20 @@ def to_dataframe(self, **kwargs):
         # return the result
         return result_df
 
-    @docval({'name': 'path', 'type': str, 'doc': 'path of the folder tsv file to write'})
-    def to_norm_tsv(self, **kwargs):
+    @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
+    def to_zip(self, **kwargs):
         """
-        Write the tables in HERD to individual tsv files.
+        Write the tables in HERD to zipped tsv files.
         """
-        path = kwargs['path']
-        files = [path+child.name+'.tsv' for child in self.children]
+        zip_file = kwargs['path']
+        directory = os.path.dirname(zip_file)
 
+        files = [os.path.join(directory, child.name)+'.tsv' for child in self.children]
         for i in range(len(self.children)):
             df = self.children[i].to_dataframe()
             df.to_csv(files[i], sep='\t', index=False)
 
-        with zipfile.ZipFile('er.zip', 'w') as zipF:
+        with zipfile.ZipFile(zip_file, 'w') as zipF:
           for file in files:
               zipF.write(file)
 
@@ -857,13 +858,17 @@ def to_norm_tsv(self, **kwargs):
             os.remove(file)
 
     @classmethod
-    @docval({'name': 'path', 'type': str, 'doc': 'path of the folder containing the tsv files to read'},
-            returns="HERD loaded from TSV", rtype="HERD")
-    def from_norm_tsv(cls, **kwargs):
-        path = kwargs['path']
-        with zipfile.ZipFile(path+'/er.zip', 'r') as zip:
-            zip.extractall(path)
-        tsv_paths = glob(path+'/*')
+    @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
+    def from_zip(cls, **kwargs):
+        """
+        Method to read in zipped tsv files to populate HERD.
+        """
+        zip_file = kwargs['path']
+        directory = os.path.dirname(zip_file)
+
+        with zipfile.ZipFile(zip_file, 'r') as zip:
+            zip.extractall(directory)
+        tsv_paths = glob(directory+'/*')
 
         for file in tsv_paths:
             file_name = os.path.basename(file)
diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py
@@ -51,17 +51,16 @@ def remove_er_files(self):
         remove_test_file('./object_keys.tsv')
         remove_test_file('./keys.tsv')
         remove_test_file('./files.tsv')
-        remove_test_file('./er.tsv')
-        remove_test_file('./er.zip')
+        remove_test_file('./HERD.zip')
 
     def child_tsv(self, external_resources):
         for child in external_resources.children:
             df = child.to_dataframe()
             df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)
 
-    def zip_child(self):
+    def zip_child(self, zip_file):
         files = glob('*.tsv')
-        with zipfile.ZipFile('er.zip', 'w') as zipF:
+        with zipfile.ZipFile(zip_file, 'w') as zipF:
           for file in files:
               zipF.write(file)
 
@@ -590,159 +589,159 @@ def test_get_obj_entities_attribute(self):
 
         pd.testing.assert_frame_equal(df, expected_df)
 
-    def test_to_and_from_norm_tsv(self):
+    def test_to_and_from_zip(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
-        er_read = HERD.from_norm_tsv(path='./')
+        er_read = HERD.from_zip(path='./HERD.zip')
         HERD.assert_external_resources_equal(er_read, er, check_dtype=False)
 
         self.remove_er_files()
 
-    def test_to_and_from_norm_tsv_entity_value_error(self):
+    def test_to_and_from_zip_entity_value_error(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(external_resources=er)
 
         df = er.entities.to_dataframe()
         df.at[0, ('keys_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./entities.tsv', sep='\t', index=False)
 
-        self.zip_child()
+        self.zip_child(zip_file='HERD.zip')
 
         with self.assertRaises(ValueError):
-            _ = HERD.from_norm_tsv(path='./')
+            _ = HERD.from_zip(path='./HERD.zip')
 
         self.remove_er_files()
 
-    def test_to_and_from_norm_tsv_entity_key_value_error_key(self):
+    def test_to_and_from_zip_entity_key_value_error_key(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(external_resources=er)
 
         df = er.entity_keys.to_dataframe()
         df.at[0, ('keys_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./entity_keys.tsv', sep='\t', index=False)
 
-        self.zip_child()
+        self.zip_child(zip_file='HERD.zip')
 
         with self.assertRaises(ValueError):
-            _ = HERD.from_norm_tsv(path='./')
+            _ = HERD.from_zip(path='./HERD.zip')
 
         self.remove_er_files()
 
-    def test_to_and_from_norm_tsv_entity_key_value_error_entity(self):
+    def test_to_and_from_zip_entity_key_value_error_entity(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(external_resources=er)
 
         df = er.entity_keys.to_dataframe()
         df.at[0, ('entities_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./entity_keys.tsv', sep='\t', index=False)
 
-        self.zip_child()
+        self.zip_child(zip_file='HERD.zip')
 
         with self.assertRaises(ValueError):
-            _ = HERD.from_norm_tsv(path='./')
+            _ = HERD.from_zip(path='./HERD.zip')
 
         self.remove_er_files()
 
-    def test_to_and_from_norm_tsv_object_value_error(self):
+    def test_to_and_from_zip_object_value_error(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(external_resources=er)
 
         df = er.objects.to_dataframe()
         df.at[0, ('files_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./objects.tsv', sep='\t', index=False)
 
-        self.zip_child()
+        self.zip_child(zip_file='HERD.zip')
 
         msg = "File_ID Index out of range in ObjectTable. Please check for alterations."
         with self.assertRaisesWith(ValueError, msg):
-            _ = HERD.from_norm_tsv(path='./')
+            _ = HERD.from_zip(path='./HERD.zip')
 
         self.remove_er_files()
 
-    def test_to_and_from_norm_tsv_object_keys_object_idx_value_error(self):
+    def test_to_and_from_zip_object_keys_object_idx_value_error(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(external_resources=er)
 
         df = er.object_keys.to_dataframe()
         df.at[0, ('objects_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./object_keys.tsv', sep='\t', index=False)
 
-        self.zip_child()
+        self.zip_child(zip_file='HERD.zip')
 
         msg = "Object Index out of range in ObjectKeyTable. Please check for alterations."
         with self.assertRaisesWith(ValueError, msg):
-            _ = HERD.from_norm_tsv(path='./')
+            _ = HERD.from_zip(path='./HERD.zip')
 
         self.remove_er_files()
 
-    def test_to_and_from_norm_tsv_object_keys_key_idx_value_error(self):
+    def test_to_and_from_zip_object_keys_key_idx_value_error(self):
         er = HERD()
         data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
         er.add_ref(file=HERDManagerContainer(name='file'),
                    container=data,
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(external_resources=er)
 
         df = er.object_keys.to_dataframe()
         df.at[0, ('keys_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./object_keys.tsv', sep='\t', index=False)
 
-        self.zip_child()
+        self.zip_child(zip_file='HERD.zip')
 
         msg = "Key Index out of range in ObjectKeyTable. Please check for alterations."
         with self.assertRaisesWith(ValueError, msg):
-            _ = HERD.from_norm_tsv(path='./')
+            _ = HERD.from_zip(path='./HERD.zip')
 
         self.remove_er_files()
 
diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py
@@ -950,17 +950,16 @@ def remove_er_files(self):
         remove_test_file('./object_keys.tsv')
         remove_test_file('./keys.tsv')
         remove_test_file('./files.tsv')
-        remove_test_file('./er.tsv')
-        remove_test_file('./er.zip')
+        remove_test_file('./HERD.zip')
 
     def child_tsv(self, herd):
         for child in herd.children:
             df = child.to_dataframe()
             df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)
 
-    def zip_child(self):
+    def zip_child(self, zip_file):
         files = glob('*.tsv')
-        with zipfile.ZipFile('er.zip', 'w') as zipF:
+        with zipfile.ZipFile(zip_file, 'w') as zipF:
           for file in files:
               zipF.write(file)
 
@@ -972,13 +971,11 @@ def test_io_read_herd(self):
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
-
-        with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
+        er.to_zip(path='./HERD.zip')
+        with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
             container = io.read()
             self.assertIsInstance(io.herd, HERD)
             self.assertIsInstance(container.get_linked_resources(), HERD)
-
         self.remove_er_files()
 
     def test_io_read_herd_file_warn(self):
@@ -989,7 +986,7 @@ def test_io_read_herd_file_warn(self):
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='wrong_path') as io:
             with self.assertWarns(Warning):
@@ -1005,16 +1002,16 @@ def test_io_read_herd_value_warn(self):
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        er.to_norm_tsv(path='./')
+        er.to_zip(path='./HERD.zip')
 
         self.child_tsv(herd=er)
 
         df = er.entities.to_dataframe()
         df.at[0, ('keys_idx')] = 10  # Change key_ix 0 to 10
         df.to_csv('./entities.tsv', sep='\t', index=False)
 
-        self.zip_child()
-        with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
+        self.zip_child(zip_file='HERD.zip')
+        with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
             with self.assertWarns(Warning):
                 io.read()
 
@@ -1031,10 +1028,10 @@ def test_io_write_herd(self):
                    entity_id='entity_id1',
                    entity_uri='entity1')
 
-        with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./') as io:
+        with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./HERD.zip') as io:
             io.write(self.foofile)
 
-        with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
+        with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
             container = io.read()
             self.assertIsInstance(io.herd, HERD)
             self.assertIsInstance(container.get_linked_resources(), HERD)
@@ -1050,7 +1047,7 @@ def test_io_warn(self):
                    key='key1',
                    entity_id='entity_id1',
                    entity_uri='entity1')
-        with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./') as io:
+        with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./HERD.zip') as io:
             with self.assertWarns(Warning):
                 io.write(self.foofile)