Merge pull request #237 from int-brain-lab/v3.4.2

k1o0 · web-flow · commit be2600d121be · 2026-02-05T15:44:24.000+02:00
This version fixes the saving and loading of insertions parquet tables for offline processing.

### Fixed

- the insertions table have the minimal meta-data to allow reloading after a cache dump
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [3.4.1]
+
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [3.4.2]
+This version fixes the saving and loading of insertions parquet tables for offline processing.
+
+### Fixed
+
+- the insertions table have the minimal meta-data to allow reloading after a cache dump
+
+## [3.4.1]
 This version fixes issues with corrupt REST cache and REST validation errors.
 
 ### Modified
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ pip install ONE-api
 For using ONE with a local cache directory:
 ```python
 from one.api import One
-one = One(cache_dir='/home/user/downlaods/ONE/behavior_paper')
+one = One(cache_dir='/home/user/downloads/ONE/behavior_paper')
 ```
 
 To use the default setup settings that connect you to the [IBL public database](https://openalyx.internationalbrainlab.org):
diff --git a/docs/notebooks/recording_data_access.ipynb b/docs/notebooks/recording_data_access.ipynb
@@ -237,6 +237,75 @@
     "\n",
     "</div>"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Guide to running ONE with multi-processes workflows.\n",
+    "\n",
+    "When using ONE with multiple processes and making simultaneous requests to the remote database, these requests can lead to connection errors like JSON.DecodeError and HTTPError.\n",
+    "\n",
+    "To get around these errors, it is very useful to generate the above mentioned parquet files containing the cache data. Then ONE can be initialized in local mode and the saved cache can be loaded. This ensures no additional database requests are made during the parallel run, and hence no connection errors.\n",
+    "\n",
+    "Below is an example, where all the probe-insertions for a project which have the spike sorting data are first queried and then the cache is saved. \n",
+    "## Example\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import batched\n",
+    "\n",
+    "from one.api import ONE\n",
+    "from one.converters import datasets2records\n",
+    "from one.alf.cache import merge_tables\n",
+    "\n",
+    "# To generate the cache, we need to use the remote mode of ONE\n",
+    "one = ONE()\n",
+    "#Get the list of eids that have the spikes.times.npy dataset.\n",
+    "eids = one.search(project='u19_proj1_multiareacom', datasets='spikes.times.npy')\n",
+    "# Update datasets\n",
+    "#Batching is being done to avoid multiple requests to the database.\n",
+    "for batch in batched(map(str, eids), 50):\n",
+    "    # these rest queries update the one._cache object in memory\n",
+    "    dsets = one.alyx.rest('datasets', 'list', django=f'session__in,{batch}')\n",
+    "    df = datasets2records(dsets)\n",
+    "    merge_tables(one._cache, datasets=df, origin=one.alyx.base_url)\n",
+    "\n",
+    "\n",
+    "#Provide the location of the directory where the cache will be saved.\n",
+    "one.save_cache(\"mutli_area_cache\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above script will save `datasets.pqt` and `sessions.pqt` file in the directory `mutli_area_cache`.\n",
+    "\n",
+    "You can then initialize ONE in local mode within your parallelized scripts - whether using SLURM, joblib, or multiprocessing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from one.api import ONE\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    one = ONE(mode='local', tables_dir=\"/path/to/mutli_area_cache\")\n",
+    "    eid_list = ['eid1', 'eid2', 'eid3']\n",
+    "    results = Parallel(n_jobs=-1, verbose=10)(delayed(one.list_collections)(eid) for eid in eid_list)\n",
+    "    print(results)"
+   ]
   }
  ],
  "metadata": {
diff --git a/one/__init__.py b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '3.4.1'
+__version__ = '3.4.2'
diff --git a/one/alf/cache.py b/one/alf/cache.py
@@ -538,13 +538,18 @@ def merge_tables(cache, strict=False, origin=None, **kwargs):
         cache[table] = pd.concat(frames).sort_index()
         updated = datetime.datetime.now()
         # Update the table metadata with the origin
+        table_meta = cache['_meta']['raw'].get(table, {})
         if origin is not None:
-            table_meta = cache['_meta']['raw'].get(table, {})
             if 'origin' not in table_meta:
-                table_meta['origin'] = set(origin)
+                table_meta['origin'] = set(ensure_list(origin))
             else:
                 table_meta['origin'].add(origin)
             cache['_meta']['raw'][table] = table_meta
+        # Makes sure that the `date_created` field exists for a new table
+        if 'date_created' not in table_meta.keys():
+            table_meta['date_created'] = datetime.datetime.now().isoformat(
+                sep=' ', timespec='minutes')
+            cache['_meta']['raw'][table] = table_meta
     cache['_meta']['modified_time'] = updated
     return updated
 
diff --git a/one/tests/test_alyxclient.py b/one/tests/test_alyxclient.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 import unittest
 from unittest import mock
+import http.client
 import urllib.parse
 import random
 import weakref
@@ -602,6 +603,42 @@ def test_download_cache_tables_auth(self, download_file_mock, zipfile_mock):
         finally:
             self.ac._token = token
 
+    @mock.patch('one.webclient.urllib.request')
+    @mock.patch('builtins.open')
+    def test_http_server_auth(self, open_mock, urllib_mock):
+        """Test for http_download_file authentication and headers."""
+        url_response_mock = mock.MagicMock(spec_set=http.client.HTTPResponse)
+        # Simulate file content then end of file
+        url_response_mock.read.side_effect = [b'file content', None]
+        urllib_mock.urlopen.return_value = url_response_mock
+        # When a username and password are set in the parameters, should attempt to authenticate
+        with tempfile.TemporaryDirectory() as temp_dir:
+            file_name, md5 = wc.http_download_file(
+                'https://example.com/file.txt',
+                target_dir=temp_dir,
+                username='user',
+                password='pass',
+                return_md5=True,
+                chunks=(4, 12),
+                headers={'Custom-Header': 'value'}
+            )
+        expected = Path(temp_dir).joinpath('file.txt')
+        # Check file is written to expected location
+        self.assertEqual(expected, Path(file_name))
+        open_mock.assert_called_once_with(expected, 'wb')
+        fid_mock = open_mock()
+        fid_mock.write.assert_called_once_with(b'file content')
+        fid_mock.close.assert_called_once()
+        # Check urlopen called with correct auth header
+        urllib.request.HTTPPasswordMgrWithDefaultRealm.assert_called_once()
+        manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
+        manager.add_password.assert_called_once_with(None, 'https://example.com', 'user', 'pass')
+        # Check the request headers
+        urllib.request.urlopen.assert_called_once()
+        req, = urllib.request.urlopen.call_args[0]
+        req.add_header.assert_any_call('Custom-Header', 'value')
+        req.add_header.assert_any_call('Range', 'bytes=4-15')  # Chunks
+
 
 class TestMisc(unittest.TestCase):
     def test_update_url_params(self):
diff --git a/one/tests/test_converters.py b/one/tests/test_converters.py
@@ -346,6 +346,10 @@ def test_pid2eid(self):
         # Check cache table updated
         self.assertIn('insertions', self.one._cache)
         self.assertIn(self.eid, self.one._cache['insertions'].index)
+        # Makes sure the meta data of the newly created insertion table is populated
+        meta_data = self.one._cache['_meta']['raw']['insertions']
+        self.assertIn('date_created', meta_data.keys())
+        self.assertEqual(meta_data['origin'], {'https://openalyx.internationalbrainlab.org'})
         # Local mode should now work
         self.assertEqual((self.eid, 'probe00'), self.one.pid2eid(self.pid, query_type='local'))
         # Test behaviour when pid not found
diff --git a/one/tests/test_one.py b/one/tests/test_one.py
@@ -1602,7 +1602,7 @@ def test_list_datasets(self):
         self.one._cache['datasets'] = self.one._cache['datasets'].iloc[0:0].copy()
 
         dsets = self.one.list_datasets(self.eid, details=True, query_type='remote')
-        expected_n_datasets = 267  # this may change after a BWM release or patch
+        expected_n_datasets = 280  # this may change after a BWM release or patch
         self.assertEqual(expected_n_datasets, len(dsets))
         self.assertEqual(1, dsets.index.nlevels, 'details data frame should be without eid index')
 
diff --git a/one/webclient.py b/one/webclient.py
@@ -413,7 +413,7 @@ def http_download_file(full_link_to_file, chunks=None, *, clobber=False, silent=
         Directory in which files are downloaded; defaults to user's Download directory
     return_md5 : bool
         If True an MD5 hash of the file is additionally returned
-    headers : list of dicts
+    headers : dict
         Additional headers to add to the request (auth tokens etc.)
 
     Returns
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 ruff
-numpy>=1.18
-pandas>=1.5.0
+numpy>=1.18, <2.4  # waiting for numba to support 2.4
+pandas>=1.5.0, <3.0.0 # pandas 3 regex on strings issue <one.tests.test_one.TestONECache testMethod=test_filter>
 tqdm>=4.32.1
 requests>=2.22.0
 iblutil>=1.14.0

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`"""The Open Neurophysiology Environment (ONE) API."""`
`2`		`-__version__ = '3.4.1'`
	`2`	`+__version__ = '3.4.2'`