-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathapi.py
More file actions
3070 lines (2638 loc) · 134 KB
/
api.py
File metadata and controls
3070 lines (2638 loc) · 134 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Classes for searching, listing and (down)loading ALyx Files."""
import collections.abc
import urllib.parse
import warnings
import logging
from weakref import WeakMethod
from datetime import datetime, timedelta
from functools import lru_cache, partial
from itertools import islice
from inspect import unwrap
from pathlib import Path, PurePosixPath
from typing import Any, Union, Optional, List
from uuid import UUID
from urllib.error import URLError
import os
import re
import pandas as pd
import numpy as np
import requests.exceptions
import packaging.version
from iblutil.io import parquet, hashfile
from iblutil.io.params import FileLock
from iblutil.util import Bunch, flatten, ensure_list, Listable
import one.params
import one.webclient as wc
import one.alf.io as alfio
import one.alf.path as alfiles
import one.alf.exceptions as alferr
from one.alf.path import ALFPath, ensure_alf_path
from .alf.cache import (
make_parquet_db, load_tables, remove_table_files, merge_tables,
default_cache, cast_index_object)
from .alf.spec import is_uuid, is_uuid_string, QC, to_alf
from . import __version__
from one.converters import ConversionMixin, session_record2path, ses2records, datasets2records
from one import util
_logger = logging.getLogger(__name__)
__all__ = ['ONE', 'One', 'OneAlyx']
SAVE_ON_DELETE = (os.environ.get('ONE_SAVE_ON_DELETE') or '0').casefold() in ('true', '1')
"""bool: Whether to save modified cache tables on delete."""
_logger.debug('ONE_SAVE_ON_DELETE: %s', SAVE_ON_DELETE)
class One(ConversionMixin):
"""An API for searching and loading data on a local filesystem."""
_search_terms = (
'datasets', 'date_range', 'laboratory', 'number',
'projects', 'subject', 'task_protocol', 'dataset_qc_lte'
)
uuid_filenames = None
"""bool: whether datasets on disk have a UUID in their filename."""
def __init__(self, cache_dir=None, mode='local', wildcards=True, tables_dir=None):
"""An API for searching and loading data on a local filesystem.
Parameters
----------
cache_dir : str, Path
Path to the data files. If Alyx parameters have been set up for this location,
an OneAlyx instance is returned. If data_dir and base_url are None, the default
location is used.
mode : str
Query mode, options include 'local' (offline) and 'remote' (online). Most methods
have a `query_type` parameter that can override the class mode.
wildcards : bool
If true, use unix shell style matching instead of regular expressions.
tables_dir : str, pathlib.Path
An optional location of the cache tables. If None, the tables are assumed to be in the
cache_dir.
"""
# get parameters override if inputs provided
super().__init__()
if not getattr(self, 'cache_dir', None): # May already be set by subclass
self.cache_dir = cache_dir or one.params.get_cache_dir()
self._tables_dir = tables_dir or self.cache_dir
self.mode = mode
self.wildcards = wildcards # Flag indicating whether to use regex or wildcards
self.record_loaded = False
# assign property here as different instances may work on separate filesystems
self.uuid_filenames = False
# init the cache file
self._reset_cache()
if self.mode == 'local':
# Ensure that we don't call any subclass method here as we only load local cache
# tables on init. Direct calls to load_cache can be made by the user or subclass.
One.load_cache(self)
elif self.mode != 'remote':
raise ValueError(f'Mode "{self.mode}" not recognized')
def __repr__(self):
return f'One ({"off" if self.offline else "on"}line, {self.cache_dir})'
def __del__(self):
"""Save cache tables to disk before deleting the object."""
if SAVE_ON_DELETE:
self.save_cache()
@property
def offline(self):
"""bool: True if mode is local or no Web client set."""
return self.mode == 'local' or not getattr(self, '_web_client', False)
def search_terms(self, query_type=None) -> tuple:
"""List the search term keyword args for use in the search method."""
return self._search_terms
def _reset_cache(self):
"""Replace the cache object with a Bunch that contains the right fields."""
self._cache = default_cache()
def _remove_table_files(self, tables=None):
"""Delete cache tables on disk.
Parameters
----------
tables : list of str
A list of table names to removes, e.g. ['sessions', 'datasets'].
If None, the currently loaded table names are removed. NB: This
will also delete the cache_info.json metadata file.
Returns
-------
list of pathlib.Path
A list of the removed files.
"""
tables = tables or filter(lambda x: x[0] != '_', self._cache)
return remove_table_files(self._tables_dir, tables)
def load_cache(self, tables_dir=None, clobber=True, **kwargs):
"""Load parquet cache files from a local directory.
Parameters
----------
tables_dir : str, pathlib.Path
An optional directory location of the parquet files, defaults to One._tables_dir.
clobber : bool
If true, the cache is loaded without merging with existing table files.
Returns
-------
datetime.datetime
A timestamp of when the cache was loaded.
"""
if clobber:
self._reset_cache()
else:
raise NotImplementedError('clobber=False not implemented yet')
self._tables_dir = Path(tables_dir or self._tables_dir or self.cache_dir)
self._cache = load_tables(self._tables_dir)
if self._cache['_meta']['loaded_time'] is None:
# No tables present
if self.offline: # In online mode, the cache tables should be downloaded later
warnings.warn(f'No cache tables found in {self._tables_dir}')
# If in remote mode and loading old tables generated on Alyx,
# prompt the user to delete them to improve load times
raw_meta = self._cache['_meta'].get('raw', {}).values() or [{}]
tagged = any(filter(None, flatten(x.get('database_tags') for x in raw_meta)))
origin = set(filter(None, flatten(ensure_list(x.get('origin', [])) for x in raw_meta)))
older = (self._cache['_meta']['created_time'] or datetime.now()) < datetime(2025, 2, 13)
remote = not self.offline and self.mode == 'remote'
if remote and origin == {'alyx'} and older and not self._web_client.silent and not tagged:
message = ('Old Alyx cache tables detected on disk. '
'It\'s recomended to remove these tables as they '
'negatively affect performance.\nDelete these tables? [Y/n]: ')
if (input(message).casefold().strip() or 'y')[0] == 'y':
self._remove_table_files()
self._reset_cache()
elif len(self._cache.datasets) > 1e6:
warnings.warn(
'Large cache tables affect performance. '
'Consider removing them by calling the `_remove_table_files` method.')
return self._cache['_meta']['loaded_time']
def save_cache(self, save_dir=None, clobber=False):
"""Save One._cache attribute into parquet tables if recently modified.
Checks if another process is writing to file, if so waits before saving.
Parameters
----------
save_dir : str, pathlib.Path
The directory path into which the tables are saved. Defaults to cache directory.
clobber : bool
If true, the cache is saved without merging with existing table files, regardless of
modification time.
"""
TIMEOUT = 5 # Delete lock file this many seconds after creation/modification or waiting
save_dir = Path(save_dir or self.cache_dir)
caches = self._cache
meta = caches['_meta']
modified = meta.get('modified_time') or datetime.min
update_time = max(meta.get(x) or datetime.min for x in ('loaded_time', 'saved_time'))
all_empty = all(x.empty for x in self._cache.values() if isinstance(x, pd.DataFrame))
if not clobber:
if modified < update_time or all_empty:
return # Not recently modified; return
# Merge existing tables with new data
_logger.debug('Merging cache tables...')
caches = load_tables(save_dir)
merge_tables(
caches, **{k: v for k, v in self._cache.items() if not k.startswith('_')})
# Ensure we use the minimum created date for each table
for table in caches['_meta']['raw']:
raw_meta = [x['_meta']['raw'].get(table, {}) for x in (caches, self._cache)]
created = filter(None, (x.get('date_created') for x in raw_meta))
if any(created := list(created)):
created = min(map(datetime.fromisoformat, created))
created = created.isoformat(sep=' ', timespec='minutes')
meta['raw'][table]['date_created'] = created
with FileLock(save_dir / '.ONE', log=_logger, timeout=TIMEOUT, timeout_action='delete'):
_logger.info('Saving cache tables...')
for table in filter(lambda x: not x[0] == '_', caches.keys()):
metadata = meta['raw'].get(table, {})
if isinstance(metadata.get('origin'), set):
metadata['origin'] = list(metadata['origin'])
metadata['date_modified'] = modified.isoformat(sep=' ', timespec='minutes')
filename = save_dir.joinpath(f'{table}.pqt')
# Cast indices to str before saving
df = cast_index_object(caches[table].copy(), str)
parquet.save(filename, df, metadata)
_logger.debug(f'Saved {filename}')
meta['saved_time'] = datetime.now()
def save_loaded_ids(self, sessions_only=False, clear_list=True):
"""Save list of UUIDs corresponding to datasets or sessions where datasets were loaded.
Parameters
----------
sessions_only : bool
If true, save list of experiment IDs, otherwise the full list of dataset IDs.
clear_list : bool
If true, clear the current list of loaded dataset IDs after saving.
Returns
-------
list of str
List of UUIDs.
pathlib.Path
The file path of the saved list.
"""
if '_loaded_datasets' not in self._cache or self._cache['_loaded_datasets'].size == 0:
warnings.warn('No datasets loaded; check "record_datasets" attribute is True')
return [], None
if sessions_only:
name = 'session_uuid'
idx = self._cache['datasets'].index.isin(self._cache['_loaded_datasets'], 'id')
ids = self._cache['datasets'][idx].index.unique('eid').values
else:
name = 'dataset_uuid'
ids = self._cache['_loaded_datasets']
timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S%z")
filename = Path(self._tables_dir or self.cache_dir) / f'{timestamp}_loaded_{name}s.csv'
pd.DataFrame(ids, columns=[name]).to_csv(filename, index=False)
if clear_list:
self._cache['_loaded_datasets'] = np.array([])
return ids, filename
def _download_datasets(self, dsets, **kwargs) -> List[ALFPath]:
"""Download several datasets given a set of datasets.
NB: This will not skip files that are already present. Use check_filesystem instead.
Parameters
----------
dsets : list
List of dataset dictionaries from an Alyx REST query OR URL strings.
Returns
-------
list of one.alf.path.ALFPath
A local file path list.
"""
# Looking to entirely remove method
pass # pragma: no cover
def _download_dataset(self, dset, cache_dir=None, **kwargs) -> ALFPath:
"""Download a dataset from an Alyx REST dictionary.
Parameters
----------
dset : pandas.Series, dict, str
A single dataset dictionary from an Alyx REST query OR URL string.
cache_dir : str, pathlib.Path
The root directory to save the data in (home/downloads by default).
Returns
-------
one.alf.path.ALFPath
The local file path.
"""
pass # pragma: no cover
def search(self, details=False, **kwargs):
"""Searches sessions matching the given criteria and returns a list of matching eids.
For a list of search terms, use the method
one.search_terms()
For all search parameters, a single value or list may be provided. For `dataset`, the
sessions returned will contain all listed datasets. For the other parameters, the session
must contain at least one of the entries.
For all but `date_range` and `number`, any field that contains the search string is
returned. Wildcards are not permitted, however if wildcards property is True, regular
expressions may be used (see notes and examples).
Parameters
----------
datasets : str, list
One or more (exact) dataset names. Returns sessions containing all of these datasets.
dataset_qc_lte : str, int, one.alf.spec.QC
A dataset QC value, returns sessions with datasets at or below this QC value, including
those with no QC set. If `dataset` not passed, sessions with any passing QC datasets
are returned, otherwise all matching datasets must have the QC value or below.
date_range : str, list, datetime.datetime, datetime.date, pandas.timestamp
A single date to search or a list of 2 dates that define the range (inclusive). To
define only the upper or lower date bound, set the other element to None.
lab : str
A str or list of lab names, returns sessions from any of these labs.
number : str, int
Number of session to be returned, i.e. number in sequence for a given date.
subject : str, list
A list of subject nicknames, returns sessions for any of these subjects.
task_protocol : str
The task protocol name (can be partial, i.e. any task protocol containing that str
will be found).
projects : str, list
The project name(s) (can be partial, i.e. any project containing that str
will be found).
details : bool
If true also returns a dict of dataset details.
Returns
-------
list of UUID
A list of eids.
(list)
(If details is True) a list of dictionaries, each entry corresponding to a matching
session.
Examples
--------
Search for sessions with 'training' in the task protocol.
>>> eids = one.search(task='training')
Search for sessions by subject 'MFD_04'.
>>> eids = one.search(subject='MFD_04')
Do an exact search for sessions by subject 'FD_04'.
>>> assert one.wildcards is True, 'the wildcards flag must be True for regex expressions'
>>> eids = one.search(subject='^FD_04$')
Search for sessions on a given date, in a given lab, containing trials and spike data.
>>> eids = one.search(
... date='2023-01-01', lab='churchlandlab',
... datasets=['trials.table.pqt', 'spikes.times.npy'])
Search for sessions containing trials and spike data where QC for both are WARNING or less.
>>> eids = one.search(dataset_qc_lte='WARNING', dataset=['trials', 'spikes'])
Search for sessions with any datasets that have a QC of PASS or NOT_SET.
>>> eids = one.search(dataset_qc_lte='PASS')
Notes
-----
- In default and local mode, most queries are case-sensitive partial matches. When lists
are provided, the search is a logical OR, except for `datasets`, which is a logical AND.
- If `dataset_qc` and `datasets` are defined, the QC criterion only applies to the provided
datasets and all must pass for a session to be returned.
- All search terms are true for a session to be returned, i.e. subject matches AND project
matches, etc.
- In remote mode most queries are case-insensitive partial matches.
- In default and local mode, when the one.wildcards flag is True (default), queries are
interpreted as regular expressions. To turn this off set one.wildcards to False.
- In remote mode regular expressions are only supported using the `django` argument.
"""
def all_present(x, dsets, exists=True):
"""Returns true if all datasets present in Series."""
name = x.str.rsplit('/', n=1, expand=True).iloc[:, -1]
return all(any(name.str.fullmatch(y) & exists) for y in dsets)
# Iterate over search filters, reducing the sessions table
sessions = self._cache['sessions']
# Ensure sessions filtered in a particular order, with datasets last
search_order = ('date_range', 'number', 'datasets')
def sort_fcn(itm):
return -1 if itm[0] not in search_order else search_order.index(itm[0])
# Validate and get full name for queries
search_terms = self.search_terms(query_type='local')
kwargs.pop('query_type', None) # used by subclasses
queries = {util.autocomplete(k, search_terms): v for k, v in kwargs.items()}
for key, value in sorted(queries.items(), key=sort_fcn):
# No matches; short circuit
if sessions.size == 0:
return ([], None) if details else []
# String fields
elif key in ('subject', 'task_protocol', 'laboratory', 'projects'):
query = '|'.join(ensure_list(value))
key = 'lab' if key == 'laboratory' else key
mask = sessions[key].str.contains(query, regex=self.wildcards)
sessions = sessions[mask.astype(bool, copy=False)]
elif key == 'date_range':
start, end = util.validate_date_range(value)
session_date = pd.to_datetime(sessions['date'])
sessions = sessions[(session_date >= start) & (session_date <= end)]
elif key == 'number':
query = ensure_list(value)
sessions = sessions[sessions[key].isin(map(int, query))]
# Dataset/QC check is biggest so this should be done last
elif key == 'datasets' or (key == 'dataset_qc_lte' and 'datasets' not in queries):
datasets = self._cache['datasets']
qc = QC.validate(queries.get('dataset_qc_lte', 'FAIL')).name # validate value
has_dset = sessions.index.isin(datasets.index.get_level_values('eid'))
if not has_dset.any():
sessions = sessions.iloc[0:0] # No datasets for any sessions
continue
datasets = datasets.loc[(sessions.index.values[has_dset], ), :]
query = ensure_list(value if key == 'datasets' else '')
# For each session check any dataset both contains query and exists
mask = (
(datasets
.groupby('eid', sort=False)
.apply(lambda x: all_present(
x['rel_path'], query, x['exists'] & x['qc'].le(qc))
))
)
# eids of matching dataset records
idx = mask[mask].index
# Reduce sessions table by datasets mask
sessions = sessions.loc[idx]
# Return results
if sessions.size == 0:
return ([], None) if details else []
sessions = sessions.sort_values(['date', 'subject', 'number'], ascending=False)
eids = sessions.index.to_list()
if details:
return eids, sessions.reset_index(drop=True).to_dict('records', into=Bunch)
else:
return eids
def _search_insertions(self, details=False, **kwargs):
"""Search insertions matching the given criteria and return a list of matching probe IDs.
For a list of search terms, use the method
one.search_terms(query_type='remote', endpoint='insertions')
All of the search parameters, apart from dataset and dataset type require a single value.
For dataset and dataset type, a single value or a list can be provided. Insertions
returned will contain all listed datasets.
Parameters
----------
session : str
A session eid, returns insertions associated with the session.
name: str
An insertion label, returns insertions with specified name.
lab : str
A lab name, returns insertions associated with the lab.
subject : str
A subject nickname, returns insertions associated with the subject.
task_protocol : str
A task protocol name (can be partial, i.e. any task protocol containing that str
will be found).
project(s) : str
The project name (can be partial, i.e. any task protocol containing that str
will be found).
dataset : str, list
One or more dataset names. Returns sessions containing all these datasets.
A dataset matches if it contains the search string e.g. 'wheel.position' matches
'_ibl_wheel.position.npy'.
dataset_qc_lte : int, str, one.alf.spec.QC
The maximum QC value for associated datasets.
details : bool
If true also returns a dict of dataset details.
Returns
-------
list of UUID
List of probe IDs (pids).
(list of dicts)
If details is True, also returns a list of dictionaries, each entry corresponding to a
matching insertion.
Notes
-----
- This method does not use the local cache and therefore can not work in 'local' mode.
Examples
--------
List the insertions associated with a given subject
>>> ins = one.search_insertions(subject='SWC_043')
"""
# Warn if no insertions table present
if (insertions := self._cache.get('insertions')) is None:
warnings.warn('No insertions data loaded.')
return ([], None) if details else []
# Validate and get full names
search_terms = ('model', 'name', 'json', 'serial', 'chronic_insertion')
search_terms += self._search_terms
kwargs.pop('query_type', None) # used by subclasses
arguments = {util.autocomplete(key, search_terms): value for key, value in kwargs.items()}
# Apply session filters first
session_kwargs = {k: v for k, v in arguments.items() if k in self._search_terms}
if session_kwargs:
eids = self.search(**session_kwargs, details=False, query_type='local')
insertions = insertions.loc[eids]
# Apply insertion filters
# Iterate over search filters, reducing the insertions table
for key, value in sorted(filter(lambda x: x[0] not in session_kwargs, kwargs.items())):
if insertions.size == 0:
return ([], None) if details else []
# String fields
elif key in ('model', 'serial', 'name'):
query = '|'.join(ensure_list(value))
mask = insertions[key].str.contains(query, regex=self.wildcards)
insertions = insertions[mask.astype(bool, copy=False)]
else:
raise NotImplementedError(key)
# Return results
if insertions.size == 0:
return ([], None) if details else []
# Sort insertions
eids = insertions.index.get_level_values('eid').unique()
# NB: This will raise if no session in cache; may need to improve error handling here
sessions = self._cache['sessions'].loc[eids, ['date', 'subject', 'number']]
insertions = (insertions
.join(sessions, how='inner')
.sort_values(['date', 'subject', 'number', 'name'], ascending=False))
pids = insertions.index.get_level_values('id').to_list()
if details: # TODO replicate Alyx records here
return pids, insertions.reset_index(drop=True).to_dict('records', into=Bunch)
else:
return pids
def _check_filesystem(self, datasets, offline=None, update_exists=True, check_hash=True):
"""Update the local filesystem for the given datasets.
Given a set of datasets, check whether records correctly reflect the filesystem.
Called by load methods, this returns a list of file paths to load and return.
This changes datasets frame, calls _update_cache(sessions=None, datasets=None) to
update and save tables. Download_datasets may also call this function.
Parameters
----------
datasets : pandas.Series, pandas.DataFrame, list of dicts
A list or DataFrame of dataset records.
offline : bool, None
If false and Web client present, downloads the missing datasets from a remote
repository.
update_exists : bool
If true, the cache is updated to reflect the filesystem.
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
Returns
-------
A list of one.alf.path.ALFPath for the datasets (None elements for non-existent datasets).
"""
if isinstance(datasets, pd.Series):
datasets = pd.DataFrame([datasets])
assert datasets.index.nlevels <= 2
idx_names = ['eid', 'id'] if datasets.index.nlevels == 2 else ['id']
datasets.index.set_names(idx_names, inplace=True)
elif not isinstance(datasets, pd.DataFrame):
# Cast set of dicts (i.e. from REST datasets endpoint)
datasets = datasets2records(list(datasets))
elif datasets.empty:
return []
else:
datasets = datasets.copy()
indices_to_download = [] # indices of datasets that need (re)downloading
files = [] # file path list to return
# If the session_path field is missing from the datasets table, fetch from sessions table
# Typically only aggregate frames contain this column
if 'session_path' not in datasets.columns:
if 'eid' not in datasets.index.names:
# Get slice of full frame with eid in index
_dsets = self._cache['datasets'][
self._cache['datasets'].index.get_level_values(1).isin(datasets.index)
]
idx = _dsets.index.get_level_values(1)
else:
_dsets = datasets
idx = pd.IndexSlice[:, _dsets.index.get_level_values(1)]
# Ugly but works over unique sessions, which should be quicker
session_path = (self._cache['sessions']
.loc[_dsets.index.get_level_values(0).unique()]
.apply(session_record2path, axis=1))
datasets.loc[idx, 'session_path'] = \
pd.Series(_dsets.index.get_level_values(0)).map(session_path).values
# First go through datasets and check if file exists and hash matches
for i, rec in datasets.iterrows():
file = ALFPath(self.cache_dir, *rec[['session_path', 'rel_path']])
if self.uuid_filenames:
file = file.with_uuid(i[1] if isinstance(i, tuple) else i)
if file.exists():
# Check if there's a hash mismatch
# If so, add this index to list of datasets that need downloading
if rec['file_size'] and file.stat().st_size != rec['file_size']:
_logger.warning('local file size mismatch on dataset: %s',
PurePosixPath(rec.session_path, rec.rel_path))
indices_to_download.append(i)
elif check_hash and rec['hash'] is not None:
if hashfile.md5(file) != rec['hash']:
_logger.warning('local md5 mismatch on dataset: %s',
PurePosixPath(rec.session_path, rec.rel_path))
indices_to_download.append(i)
files.append(file) # File exists so add to file list
else:
# File doesn't exist so add None to output file list
files.append(None)
# Add this index to list of datasets that need downloading
indices_to_download.append(i)
# If online and we have datasets to download, call download_datasets with these datasets
if not (offline or self.offline) and indices_to_download:
dsets_to_download = datasets.loc[indices_to_download]
# Returns list of local file paths and set to variable
new_files = self._download_datasets(dsets_to_download, update_cache=update_exists)
# Add each downloaded file to the output list of files
for i, file in zip(indices_to_download, new_files):
files[datasets.index.get_loc(i)] = file
# NB: Currently if not offline and a remote file is missing, an exception will be raised
# before we reach this point. This could change in the future.
exists = list(map(bool, files))
if not all(datasets['exists'] == exists):
with warnings.catch_warnings():
# Suppress future warning: exist column should always be present
msg = '.*indexing on a MultiIndex with a nested sequence of labels.*'
warnings.filterwarnings('ignore', message=msg)
datasets['exists'] = exists
if update_exists:
_logger.debug('Updating exists field')
i = datasets.index
if i.nlevels == 1:
# eid index level missing in datasets input
i = pd.IndexSlice[:, i]
self._cache['datasets'].loc[i, 'exists'] = exists
self._cache['_meta']['modified_time'] = datetime.now()
if self.record_loaded:
loaded = np.fromiter(map(bool, files), bool)
loaded_ids = datasets.index.get_level_values('id')[loaded].to_numpy()
if '_loaded_datasets' not in self._cache:
self._cache['_loaded_datasets'] = np.unique(loaded_ids)
else:
loaded_set = np.hstack([self._cache['_loaded_datasets'], loaded_ids])
self._cache['_loaded_datasets'] = np.unique(loaded_set)
# Return full list of file paths
return files
@util.parse_id
def get_details(self, eid: Union[str, Path, UUID], full: bool = False):
"""Return session details for a given session ID.
Parameters
----------
eid : str, UUID, pathlib.Path, dict
Experiment session identifier; may be a UUID, URL, experiment reference string
details dict or Path.
full : bool
If True, returns a DataFrame of session and dataset info
Returns
-------
pd.Series, pd.DataFrame
A session record or full DataFrame with dataset information if full is True
"""
# Int ids return DataFrame, making str eid a list ensures Series not returned
try:
det = self._cache['sessions'].loc[[eid]]
assert len(det) == 1
except KeyError:
raise alferr.ALFObjectNotFound(eid)
except AssertionError:
raise alferr.ALFMultipleObjectsFound(f'Multiple sessions in cache for eid {eid}')
if not full:
return det.iloc[0]
# .reset_index('eid', drop=True)
return self._cache['datasets'].join(det, on='eid', how='right')
def list_subjects(self) -> List[str]:
"""List all subjects in database.
Returns
-------
list
Sorted list of subject names
"""
return self._cache['sessions']['subject'].sort_values().unique().tolist()
def list_datasets(
self, eid=None, filename=None, collection=None, revision=None, qc=QC.FAIL,
ignore_qc_not_set=False, details=False, query_type=None, default_revisions_only=False,
keep_eid_index=False
) -> Union[np.ndarray, pd.DataFrame]:
"""Given an eid, return the datasets for those sessions.
If no eid is provided, a list of all datasets is returned. When details is false, a sorted
array of unique datasets is returned (their relative paths).
Parameters
----------
eid : str, UUID, pathlib.Path, dict
Experiment session identifier; may be a UUID, URL, experiment reference string
details dict or Path.
filename : str, dict, list
Filters datasets and returns only the ones matching the filename.
Supports lists asterisks as wildcards. May be a dict of ALF parts.
collection : str, list
The collection to which the object belongs, e.g. 'alf/probe01'.
This is the relative path of the file from the session root.
Supports asterisks as wildcards.
revision : str
Filters datasets and returns only the ones matching the revision.
Supports asterisks as wildcards.
qc : str, int, one.alf.spec.QC
Returns datasets at or below this QC level. Integer values should correspond to the QC
enumeration NOT the qc category column codes in the pandas table.
ignore_qc_not_set : bool
When true, do not return datasets for which QC is NOT_SET.
details : bool
When true, a pandas DataFrame is returned, otherwise a numpy array of
relative paths (collection/revision/filename) - see one.alf.spec.describe for details.
query_type : str
Query cache ('local') or Alyx database ('remote').
default_revisions_only : bool
When true, only matching datasets that are considered default revisions are returned.
If no 'default_revision' column is present, and ALFError is raised.
keep_eid_index : bool
If details is true, this determines whether the returned data frame contains the eid
in the index. When false (default) the returned data frame index is the dataset id
only, otherwise the index is a MultIndex with levels (eid, id).
Returns
-------
np.ndarray, pd.DataFrame
Slice of datasets table or numpy array if details is False.
Examples
--------
List all unique datasets in ONE cache
>>> datasets = one.list_datasets()
List all datasets for a given experiment
>>> datasets = one.list_datasets(eid)
List all datasets for an experiment that match a collection name
>>> probe_datasets = one.list_datasets(eid, collection='*probe*')
List datasets for an experiment that have 'wheel' in the filename
>>> datasets = one.list_datasets(eid, filename='*wheel*')
List datasets for an experiment that are part of a 'wheel' or 'trial(s)' object
>>> datasets = one.list_datasets(eid, {'object': ['wheel', 'trial?']})
"""
datasets = self._cache['datasets']
if default_revisions_only:
if 'default_revision' not in datasets.columns:
raise alferr.ALFError('No default revisions specified')
datasets = datasets[datasets['default_revision']]
filter_args = dict(
collection=collection, filename=filename, wildcards=self.wildcards, revision=revision,
revision_last_before=False, assert_unique=False, qc=qc,
ignore_qc_not_set=ignore_qc_not_set)
if not eid:
datasets = util.filter_datasets(datasets, **filter_args)
return datasets.copy() if details else datasets['rel_path'].unique().tolist()
eid = self.to_eid(eid) # Ensure we have a UUID str list
if not eid:
return datasets.iloc[0:0] # Return empty
try:
datasets = datasets.loc[(eid,), :]
except KeyError:
return datasets.iloc[0:0] # Return empty
datasets = util.filter_datasets(datasets, **filter_args)
if details:
if keep_eid_index and datasets.index.nlevels == 1:
# Reinstate eid index
datasets = pd.concat({eid: datasets}, names=['eid'])
# Return the full data frame
return datasets
else:
# Return only the relative path
return datasets['rel_path'].sort_values().values.tolist()
def list_collections(self, eid=None, filename=None, collection=None, revision=None,
details=False, query_type=None) -> Union[np.ndarray, dict]:
"""List the collections for a given experiment.
If no experiment ID is given, all collections are returned.
Parameters
----------
eid : [str, UUID, Path, dict]
Experiment session identifier; may be a UUID, URL, experiment reference string
details dict or Path
filename : str, dict, list
Filters datasets and returns only the collections containing matching datasets.
Supports lists asterisks as wildcards. May be a dict of ALF parts.
collection : str, list
Filter by a given pattern. Supports asterisks as wildcards.
revision : str
Filters collections and returns only the ones with the matching revision.
Supports asterisks as wildcards
details : bool
If true a dict of pandas datasets tables is returned with collections as keys,
otherwise a numpy array of unique collections
query_type : str
Query cache ('local') or Alyx database ('remote')
Returns
-------
list, dict
A list of unique collections or dict of datasets tables
Examples
--------
List all unique collections in ONE cache
>>> collections = one.list_collections()
List all collections for a given experiment
>>> collections = one.list_collections(eid)
List all collections for a given experiment and revision
>>> revised = one.list_collections(eid, revision='2020-01-01')
List all collections that have 'probe' in the name.
>>> collections = one.list_collections(eid, collection='*probe*')
List collections for an experiment that have datasets with 'wheel' in the name
>>> collections = one.list_collections(eid, filename='*wheel*')
List collections for an experiment that contain numpy datasets
>>> collections = one.list_collections(eid, {'extension': 'npy'})
"""
filter_kwargs = dict(eid=eid, collection=collection, filename=filename,
revision=revision, query_type=query_type)
datasets = self.list_datasets(details=True, **filter_kwargs).copy()
datasets['collection'] = datasets.rel_path.apply(
lambda x: alfiles.rel_path_parts(x, assert_valid=False)[0] or ''
)
if details:
return {k: table.drop('collection', axis=1)
for k, table in datasets.groupby('collection')}
else:
return datasets['collection'].unique().tolist()
def list_revisions(self, eid=None, filename=None, collection=None, revision=None,
details=False, query_type=None):
"""List the revisions for a given experiment.
If no experiment id is given, all collections are returned.
Parameters
----------
eid : str, UUID, Path, dict
Experiment session identifier; may be a UUID, URL, experiment reference string
details dict or Path.
filename : str, dict, list
Filters datasets and returns only the revisions containing matching datasets.
Supports lists asterisks as wildcards. May be a dict of ALF parts.
collection : str, list
Filter by a given collection. Supports asterisks as wildcards.
revision : str, list
Filter by a given pattern. Supports asterisks as wildcards.
details : bool
If true a dict of pandas datasets tables is returned with collections as keys,
otherwise a numpy array of unique collections.
query_type : str
Query cache ('local') or Alyx database ('remote').
Returns
-------
list, dict
A list of unique collections or dict of datasets tables.
Examples
--------
List all revisions in ONE cache
>>> revisions = one.list_revisions()
List all revisions for a given experiment
>>> revisions = one.list_revisions(eid)
List all revisions for a given experiment that contain the trials object
>>> revisions = one.list_revisions(eid, filename={'object': 'trials'})
List all revisions for a given experiment that start with 2020 or 2021
>>> revisions = one.list_revisions(eid, revision=['202[01]*'])
"""
datasets = self.list_datasets(eid=eid, details=True, query_type=query_type).copy()
# Call filter util ourselves with the revision_last_before set to False
kwargs = dict(collection=collection, filename=filename, revision=revision,
revision_last_before=False, wildcards=self.wildcards, assert_unique=False)
datasets = util.filter_datasets(datasets, **kwargs)
datasets['revision'] = datasets.rel_path.apply(
lambda x: (alfiles.rel_path_parts(x, assert_valid=False)[1] or '').strip('#')
)
if details:
return {k: table.drop('revision', axis=1)
for k, table in datasets.groupby('revision')}
else:
return datasets['revision'].unique().tolist()
@util.parse_id
def load_object(self,
eid: Union[str, Path, UUID],
obj: str,
collection: Optional[str] = None,
revision: Optional[str] = None,
query_type: Optional[str] = None,
download_only: bool = False,
check_hash: bool = True,
**kwargs) -> Union[alfio.AlfBunch, List[ALFPath]]:
"""Load all attributes of an ALF object from a Session ID and an object name.
Any datasets with matching object name will be loaded.
Parameters
----------
eid : str, UUID, pathlib.Path, dict
Experiment session identifier; may be a UUID, URL, experiment reference string
details dict or Path.
obj : str
The ALF object to load. Supports asterisks as wildcards.
collection : str
The collection to which the object belongs, e.g. 'alf/probe01'.
This is the relative path of the file from the session root.
Supports asterisks as wildcards.