-
Notifications
You must be signed in to change notification settings - Fork 17
Open
Description
We're using ndindex as part of versioned-hdf5. Unfortunately in some common circumstances it can be pretty slow. Consider the following code:
>>> # THIS CELL WAS AUTO-GENERATED BY PYFLYBY
>>> import atexit
>>> import h5py
>>> import numpy as np
>>> import shutil
>>> import tempfile
>>> from versioned_hdf5 import VersionedHDF5File
>>> # END AUTO-GENERATED BLOCK
>>> d = tempfile.mkdtemp()
>>> atexit.register(shutil.rmtree, d)
[PYFLYBY] import atexit
[PYFLYBY] import shutil
[PYFLYBY] import tempfile
<function shutil.rmtree(path, ignore_errors=False, onerror=None, *, dir_fd=None)>
>>> def r0(d):
... with h5py.File(f'{d}/data.h5', mode="w") as f:
... vf = VersionedHDF5File(f)
... with vf.stage_version("r0") as sv:
... sv.create_dataset('values', data=np.arange(100_000_000), chunks=(1_000,), maxshape=(None,))
[PYFLYBY] from versioned_hdf5 import VersionedHDF5File
[PYFLYBY] import h5py
[PYFLYBY] import numpy as np
>>> def r1(d):
... with h5py.File(f'{d}/data.h5', mode="r+") as f:
... vf = VersionedHDF5File(f)
... with vf.stage_version("r1") as sv:
... values = sv['values']
... # resizing creates an InMemoryDataset and populates data_dict
... values.resize((110_000_000,))
... # reading from InMemoryDataset is now slow
... _ = values[500:90_000_000]
>>> %load_ext pyinstrument
>>> r0(d)
>>> %pyinstrument r1(d)
_ ._ __/__ _ _ _ _ _/_ Recorded: 14:20:41 Samples: 30357
/_//_/// /_\ / //_// / //_'/ // Duration: 42.410 CPU time: 41.642
/ _/ v4.6.1
Program: /usr/local/python/python-3.11/std/lib64/python3.11/site-packages/ipykernel_launcher.py -f /u/bessen/.local/share/jupyter/runtime/kernel-f2241356-7014-4cb5-99a3-98f8ade0d4d7.json
42.409 <module> ../../../tmp/ipykernel_3610251/2296172108.py:1
`- 42.407 r1 ../../../tmp/ipykernel_3610251/2453629480.py:1
|- 15.246 DatasetWrapper.__getitem__ versioned_hdf5/wrappers.py:1267
| `- 15.246 InMemoryDataset.__getitem__ versioned_hdf5/wrappers.py:757
| `- 15.246 InMemoryDataset.get_index versioned_hdf5/wrappers.py:655
| |- 9.144 [self] versioned_hdf5/wrappers.py
| |- 4.383 InMemoryDatasetID._read_chunk versioned_hdf5/wrappers.py:1385
| | `- 4.275 Dataset.__getitem__ h5py/_hl/dataset.py:749
| | |- 1.916 Reader.read <built-in>
| | |- 1.704 Dataset._fast_reader h5py/_hl/dataset.py:527
| | | `- 1.689 [self] h5py/_hl/dataset.py
| | `- 0.604 [self] h5py/_hl/dataset.py
| `- 0.878 where <__array_function__ internals>:177
| [2 frames hidden] <__array_function__ internals>, <buil...
|- 13.388 _GeneratorContextManager.__exit__ contextlib.py:141
| `- 13.388 VersionedHDF5File.stage_version versioned_hdf5/api.py:267
| `- 13.371 commit_version versioned_hdf5/versions.py:71
| |- 8.712 create_virtual_dataset versioned_hdf5/backend.py:429
| | |- 2.525 [self] versioned_hdf5/backend.py
| | |- 2.068 Group.create_virtual_dataset h5py/_hl/group.py:188
| | | `- 1.646 VirtualLayout.make_dataset h5py/_hl/vds.py:228
| | |- 1.932 select h5py/_hl/selections.py:19
| | | `- 1.452 [self] h5py/_hl/selections.py
| | |- 0.728 Dataset.shape h5py/_hl/dataset.py:467
| | | `- 0.699 [self] h5py/_hl/dataset.py
| | `- 0.530 <dictcomp> versioned_hdf5/backend.py:437
| | `- 0.525 [self] versioned_hdf5/backend.py
| |- 3.961 write_dataset_chunks versioned_hdf5/backend.py:346
| | |- 2.349 Hashtable.hash versioned_hdf5/hashtable.py:116
| | | `- 1.822 openssl_sha256 <built-in>
| | `- 0.659 Hashtable.__contains__ <frozen _collections_abc>:778
| | `- 0.609 Hashtable.__getitem__ versioned_hdf5/hashtable.py:205
| | `- 0.595 [self] versioned_hdf5/hashtable.py
| `- 0.688 [self] versioned_hdf5/versions.py
|- 9.595 InMemoryDataset.resize versioned_hdf5/wrappers.py:609
| |- 5.824 InMemoryDatasetID.data_dict versioned_hdf5/wrappers.py:1298
| | |- 2.915 <dictcomp> versioned_hdf5/wrappers.py:1327
| | | `- 2.680 spaceid_to_slice versioned_hdf5/slicetools.py:6
| | | `- 2.317 [self] versioned_hdf5/slicetools.py
| | |- 1.683 <listcomp> versioned_hdf5/wrappers.py:1317
| | | `- 1.627 [self] versioned_hdf5/wrappers.py
| | `- 0.830 [self] versioned_hdf5/wrappers.py
| |- 1.712 [self] versioned_hdf5/wrappers.py
| `- 1.710 InMemoryDataset.get_index versioned_hdf5/wrappers.py:655
| `- 1.710 InMemoryDataset.__getitem__ h5py/_hl/dataset.py:749
| `- 1.706 Reader.read <built-in>
|- 3.185 _GeneratorContextManager.__enter__ contextlib.py:132
| `- 3.185 VersionedHDF5File.stage_version versioned_hdf5/api.py:267
| `- 3.185 create_version_group versioned_hdf5/versions.py:22
| `- 3.184 Group.visititems h5py/_hl/group.py:635
| |- 2.613 proxy h5py/_hl/group.py:660
| | |- 1.673 _get versioned_hdf5/versions.py:57
| | | `- 1.275 InMemoryGroup.__setitem__ versioned_hdf5/wrappers.py:110
| | | `- 1.275 InMemoryGroup._add_to_data versioned_hdf5/wrappers.py:114
| | | `- 1.275 InMemoryDataset.__init__ versioned_hdf5/wrappers.py:503
| | | `- 0.592 KeysViewHDF5.__iter__ <frozen _collections_abc>:835
| | | `- 0.592 AttributeManager.__iter__ h5py/_hl/attrs.py:257
| | | `- 0.441 [self] h5py/_hl/attrs.py
| | `- 0.791 Group.__getitem__ h5py/_hl/group.py:348
| | `- 0.544 [self] h5py/_hl/group.py
| `- 0.571 [self] h5py/_hl/group.py
`- 0.985 File.__exit__ h5py/_hl/files.py:601
`- 0.985 File.close h5py/_hl/files.py:576
>>> %load_ext line_profiler
>>> from versioned_hdf5.wrappers import InMemoryDataset
>>> r0(d)
>>> %lprun -f InMemoryDataset.get_index.__wrapped__ r1(d)
Timer unit: 1e-09 s
Total time: 14.9595 s
File: /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/wrappers.py
Function: get_index at line 655
Line # Hits Time Per Hit % Time Line Contents
==============================================================
655 @with_phil
656 def get_index(
657 self,
658 args: Union[slice, Slice, Tuple, tuple, h5r.RegionReference],
659 new_dtype: Optional[str] = None,
660 can_read_direct: Optional[bool] = None,
661 ) -> np.ndarray:
662 """Read a slice from the HDF5 dataset given by the index.
663
664 Takes slices and recarray-style field names (more than one is
665 allowed!) in any order. Obeys basic NumPy rules, including
666 broadcasting.
667
668 Parameters
669 ----------
670 args : Union[slice, Slice, Tuple, tuple, h5r.RegionReference]
671 Index to read from the Dataset
672 new_dtype : Optional[str]
673 Dtype of the returned array
674 can_read_direct : Optional[bool]
675 True if we can read directly from the underlying hdf5 Dataset, False otherwise.
676 This should be the value of the InMemoryDatasetID instance's ``can_read_direct``
677 property for this Dataset.
678
679 If None, ``self.id.can_read_direct`` is evaluated first to determine if data can
680 be read directly from the underlying dataset.
681
682 Returns
683 -------
684 np.ndarray
685 Array containing data from this dataset from the requested index
686 """
687 # This boilerplate code is based on h5py.Dataset.__getitem__
688 2 8062.0 4031.0 0.0 args = args if isinstance(args, tuple) else (args,)
689
690 # Sort field names from the rest of the args.
691 2 10665.0 5332.5 0.0 names = tuple(x for x in args if isinstance(x, str))
692
693 2 2500.0 1250.0 0.0 if names:
694 # Read a subset of the fields in this structured dtype
695 if len(names) == 1:
696 names = names[0] # Read with simpler dtype of this field
697 args = tuple(x for x in args if not isinstance(x, str))
698 return self.fields(names, _prior_dtype=new_dtype)[args]
699
700 2 1745.0 872.5 0.0 if new_dtype is None:
701 2 20228.0 10114.0 0.0 new_dtype = self.dtype
702 2 73856.0 36928.0 0.0 mtype = h5t.py_create(new_dtype)
703
704 # === Special-case region references ====
705
706 2 5677.0 2838.5 0.0 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):
707 obj = h5r.dereference(args[0], self.id)
708 if obj != self.id:
709 raise ValueError("Region reference must point to this dataset")
710
711 sid = h5r.get_region(args[0], self.id)
712 mshape = guess_shape(sid)
713 if mshape is None:
714 # 0D with no data (NULL or deselected SCALAR)
715 return Empty(new_dtype)
716 out = np.empty(mshape, dtype=new_dtype)
717 if out.size == 0:
718 return out
719
720 sid_out = h5s.create_simple(mshape)
721 sid_out.select_all()
722 self.id.read(sid_out, sid, out, mtype)
723 return out
724
725 # === END CODE FROM h5py.Dataset.__getitem__ ===
726
727 2 328533.0 164266.5 0.0 idx = ndindex(args).expand(self.shape)
728
729 2 1098.0 549.0 0.0 if can_read_direct is None:
730 1 48158088.0 5e+07 0.3 can_read_direct = self.id.can_read_direct
731
732 2 1628.0 814.0 0.0 if can_read_direct:
733 1 1411348097.0 1e+09 9.4 return super().__getitem__(idx.raw)
734
735 1 1542927.0 2e+06 0.0 arr = np.ndarray(idx.newshape(self.shape), new_dtype, order="C")
736
737 90001 994078595.0 11045.2 6.6 for chunk in self.chunks.as_subchunks(idx, self.shape):
738 90000 367273224.0 4080.8 2.5 if chunk not in self.id.data_dict:
739 self.id.data_dict[chunk] = np.broadcast_to(
740 self.fillvalue, chunk.newshape(self.shape)
741 )
742 90000 295761742.0 3286.2 2.0 elif isinstance(self.id.data_dict[chunk], (slice, Slice, tuple, Tuple)):
743 360000 439296532.0 1220.3 2.9 raw_idx = Tuple(
744 90000 226675754.0 2518.6 1.5 self.id.data_dict[chunk],
745 90000 109581410.0 1217.6 0.7 *[slice(0, len(i)) for i in chunk.args[1:]],
746 90000 99604868.0 1106.7 0.7 ).raw
747 90000 4136405229.0 45960.1 27.7 self.id.data_dict[chunk] = self.id._read_chunk(raw_idx)
748
749 90000 269383258.0 2993.1 1.8 if self.id.data_dict[chunk].size != 0:
750 90000 3196593424.0 35517.7 21.4 arr_idx = chunk.as_subindex(idx)
751 90000 2472868596.0 27476.3 16.5 index = idx.as_subindex(chunk)
752 90000 890518785.0 9894.7 6.0 arr[arr_idx.raw] = self.id.data_dict[chunk][index.raw]
753
754 # Return arr as a scalar if it is shape () (matching h5py)
755 1 1263.0 1263.0 0.0 return arr[()]
In particular note that in InMemoryDataset.get_index we spend 9.144 seconds in "[self]" and 0.878 seconds in "where":
| `- 15.246 InMemoryDataset.get_index versioned_hdf5/wrappers.py:655
| |- 9.144 [self] versioned_hdf5/wrappers.py
| |- 4.383 InMemoryDatasetID._read_chunk versioned_hdf5/wrappers.py:1385
| | `- 4.275 Dataset.__getitem__ h5py/_hl/dataset.py:749
| | |- 1.916 Reader.read <built-in>
| | |- 1.704 Dataset._fast_reader h5py/_hl/dataset.py:527
| | | `- 1.689 [self] h5py/_hl/dataset.py
| | `- 0.604 [self] h5py/_hl/dataset.py
| `- 0.878 where <__array_function__ internals>:177
| [2 frames hidden] <__array_function__ internals>, <buil...
These timings come from ndindex but don't show up properly because it is compiled as Cython extension. If I disable cythonization you can see where it's spending it's time (and it's also even slower):
>>> r0(d)
>>> %pyinstrument r1(d)
_ ._ __/__ _ _ _ _ _/_ Recorded: 14:24:32 Samples: 55759
/_//_/// /_\ / //_// / //_'/ // Duration: 66.820 CPU time: 66.438
/ _/ v4.6.1
Program: /usr/local/python/python-3.11/std/lib64/python3.11/site-packages/ipykernel_launcher.py -f /u/bessen/.local/share/jupyter/runtime/kernel-f2241356-7014-4cb5-99a3-98f8ade0d4d7.json
66.820 <module> ../../../tmp/ipykernel_3619864/2296172108.py:1
`- 66.817 r1 ../../../tmp/ipykernel_3619864/2453629480.py:1
|- 32.529 DatasetWrapper.__getitem__ versioned_hdf5/wrappers.py:1267
| `- 32.529 InMemoryDataset.__getitem__ versioned_hdf5/wrappers.py:757
| `- 32.529 InMemoryDataset.get_index versioned_hdf5/wrappers.py:655
| |- 19.605 Tuple.as_subindex ndindex/tuple.py:627
| | |- 8.551 Slice.as_subindex ndindex/slice.py:501
| | | |- 3.618 Slice.reduce ndindex/slice.py:212
| | | | |- 2.811 Slice.__init__ ndindex/ndindex.py:159
| | | | | `- 2.539 Slice._typecheck ndindex/slice.py:62
| | | | | `- 1.768 operator_index ndindex/ndindex.py:681
| | | | | `- 1.345 [self] ndindex/ndindex.py
| | | | `- 0.807 [self] ndindex/slice.py
| | | |- 2.226 subindex_slice ndindex/subindex_helpers.py:83
| | | | `- 1.114 where ndindex/subindex_helpers.py:46
| | | | `- 0.722 where <__array_function__ internals>:177
| | | |- 1.144 [self] ndindex/slice.py
| | | `- 0.997 Slice.__init__ ndindex/ndindex.py:159
| | | `- 0.911 Slice._typecheck ndindex/slice.py:62
| | |- 4.300 Tuple.reduce ndindex/tuple.py:185
| | | |- 1.656 [self] ndindex/tuple.py
| | | `- 1.513 Slice.reduce ndindex/slice.py:212
| | | `- 1.107 Slice.__init__ ndindex/ndindex.py:159
| | | `- 0.972 Slice._typecheck ndindex/slice.py:62
| | | `- 0.697 operator_index ndindex/ndindex.py:681
| | |- 3.529 Tuple.__init__ ndindex/ndindex.py:159
| | | `- 3.261 Tuple._typecheck ndindex/tuple.py:49
| | | `- 1.929 [self] ndindex/tuple.py
| | `- 1.650 [self] ndindex/tuple.py
| |- 4.108 InMemoryDatasetID._read_chunk versioned_hdf5/wrappers.py:1385
| | `- 4.002 Dataset.__getitem__ h5py/_hl/dataset.py:749
| | |- 1.817 Reader.read <built-in>
| | `- 1.655 Dataset._fast_reader h5py/_hl/dataset.py:527
| | `- 1.637 [self] h5py/_hl/dataset.py
| |- 2.265 ChunkSize.as_subchunks ndindex/chunking.py:143
| | `- 2.230 _indices ndindex/chunking.py:288
| | |- 0.909 <listcomp> ndindex/chunking.py:292
| | `- 0.871 Tuple.__init__ ndindex/ndindex.py:159
| | `- 0.801 Tuple._typecheck ndindex/tuple.py:49
| |- 2.179 [self] versioned_hdf5/wrappers.py
| |- 1.487 Tuple.__hash__ ndindex/tuple.py:114
| | `- 1.071 Slice.__hash__ ndindex/slice.py:85
| |- 1.057 Tuple.__eq__ ndindex/tuple.py:107
| `- 0.932 Tuple.__init__ ndindex/ndindex.py:159
| `- 0.860 Tuple._typecheck ndindex/tuple.py:49
|- 16.648 _GeneratorContextManager.__exit__ contextlib.py:141
| `- 16.648 VersionedHDF5File.stage_version versioned_hdf5/api.py:267
| `- 16.633 commit_version versioned_hdf5/versions.py:71
| |- 10.969 create_virtual_dataset versioned_hdf5/backend.py:429
| | |- 1.989 [self] versioned_hdf5/backend.py
| | |- 1.897 Group.create_virtual_dataset h5py/_hl/group.py:188
| | | `- 1.467 VirtualLayout.make_dataset h5py/_hl/vds.py:228
| | |- 1.794 select h5py/_hl/selections.py:19
| | | `- 1.361 [self] h5py/_hl/selections.py
| | |- 1.289 <dictcomp> versioned_hdf5/backend.py:437
| | | `- 0.964 Slice.reduce ndindex/slice.py:212
| | | `- 0.816 Slice.__init__ ndindex/ndindex.py:159
| | | `- 0.762 Slice._typecheck ndindex/slice.py:62
| | |- 1.111 Tuple.__init__ ndindex/ndindex.py:159
| | | `- 1.017 Tuple._typecheck ndindex/tuple.py:49
| | `- 0.846 Dataset.shape h5py/_hl/dataset.py:467
| | `- 0.799 [self] h5py/_hl/dataset.py
| `- 4.984 write_dataset_chunks versioned_hdf5/backend.py:346
| |- 2.393 Hashtable.hash versioned_hdf5/hashtable.py:116
| | `- 1.827 openssl_sha256 <built-in>
| `- 0.953 Hashtable.__contains__ <frozen _collections_abc>:778
| `- 0.903 Hashtable.__getitem__ versioned_hdf5/hashtable.py:205
| `- 0.673 Slice.__init__ ndindex/ndindex.py:159
|- 13.589 InMemoryDataset.resize versioned_hdf5/wrappers.py:609
| |- 7.899 InMemoryDatasetID.data_dict versioned_hdf5/wrappers.py:1298
| | |- 4.633 <dictcomp> versioned_hdf5/wrappers.py:1327
| | | `- 4.225 spaceid_to_slice versioned_hdf5/slicetools.py:6
| | | |- 1.730 Tuple.__init__ ndindex/ndindex.py:159
| | | | `- 1.607 Tuple._typecheck ndindex/tuple.py:49
| | | | `- 0.932 [self] ndindex/tuple.py
| | | |- 1.228 [self] versioned_hdf5/slicetools.py
| | | `- 1.193 hyperslab_to_slice versioned_hdf5/slicetools.py:33
| | | `- 1.112 Slice.__init__ ndindex/ndindex.py:159
| | | `- 0.682 Slice._typecheck ndindex/slice.py:62
| | `- 1.464 <listcomp> versioned_hdf5/wrappers.py:1317
| | `- 1.405 [self] versioned_hdf5/wrappers.py
| |- 2.367 ChunkSize.as_subchunks ndindex/chunking.py:143
| | `- 2.327 _indices ndindex/chunking.py:288
| | |- 1.155 <listcomp> ndindex/chunking.py:292
| | | `- 0.884 Slice.__init__ ndindex/ndindex.py:159
| | | `- 0.710 Slice._typecheck ndindex/slice.py:62
| | `- 0.856 Tuple.__init__ ndindex/ndindex.py:159
| | `- 0.788 Tuple._typecheck ndindex/tuple.py:49
| |- 1.429 InMemoryDataset.get_index versioned_hdf5/wrappers.py:655
| | `- 1.429 InMemoryDataset.__getitem__ h5py/_hl/dataset.py:749
| | `- 1.422 Reader.read <built-in>
| `- 0.899 [self] versioned_hdf5/wrappers.py
|- 3.085 _GeneratorContextManager.__enter__ contextlib.py:132
| `- 3.085 VersionedHDF5File.stage_version versioned_hdf5/api.py:267
| `- 3.085 create_version_group versioned_hdf5/versions.py:22
| `- 3.084 Group.visititems h5py/_hl/group.py:635
| `- 2.533 proxy h5py/_hl/group.py:660
| |- 1.618 _get versioned_hdf5/versions.py:57
| | `- 1.225 InMemoryGroup.__setitem__ versioned_hdf5/wrappers.py:110
| | `- 1.225 InMemoryGroup._add_to_data versioned_hdf5/wrappers.py:114
| | `- 1.225 InMemoryDataset.__init__ versioned_hdf5/wrappers.py:503
| `- 0.767 Group.__getitem__ h5py/_hl/group.py:348
`- 0.958 File.__exit__ h5py/_hl/files.py:601
`- 0.958 File.close h5py/_hl/files.py:576
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels