Skip to content

Commit 5926e8a

Browse files
author
Luke Shaw
committed
Merge branch 'main' into fancyIndex
2 parents b0f6ecd + 3f2a7fc commit 5926e8a

32 files changed

+4548
-25
lines changed

ANNOUNCE.rst

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1-
Announcing Python-Blosc2 3.6.1
1+
Announcing Python-Blosc2 3.7.0
22
==============================
33

44
In this release:
55

6-
✅ Blosc2 now suppports fancy indexing (and orthogonal indexing)
7-
✅ Added fast path for 1D fancy indexing
8-
✅ More complex slicing is now supported for lazy expressions
9-
✅ Blosc2 indexing more consistent with NumPy
10-
✅ Comprehensive ``squeeze`` function which squeezes only specified dimensions
11-
✅ Correctly point to most recent C-blosc2 version 2.19.1
12-
13-
We have blogged about the new fancy indexing support:
14-
https://www.blosc.org/posts/blosc2-fancy-indexing/
6+
✅ Overhaul of documentation (API reference and Tutorials)
7+
✅ Improvements to lazy expression indexing and in particular much more efficient
8+
memory usage when applying non-unit steps
9+
✅ Extended functionality of ``expand_dims`` to match that of NumPy
10+
✅ 3(!) new data storage classes (``EmbedStore``, ``DictStore`` and ``TreeStore``)
11+
which allow for the efficient storage of heterogeneous array data
12+
13+
See [here](https://github.com/Blosc/python-blosc2/pull/451#issuecomment-3178828765)
14+
for plots for the new data storage classes. And
15+
[here](https://github.com/Blosc/python-blosc2/pull/446#issuecomment-3167060686) for the improved performance
16+
of lazy expression slicing.
1517

1618
You can think of Python-Blosc2 3.x as an extension of NumPy/numexpr that:
1719

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ else()
5050
include(FetchContent)
5151
FetchContent_Declare(blosc2
5252
GIT_REPOSITORY https://github.com/Blosc/c-blosc2
53-
GIT_TAG 2d34ddb0b4832b58d68eae042da00c05a76b72fb # v2.19.2dev0
53+
GIT_TAG d75993535461aaf2ded996f0a625cbec8df9655c # v2.20.0
5454
)
5555
FetchContent_MakeAvailable(blosc2)
5656
include_directories("${blosc2_SOURCE_DIR}/include")

RELEASE_NOTES.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,18 @@
11
# Release notes
2-
## Changes from 3.6.1 to 3.6.2
2+
## Changes from 3.7.0 to 3.7.1
33

44
XXX version-specific blurb XXX
55

6+
## Changes from 3.6.1 to 3.7.0
7+
8+
* Overhaul of documentation (API reference and Tutorials)
9+
10+
* Improvements to lazy expression indexing and in particular much more efficient memory usage when applying non-unit steps (PR #446).
11+
12+
* Extended functionality of ``expand_dims`` to match that of NumPy (note that this breaks the previous API) (PR #453).
13+
14+
* The biggest change is in the form of three new data storage classes (``EmbedStore``, ``DictStore`` and ``TreeStore``) which allow for the efficient storage of heterogeneous array data (PR #451). ``EmbedStore`` is essentially an ``SChunk`` wrapper which can be stored on-disk or in-memory; ``DictStore`` allows for mixed storage across memory, disk or indeed remote; and ``TreeStore`` is a hieracrhically-formatted version of ``DictStore`` which mimics the HDF5 file format. Write, access and storage performance are all very competitive with other packages - see [plots here](https://github.com/Blosc/python-blosc2/pull/451#issuecomment-3178828765).
15+
616
## Changes from 3.6.0 to 3.6.1
717

818
* C-Blosc2 internal library updated to latest 2.19.1.

bench/large-embed-store.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <[email protected]>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
import os
9+
import time
10+
import numpy as np
11+
import blosc2
12+
from blosc2 import EmbedStore
13+
from memory_profiler import memory_usage
14+
15+
def make_arrays(n, min_size, max_size, dtype="f8"):
16+
sizes = np.linspace(min_size, max_size, n).astype(int)
17+
#arrays = [blosc2.arange(size, dtype=dtype) for size in sizes]
18+
arrays = [blosc2.linspace(0, 1, size, dtype=dtype) for size in sizes]
19+
#arrays = [np.random.randint(0, 100, size=size, dtype=dtype) for size in sizes]
20+
# Calculate uncompressed size
21+
uncompressed_size = sum(arr.nbytes for arr in arrays)
22+
print(f"Uncompressed data size: {uncompressed_size / 1e9:.2f} GB")
23+
return arrays, sizes, uncompressed_size
24+
25+
def get_file_size(filepath):
26+
"""Get file size in MB."""
27+
if os.path.exists(filepath):
28+
return os.path.getsize(filepath) / 2**20
29+
return 0
30+
31+
def check_arrays(tree_path, arrays, prefix="node"):
32+
print("Checking stored arrays...")
33+
tree = EmbedStore(urlpath=tree_path, mode="r")
34+
for i, arr in enumerate(arrays):
35+
stored_arr = tree[f"/{prefix}{i}"][:]
36+
if not np.allclose(arr, stored_arr):
37+
raise ValueError(f"Array mismatch at {prefix}{i}")
38+
39+
def run_embed_tree(arrays, sizes, tree_path, uncompressed_size, check=False):
40+
def embed_process():
41+
tree = EmbedStore(urlpath=tree_path, mode="w")
42+
for i, arr in enumerate(arrays):
43+
tree[f"/node{i}"] = arr
44+
return tree
45+
46+
t0 = time.time()
47+
mem_usage = memory_usage((embed_process, ()), interval=0.1)
48+
t1 = time.time()
49+
peak_mem = max(mem_usage) - min(mem_usage)
50+
file_size = get_file_size(tree_path)
51+
compression_ratio = uncompressed_size / (file_size * 2**20) if file_size > 0 else 0
52+
print(f"[Embed] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, File size: {file_size:.2f} MB, Compression: {compression_ratio:.1f}x")
53+
54+
if check:
55+
check_arrays(tree_path, arrays, prefix="node")
56+
57+
return t1-t0, peak_mem, file_size
58+
59+
def run_external_tree(arrays, sizes, tree_path, arr_prefix, uncompressed_size, check=False):
60+
def external_process():
61+
tree = EmbedStore(urlpath=tree_path, mode="w")
62+
for i, arr in enumerate(arrays):
63+
arr_path = f"{arr_prefix}_node{i}.b2nd"
64+
arr_b2 = blosc2.asarray(arr, urlpath=arr_path, mode="w")
65+
tree[f"/node{i}"] = arr_b2
66+
return tree
67+
68+
t0 = time.time()
69+
mem_usage = memory_usage((external_process, ()), interval=0.1)
70+
t1 = time.time()
71+
peak_mem = max(mem_usage) - min(mem_usage)
72+
file_size = get_file_size(tree_path)
73+
total_external_size = sum(get_file_size(f"{arr_prefix}_node{i}.b2nd") for i in range(len(arrays)))
74+
total_size_mb = (file_size + total_external_size)
75+
compression_ratio = uncompressed_size / (total_size_mb * 2**20) if total_size_mb > 0 else 0
76+
print(f"[External] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, EmbedStore file size: {file_size:.2f} MB, External files size: {total_external_size:.2f} MB, Total: {total_size_mb:.2f} MB, Compression: {compression_ratio:.1f}x")
77+
78+
if check:
79+
check_arrays(tree_path, arrays, prefix="node")
80+
81+
return t1-t0, peak_mem, file_size, total_external_size
82+
83+
def cleanup_files(tree_path, arr_prefix, n):
84+
if os.path.exists(tree_path):
85+
os.remove(tree_path)
86+
for i in range(n):
87+
arr_path = f"{arr_prefix}_node{i}.b2nd"
88+
if os.path.exists(arr_path):
89+
os.remove(arr_path)
90+
91+
if __name__ == "__main__":
92+
N = 10
93+
min_size = int(1e6) # 1 MB
94+
max_size = int(1e8) # 100 MB
95+
print(f"Creating {N} arrays with sizes ranging from {min_size / 1e6:.2f} to {max_size / 1e6:.2f} MB...")
96+
arrays, sizes, uncompressed_size = make_arrays(N, min_size, max_size)
97+
98+
print("Benchmarking EmbedStore with embed arrays...")
99+
tree_path_embed = "large_embed_store.b2e"
100+
t_embed, mem_embed, file_size_embed = run_embed_tree(arrays, sizes, tree_path_embed, uncompressed_size)
101+
102+
print("Benchmarking EmbedStore with external arrays...")
103+
tree_path_external = "large_embed_store_external.b2e"
104+
arr_prefix = "large_external"
105+
t_external, mem_external, file_size_external, external_size = (
106+
run_external_tree(arrays, sizes, tree_path_external, arr_prefix, uncompressed_size))
107+
108+
print("\nSummary:")
109+
print(f"Embed arrays: Time = {t_embed:.2f}s, Memory = {mem_embed:.2f} MB, File size = {file_size_embed:.2f} MB")
110+
print(f"External arrays: Time = {t_external:.2f}s, Memory = {mem_external:.2f} MB,"
111+
f" File size = {file_size_external:.2f} MB, External files size = {external_size:.2f} MB")
112+
113+
speedup = t_embed / t_external if t_external > 0 else float('inf')
114+
mem_ratio = mem_embed / mem_external if mem_external > 0 else float('inf')
115+
file_ratio = file_size_embed / file_size_external if file_size_external > 0 else float('inf')
116+
storage_ratio = file_size_embed / file_size_external
117+
print(f"Time ratio (embed/external): {speedup:.2f}x")
118+
print(f"Memory ratio (embed/external): {mem_ratio:.2f}x")
119+
print(f"File size ratio (embed/external tree): {file_ratio:.2f}x")
120+
print(f"Storage efficiency (embed vs total external): {storage_ratio:.2f}x")
121+
122+
# cleanup_files(tree_path_embed, arr_prefix, N)
123+
# cleanup_files(tree_path_external, arr_prefix, N)

0 commit comments

Comments
 (0)