Blosc
diff --git a/‎ANNOUNCE.rst‎
Lines changed: 12 additions & 10 deletions b/‎ANNOUNCE.rst‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎RELEASE_NOTES.md‎
Lines changed: 11 additions & 1 deletion b/‎RELEASE_NOTES.md‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎bench/large-embed-store.py‎
Lines changed: 123 additions & 0 deletions b/‎bench/large-embed-store.py‎
Lines changed: 123 additions & 0 deletions
@@ -1,17 +1,19 @@
-Announcing Python-Blosc2 3.6.1
+Announcing Python-Blosc2 3.7.0
 ==============================
 
 In this release:
 
-✅ Blosc2 now suppports fancy indexing (and orthogonal indexing)
-✅ Added fast path for 1D fancy indexing
-✅ More complex slicing is now supported for lazy expressions
-✅ Blosc2 indexing more consistent with NumPy
-✅ Comprehensive ``squeeze`` function which squeezes only specified dimensions
-✅ Correctly point to most recent C-blosc2 version 2.19.1
-
-We have blogged about the new fancy indexing support:
-https://www.blosc.org/posts/blosc2-fancy-indexing/
+✅ Overhaul of documentation (API reference and Tutorials)
+✅ Improvements to lazy expression indexing and in particular much more efficient
+memory usage when applying non-unit steps
+✅ Extended functionality of ``expand_dims`` to match that of NumPy
+✅ 3(!) new data storage classes (``EmbedStore``, ``DictStore`` and ``TreeStore``)
+which allow for the efficient storage of heterogeneous array data
+
+See [here](https://github.com/Blosc/python-blosc2/pull/451#issuecomment-3178828765)
+for plots for the new data storage classes. And
+[here](https://github.com/Blosc/python-blosc2/pull/446#issuecomment-3167060686) for the improved performance
+of lazy expression slicing.
 
 You can think of Python-Blosc2 3.x as an extension of NumPy/numexpr that:
 
 
@@ -50,7 +50,7 @@ else()
     include(FetchContent)
     FetchContent_Declare(blosc2
         GIT_REPOSITORY https://github.com/Blosc/c-blosc2
-        GIT_TAG 2d34ddb0b4832b58d68eae042da00c05a76b72fb  # v2.19.2dev0
+        GIT_TAG d75993535461aaf2ded996f0a625cbec8df9655c  # v2.20.0
     )
     FetchContent_MakeAvailable(blosc2)
     include_directories("${blosc2_SOURCE_DIR}/include")
 
@@ -1,8 +1,18 @@
 # Release notes
-## Changes from 3.6.1 to 3.6.2
+## Changes from 3.7.0 to 3.7.1
 
 XXX version-specific blurb XXX
 
+## Changes from 3.6.1 to 3.7.0
+
+* Overhaul of documentation (API reference and Tutorials)
+
+* Improvements to lazy expression indexing and in particular much more efficient memory usage when applying non-unit steps (PR #446).
+
+* Extended functionality of ``expand_dims`` to match that of NumPy (note that this breaks the previous API) (PR #453).
+
+* The biggest change is in the form of three new data storage classes (``EmbedStore``, ``DictStore`` and ``TreeStore``) which allow for the efficient storage of heterogeneous array data (PR #451). ``EmbedStore`` is essentially an ``SChunk`` wrapper which can be stored on-disk or in-memory; ``DictStore`` allows for mixed storage across memory, disk or indeed remote; and ``TreeStore`` is a hieracrhically-formatted version of ``DictStore`` which mimics the HDF5 file format. Write, access and storage performance are all very competitive with other packages - see [plots here](https://github.com/Blosc/python-blosc2/pull/451#issuecomment-3178828765).
+
 ## Changes from 3.6.0 to 3.6.1
 
 * C-Blosc2 internal library updated to latest 2.19.1.
 
@@ -0,0 +1,123 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <[email protected]>
+# All rights reserved.
+#
+# This source code is licensed under a BSD-style license (found in the
+# LICENSE file in the root directory of this source tree)
+#######################################################################
+import os
+import time
+import numpy as np
+import blosc2
+from blosc2 import EmbedStore
+from memory_profiler import memory_usage
+
+def make_arrays(n, min_size, max_size, dtype="f8"):
+    sizes = np.linspace(min_size, max_size, n).astype(int)
+    #arrays = [blosc2.arange(size, dtype=dtype) for size in sizes]
+    arrays = [blosc2.linspace(0, 1, size, dtype=dtype) for size in sizes]
+    #arrays = [np.random.randint(0, 100, size=size, dtype=dtype) for size in sizes]
+    # Calculate uncompressed size
+    uncompressed_size = sum(arr.nbytes for arr in arrays)
+    print(f"Uncompressed data size: {uncompressed_size / 1e9:.2f} GB")
+    return arrays, sizes, uncompressed_size
+
+def get_file_size(filepath):
+    """Get file size in MB."""
+    if os.path.exists(filepath):
+        return os.path.getsize(filepath) / 2**20
+    return 0
+
+def check_arrays(tree_path, arrays, prefix="node"):
+    print("Checking stored arrays...")
+    tree = EmbedStore(urlpath=tree_path, mode="r")
+    for i, arr in enumerate(arrays):
+        stored_arr = tree[f"/{prefix}{i}"][:]
+        if not np.allclose(arr, stored_arr):
+            raise ValueError(f"Array mismatch at {prefix}{i}")
+
+def run_embed_tree(arrays, sizes, tree_path, uncompressed_size, check=False):
+    def embed_process():
+        tree = EmbedStore(urlpath=tree_path, mode="w")
+        for i, arr in enumerate(arrays):
+            tree[f"/node{i}"] = arr
+        return tree
+
+    t0 = time.time()
+    mem_usage = memory_usage((embed_process, ()), interval=0.1)
+    t1 = time.time()
+    peak_mem = max(mem_usage) - min(mem_usage)
+    file_size = get_file_size(tree_path)
+    compression_ratio = uncompressed_size / (file_size * 2**20) if file_size > 0 else 0
+    print(f"[Embed] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, File size: {file_size:.2f} MB, Compression: {compression_ratio:.1f}x")
+
+    if check:
+        check_arrays(tree_path, arrays, prefix="node")
+
+    return t1-t0, peak_mem, file_size
+
+def run_external_tree(arrays, sizes, tree_path, arr_prefix, uncompressed_size, check=False):
+    def external_process():
+        tree = EmbedStore(urlpath=tree_path, mode="w")
+        for i, arr in enumerate(arrays):
+            arr_path = f"{arr_prefix}_node{i}.b2nd"
+            arr_b2 = blosc2.asarray(arr, urlpath=arr_path, mode="w")
+            tree[f"/node{i}"] = arr_b2
+        return tree
+
+    t0 = time.time()
+    mem_usage = memory_usage((external_process, ()), interval=0.1)
+    t1 = time.time()
+    peak_mem = max(mem_usage) - min(mem_usage)
+    file_size = get_file_size(tree_path)
+    total_external_size = sum(get_file_size(f"{arr_prefix}_node{i}.b2nd") for i in range(len(arrays)))
+    total_size_mb = (file_size + total_external_size)
+    compression_ratio = uncompressed_size / (total_size_mb * 2**20) if total_size_mb > 0 else 0
+    print(f"[External] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, EmbedStore file size: {file_size:.2f} MB, External files size: {total_external_size:.2f} MB, Total: {total_size_mb:.2f} MB, Compression: {compression_ratio:.1f}x")
+
+    if check:
+        check_arrays(tree_path, arrays, prefix="node")
+
+    return t1-t0, peak_mem, file_size, total_external_size
+
+def cleanup_files(tree_path, arr_prefix, n):
+    if os.path.exists(tree_path):
+        os.remove(tree_path)
+    for i in range(n):
+        arr_path = f"{arr_prefix}_node{i}.b2nd"
+        if os.path.exists(arr_path):
+            os.remove(arr_path)
+
+if __name__ == "__main__":
+    N = 10
+    min_size = int(1e6)   # 1 MB
+    max_size = int(1e8)   # 100 MB
+    print(f"Creating {N} arrays with sizes ranging from {min_size / 1e6:.2f} to {max_size / 1e6:.2f} MB...")
+    arrays, sizes, uncompressed_size = make_arrays(N, min_size, max_size)
+
+    print("Benchmarking EmbedStore with embed arrays...")
+    tree_path_embed = "large_embed_store.b2e"
+    t_embed, mem_embed, file_size_embed = run_embed_tree(arrays, sizes, tree_path_embed, uncompressed_size)
+
+    print("Benchmarking EmbedStore with external arrays...")
+    tree_path_external = "large_embed_store_external.b2e"
+    arr_prefix = "large_external"
+    t_external, mem_external, file_size_external, external_size = (
+        run_external_tree(arrays, sizes, tree_path_external, arr_prefix, uncompressed_size))
+
+    print("\nSummary:")
+    print(f"Embed arrays:   Time = {t_embed:.2f}s, Memory = {mem_embed:.2f} MB, File size = {file_size_embed:.2f} MB")
+    print(f"External arrays:   Time = {t_external:.2f}s, Memory = {mem_external:.2f} MB,"
+          f" File size = {file_size_external:.2f} MB, External files size = {external_size:.2f} MB")
+
+    speedup = t_embed / t_external if t_external > 0 else float('inf')
+    mem_ratio = mem_embed / mem_external if mem_external > 0 else float('inf')
+    file_ratio = file_size_embed / file_size_external if file_size_external > 0 else float('inf')
+    storage_ratio = file_size_embed / file_size_external
+    print(f"Time ratio (embed/external): {speedup:.2f}x")
+    print(f"Memory ratio (embed/external): {mem_ratio:.2f}x")
+    print(f"File size ratio (embed/external tree): {file_ratio:.2f}x")
+    print(f"Storage efficiency (embed vs total external): {storage_ratio:.2f}x")
+
+    # cleanup_files(tree_path_embed, arr_prefix, N)
+    # cleanup_files(tree_path_external, arr_prefix, N)
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ else()`
`50`	`50`	`include(FetchContent)`
`51`	`51`	`FetchContent_Declare(blosc2`
`52`	`52`	`GIT_REPOSITORY https://github.com/Blosc/c-blosc2`
`53`		`- GIT_TAG 2d34ddb0b4832b58d68eae042da00c05a76b72fb # v2.19.2dev0`
	`53`	`+ GIT_TAG d75993535461aaf2ded996f0a625cbec8df9655c # v2.20.0`
`54`	`54`	`)`
`55`	`55`	`FetchContent_MakeAvailable(blosc2)`
`56`	`56`	`include_directories("${blosc2_SOURCE_DIR}/include")`