Skip to content

Commit 6808455

Browse files
authored
Merge pull request #451 from Blosc/b2tree
New EmbedStore, DictStore and TreeStore classes
2 parents 9b1d23f + 0d2ee80 commit 6808455

26 files changed

+4520
-10
lines changed

bench/large-embed-store.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <[email protected]>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
import os
9+
import time
10+
import numpy as np
11+
import blosc2
12+
from blosc2 import EmbedStore
13+
from memory_profiler import memory_usage
14+
15+
def make_arrays(n, min_size, max_size, dtype="f8"):
16+
sizes = np.linspace(min_size, max_size, n).astype(int)
17+
#arrays = [blosc2.arange(size, dtype=dtype) for size in sizes]
18+
arrays = [blosc2.linspace(0, 1, size, dtype=dtype) for size in sizes]
19+
#arrays = [np.random.randint(0, 100, size=size, dtype=dtype) for size in sizes]
20+
# Calculate uncompressed size
21+
uncompressed_size = sum(arr.nbytes for arr in arrays)
22+
print(f"Uncompressed data size: {uncompressed_size / 1e9:.2f} GB")
23+
return arrays, sizes, uncompressed_size
24+
25+
def get_file_size(filepath):
26+
"""Get file size in MB."""
27+
if os.path.exists(filepath):
28+
return os.path.getsize(filepath) / 2**20
29+
return 0
30+
31+
def check_arrays(tree_path, arrays, prefix="node"):
32+
print("Checking stored arrays...")
33+
tree = EmbedStore(urlpath=tree_path, mode="r")
34+
for i, arr in enumerate(arrays):
35+
stored_arr = tree[f"/{prefix}{i}"][:]
36+
if not np.allclose(arr, stored_arr):
37+
raise ValueError(f"Array mismatch at {prefix}{i}")
38+
39+
def run_embed_tree(arrays, sizes, tree_path, uncompressed_size, check=False):
40+
def embed_process():
41+
tree = EmbedStore(urlpath=tree_path, mode="w")
42+
for i, arr in enumerate(arrays):
43+
tree[f"/node{i}"] = arr
44+
return tree
45+
46+
t0 = time.time()
47+
mem_usage = memory_usage((embed_process, ()), interval=0.1)
48+
t1 = time.time()
49+
peak_mem = max(mem_usage) - min(mem_usage)
50+
file_size = get_file_size(tree_path)
51+
compression_ratio = uncompressed_size / (file_size * 2**20) if file_size > 0 else 0
52+
print(f"[Embed] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, File size: {file_size:.2f} MB, Compression: {compression_ratio:.1f}x")
53+
54+
if check:
55+
check_arrays(tree_path, arrays, prefix="node")
56+
57+
return t1-t0, peak_mem, file_size
58+
59+
def run_external_tree(arrays, sizes, tree_path, arr_prefix, uncompressed_size, check=False):
60+
def external_process():
61+
tree = EmbedStore(urlpath=tree_path, mode="w")
62+
for i, arr in enumerate(arrays):
63+
arr_path = f"{arr_prefix}_node{i}.b2nd"
64+
arr_b2 = blosc2.asarray(arr, urlpath=arr_path, mode="w")
65+
tree[f"/node{i}"] = arr_b2
66+
return tree
67+
68+
t0 = time.time()
69+
mem_usage = memory_usage((external_process, ()), interval=0.1)
70+
t1 = time.time()
71+
peak_mem = max(mem_usage) - min(mem_usage)
72+
file_size = get_file_size(tree_path)
73+
total_external_size = sum(get_file_size(f"{arr_prefix}_node{i}.b2nd") for i in range(len(arrays)))
74+
total_size_mb = (file_size + total_external_size)
75+
compression_ratio = uncompressed_size / (total_size_mb * 2**20) if total_size_mb > 0 else 0
76+
print(f"[External] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, EmbedStore file size: {file_size:.2f} MB, External files size: {total_external_size:.2f} MB, Total: {total_size_mb:.2f} MB, Compression: {compression_ratio:.1f}x")
77+
78+
if check:
79+
check_arrays(tree_path, arrays, prefix="node")
80+
81+
return t1-t0, peak_mem, file_size, total_external_size
82+
83+
def cleanup_files(tree_path, arr_prefix, n):
84+
if os.path.exists(tree_path):
85+
os.remove(tree_path)
86+
for i in range(n):
87+
arr_path = f"{arr_prefix}_node{i}.b2nd"
88+
if os.path.exists(arr_path):
89+
os.remove(arr_path)
90+
91+
if __name__ == "__main__":
92+
N = 10
93+
min_size = int(1e6) # 1 MB
94+
max_size = int(1e8) # 100 MB
95+
print(f"Creating {N} arrays with sizes ranging from {min_size / 1e6:.2f} to {max_size / 1e6:.2f} MB...")
96+
arrays, sizes, uncompressed_size = make_arrays(N, min_size, max_size)
97+
98+
print("Benchmarking EmbedStore with embed arrays...")
99+
tree_path_embed = "large_embed_store.b2e"
100+
t_embed, mem_embed, file_size_embed = run_embed_tree(arrays, sizes, tree_path_embed, uncompressed_size)
101+
102+
print("Benchmarking EmbedStore with external arrays...")
103+
tree_path_external = "large_embed_store_external.b2e"
104+
arr_prefix = "large_external"
105+
t_external, mem_external, file_size_external, external_size = (
106+
run_external_tree(arrays, sizes, tree_path_external, arr_prefix, uncompressed_size))
107+
108+
print("\nSummary:")
109+
print(f"Embed arrays: Time = {t_embed:.2f}s, Memory = {mem_embed:.2f} MB, File size = {file_size_embed:.2f} MB")
110+
print(f"External arrays: Time = {t_external:.2f}s, Memory = {mem_external:.2f} MB,"
111+
f" File size = {file_size_external:.2f} MB, External files size = {external_size:.2f} MB")
112+
113+
speedup = t_embed / t_external if t_external > 0 else float('inf')
114+
mem_ratio = mem_embed / mem_external if mem_external > 0 else float('inf')
115+
file_ratio = file_size_embed / file_size_external if file_size_external > 0 else float('inf')
116+
storage_ratio = file_size_embed / file_size_external
117+
print(f"Time ratio (embed/external): {speedup:.2f}x")
118+
print(f"Memory ratio (embed/external): {mem_ratio:.2f}x")
119+
print(f"File size ratio (embed/external tree): {file_ratio:.2f}x")
120+
print(f"Storage efficiency (embed vs total external): {storage_ratio:.2f}x")
121+
122+
# cleanup_files(tree_path_embed, arr_prefix, N)
123+
# cleanup_files(tree_path_external, arr_prefix, N)

0 commit comments

Comments
 (0)