|
| 1 | +####################################################################### |
| 2 | +# Copyright (c) 2019-present, Blosc Development Team <[email protected]> |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# This source code is licensed under a BSD-style license (found in the |
| 6 | +# LICENSE file in the root directory of this source tree) |
| 7 | +####################################################################### |
| 8 | +import os |
| 9 | +import time |
| 10 | +import numpy as np |
| 11 | +import blosc2 |
| 12 | +from blosc2 import EmbedStore |
| 13 | +from memory_profiler import memory_usage |
| 14 | + |
| 15 | +def make_arrays(n, min_size, max_size, dtype="f8"): |
| 16 | + sizes = np.linspace(min_size, max_size, n).astype(int) |
| 17 | + #arrays = [blosc2.arange(size, dtype=dtype) for size in sizes] |
| 18 | + arrays = [blosc2.linspace(0, 1, size, dtype=dtype) for size in sizes] |
| 19 | + #arrays = [np.random.randint(0, 100, size=size, dtype=dtype) for size in sizes] |
| 20 | + # Calculate uncompressed size |
| 21 | + uncompressed_size = sum(arr.nbytes for arr in arrays) |
| 22 | + print(f"Uncompressed data size: {uncompressed_size / 1e9:.2f} GB") |
| 23 | + return arrays, sizes, uncompressed_size |
| 24 | + |
| 25 | +def get_file_size(filepath): |
| 26 | + """Get file size in MB.""" |
| 27 | + if os.path.exists(filepath): |
| 28 | + return os.path.getsize(filepath) / 2**20 |
| 29 | + return 0 |
| 30 | + |
| 31 | +def check_arrays(tree_path, arrays, prefix="node"): |
| 32 | + print("Checking stored arrays...") |
| 33 | + tree = EmbedStore(urlpath=tree_path, mode="r") |
| 34 | + for i, arr in enumerate(arrays): |
| 35 | + stored_arr = tree[f"/{prefix}{i}"][:] |
| 36 | + if not np.allclose(arr, stored_arr): |
| 37 | + raise ValueError(f"Array mismatch at {prefix}{i}") |
| 38 | + |
| 39 | +def run_embed_tree(arrays, sizes, tree_path, uncompressed_size, check=False): |
| 40 | + def embed_process(): |
| 41 | + tree = EmbedStore(urlpath=tree_path, mode="w") |
| 42 | + for i, arr in enumerate(arrays): |
| 43 | + tree[f"/node{i}"] = arr |
| 44 | + return tree |
| 45 | + |
| 46 | + t0 = time.time() |
| 47 | + mem_usage = memory_usage((embed_process, ()), interval=0.1) |
| 48 | + t1 = time.time() |
| 49 | + peak_mem = max(mem_usage) - min(mem_usage) |
| 50 | + file_size = get_file_size(tree_path) |
| 51 | + compression_ratio = uncompressed_size / (file_size * 2**20) if file_size > 0 else 0 |
| 52 | + print(f"[Embed] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, File size: {file_size:.2f} MB, Compression: {compression_ratio:.1f}x") |
| 53 | + |
| 54 | + if check: |
| 55 | + check_arrays(tree_path, arrays, prefix="node") |
| 56 | + |
| 57 | + return t1-t0, peak_mem, file_size |
| 58 | + |
| 59 | +def run_external_tree(arrays, sizes, tree_path, arr_prefix, uncompressed_size, check=False): |
| 60 | + def external_process(): |
| 61 | + tree = EmbedStore(urlpath=tree_path, mode="w") |
| 62 | + for i, arr in enumerate(arrays): |
| 63 | + arr_path = f"{arr_prefix}_node{i}.b2nd" |
| 64 | + arr_b2 = blosc2.asarray(arr, urlpath=arr_path, mode="w") |
| 65 | + tree[f"/node{i}"] = arr_b2 |
| 66 | + return tree |
| 67 | + |
| 68 | + t0 = time.time() |
| 69 | + mem_usage = memory_usage((external_process, ()), interval=0.1) |
| 70 | + t1 = time.time() |
| 71 | + peak_mem = max(mem_usage) - min(mem_usage) |
| 72 | + file_size = get_file_size(tree_path) |
| 73 | + total_external_size = sum(get_file_size(f"{arr_prefix}_node{i}.b2nd") for i in range(len(arrays))) |
| 74 | + total_size_mb = (file_size + total_external_size) |
| 75 | + compression_ratio = uncompressed_size / (total_size_mb * 2**20) if total_size_mb > 0 else 0 |
| 76 | + print(f"[External] Time: {t1-t0:.2f}s, Memory: {peak_mem:.2f} MB, EmbedStore file size: {file_size:.2f} MB, External files size: {total_external_size:.2f} MB, Total: {total_size_mb:.2f} MB, Compression: {compression_ratio:.1f}x") |
| 77 | + |
| 78 | + if check: |
| 79 | + check_arrays(tree_path, arrays, prefix="node") |
| 80 | + |
| 81 | + return t1-t0, peak_mem, file_size, total_external_size |
| 82 | + |
| 83 | +def cleanup_files(tree_path, arr_prefix, n): |
| 84 | + if os.path.exists(tree_path): |
| 85 | + os.remove(tree_path) |
| 86 | + for i in range(n): |
| 87 | + arr_path = f"{arr_prefix}_node{i}.b2nd" |
| 88 | + if os.path.exists(arr_path): |
| 89 | + os.remove(arr_path) |
| 90 | + |
| 91 | +if __name__ == "__main__": |
| 92 | + N = 10 |
| 93 | + min_size = int(1e6) # 1 MB |
| 94 | + max_size = int(1e8) # 100 MB |
| 95 | + print(f"Creating {N} arrays with sizes ranging from {min_size / 1e6:.2f} to {max_size / 1e6:.2f} MB...") |
| 96 | + arrays, sizes, uncompressed_size = make_arrays(N, min_size, max_size) |
| 97 | + |
| 98 | + print("Benchmarking EmbedStore with embed arrays...") |
| 99 | + tree_path_embed = "large_embed_store.b2e" |
| 100 | + t_embed, mem_embed, file_size_embed = run_embed_tree(arrays, sizes, tree_path_embed, uncompressed_size) |
| 101 | + |
| 102 | + print("Benchmarking EmbedStore with external arrays...") |
| 103 | + tree_path_external = "large_embed_store_external.b2e" |
| 104 | + arr_prefix = "large_external" |
| 105 | + t_external, mem_external, file_size_external, external_size = ( |
| 106 | + run_external_tree(arrays, sizes, tree_path_external, arr_prefix, uncompressed_size)) |
| 107 | + |
| 108 | + print("\nSummary:") |
| 109 | + print(f"Embed arrays: Time = {t_embed:.2f}s, Memory = {mem_embed:.2f} MB, File size = {file_size_embed:.2f} MB") |
| 110 | + print(f"External arrays: Time = {t_external:.2f}s, Memory = {mem_external:.2f} MB," |
| 111 | + f" File size = {file_size_external:.2f} MB, External files size = {external_size:.2f} MB") |
| 112 | + |
| 113 | + speedup = t_embed / t_external if t_external > 0 else float('inf') |
| 114 | + mem_ratio = mem_embed / mem_external if mem_external > 0 else float('inf') |
| 115 | + file_ratio = file_size_embed / file_size_external if file_size_external > 0 else float('inf') |
| 116 | + storage_ratio = file_size_embed / file_size_external |
| 117 | + print(f"Time ratio (embed/external): {speedup:.2f}x") |
| 118 | + print(f"Memory ratio (embed/external): {mem_ratio:.2f}x") |
| 119 | + print(f"File size ratio (embed/external tree): {file_ratio:.2f}x") |
| 120 | + print(f"Storage efficiency (embed vs total external): {storage_ratio:.2f}x") |
| 121 | + |
| 122 | + # cleanup_files(tree_path_embed, arr_prefix, N) |
| 123 | + # cleanup_files(tree_path_external, arr_prefix, N) |
0 commit comments