|
| 1 | +import argparse |
| 2 | +import asyncio |
| 3 | +import time |
| 4 | + |
| 5 | +import xarray as xr |
| 6 | +from multiformats import CID |
| 7 | + |
| 8 | +from .hamt import HAMT |
| 9 | +from .sharded_zarr_store import ShardedZarrStore |
| 10 | +from .store_httpx import KuboCAS |
| 11 | +from .zarr_hamt_store import ZarrHAMTStore |
| 12 | + |
| 13 | + |
| 14 | +async def convert_hamt_to_sharded( |
| 15 | + cas: KuboCAS, hamt_root_cid: str, chunks_per_shard: int |
| 16 | +) -> str: |
| 17 | + """ |
| 18 | + Converts a Zarr dataset from a HAMT-based store to a ShardedZarrStore. |
| 19 | +
|
| 20 | + Args: |
| 21 | + cas: An initialized ContentAddressedStore instance (KuboCAS). |
| 22 | + hamt_root_cid: The root CID of the source ZarrHAMTStore. |
| 23 | + chunks_per_shard: The number of chunks to group into a single shard in the new store. |
| 24 | +
|
| 25 | + Returns: |
| 26 | + The root CID of the newly created ShardedZarrStore. |
| 27 | + """ |
| 28 | + print(f"--- Starting Conversion from HAMT Root {hamt_root_cid} ---") |
| 29 | + start_time = time.perf_counter() |
| 30 | + # 1. Open the source HAMT store for reading |
| 31 | + print("Opening source HAMT store...") |
| 32 | + hamt_ro = await HAMT.build( |
| 33 | + cas=cas, root_node_id=hamt_root_cid, values_are_bytes=True, read_only=True |
| 34 | + ) |
| 35 | + source_store = ZarrHAMTStore(hamt_ro, read_only=True) |
| 36 | + source_dataset = xr.open_zarr(store=source_store, consolidated=True) |
| 37 | + # 2. Introspect the source array to get its configuration |
| 38 | + print("Reading metadata from source store...") |
| 39 | + |
| 40 | + # Read the stores metadata to get array shape and chunk shape |
| 41 | + data_var_name = next(iter(source_dataset.data_vars)) |
| 42 | + ordered_dims = list(source_dataset[data_var_name].dims) |
| 43 | + array_shape_tuple = tuple(source_dataset.sizes[dim] for dim in ordered_dims) |
| 44 | + chunk_shape_tuple = tuple(source_dataset.chunks[dim][0] for dim in ordered_dims) |
| 45 | + array_shape = array_shape_tuple |
| 46 | + chunk_shape = chunk_shape_tuple |
| 47 | + |
| 48 | + # 3. Create the destination ShardedZarrStore for writing |
| 49 | + print( |
| 50 | + f"Initializing new ShardedZarrStore with {chunks_per_shard} chunks per shard..." |
| 51 | + ) |
| 52 | + dest_store = await ShardedZarrStore.open( |
| 53 | + cas=cas, |
| 54 | + read_only=False, |
| 55 | + array_shape=array_shape, |
| 56 | + chunk_shape=chunk_shape, |
| 57 | + chunks_per_shard=chunks_per_shard, |
| 58 | + ) |
| 59 | + |
| 60 | + print("Destination store initialized.") |
| 61 | + |
| 62 | + # 4. Iterate and copy all data from source to destination |
| 63 | + print("Starting data migration...") |
| 64 | + count = 0 |
| 65 | + async for key in hamt_ro.keys(): |
| 66 | + count += 1 |
| 67 | + # Read the raw data (metadata or chunk) from the source |
| 68 | + cid: CID = await hamt_ro.get_pointer(key) |
| 69 | + cid_base32_str = str(cid.encode("base32")) |
| 70 | + |
| 71 | + # Write the exact same key-value pair to the destination. |
| 72 | + await dest_store.set_pointer(key, cid_base32_str) |
| 73 | + if count % 200 == 0: # pragma: no cover |
| 74 | + print(f"Migrated {count} keys...") # pragma: no cover |
| 75 | + |
| 76 | + print(f"Migration of {count} total keys complete.") |
| 77 | + |
| 78 | + # 5. Finalize the new store by flushing it to the CAS |
| 79 | + print("Flushing new store to get final root CID...") |
| 80 | + new_root_cid = await dest_store.flush() |
| 81 | + end_time = time.perf_counter() |
| 82 | + |
| 83 | + print("\n--- Conversion Complete! ---") |
| 84 | + print(f"Total time: {end_time - start_time:.2f} seconds") |
| 85 | + print(f"New ShardedZarrStore Root CID: {new_root_cid}") |
| 86 | + return new_root_cid |
| 87 | + |
| 88 | + |
| 89 | +async def sharded_converter_cli(): |
| 90 | + parser = argparse.ArgumentParser( |
| 91 | + description="Convert a Zarr HAMT store to a Sharded Zarr store." |
| 92 | + ) |
| 93 | + parser.add_argument( |
| 94 | + "hamt_cid", type=str, help="The root CID of the source Zarr HAMT store." |
| 95 | + ) |
| 96 | + parser.add_argument( |
| 97 | + "--chunks-per-shard", |
| 98 | + type=int, |
| 99 | + default=6250, |
| 100 | + help="Number of chunk CIDs to store per shard in the new store.", |
| 101 | + ) |
| 102 | + parser.add_argument( |
| 103 | + "--rpc-url", |
| 104 | + type=str, |
| 105 | + default="http://127.0.0.1:5001", |
| 106 | + help="The URL of the IPFS Kubo RPC API.", |
| 107 | + ) |
| 108 | + parser.add_argument( |
| 109 | + "--gateway-url", |
| 110 | + type=str, |
| 111 | + default="http://127.0.0.1:8080", |
| 112 | + help="The URL of the IPFS Gateway.", |
| 113 | + ) |
| 114 | + args = parser.parse_args() |
| 115 | + # Initialize the KuboCAS client with the provided RPC and Gateway URLs |
| 116 | + async with KuboCAS( |
| 117 | + rpc_base_url=args.rpc_url, gateway_base_url=args.gateway_url |
| 118 | + ) as cas_client: |
| 119 | + try: |
| 120 | + await convert_hamt_to_sharded( |
| 121 | + cas=cas_client, |
| 122 | + hamt_root_cid=args.hamt_cid, |
| 123 | + chunks_per_shard=args.chunks_per_shard, |
| 124 | + ) |
| 125 | + except Exception as e: |
| 126 | + print(f"\nAn error occurred: {e}") |
| 127 | + |
| 128 | + |
| 129 | +if __name__ == "__main__": |
| 130 | + asyncio.run(sharded_converter_cli()) # pragma: no cover |
0 commit comments