Add hex-dump and MDIO output reproducer

BrianMichell · BrianMichell · commit 3bb5fe981f88 · 2025-09-19T16:25:03.000Z
diff --git a/disaster_recovery_analysis/bootstrap.sh b/disaster_recovery_analysis/bootstrap.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -e  # Exit on any error
+
+echo "=== Zarr Hexdump Bootstrap Script ==="
+echo
+
+# Check if Rust is installed, if not install it
+if ! command -v rustc &> /dev/null; then
+    echo "Rust is not installed. Installing Rust..."
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+    source $HOME/.cargo/env
+    echo "✓ Rust installed successfully"
+fi
+
+echo "✓ Rust is installed ($(rustc --version))"
+
+# Check if Cargo is available
+if ! command -v cargo &> /dev/null; then
+    echo "Sourcing Rust environment..."
+    source $HOME/.cargo/env
+fi
+
+echo "✓ Cargo is available ($(cargo --version))"
+echo
+
+# Install blosc library if not present
+echo "Checking for blosc library..."
+if ! pkg-config --exists blosc; then
+    echo "Installing blosc library..."
+    sudo apt-get update
+    sudo apt-get install -y libblosc-dev pkg-config
+    echo "✓ Blosc library installed"
+else
+    echo "✓ Blosc library already available"
+fi
+echo
+
+# Build the project
+echo "Building the Zarr hexdump tool..."
+if cargo build --release; then
+    echo "✓ Build successful!"
+else
+    echo "✗ Build failed!"
+    exit 1
+fi
+
+echo
+echo "=== Usage ==="
+echo "To hexdump a Zarr array, run:"
+echo "  ./target/release/zarr-hexdump /path/to/your/zarr/array"
+echo
+echo "Or use cargo run:"
+echo "  cargo run --release -- /path/to/your/zarr/array"
+echo
+
+# Check if a path was provided as an argument
+if [ $# -eq 1 ]; then
+    ZARR_PATH="$1"
+    echo "=== Running hexdump on provided path: $ZARR_PATH ==="
+    echo
+    
+    if [ -e "$ZARR_PATH" ]; then
+        ./target/release/zarr-hexdump "$ZARR_PATH"
+    else
+        echo "Error: Path '$ZARR_PATH' does not exist!"
+        exit 1
+    fi
+elif [ $# -gt 1 ]; then
+    echo "Error: Too many arguments provided."
+    echo "Usage: $0 [zarr_array_path]"
+    exit 1
+else
+    echo "No Zarr array path provided. Build completed successfully."
+    echo "Use the commands above to run the hexdump tool."
+fi
diff --git a/disaster_recovery_analysis/ingest_both_teapots.py b/disaster_recovery_analysis/ingest_both_teapots.py
@@ -0,0 +1,43 @@
+if __name__ == "__main__":
+    import mdio
+    from segy.standards import get_segy_standard
+    from segy.schema import HeaderField, Endianness
+    import os
+    from mdio.builder.template_registry import TemplateRegistry
+
+    import logging
+
+    logging.getLogger("segy").setLevel(logging.DEBUG)
+
+    os.environ["MDIO__IMPORT__CLOUD_NATIVE"] = "true"
+    os.environ["MDIO__IMPORT__CPU_COUNT"] = "16"
+    os.environ["MDIO__DO_RAW_HEADERS"] = "1"
+
+    custom_headers = [
+        HeaderField(name="inline", byte=181, format="int32"),
+        HeaderField(name="crossline", byte=185, format="int32"),
+        HeaderField(name="cdp_x", byte=81, format="int32"),
+        HeaderField(name="cdp_y", byte=85, format="int32"),
+    ]
+
+    big_endian_spec = get_segy_standard(0)
+    big_endian_spec.endianness = Endianness.BIG
+    little_endian_spec = get_segy_standard(0)
+    little_endian_spec.endianness = Endianness.LITTLE
+    big_endian_spec = big_endian_spec.customize(trace_header_fields=custom_headers)
+    little_endian_spec = little_endian_spec.customize(trace_header_fields=custom_headers)
+
+    mdio.segy_to_mdio(
+        segy_spec=big_endian_spec,
+        mdio_template=TemplateRegistry().get("PostStack3DTime"),
+        input_path="filt_mig_IEEE_BigEndian_Rev1.sgy",
+        output_path="filt_mig_IEEE_BigEndian_Rev1.mdio",
+        overwrite=True,
+    )
+    mdio.segy_to_mdio(
+        segy_spec=little_endian_spec,
+        mdio_template=TemplateRegistry().get("PostStack3DTime"),
+        input_path="filt_mig_IEEE_LittleEndian_Rev1.sgy",
+        output_path="filt_mig_IEEE_LittleEndian_Rev1.mdio",
+        overwrite=True,
+    )
diff --git a/disaster_recovery_analysis/src/main.rs b/disaster_recovery_analysis/src/main.rs
@@ -0,0 +1,200 @@
+use std::env;
+use std::fs;
+use std::path::Path;
+use walkdir::WalkDir;
+fn decompress_blosc(compressed_data: &[u8]) -> Result<Vec<u8>, String> {
+    unsafe {
+        // Get decompressed size first
+        let mut nbytes = 0usize;
+        let mut cbytes = 0usize;
+        let mut blocksize = 0usize;
+        
+        blosc_sys::blosc_cbuffer_sizes(
+            compressed_data.as_ptr() as *const std::ffi::c_void,
+            &mut nbytes as *mut usize,
+            &mut cbytes as *mut usize,
+            &mut blocksize as *mut usize,
+        );
+        
+        if nbytes == 0 {
+            return Err("Invalid compressed data".to_string());
+        }
+        
+        // Allocate output buffer
+        let mut decompressed = vec![0u8; nbytes];
+        
+        // Decompress
+        let result = blosc_sys::blosc_decompress(
+            compressed_data.as_ptr() as *const std::ffi::c_void,
+            decompressed.as_mut_ptr() as *mut std::ffi::c_void,
+            nbytes,
+        );
+        
+        if result < 0 {
+            return Err(format!("Blosc decompression failed with code: {}", result));
+        }
+        
+        decompressed.truncate(result as usize);
+        Ok(decompressed)
+    }
+}
+
+fn print_hexdump(data: &[u8], offset: usize, chunk_name: &str) {
+    println!("=== {} ===", chunk_name);
+    for (i, chunk) in data.chunks(16).enumerate() {
+        let addr = offset + i * 16;
+        
+        // Print address
+        print!("{:08x}  ", addr);
+        
+        // Print hex bytes
+        for (j, &byte) in chunk.iter().enumerate() {
+            if j == 8 {
+                print!(" "); // Extra space in the middle
+            }
+            print!("{:02x} ", byte);
+        }
+        
+        // Pad if chunk is less than 16 bytes
+        if chunk.len() < 16 {
+            for j in chunk.len()..16 {
+                if j == 8 {
+                    print!(" ");
+                }
+                print!("   ");
+            }
+        }
+        
+        // Print ASCII representation
+        print!(" |");
+        for &byte in chunk {
+            if byte >= 32 && byte <= 126 {
+                print!("{}", byte as char);
+            } else {
+                print!(".");
+            }
+        }
+        println!("|");
+    }
+    println!();
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args: Vec<String> = env::args().collect();
+    
+    if args.len() != 2 {
+        eprintln!("Usage: {} <zarr_array_path>", args[0]);
+        eprintln!("Example: {} /path/to/zarr/array", args[0]);
+        std::process::exit(1);
+    }
+    
+    let zarr_path = Path::new(&args[1]);
+    
+    // Verify the path exists
+    if !zarr_path.exists() {
+        eprintln!("Error: Path '{}' does not exist", zarr_path.display());
+        std::process::exit(1);
+    }
+    
+    println!("Reading Zarr array from: {}", zarr_path.display());
+    println!("========================================");
+    
+    // Read zarr.json metadata
+    let zarr_json_path = zarr_path.join("zarr.json");
+    if !zarr_json_path.exists() {
+        eprintln!("Error: zarr.json not found in {}", zarr_path.display());
+        std::process::exit(1);
+    }
+    
+    let metadata_content = fs::read_to_string(&zarr_json_path)?;
+    let metadata: serde_json::Value = serde_json::from_str(&metadata_content)?;
+    
+    // Extract information from metadata
+    let shape = metadata["shape"].as_array().unwrap();
+    let chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"].as_array().unwrap();
+    
+    println!("Array shape: {:?}", shape);
+    println!("Chunk shape: {:?}", chunk_shape);
+    println!("Data type: {}", metadata["data_type"]["name"]);
+    if let Some(config) = metadata["data_type"]["configuration"].as_object() {
+        if let Some(length_bytes) = config.get("length_bytes") {
+            println!("Length bytes: {}", length_bytes);
+        }
+    }
+    println!();
+    
+    // Calculate expected chunks based on the metadata we know:
+    // Shape: [345, 188], Chunk shape: [128, 128]
+    // This means we have ceil(345/128) = 3 chunks in dimension 0
+    // and ceil(188/128) = 2 chunks in dimension 1
+    // So we expect chunks: c/0/0, c/0/1, c/1/0, c/1/1, c/2/0, c/2/1
+    
+    let mut chunk_files = Vec::new();
+    
+    // Find all chunk files by walking the directory
+    for entry in WalkDir::new(zarr_path) {
+        let entry = entry?;
+        let path = entry.path();
+        
+        // Look for chunk files (they start with 'c/' in Zarr v3)
+        if path.is_file() {
+            let relative_path = path.strip_prefix(zarr_path)?;
+            let path_str = relative_path.to_string_lossy();
+            
+            if path_str.starts_with("c/") {
+                chunk_files.push((path.to_path_buf(), path_str.to_string()));
+            }
+        }
+    }
+    
+    // Sort chunk files for consistent ordering
+    chunk_files.sort_by(|a, b| a.1.cmp(&b.1));
+    
+    println!("Found {} chunk files:", chunk_files.len());
+    for (_, chunk_name) in &chunk_files {
+        println!("  {}", chunk_name);
+    }
+    println!();
+    
+    let mut total_offset = 0;
+    
+    // Read, decompress, and hexdump each chunk file
+    for (chunk_path, chunk_name) in chunk_files {
+        match fs::read(&chunk_path) {
+            Ok(compressed_data) => {
+                if compressed_data.is_empty() {
+                    println!("=== {} ===", chunk_name);
+                    println!("(empty chunk)");
+                    println!();
+                } else {
+                    println!("Compressed size: {} bytes", compressed_data.len());
+                    
+                    // Decompress the Blosc-compressed data using blosc-sys directly
+                    match decompress_blosc(&compressed_data) {
+                        Ok(decompressed_data) => {
+                            println!("Decompressed size: {} bytes", decompressed_data.len());
+                            print_hexdump(&decompressed_data, total_offset, &chunk_name);
+                            total_offset += decompressed_data.len();
+                        }
+                        Err(e) => {
+                            eprintln!("Error decompressing chunk {}: {}", chunk_name, e);
+                            println!("Showing raw compressed data instead:");
+                            print_hexdump(&compressed_data, total_offset, &format!("{} (compressed)", chunk_name));
+                            total_offset += compressed_data.len();
+                        }
+                    }
+                }
+            }
+            Err(e) => {
+                eprintln!("Error reading chunk {}: {}", chunk_name, e);
+            }
+        }
+    }
+    
+    println!("Total decompressed bytes processed: {}", total_offset);
+    println!();
+    println!("Note: This shows the decompressed array data as it would appear in memory.");
+    println!("Each element is 240 bytes (raw_bytes with length_bytes: 240).");
+    
+    Ok(())
+}