label manager and parallel load test

fordN · fordN · commit 18a83b8b89e1 · 2025-10-24T10:12:54.000-07:00
diff --git a/apps/test_erc20_parallel_load.py b/apps/test_erc20_parallel_load.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Real-world test: Load ERC20 transfers into Snowflake using parallel streaming.
+
+Usage:
+    python app/test_erc20_parallel_load.py [--blocks BLOCKS] [--workers WORKERS]
+
+Example:
+    python app/test_erc20_parallel_load.py --blocks 100000 --workers 8
+"""
+
+import argparse
+import os
+import time
+from datetime import datetime
+
+from amp.client import Client
+from amp.streaming.parallel import ParallelConfig
+
+
+def get_recent_block_range(client: Client, num_blocks: int = 100_000):
+    """Query amp server to get recent block range."""
+    print(f'\n🔍 Detecting recent block range ({num_blocks:,} blocks)...')
+
+    query = 'SELECT MAX(block_num) as max_block FROM eth_firehose.logs'
+    result = client.get_sql(query, read_all=True)
+
+    if result.num_rows == 0:
+        raise RuntimeError('No data found in eth_firehose.logs')
+
+    max_block = result.column('max_block')[0].as_py()
+    if max_block is None:
+        raise RuntimeError('No blocks found in eth_firehose.logs')
+
+    min_block = max(0, max_block - num_blocks)
+
+    print(f'✅ Block range: {min_block:,} to {max_block:,} ({max_block - min_block:,} blocks)')
+    return min_block, max_block
+
+
+def load_erc20_transfers(num_blocks: int = 100_000, num_workers: int = 8):
+    """Load ERC20 transfers using parallel streaming."""
+
+    # Initialize client
+    server_url = os.getenv('AMP_SERVER_URL', 'grpc://34.27.238.174:80')
+    client = Client(server_url)
+    print(f'📡 Connected to amp server: {server_url}')
+
+    # Get recent block range
+    min_block, max_block = get_recent_block_range(client, num_blocks)
+
+    # Generate unique table name
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    table_name = f'erc20_transfers_{timestamp}'
+    print(f'\n📊 Target table: {table_name}')
+
+    # ERC20 Transfer event signature
+    transfer_sig = 'Transfer(address indexed from, address indexed to, uint256 value)'
+
+    # ERC20 transfer query with corrected syntax
+    erc20_query = f"""
+        select 
+            pc.block_num,
+            pc.block_hash,
+            pc.timestamp,
+            pc.tx_hash,
+            pc.tx_index,
+            pc.log_index,
+            pc.dec['from'] as from_address,
+            pc.dec['to'] as to_address,
+            pc.dec['value'] as value  
+        from (
+            select 
+                l.block_num,
+                l.block_hash,
+                l.tx_hash,
+                l.tx_index,
+                l.log_index,
+                l.timestamp,
+                evm_decode(l.topic1, l.topic2, l.topic3, l.data, '{transfer_sig}') as dec
+            from eth_firehose.logs l
+            where 
+                l.topic0 = evm_topic('{transfer_sig}') and
+                l.topic3 IS NULL) pc 
+    """
+
+    # Configure Snowflake connection
+    snowflake_config = {
+        'account': os.getenv('SNOWFLAKE_ACCOUNT'),
+        'user': os.getenv('SNOWFLAKE_USER'),
+        'warehouse': os.getenv('SNOWFLAKE_WAREHOUSE'),
+        'database': os.getenv('SNOWFLAKE_DATABASE'),
+        'private_key': os.getenv('SNOWFLAKE_PRIVATE_KEY'),
+        'loading_method': 'stage',  # Use fast bulk loading via COPY INTO
+    }
+
+    client.configure_connection(name='snowflake_erc20', loader='snowflake', config=snowflake_config)
+
+    # Configure parallel execution
+    parallel_config = ParallelConfig(
+        num_workers=num_workers,
+        table_name='eth_firehose.logs',
+        min_block=min_block,
+        max_block=max_block,
+        block_column='block_num',
+    )
+
+    print(f'\n🚀 Starting parallel load with {num_workers} workers...\n')
+
+    start_time = time.time()
+
+    # Load data in parallel (will stop after processing the block range)
+    results = list(
+        client.sql(erc20_query).load(
+            connection='snowflake_erc20', destination=table_name, stream=True, parallel_config=parallel_config
+        )
+    )
+
+    duration = time.time() - start_time
+
+    # Calculate statistics
+    total_rows = sum(r.rows_loaded for r in results if r.success)
+    rows_per_sec = total_rows / duration if duration > 0 else 0
+    partitions = [r for r in results if 'partition_id' in r.metadata]
+    successful_workers = len(partitions)
+    failed_workers = num_workers - successful_workers
+
+    # Print results
+    print(f'\n{"=" * 70}')
+    print('🎉 ERC20 Parallel Load Complete!')
+    print(f'{"=" * 70}')
+    print(f'📊 Table name:       {table_name}')
+    print(f'📦 Block range:      {min_block:,} to {max_block:,}')
+    print(f'📈 Rows loaded:      {total_rows:,}')
+    print(f'⏱️  Duration:         {duration:.2f}s')
+    print(f'🚀 Throughput:       {rows_per_sec:,.0f} rows/sec')
+    print(f'👷 Workers:          {successful_workers}/{num_workers} succeeded')
+    if failed_workers > 0:
+        print(f'⚠️  Failed workers:   {failed_workers}')
+    print(f'📊 Avg rows/block:   {total_rows / (max_block - min_block):.0f}')
+    print(f'{"=" * 70}')
+
+    print(f'\n✅ Table "{table_name}" is ready in Snowflake for testing!')
+    print(f'   Query it with: SELECT * FROM {table_name} LIMIT 10;')
+
+    return table_name, total_rows, duration
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Load ERC20 transfers into Snowflake using parallel streaming')
+    parser.add_argument(
+        '--blocks', type=int, default=100_000, help='Number of recent blocks to load (default: 100,000)'
+    )
+    parser.add_argument('--workers', type=int, default=8, help='Number of parallel workers (default: 8)')
+
+    args = parser.parse_args()
+
+    try:
+        load_erc20_transfers(num_blocks=args.blocks, num_workers=args.workers)
+    except KeyboardInterrupt:
+        print('\n\n⚠️  Interrupted by user')
+    except Exception as e:
+        print(f'\n\n❌ Error: {e}')
+        raise
diff --git a/src/amp/config/label_manager.py b/src/amp/config/label_manager.py
@@ -0,0 +1,171 @@
+"""
+Label Manager for CSV-based label datasets.
+
+This module provides functionality to register and manage CSV label datasets
+that can be joined with streaming data during loading operations.
+"""
+
+import logging
+from typing import Dict, List, Optional
+
+import pyarrow as pa
+import pyarrow.csv as csv
+
+
+class LabelManager:
+    """
+    Manages CSV label datasets for joining with streaming data.
+
+    Labels are registered by name and loaded as PyArrow Tables for efficient
+    joining operations. This allows reuse of label datasets across multiple
+    queries and loaders.
+
+    Example:
+        >>> manager = LabelManager()
+        >>> manager.add_label('token_labels', '/path/to/tokens.csv')
+        >>> label_table = manager.get_label('token_labels')
+    """
+
+    def __init__(self):
+        self._labels: Dict[str, pa.Table] = {}
+        self.logger = logging.getLogger(__name__)
+
+    def add_label(self, name: str, csv_path: str, binary_columns: Optional[List[str]] = None) -> None:
+        """
+        Load and register a CSV label dataset with automatic hex→binary conversion.
+
+        Hex string columns (like Ethereum addresses) are automatically converted to
+        binary format for efficient storage and joining. This reduces memory usage
+        by ~50% and improves join performance.
+
+        Args:
+            name: Unique name for this label dataset
+            csv_path: Path to the CSV file
+            binary_columns: List of column names containing hex addresses to convert to binary.
+                          If None, auto-detects columns with 'address' in the name.
+
+        Raises:
+            FileNotFoundError: If CSV file doesn't exist
+            ValueError: If CSV cannot be parsed or name already exists
+        """
+        if name in self._labels:
+            self.logger.warning(f"Label '{name}' already exists, replacing with new data")
+
+        try:
+            # Load CSV as PyArrow Table (initially as strings)
+            temp_table = csv.read_csv(csv_path, read_options=csv.ReadOptions(autogenerate_column_names=False))
+
+            # Force all columns to be strings initially
+            column_types = {col_name: pa.string() for col_name in temp_table.column_names}
+            convert_opts = csv.ConvertOptions(column_types=column_types)
+            label_table = csv.read_csv(csv_path, convert_options=convert_opts)
+
+            # Auto-detect or use specified binary columns
+            if binary_columns is None:
+                # Auto-detect columns with 'address' in name (case-insensitive)
+                binary_columns = [col for col in label_table.column_names if 'address' in col.lower()]
+
+            # Convert hex string columns to binary for efficiency
+            converted_columns = []
+            for col_name in binary_columns:
+                if col_name not in label_table.column_names:
+                    self.logger.warning(f"Binary column '{col_name}' not found in CSV, skipping")
+                    continue
+
+                hex_col = label_table.column(col_name)
+
+                # Detect hex string format and convert to binary
+                # Sample first non-null value to determine format
+                sample_value = None
+                for v in hex_col.to_pylist()[:100]:  # Check first 100 values
+                    if v is not None:
+                        sample_value = v
+                        break
+
+                if sample_value is None:
+                    self.logger.warning(f"Column '{col_name}' has no non-null values, skipping conversion")
+                    continue
+
+                # Detect if it's a hex string (with or without 0x prefix)
+                if isinstance(sample_value, str) and all(c in '0123456789abcdefABCDEFx' for c in sample_value):
+                    # Determine binary length from hex string
+                    hex_str = sample_value[2:] if sample_value.startswith('0x') else sample_value
+                    binary_length = len(hex_str) // 2
+
+                    # Convert all values to binary (fixed-size to match streaming data)
+                    def hex_to_binary(v):
+                        if v is None:
+                            return None
+                        hex_str = v[2:] if v.startswith('0x') else v
+                        return bytes.fromhex(hex_str)
+
+                    binary_values = pa.array(
+                        [hex_to_binary(v) for v in hex_col.to_pylist()],
+                        type=pa.binary(
+                            binary_length
+                        ),  # Fixed-size binary to match server data (e.g., 20 bytes for addresses)
+                    )
+
+                    # Replace the column
+                    label_table = label_table.set_column(
+                        label_table.schema.get_field_index(col_name), col_name, binary_values
+                    )
+                    converted_columns.append(f'{col_name} (hex→fixed_size_binary[{binary_length}])')
+                    self.logger.info(f"Converted '{col_name}' from hex string to fixed_size_binary[{binary_length}]")
+
+            self._labels[name] = label_table
+
+            conversion_info = f', converted: {", ".join(converted_columns)}' if converted_columns else ''
+            self.logger.info(
+                f"Loaded label '{name}' from {csv_path}: "
+                f'{label_table.num_rows:,} rows, {len(label_table.schema)} columns '
+                f'({", ".join(label_table.schema.names)}){conversion_info}'
+            )
+
+        except FileNotFoundError:
+            raise FileNotFoundError(f'Label CSV file not found: {csv_path}')
+        except Exception as e:
+            raise ValueError(f"Failed to load label CSV '{csv_path}': {e}") from e
+
+    def get_label(self, name: str) -> Optional[pa.Table]:
+        """
+        Get label table by name.
+
+        Args:
+            name: Name of the label dataset
+
+        Returns:
+            PyArrow Table containing label data, or None if not found
+        """
+        return self._labels.get(name)
+
+    def list_labels(self) -> List[str]:
+        """
+        List all registered label names.
+
+        Returns:
+            List of label names
+        """
+        return list(self._labels.keys())
+
+    def remove_label(self, name: str) -> bool:
+        """
+        Remove a label dataset.
+
+        Args:
+            name: Name of the label to remove
+
+        Returns:
+            True if label was removed, False if it didn't exist
+        """
+        if name in self._labels:
+            del self._labels[name]
+            self.logger.info(f"Removed label '{name}'")
+            return True
+        return False
+
+    def clear(self) -> None:
+        """Remove all label datasets."""
+        count = len(self._labels)
+        self._labels.clear()
+        self.logger.info(f'Cleared {count} label dataset(s)')