feat: Integrate label manager into Client for enriched streaming

fordN · fordN · commit 9fdb63325669 · 2025-11-03T09:35:14.000-08:00
Add label management to Client class:
- Initialize LabelManager with configurable label directory
- Support loading labels from CSV files
- Pass label_manager to all loader instances
- Enable label joining in streaming queries via load() method

Updates:
- Client now supports label enrichment out of the box
- Loaders inherit label_manager from client
- Add pyarrow.csv dependency for label loading
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,21 +12,17 @@ dependencies = [
     "pandas>=2.3.1",
     "pyarrow>=20.0.0",
     "typer>=0.15.2",
-
     # Flight SQL support
     "adbc-driver-manager>=1.5.0",
     "adbc-driver-postgresql>=1.5.0",
     "protobuf>=4.21.0",
-
     # Ethereum/blockchain utilities
     "base58>=2.1.1",
     "eth-hash[pysha3]>=0.7.1",
     "eth-utils>=5.2.0",
-
     # Google Cloud support
     "google-cloud-bigquery>=3.30.0",
     "google-cloud-storage>=3.1.0",
-
     # Arro3 for enhanced PyArrow operations
     "arro3-core>=0.5.1",
     "arro3-compute>=0.5.1",
@@ -58,7 +54,8 @@ iceberg = [
 ]
 
 snowflake = [
-    "snowflake-connector-python>=3.5.0",
+    "snowflake-connector-python>=4.0.0",
+    "snowpipe-streaming>=1.0.0",  # Snowpipe Streaming API
 ]
 
 lmdb = [
@@ -71,7 +68,8 @@ all_loaders = [
     "deltalake>=1.0.2",             # Delta Lake (consistent version)
     "pyiceberg[sql-sqlite]>=0.10.0", # Apache Iceberg
     "pydantic>=2.0,<2.12",          # PyIceberg 0.10.0 compatibility
-    "snowflake-connector-python>=3.5.0",  # Snowflake
+    "snowflake-connector-python>=4.0.0",  # Snowflake
+    "snowpipe-streaming>=1.0.0",        # Snowpipe Streaming API
     "lmdb>=1.4.0",                  # LMDB
 ]
 
@@ -91,6 +89,9 @@ test = [
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
+[tool.hatch.build.targets.wheel]
+packages = ["src/amp"]
+
 [tool.pytest.ini_options]
 pythonpath = ["."]
 testpaths = ["tests"]
diff --git a/src/amp/client.py b/src/amp/client.py
@@ -7,8 +7,9 @@
 
 from . import FlightSql_pb2
 from .config.connection_manager import ConnectionManager
+from .config.label_manager import LabelManager
 from .loaders.registry import create_loader, get_available_loaders
-from .loaders.types import LoadConfig, LoadMode, LoadResult
+from .loaders.types import LabelJoinConfig, LoadConfig, LoadMode, LoadResult
 from .streaming import (
     ParallelConfig,
     ParallelStreamExecutor,
@@ -28,7 +29,12 @@ def __init__(self, client: 'Client', query: str):
         self.logger = logging.getLogger(__name__)
 
     def load(
-        self, connection: str, destination: str, config: Dict[str, Any] = None, **kwargs
+        self,
+        connection: str,
+        destination: str,
+        config: Dict[str, Any] = None,
+        label_config: Optional[LabelJoinConfig] = None,
+        **kwargs
     ) -> Union[LoadResult, Iterator[LoadResult]]:
         """
         Load query results to specified destination
@@ -38,12 +44,16 @@ def load(
             destination: Target destination (table name, key, path, etc.)
             connection: Named connection or connection name for auto-discovery
             config: Inline configuration dict (alternative to connection)
+            label_config: Optional LabelJoinConfig for joining with label data
             **kwargs: Additional loader-specific options including:
                 - read_all: bool = False (if True, loads entire table at once; if False, streams batch by batch)
                 - batch_size: int = 10000 (size of each batch for streaming)
                 - stream: bool = False (if True, enables continuous streaming with reorg detection)
                 - with_reorg_detection: bool = True (enable reorg detection for streaming queries)
                 - resume_watermark: Optional[ResumeWatermark] = None (resume streaming from specific point)
+                - label: str (deprecated, use label_config instead)
+                - label_key_column: str (deprecated, use label_config instead)
+                - stream_key_column: str (deprecated, use label_config instead)
 
         Returns:
             - If read_all=True: Single LoadResult with operation details
@@ -58,7 +68,12 @@ def load(
             # TODO: Add validation that the specific query uses features supported by streaming
             streaming_query = self._ensure_streaming_query(self.query)
             return self.client.query_and_load_streaming(
-                query=streaming_query, destination=destination, connection_name=connection, config=config, **kwargs
+                query=streaming_query,
+                destination=destination,
+                connection_name=connection,
+                config=config,
+                label_config=label_config,
+                **kwargs,
             )
 
         # Validate that parallel_config is only used with stream=True
@@ -69,7 +84,12 @@ def load(
         kwargs.setdefault('read_all', False)
 
         return self.client.query_and_load(
-            query=self.query, destination=destination, connection_name=connection, config=config, **kwargs
+            query=self.query,
+            destination=destination,
+            connection_name=connection,
+            config=config,
+            label_config=label_config,
+            **kwargs,
         )
 
     def _ensure_streaming_query(self, query: str) -> str:
@@ -105,6 +125,7 @@ class Client:
     def __init__(self, url):
         self.conn = flight.connect(url)
         self.connection_manager = ConnectionManager()
+        self.label_manager = LabelManager()
         self.logger = logging.getLogger(__name__)
 
     def sql(self, query: str) -> QueryBuilder:
@@ -123,6 +144,18 @@ def configure_connection(self, name: str, loader: str, config: Dict[str, Any]) -
         """Configure a named connection for reuse"""
         self.connection_manager.add_connection(name, loader, config)
 
+    def configure_label(self, name: str, csv_path: str, binary_columns: Optional[List[str]] = None) -> None:
+        """
+        Configure a label dataset from a CSV file for joining with streaming data.
+
+        Args:
+            name: Unique name for this label dataset
+            csv_path: Path to the CSV file
+            binary_columns: List of column names containing hex addresses to convert to binary.
+                          If None, auto-detects columns with 'address' in the name.
+        """
+        self.label_manager.add_label(name, csv_path, binary_columns)
+
     def list_connections(self) -> Dict[str, str]:
         """List all configured connections"""
         return self.connection_manager.list_connections()
@@ -162,7 +195,13 @@ def _batch_generator(self, reader):
                 break
 
     def query_and_load(
-        self, query: str, destination: str, connection_name: str, config: Optional[Dict[str, Any]] = None, **kwargs
+        self,
+        query: str,
+        destination: str,
+        connection_name: str,
+        config: Optional[Dict[str, Any]] = None,
+        label_config: Optional[LabelJoinConfig] = None,
+        **kwargs,
     ) -> Union[LoadResult, Iterator[LoadResult]]:
         """
         Execute query and load results directly into target system
@@ -211,6 +250,13 @@ def query_and_load(
             **{k: v for k, v in kwargs.items() if k in ['max_retries', 'retry_delay']},
         )
 
+        # Remove known LoadConfig params from kwargs, leaving loader-specific params
+        for key in ['max_retries', 'retry_delay']:
+            kwargs.pop(key, None)
+
+        # Remaining kwargs are loader-specific (e.g., channel_suffix for Snowflake)
+        loader_specific_kwargs = kwargs
+
         if read_all:
             self.logger.info(f'Loading entire query result to {loader_type}:{destination}')
         else:
@@ -221,20 +267,36 @@ def query_and_load(
         # Get the data and load
         if read_all:
             table = self.get_sql(query, read_all=True)
-            return self._load_table(table, loader_type, destination, loader_config, load_config)
+            return self._load_table(
+                table,
+                loader_type,
+                destination,
+                loader_config,
+                load_config,
+                label_config=label_config,
+                **loader_specific_kwargs,
+            )
         else:
             batch_stream = self.get_sql(query, read_all=False)
-            return self._load_stream(batch_stream, loader_type, destination, loader_config, load_config)
+            return self._load_stream(
+                batch_stream,
+                loader_type,
+                destination,
+                loader_config,
+                load_config,
+                label_config=label_config,
+                **loader_specific_kwargs,
+            )
 
     def _load_table(
-        self, table: pa.Table, loader: str, table_name: str, config: Dict[str, Any], load_config: LoadConfig
+        self, table: pa.Table, loader: str, table_name: str, config: Dict[str, Any], load_config: LoadConfig, **kwargs
     ) -> LoadResult:
         """Load a complete Arrow Table"""
         try:
-            loader_instance = create_loader(loader, config)
+            loader_instance = create_loader(loader, config, label_manager=self.label_manager)
 
             with loader_instance:
-                return loader_instance.load_table(table, table_name, **load_config.__dict__)
+                return loader_instance.load_table(table, table_name, **load_config.__dict__, **kwargs)
         except Exception as e:
             self.logger.error(f'Failed to load table: {e}')
             return LoadResult(
@@ -254,13 +316,14 @@ def _load_stream(
         table_name: str,
         config: Dict[str, Any],
         load_config: LoadConfig,
+        **kwargs,
     ) -> Iterator[LoadResult]:
         """Load from a stream of batches"""
         try:
-            loader_instance = create_loader(loader, config)
+            loader_instance = create_loader(loader, config, label_manager=self.label_manager)
 
             with loader_instance:
-                yield from loader_instance.load_stream(batch_stream, table_name, **load_config.__dict__)
+                yield from loader_instance.load_stream(batch_stream, table_name, **load_config.__dict__, **kwargs)
         except Exception as e:
             self.logger.error(f'Failed to load stream: {e}')
             yield LoadResult(
@@ -279,6 +342,7 @@ def query_and_load_streaming(
         destination: str,
         connection_name: str,
         config: Optional[Dict[str, Any]] = None,
+        label_config: Optional[LabelJoinConfig] = None,
         with_reorg_detection: bool = True,
         resume_watermark: Optional[ResumeWatermark] = None,
         parallel_config: Optional[ParallelConfig] = None,
@@ -315,6 +379,10 @@ def query_and_load_streaming(
                 **{k: v for k, v in kwargs.items() if k in ['max_retries', 'retry_delay']},
             }
 
+            # Add label_config if provided
+            if label_config:
+                load_config_dict['label_config'] = label_config
+
             yield from executor.execute_parallel_stream(query, destination, connection_name, load_config_dict)
             return
 
@@ -346,6 +414,27 @@ def query_and_load_streaming(
 
         self.logger.info(f'Starting streaming query to {loader_type}:{destination}')
 
+        # Create loader instance early to access checkpoint store
+        loader_instance = create_loader(loader_type, loader_config, label_manager=self.label_manager)
+
+        # Load checkpoint and create resume watermark if enabled (default: enabled)
+        if resume_watermark is None and kwargs.get('resume', True):
+            try:
+                checkpoint = loader_instance.checkpoint_store.load(connection_name, destination)
+
+                if checkpoint:
+                    resume_watermark = checkpoint.to_resume_watermark()
+                    checkpoint_type = 'reorg checkpoint' if checkpoint.is_reorg else 'checkpoint'
+                    self.logger.info(
+                        f'Resuming from {checkpoint_type}: {len(checkpoint.ranges)} ranges, '
+                        f'timestamp {checkpoint.timestamp}'
+                    )
+                    if checkpoint.is_reorg:
+                        resume_points = ', '.join(f'{r.network}:{r.start}' for r in checkpoint.ranges)
+                        self.logger.info(f'Reorg resume points: {resume_points}')
+            except Exception as e:
+                self.logger.warning(f'Failed to load checkpoint, starting from beginning: {e}')
+
         try:
             # Execute streaming query with Flight SQL
             # Create a CommandStatementQuery message
@@ -376,12 +465,13 @@ def query_and_load_streaming(
                 stream_iterator = ReorgAwareStream(stream_iterator)
                 self.logger.info('Reorg detection enabled for streaming query')
 
-            # Create loader instance and start continuous loading
-            loader_instance = create_loader(loader_type, loader_config)
-
+            # Start continuous loading with checkpoint support
             with loader_instance:
                 self.logger.info(f'Starting continuous load to {destination}. Press Ctrl+C to stop.')
-                yield from loader_instance.load_stream_continuous(stream_iterator, destination, **load_config.__dict__)
+                # Pass connection_name for checkpoint saving
+                yield from loader_instance.load_stream_continuous(
+                    stream_iterator, destination, connection_name=connection_name, **load_config.__dict__
+                )
 
         except Exception as e:
             self.logger.error(f'Streaming query failed: {e}')