Fix deduplication bug

incrypto32 · incrypto32 · commit 216448fa25e8 · 2025-12-15T12:26:06.000+04:00
diff --git a/apps/kafka_streaming_loader.py b/apps/kafka_streaming_loader.py
@@ -13,15 +13,15 @@
 
 def get_block_hash(client: Client, raw_dataset: str, block_num: int) -> str:
     """Get block hash from dataset.blocks table."""
-    query = f'SELECT hash FROM {raw_dataset}.blocks WHERE block_num = {block_num} LIMIT 1'
+    query = f'SELECT hash FROM "{raw_dataset}".blocks WHERE block_num = {block_num} LIMIT 1'
     result = client.get_sql(query, read_all=True)
     hash_val = result.to_pydict()['hash'][0]
     return '0x' + hash_val.hex() if isinstance(hash_val, bytes) else hash_val
 
 
 def get_latest_block(client: Client, raw_dataset: str) -> int:
     """Get latest block number from dataset.blocks table."""
-    query = f'SELECT block_num FROM {raw_dataset}.blocks ORDER BY block_num DESC LIMIT 1'
+    query = f'SELECT block_num FROM "{raw_dataset}".blocks ORDER BY block_num DESC LIMIT 1'
     result = client.get_sql(query, read_all=True)
     return result.to_pydict()['block_num'][0]
 
diff --git a/apps/queries/erc20_transfers.sql b/apps/queries/erc20_transfers.sql
@@ -39,7 +39,7 @@ from (
         l.timestamp,
         l.address,
         evm_decode(l.topic1, l.topic2, l.topic3, l.data, 'Transfer(address indexed from, address indexed to, uint256 value)') as dec
-    from eth_firehose.logs l
+    from 'edgeandnode/ethereum_mainnet'.logs l
     where
         l.topic0 = evm_topic('Transfer(address indexed from, address indexed to, uint256 value)') and
         l.topic3 IS NULL
diff --git a/src/amp/loaders/base.py b/src/amp/loaders/base.py
@@ -494,6 +494,7 @@ def load_stream_continuous(
                             table_name,
                             connection_name,
                             response.metadata.ranges,
+                            response.metadata.ranges_complete,
                             **filtered_kwargs,
                         )
 
@@ -670,6 +671,7 @@ def _process_batch_non_transactional(
         table_name: str,
         connection_name: str,
         ranges: Optional[List[BlockRange]],
+        ranges_complete: bool,
         **kwargs,
     ) -> Optional[LoadResult]:
         """
@@ -682,46 +684,24 @@ def _process_batch_non_transactional(
             table_name: Target table name
             connection_name: Connection identifier
             ranges: Block ranges for this batch (if available)
+            ranges_complete: Whether this batch marks a watermark boundary
             **kwargs: Additional options passed to load_batch
 
         Returns:
             LoadResult, or None if batch was skipped as duplicate
         """
-        # Check if batch already processed (idempotency / exactly-once)
-        if ranges and self.state_enabled:
-            try:
-                batch_ids = [BatchIdentifier.from_block_range(br) for br in ranges]
-                is_duplicate = self.state_store.is_processed(connection_name, table_name, batch_ids)
-
-                if is_duplicate:
-                    # Skip this batch - already processed
-                    self.logger.info(
-                        f'Skipping duplicate batch: {len(ranges)} ranges already processed for {table_name}'
-                    )
-                    return LoadResult(
-                        rows_loaded=0,
-                        duration=0.0,
-                        ops_per_second=0.0,
-                        table_name=table_name,
-                        loader_type=self.__class__.__name__.replace('Loader', '').lower(),
-                        success=True,
-                        metadata={'operation': 'skip_duplicate', 'ranges': [r.to_dict() for r in ranges]},
-                    )
-            except ValueError as e:
-                # BlockRange missing hash - log and continue without idempotency check
-                self.logger.warning(f'Cannot check for duplicates: {e}. Processing batch anyway.')
-
         # Load batch
         result = self.load_batch(batch_data, table_name, **kwargs)
 
-        if result.success and ranges and self.state_enabled:
-            # Mark batch as processed (for exactly-once semantics)
+        if result.success and ranges and self.state_enabled and ranges_complete:
+            # Only mark ranges at watermark boundaries (ranges_complete=true)
+            # Multiple batches can have the same BlockRange, so we only checkpoint at watermarks
             try:
                 batch_ids = [BatchIdentifier.from_block_range(br) for br in ranges]
                 self.state_store.mark_processed(connection_name, table_name, batch_ids)
+                self.logger.debug(f'Marked watermark as processed: {len(ranges)} ranges for {table_name}')
             except Exception as e:
-                self.logger.error(f'Failed to mark batches as processed: {e}')
-                # Continue anyway - state store provides resume capability
+                self.logger.error(f'Failed to mark watermark as processed: {e}')
 
         return result