edgeandnode
diff --git a/‎src/amp/loaders/base.py‎
Lines changed: 24 additions & 8 deletions b/‎src/amp/loaders/base.py‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎src/amp/loaders/implementations/postgresql_loader.py‎
Lines changed: 10 additions & 5 deletions b/‎src/amp/loaders/implementations/postgresql_loader.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎tests/integration/test_deltalake_loader.py‎
Lines changed: 60 additions & 15 deletions b/‎tests/integration/test_deltalake_loader.py‎
Lines changed: 60 additions & 15 deletions
@@ -484,6 +484,7 @@ def load_stream_continuous(
                             table_name,
                             connection_name,
                             response.metadata.ranges,
+                            ranges_complete=response.metadata.ranges_complete,
                         )
                     else:
                         # Non-transactional loading (separate check, load, mark)
@@ -494,6 +495,7 @@ def load_stream_continuous(
                             table_name,
                             connection_name,
                             response.metadata.ranges,
+                            ranges_complete=response.metadata.ranges_complete,
                             **filtered_kwargs,
                         )
 
@@ -611,6 +613,7 @@ def _process_batch_transactional(
         table_name: str,
         connection_name: str,
         ranges: List[BlockRange],
+        ranges_complete: bool = False,
     ) -> LoadResult:
         """
         Process a data batch using transactional exactly-once semantics.
@@ -622,6 +625,7 @@ def _process_batch_transactional(
             table_name: Target table name
             connection_name: Connection identifier
             ranges: Block ranges for this batch
+            ranges_complete: True when this RecordBatch completes a microbatch (streaming only)
 
         Returns:
             LoadResult with operation outcome
@@ -630,13 +634,17 @@ def _process_batch_transactional(
         try:
             # Delegate to loader-specific transactional implementation
             # Loaders that support transactions implement load_batch_transactional()
-            rows_loaded_batch = self.load_batch_transactional(batch_data, table_name, connection_name, ranges)
+            rows_loaded_batch = self.load_batch_transactional(
+                batch_data, table_name, connection_name, ranges, ranges_complete
+            )
             duration = time.time() - start_time
 
-            # Mark batches as processed in state store after successful transaction
-            if ranges:
+            # Mark batches as processed ONLY when microbatch is complete
+            # multiple RecordBatches can share the same microbatch ID
+            if ranges and ranges_complete:
                 batch_ids = [BatchIdentifier.from_block_range(br) for br in ranges]
                 self.state_store.mark_processed(connection_name, table_name, batch_ids)
+                self.logger.debug(f'Marked microbatch as processed: {len(batch_ids)} batch IDs')
 
             return LoadResult(
                 rows_loaded=rows_loaded_batch,
@@ -648,6 +656,7 @@ def _process_batch_transactional(
                 metadata={
                     'operation': 'transactional_load' if rows_loaded_batch > 0 else 'skip_duplicate',
                     'ranges': [r.to_dict() for r in ranges],
+                    'ranges_complete': ranges_complete,
                 },
             )
 
@@ -670,6 +679,7 @@ def _process_batch_non_transactional(
         table_name: str,
         connection_name: str,
         ranges: Optional[List[BlockRange]],
+        ranges_complete: bool = False,
         **kwargs,
     ) -> Optional[LoadResult]:
         """
@@ -682,21 +692,25 @@ def _process_batch_non_transactional(
             table_name: Target table name
             connection_name: Connection identifier
             ranges: Block ranges for this batch (if available)
+            ranges_complete: True when this RecordBatch completes a microbatch (streaming only)
             **kwargs: Additional options passed to load_batch
 
         Returns:
             LoadResult, or None if batch was skipped as duplicate
         """
         # Check if batch already processed (idempotency / exactly-once)
-        if ranges and self.state_enabled:
+        # For streaming: only check when ranges_complete=True (end of microbatch)
+        # Multiple RecordBatches can share the same microbatch ID, so we must wait
+        # until the entire microbatch is delivered before checking/marking as processed
+        if ranges and self.state_enabled and ranges_complete:
             try:
                 batch_ids = [BatchIdentifier.from_block_range(br) for br in ranges]
                 is_duplicate = self.state_store.is_processed(connection_name, table_name, batch_ids)
 
                 if is_duplicate:
                     # Skip this batch - already processed
                     self.logger.info(
-                        f'Skipping duplicate batch: {len(ranges)} ranges already processed for {table_name}'
+                        f'Skipping duplicate microbatch: {len(ranges)} ranges already processed for {table_name}'
                     )
                     return LoadResult(
                         rows_loaded=0,
@@ -711,14 +725,16 @@ def _process_batch_non_transactional(
                 # BlockRange missing hash - log and continue without idempotency check
                 self.logger.warning(f'Cannot check for duplicates: {e}. Processing batch anyway.')
 
-        # Load batch
+        # Load batch (always load, even if part of larger microbatch)
         result = self.load_batch(batch_data, table_name, **kwargs)
 
-        if result.success and ranges and self.state_enabled:
-            # Mark batch as processed (for exactly-once semantics)
+        # Mark batch as processed ONLY when microbatch is complete
+        # This ensures we don't skip subsequent RecordBatches within the same microbatch
+        if result.success and ranges and self.state_enabled and ranges_complete:
             try:
                 batch_ids = [BatchIdentifier.from_block_range(br) for br in ranges]
                 self.state_store.mark_processed(connection_name, table_name, batch_ids)
+                self.logger.debug(f'Marked microbatch as processed: {len(batch_ids)} batch IDs')
             except Exception as e:
                 self.logger.error(f'Failed to mark batches as processed: {e}')
                 # Continue anyway - state store provides resume capability
 
@@ -119,6 +119,7 @@ def load_batch_transactional(
         table_name: str,
         connection_name: str,
         ranges: List[BlockRange],
+        ranges_complete: bool = False,
     ) -> int:
         """
         Load a batch with transactional exactly-once semantics using in-memory state.
@@ -135,6 +136,7 @@ def load_batch_transactional(
             table_name: Target table name
             connection_name: Connection identifier for tracking
             ranges: Block ranges covered by this batch
+            ranges_complete: True when this RecordBatch completes a microbatch (streaming only)
 
         Returns:
             Number of rows loaded (0 if duplicate)
@@ -149,24 +151,27 @@ def load_batch_transactional(
             self.logger.warning(f'Cannot create batch identifiers: {e}. Loading without duplicate check.')
             batch_ids = []
 
-        # Check if already processed (using in-memory state)
-        if batch_ids and self.state_store.is_processed(connection_name, table_name, batch_ids):
+        # Check if already processed ONLY when microbatch is complete
+        # Multiple RecordBatches can share the same microbatch ID (BlockRange)
+        if batch_ids and ranges_complete and self.state_store.is_processed(connection_name, table_name, batch_ids):
             self.logger.info(
                 f'Batch already processed (ranges: {[f"{r.network}:{r.start}-{r.end}" for r in ranges]}), '
                 f'skipping (state check)'
             )
             return 0
 
-        # Load data
+        # Load data (always load, even if part of larger microbatch)
         conn = self.pool.getconn()
         try:
             with conn.cursor() as cur:
                 self._copy_arrow_data(cur, batch, table_name)
                 conn.commit()
 
-            # Mark as processed after successful load
-            if batch_ids:
+            # Mark as processed ONLY when microbatch is complete
+            # This ensures we don't skip subsequent RecordBatches within the same microbatch
+            if batch_ids and ranges_complete:
                 self.state_store.mark_processed(connection_name, table_name, batch_ids)
+                self.logger.debug(f'Marked microbatch as processed: {len(batch_ids)} batch IDs')
 
             self.logger.debug(
                 f'Batch load committed: {batch.num_rows} rows, '
 
@@ -586,15 +586,24 @@ def test_handle_reorg_single_network(self, delta_temp_config):
             # Create response batches with hashes
             response1 = ResponseBatch.data_batch(
                 data=batch1,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xabc')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xabc')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response2 = ResponseBatch.data_batch(
                 data=batch2,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=150, end=160, hash='0xdef')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=150, end=160, hash='0xdef')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response3 = ResponseBatch.data_batch(
                 data=batch3,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=200, end=210, hash='0x123')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=200, end=210, hash='0x123')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
 
             # Load via streaming API
@@ -637,19 +646,31 @@ def test_handle_reorg_multi_network(self, delta_temp_config):
             # Create response batches with network-specific ranges
             response1 = ResponseBatch.data_batch(
                 data=batch1,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xaaa')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xaaa')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response2 = ResponseBatch.data_batch(
                 data=batch2,
-                metadata=BatchMetadata(ranges=[BlockRange(network='polygon', start=100, end=110, hash='0xbbb')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='polygon', start=100, end=110, hash='0xbbb')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response3 = ResponseBatch.data_batch(
                 data=batch3,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=150, end=160, hash='0xccc')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=150, end=160, hash='0xccc')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response4 = ResponseBatch.data_batch(
                 data=batch4,
-                metadata=BatchMetadata(ranges=[BlockRange(network='polygon', start=150, end=160, hash='0xddd')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='polygon', start=150, end=160, hash='0xddd')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
 
             # Load via streaming API
@@ -689,15 +710,24 @@ def test_handle_reorg_overlapping_ranges(self, delta_temp_config):
             # Batch 3: 170-190 (after reorg, but should be deleted as 170 >= 150)
             response1 = ResponseBatch.data_batch(
                 data=batch1,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=90, end=110, hash='0xaaa')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=90, end=110, hash='0xaaa')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response2 = ResponseBatch.data_batch(
                 data=batch2,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=140, end=160, hash='0xbbb')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=140, end=160, hash='0xbbb')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response3 = ResponseBatch.data_batch(
                 data=batch3,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=170, end=190, hash='0xccc')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=170, end=190, hash='0xccc')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
 
             # Load via streaming API
@@ -733,15 +763,24 @@ def test_handle_reorg_version_history(self, delta_temp_config):
 
             response1 = ResponseBatch.data_batch(
                 data=batch1,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=0, end=10, hash='0xaaa')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=0, end=10, hash='0xaaa')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response2 = ResponseBatch.data_batch(
                 data=batch2,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=50, end=60, hash='0xbbb')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=50, end=60, hash='0xbbb')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
             response3 = ResponseBatch.data_batch(
                 data=batch3,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xccc')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xccc')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
 
             # Load via streaming API
@@ -792,12 +831,18 @@ def test_streaming_with_reorg(self, delta_temp_config):
             # Create response batches using factory methods (with hashes for proper state management)
             response1 = ResponseBatch.data_batch(
                 data=data1,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xabc123')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=100, end=110, hash='0xabc123')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
 
             response2 = ResponseBatch.data_batch(
                 data=data2,
-                metadata=BatchMetadata(ranges=[BlockRange(network='ethereum', start=150, end=160, hash='0xdef456')]),
+                metadata=BatchMetadata(
+                    ranges=[BlockRange(network='ethereum', start=150, end=160, hash='0xdef456')],
+                    ranges_complete=True,  # Mark as complete so it gets tracked in state store
+                ),
             )
 
             # Simulate reorg event using factory method