44import pyarrow as pa
55from psycopg2 .pool import ThreadedConnectionPool
66
7+ from ...streaming .idempotency import DatabaseProcessedRangesStore , IdempotencyConfig
78from ...streaming .types import BlockRange
89from ..base import DataLoader , LoadMode
910from ._postgres_helpers import has_binary_columns , prepare_csv_data , prepare_insert_data
@@ -84,6 +85,22 @@ def connect(self) -> None:
8485 finally :
8586 self .pool .putconn (conn )
8687
88+ # Replace NullStores with database-backed implementations
89+ # This enables persistent checkpointing and idempotency
90+ conn = self .pool .getconn ()
91+ try :
92+ if self .checkpoint_config .enabled :
93+ from ...streaming .checkpoint import DatabaseCheckpointStore
94+
95+ self .checkpoint_store = DatabaseCheckpointStore (self .checkpoint_config , conn )
96+ self .logger .info ('Enabled database-backed checkpoint store' )
97+
98+ if self .idempotency_config .enabled :
99+ self .processed_ranges_store = DatabaseProcessedRangesStore (self .idempotency_config , conn )
100+ self .logger .info ('Enabled database-backed idempotency store' )
101+ finally :
102+ self .pool .putconn (conn )
103+
87104 self ._is_connected = True
88105
89106 except Exception as e :
@@ -109,6 +126,90 @@ def _load_batch_impl(self, batch: pa.RecordBatch, table_name: str, **kwargs) ->
109126 finally :
110127 self .pool .putconn (conn )
111128
129+ def load_batch_transactional (
130+ self ,
131+ batch : pa .RecordBatch ,
132+ table_name : str ,
133+ connection_name : str ,
134+ ranges : List [BlockRange ],
135+ batch_hash : Optional [str ] = None ,
136+ ) -> int :
137+ """
138+ Load a batch with transactional exactly-once semantics.
139+
140+ This method wraps the duplicate check, data loading, and processed marking
141+ in a single PostgreSQL transaction, ensuring atomic exactly-once processing.
142+
143+ The transaction flow:
144+ 1. BEGIN TRANSACTION
145+ 2. Check if batch already processed (with SELECT FOR UPDATE lock)
146+ 3. If not processed:
147+ - Load data into target table
148+ - Mark ranges as processed in processed_ranges table
149+ 4. COMMIT (or ROLLBACK on error)
150+
151+ This guarantees that either both operations succeed or both fail,
152+ preventing duplicate data even in case of crashes between operations.
153+
154+ Args:
155+ batch: PyArrow RecordBatch to load
156+ table_name: Target table name
157+ connection_name: Connection identifier for tracking
158+ ranges: Block ranges covered by this batch
159+ batch_hash: Optional hash for additional validation
160+
161+ Returns:
162+ Number of rows loaded (0 if duplicate)
163+ """
164+ if not self .idempotency_config .enabled :
165+ raise ValueError ('Transactional loading requires idempotency to be enabled' )
166+
167+ conn = self .pool .getconn ()
168+ try :
169+ # Create processed ranges store with this connection for transactional operations
170+ store = DatabaseProcessedRangesStore (self .idempotency_config , conn )
171+
172+ # Disable autocommit to manage transaction manually
173+ original_autocommit = conn .autocommit
174+ conn .autocommit = False
175+
176+ try :
177+ # Check if already processed (within transaction)
178+ if store .is_processed (connection_name , table_name , ranges ):
179+ self .logger .info (
180+ f'Batch already processed (ranges: { [f"{ r .network } :{ r .start } -{ r .end } " for r in ranges ]} ), '
181+ f'skipping (transactional check)'
182+ )
183+ conn .rollback ()
184+ return 0
185+
186+ # Load data within transaction
187+ with conn .cursor () as cur :
188+ self ._copy_arrow_data (cur , batch , table_name )
189+
190+ # Mark as processed within same transaction
191+ store .mark_processed (connection_name , table_name , ranges , batch_hash )
192+
193+ # Commit transaction - both data load and processed marking succeed atomically
194+ conn .commit ()
195+ self .logger .debug (
196+ f'Transactional batch load committed: { batch .num_rows } rows, '
197+ f'ranges: { [f"{ r .network } :{ r .start } -{ r .end } " for r in ranges ]} '
198+ )
199+ return batch .num_rows
200+
201+ except Exception as e :
202+ # Rollback on any error - ensures no partial state
203+ conn .rollback ()
204+ self .logger .error (f'Transactional batch load failed, rolled back: { e } ' )
205+ raise
206+ finally :
207+ # Restore original autocommit setting
208+ conn .autocommit = original_autocommit
209+
210+ finally :
211+ self .pool .putconn (conn )
212+
112213 def _clear_table (self , table_name : str ) -> None :
113214 """Clear table for overwrite mode"""
114215 conn = self .pool .getconn ()
@@ -208,11 +309,9 @@ def _create_table_from_schema(self, schema: pa.Schema, table_name: str) -> None:
208309
209310 # Build CREATE TABLE statement
210311 columns = []
211- # Check if this is streaming data with metadata columns
212- has_metadata = any (field .name .startswith ('_meta_' ) for field in schema )
213312
214313 for field in schema :
215- # Skip generic metadata columns - we'll use _meta_block_range instead
314+ # Skip generic metadata columns - we'll use _meta_block_ranges instead
216315 if field .name in ('_meta_range_start' , '_meta_range_end' ):
217316 continue
218317 # Special handling for JSONB metadata column
@@ -258,13 +357,14 @@ def _create_table_from_schema(self, schema: pa.Schema, table_name: str) -> None:
258357 # Quote column name for safety (important for blockchain field names)
259358 columns .append (f'"{ field .name } " { pg_type } { nullable } ' )
260359
261- # Add metadata columns for streaming/reorg support if this is streaming data
262- # but only if they don't already exist in the schema
263- if has_metadata :
264- schema_field_names = [field .name for field in schema ]
265- if '_meta_block_ranges' not in schema_field_names :
266- # Use JSONB for multi-network block ranges with GIN index support
267- columns .append ('"_meta_block_ranges" JSONB' )
360+ # Always add metadata column for streaming/reorg support
361+ # This supports hybrid streaming (parallel catch-up → continuous streaming)
362+ # where initial batches don't have metadata but later ones do
363+ schema_field_names = [field .name for field in schema ]
364+ if '_meta_block_ranges' not in schema_field_names :
365+ # Use JSONB for multi-network block ranges with GIN index support
366+ # This column is optional and can be NULL for non-streaming loads
367+ columns .append ('"_meta_block_ranges" JSONB' )
268368
269369 # Create the table - Fixed: use proper identifier quoting
270370 create_sql = f"""
0 commit comments