migrate client side filtering to concurrent cursor

maxime.c · maxime.c · commit 438a11d8c160 · 2025-07-30T11:39:13.000-04:00
diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py
@@ -481,16 +481,6 @@ def _get_retriever(
             if retriever.cursor:
                 retriever.cursor.set_initial_state(stream_state=stream_state)
 
-            # Similar to above, the ClientSideIncrementalRecordFilterDecorator cursor is a separate instance
-            # from the one initialized on the SimpleRetriever, so it also must also have state initialized
-            # for semi-incremental streams using is_client_side_incremental to filter properly
-            if isinstance(retriever.record_selector, RecordSelector) and isinstance(
-                retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator
-            ):
-                retriever.record_selector.record_filter._cursor.set_initial_state(
-                    stream_state=stream_state
-                )  # type: ignore  # After non-concurrent cursors are deprecated we can remove these cursor workarounds
-
             # We zero it out here, but since this is a cursor reference, the state is still properly
             # instantiated for the other components that reference it
             retriever.cursor = None
diff --git a/airbyte_cdk/sources/declarative/extractors/record_filter.py b/airbyte_cdk/sources/declarative/extractors/record_filter.py
@@ -4,11 +4,7 @@
 from dataclasses import InitVar, dataclass
 from typing import Any, Iterable, Mapping, Optional, Union
 
-from airbyte_cdk.sources.declarative.incremental import (
-    DatetimeBasedCursor,
-    GlobalSubstreamCursor,
-    PerPartitionWithGlobalCursor,
-)
+from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
 from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean
 from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
 
@@ -53,13 +49,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
     """
     Applies a filter to a list of records to exclude those that are older than the stream_state/start_date.
 
-    :param DatetimeBasedCursor date_time_based_cursor: Cursor used to extract datetime values
+    :param Cursor cursor: Cursor used to filter out values
     :param PerPartitionCursor per_partition_cursor: Optional Cursor used for mapping cursor value in nested stream_state
     """
 
     def __init__(
         self,
-        cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
+        cursor: Union[Cursor],
         **kwargs: Any,
     ):
         super().__init__(**kwargs)
@@ -77,7 +73,7 @@ def filter_records(
             for record in records
             if self._cursor.should_be_synced(
                 # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
-                # Record stream name is empty cause it is not used durig the filtering
+                # Record stream name is empty because it is not used during the filtering
                 Record(data=record, associated_slice=stream_slice, stream_name="")
             )
         )
diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py
@@ -81,6 +81,7 @@ def __init__(
         connector_state_converter: AbstractStreamStateConverter,
         cursor_field: CursorField,
         use_global_cursor: bool = False,
+        attempt_to_create_cursor_if_not_provided: bool = False
     ) -> None:
         self._global_cursor: Optional[StreamState] = {}
         self._stream_name = stream_name
@@ -125,6 +126,9 @@ def __init__(
 
         self._set_initial_state(stream_state)
 
+        # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
+        self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
+
     @property
     def cursor_field(self) -> CursorField:
         return self._cursor_field
@@ -513,12 +517,17 @@ def _get_cursor(self, record: Record) -> ConcurrentCursor:
                 "Invalid state as stream slices that are emitted should refer to an existing cursor"
             )
         partition_key = self._to_partition_key(record.associated_slice.partition)
-        if partition_key not in self._cursor_per_partition:
+        if partition_key not in self._cursor_per_partition and not self._attempt_to_create_cursor_if_not_provided:
             raise ValueError(
                 "Invalid state as stream slices that are emitted should refer to an existing cursor"
             )
-        cursor = self._cursor_per_partition[partition_key]
-        return cursor
+        elif partition_key not in self._cursor_per_partition:
+            return self._create_cursor(
+                self._global_cursor,
+                self._lookback_window if self._global_cursor else 0,
+            )
+        else:
+            return self._cursor_per_partition[partition_key]
 
     def limit_reached(self) -> bool:
         return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -34,7 +34,6 @@
 )
 from airbyte_cdk.models import FailureType, Level
 from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
-from airbyte_cdk.sources.declarative import transformations
 from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator
 from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker
 from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository
@@ -1561,6 +1560,8 @@ def create_concurrent_cursor_from_perpartition_cursor(
         stream_state: MutableMapping[str, Any],
         partition_router: PartitionRouter,
         stream_state_migrations: Optional[List[Any]] = None,
+        attempt_to_create_cursor_if_not_provided: bool = False,
+
         **kwargs: Any,
     ) -> ConcurrentPerPartitionCursor:
         component_type = component_definition.get("type")
@@ -1631,6 +1632,7 @@ def create_concurrent_cursor_from_perpartition_cursor(
             connector_state_converter=connector_state_converter,
             cursor_field=cursor_field,
             use_global_cursor=use_global_cursor,
+            attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided,
         )
 
     @staticmethod
@@ -1937,23 +1939,10 @@ def create_declarative_stream(
             and hasattr(model.incremental_sync, "is_client_side_incremental")
             and model.incremental_sync.is_client_side_incremental
         ):
-            supported_slicers = (
-                DatetimeBasedCursor,
-                GlobalSubstreamCursor,
-                PerPartitionWithGlobalCursor,
-            )
-            if combined_slicers and not isinstance(combined_slicers, supported_slicers):
-                raise ValueError(
-                    "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
-                )
-            cursor = (
-                combined_slicers
-                if isinstance(
-                    combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
-                )
-                else self._create_component_from_model(model=model.incremental_sync, config=config)
+            stream_slicer = self._build_stream_slicer_from_partition_router(
+                model.retriever, config, stream_name=model.name
             )
-
+            cursor = self._build_concurrent_cursor(model, stream_slicer, config)
             client_side_incremental_sync = {"cursor": cursor}
 
         if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
@@ -2185,6 +2174,63 @@ def _build_incremental_cursor(
             return self._create_component_from_model(model=model.incremental_sync, config=config)  # type: ignore[no-any-return]  # Will be created Cursor as stream_slicer_model is model.incremental_sync
         return None
 
+    def _build_concurrent_cursor(
+        self,
+        model: DeclarativeStreamModel,
+        stream_slicer: Optional[PartitionRouter],
+        config: Config,
+    ) -> Optional[StreamSlicer]:
+        stream_state = self._connector_state_manager.get_stream_state(
+            stream_name=model.name, namespace=None
+        )
+
+        if model.incremental_sync and stream_slicer:
+            # FIXME should this be in create_concurrent_cursor_from_perpartition_cursor
+            if model.state_migrations:
+                state_transformations = [
+                    self._create_component_from_model(state_migration, config, declarative_stream=model)
+                    for state_migration in model.state_migrations
+                ]
+            else:
+                state_transformations = []
+
+            return self.create_concurrent_cursor_from_perpartition_cursor(
+                # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
+                state_manager=self._connector_state_manager,
+                model_type=DatetimeBasedCursorModel,
+                component_definition=model.incremental_sync.__dict__,
+                stream_name=model.name,
+                stream_namespace=None,
+                config=config or {},
+                stream_state=stream_state,
+                stream_state_migrations=state_transformations,
+                partition_router=stream_slicer,
+                attempt_to_create_cursor_if_not_provided=True,
+            )
+        elif model.incremental_sync:
+            if type(model.incremental_sync) == IncrementingCountCursorModel:
+                return self.create_concurrent_cursor_from_incrementing_count_cursor(
+                    model_type=IncrementingCountCursorModel,
+                    component_definition=model.incremental_sync.__dict__,
+                    stream_name=model.name or "",
+                    stream_namespace=None,
+                    config=config or {},
+                    stream_state_migrations=model.state_migrations,
+                )
+            elif type(model.incremental_sync) == DatetimeBasedCursorModel:
+                return self.create_concurrent_cursor_from_datetime_based_cursor(  # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
+                    model_type=type(model.incremental_sync),
+                    component_definition=model.incremental_sync.__dict__,
+                    stream_name=model.name or "",
+                    stream_namespace=None,
+                    config=config or {},
+                    stream_state_migrations=model.state_migrations,
+                    attempt_to_create_cursor_if_not_provided=True,
+                )
+            else:
+                raise ValueError(f"Incremental sync of type {type(model.incremental_sync)} is not supported")
+        return None
+
     def _build_resumable_cursor(
         self,
         model: Union[
diff --git a/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py b/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py
@@ -28,6 +28,7 @@
 
 
 class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
+
     CURSOR_FIELD = "_ab_source_file_last_modified"
     DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = (
         DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
@@ -311,3 +312,6 @@ def set_initial_state(self, value: StreamState) -> None:
 
     def ensure_at_least_one_state_emitted(self) -> None:
         self.emit_state_message()
+
+    def should_be_synced(self, record: Record) -> bool:
+        return True
diff --git a/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py b/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py
@@ -81,3 +81,6 @@ def ensure_at_least_one_state_emitted(self) -> None:
             self._stream_name, self._stream_namespace
         )
         self._message_repository.emit_message(state_message)
+
+    def should_be_synced(self, record: Record) -> bool:
+        return True
diff --git a/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte_cdk/sources/streams/concurrent/cursor.py
@@ -74,6 +74,10 @@ def ensure_at_least_one_state_emitted(self) -> None:
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def should_be_synced(self, record: Record) -> bool:
+        pass
+
     def stream_slices(self) -> Iterable[StreamSlice]:
         """
         Default placeholder implementation of generate_slices.
@@ -123,6 +127,9 @@ def ensure_at_least_one_state_emitted(self) -> None:
         )
         self._message_repository.emit_message(state_message)
 
+    def should_be_synced(self, record: Record) -> bool:
+        return True
+
 
 class ConcurrentCursor(Cursor):
     _START_BOUNDARY = 0
@@ -192,9 +199,17 @@ def _get_concurrent_state(
         self, state: MutableMapping[str, Any]
     ) -> Tuple[CursorValueType, MutableMapping[str, Any]]:
         if self._connector_state_converter.is_state_message_compatible(state):
+            partitioned_state = self._connector_state_converter.deserialize(state)
+            slices_from_partitioned_state = partitioned_state.get("slices", [])
+
+            value_from_partitioned_state = None
+            if slices_from_partitioned_state:
+                # We assume here that the slices have been already merged
+                first_slice = slices_from_partitioned_state[0]
+                value_from_partitioned_state = first_slice[self._connector_state_converter.MOST_RECENT_RECORD_KEY] if self._connector_state_converter.MOST_RECENT_RECORD_KEY in first_slice else first_slice[self._connector_state_converter.END_KEY]
             return (
-                self._start or self._connector_state_converter.zero_value,
-                self._connector_state_converter.deserialize(state),
+                value_from_partitioned_state or self._start or self._connector_state_converter.zero_value,
+                partitioned_state,
             )
         return self._connector_state_converter.convert_from_sequential_state(
             self._cursor_field, state, self._start
@@ -471,7 +486,7 @@ def should_be_synced(self, record: Record) -> bool:
         except ValueError:
             self._log_for_record_without_cursor_value()
             return True
-        return self.start <= record_cursor_value <= self._end_provider()
+        return self.start <= record_cursor_value
 
     def _log_for_record_without_cursor_value(self) -> None:
         if not self._should_be_synced_logger_triggered:
diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py
@@ -1424,7 +1424,7 @@ def test_client_side_incremental_with_partition_router():
     assert stream.retriever.record_selector.transform_before_filtering == True
     assert isinstance(
         stream.retriever.record_selector.record_filter._cursor,
-        PerPartitionWithGlobalCursor,
+        ConcurrentPerPartitionCursor,
     )
 
 
diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py
@@ -1891,9 +1891,7 @@ def test_stream_using_is_client_side_incremental_has_cursor_state():
     simple_retriever = locations_stream._stream_partition_generator._partition_factory._retriever
     record_filter = simple_retriever.record_selector.record_filter
     assert isinstance(record_filter, ClientSideIncrementalRecordFilterDecorator)
-    client_side_incremental_cursor_state = record_filter._cursor._cursor
-
-    assert client_side_incremental_cursor_state == expected_cursor_value
+    assert list(record_filter._cursor.state.values()) == [expected_cursor_value]
 
 
 @pytest.mark.parametrize(
diff --git a/unit_tests/sources/streams/concurrent/test_cursor.py b/unit_tests/sources/streams/concurrent/test_cursor.py

Original file line number	Diff line number	Diff line change
`@@ -81,3 +81,6 @@ def ensure_at_least_one_state_emitted(self) -> None:`
`81`	`81`	`self._stream_name, self._stream_namespace`
`82`	`82`	`)`
`83`	`83`	`self._message_repository.emit_message(state_message)`
	`84`	`+`
	`85`	`+ def should_be_synced(self, record: Record) -> bool:`
	`86`	`+ return True`
Original file line number	Diff line number	Diff line change
`@@ -1424,7 +1424,7 @@ def test_client_side_incremental_with_partition_router():`
`1424`	`1424`	`assert stream.retriever.record_selector.transform_before_filtering == True`
`1425`	`1425`	`assert isinstance(`
`1426`	`1426`	`stream.retriever.record_selector.record_filter._cursor,`
`1427`		`- PerPartitionWithGlobalCursor,`
	`1427`	`+ ConcurrentPerPartitionCursor,`
`1428`	`1428`	`)`
`1429`	`1429`
`1430`	`1430`