Skip to content

Commit 4459243

Browse files
tolik0maxi297
andauthored
feat(Low-Code Concurrent CDK): Add ConcurrentPerPartitionCursor (#111)
Co-authored-by: Maxime Carbonneau-Leclerc <[email protected]>
1 parent 8963a3c commit 4459243

File tree

14 files changed

+2910
-86
lines changed

14 files changed

+2910
-86
lines changed

airbyte_cdk/sources/declarative/concurrent_declarative_source.py

Lines changed: 80 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
ClientSideIncrementalRecordFilterDecorator,
2121
)
2222
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23+
from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24+
PerPartitionWithGlobalCursor,
25+
)
2326
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
2427
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
2528
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -32,7 +35,7 @@
3235
ModelToComponentFactory,
3336
)
3437
from airbyte_cdk.sources.declarative.requesters import HttpRequester
35-
from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever
38+
from airbyte_cdk.sources.declarative.retrievers import Retriever, SimpleRetriever
3639
from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
3740
DeclarativePartitionFactory,
3841
StreamSlicerPartitionGenerator,
@@ -231,21 +234,7 @@ def _group_streams(
231234
stream_state=stream_state,
232235
)
233236

234-
retriever = declarative_stream.retriever
235-
236-
# This is an optimization so that we don't invoke any cursor or state management flows within the
237-
# low-code framework because state management is handled through the ConcurrentCursor.
238-
if declarative_stream and isinstance(retriever, SimpleRetriever):
239-
# Also a temporary hack. In the legacy Stream implementation, as part of the read,
240-
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no
241-
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
242-
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
243-
# still rely on a DatetimeBasedCursor that is properly initialized with state.
244-
if retriever.cursor:
245-
retriever.cursor.set_initial_state(stream_state=stream_state)
246-
# We zero it out here, but since this is a cursor reference, the state is still properly
247-
# instantiated for the other components that reference it
248-
retriever.cursor = None
237+
retriever = self._get_retriever(declarative_stream, stream_state)
249238

250239
partition_generator = StreamSlicerPartitionGenerator(
251240
DeclarativePartitionFactory(
@@ -305,6 +294,60 @@ def _group_streams(
305294
cursor=final_state_cursor,
306295
)
307296
)
297+
elif (
298+
incremental_sync_component_definition
299+
and incremental_sync_component_definition.get("type", "")
300+
== DatetimeBasedCursorModel.__name__
301+
and self._stream_supports_concurrent_partition_processing(
302+
declarative_stream=declarative_stream
303+
)
304+
and hasattr(declarative_stream.retriever, "stream_slicer")
305+
and isinstance(
306+
declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
307+
)
308+
):
309+
stream_state = state_manager.get_stream_state(
310+
stream_name=declarative_stream.name, namespace=declarative_stream.namespace
311+
)
312+
partition_router = declarative_stream.retriever.stream_slicer._partition_router
313+
314+
perpartition_cursor = (
315+
self._constructor.create_concurrent_cursor_from_perpartition_cursor(
316+
state_manager=state_manager,
317+
model_type=DatetimeBasedCursorModel,
318+
component_definition=incremental_sync_component_definition,
319+
stream_name=declarative_stream.name,
320+
stream_namespace=declarative_stream.namespace,
321+
config=config or {},
322+
stream_state=stream_state,
323+
partition_router=partition_router,
324+
)
325+
)
326+
327+
retriever = self._get_retriever(declarative_stream, stream_state)
328+
329+
partition_generator = StreamSlicerPartitionGenerator(
330+
DeclarativePartitionFactory(
331+
declarative_stream.name,
332+
declarative_stream.get_json_schema(),
333+
retriever,
334+
self.message_repository,
335+
),
336+
perpartition_cursor,
337+
)
338+
339+
concurrent_streams.append(
340+
DefaultStream(
341+
partition_generator=partition_generator,
342+
name=declarative_stream.name,
343+
json_schema=declarative_stream.get_json_schema(),
344+
availability_strategy=AlwaysAvailableAvailabilityStrategy(),
345+
primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
346+
cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
347+
logger=self.logger,
348+
cursor=perpartition_cursor,
349+
)
350+
)
308351
else:
309352
synchronous_streams.append(declarative_stream)
310353
else:
@@ -395,6 +438,27 @@ def _stream_supports_concurrent_partition_processing(
395438
return False
396439
return True
397440

441+
def _get_retriever(
442+
self, declarative_stream: DeclarativeStream, stream_state: Mapping[str, Any]
443+
) -> Retriever:
444+
retriever = declarative_stream.retriever
445+
446+
# This is an optimization so that we don't invoke any cursor or state management flows within the
447+
# low-code framework because state management is handled through the ConcurrentCursor.
448+
if declarative_stream and isinstance(retriever, SimpleRetriever):
449+
# Also a temporary hack. In the legacy Stream implementation, as part of the read,
450+
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no
451+
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
452+
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
453+
# still rely on a DatetimeBasedCursor that is properly initialized with state.
454+
if retriever.cursor:
455+
retriever.cursor.set_initial_state(stream_state=stream_state)
456+
# We zero it out here, but since this is a cursor reference, the state is still properly
457+
# instantiated for the other components that reference it
458+
retriever.cursor = None
459+
460+
return retriever
461+
398462
@staticmethod
399463
def _select_streams(
400464
streams: List[AbstractStream], configured_catalog: ConfiguredAirbyteCatalog

airbyte_cdk/sources/declarative/extractors/record_filter.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,11 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
5959

6060
def __init__(
6161
self,
62-
date_time_based_cursor: DatetimeBasedCursor,
63-
substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
62+
cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
6463
**kwargs: Any,
6564
):
6665
super().__init__(**kwargs)
67-
self._date_time_based_cursor = date_time_based_cursor
68-
self._substream_cursor = substream_cursor
66+
self._cursor = cursor
6967

7068
def filter_records(
7169
self,
@@ -77,7 +75,7 @@ def filter_records(
7775
records = (
7876
record
7977
for record in records
80-
if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
78+
if self._cursor.should_be_synced(
8179
# Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
8280
# Record stream name is empty cause it is not used durig the filtering
8381
Record(data=record, associated_slice=stream_slice, stream_name="")

airbyte_cdk/sources/declarative/incremental/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
33
#
44

5+
from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
6+
ConcurrentCursorFactory,
7+
ConcurrentPerPartitionCursor,
8+
)
59
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
610
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
711
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
@@ -21,6 +25,8 @@
2125

2226
__all__ = [
2327
"CursorFactory",
28+
"ConcurrentCursorFactory",
29+
"ConcurrentPerPartitionCursor",
2430
"DatetimeBasedCursor",
2531
"DeclarativeCursor",
2632
"GlobalSubstreamCursor",

0 commit comments

Comments
 (0)