From 1d5b468c6b707ac446667f96b5b1ca7a476e8d16 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 09:37:25 -0400 Subject: [PATCH 01/68] remove --- .../concurrent_declarative_source.py | 6 --- .../availability_strategy/__init__.py | 6 +-- ...stract_file_based_availability_strategy.py | 28 +----------- .../stream/abstract_file_based_stream.py | 1 - .../file_based/stream/concurrent/adapters.py | 6 +-- .../sources/streams/availability_strategy.py | 1 + .../streams/concurrent/abstract_stream.py | 7 --- .../sources/streams/concurrent/adapters.py | 43 ------------------- .../streams/concurrent/default_stream.py | 9 ---- ...hread_based_concurrent_stream_scenarios.py | 10 ----- .../streams/concurrent/test_adapters.py | 32 -------------- .../streams/concurrent/test_default_stream.py | 13 ------ 12 files changed, 4 insertions(+), 158 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 1d629f0c7..cc59b1554 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -52,9 +52,6 @@ from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AlwaysAvailableAvailabilityStrategy, -) from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream @@ -325,7 +322,6 @@ def _group_streams( partition_generator=partition_generator, name=declarative_stream.name, json_schema=declarative_stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=get_primary_key_from_stream(declarative_stream.primary_key), cursor_field=cursor.cursor_field.cursor_field_key if hasattr(cursor, "cursor_field") @@ -362,7 +358,6 @@ def _group_streams( partition_generator=partition_generator, name=declarative_stream.name, json_schema=declarative_stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=get_primary_key_from_stream(declarative_stream.primary_key), cursor_field=None, logger=self.logger, @@ -417,7 +412,6 @@ def _group_streams( partition_generator=partition_generator, name=declarative_stream.name, json_schema=declarative_stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=get_primary_key_from_stream(declarative_stream.primary_key), cursor_field=perpartition_cursor.cursor_field.cursor_field_key, logger=self.logger, diff --git a/airbyte_cdk/sources/file_based/availability_strategy/__init__.py b/airbyte_cdk/sources/file_based/availability_strategy/__init__.py index 8134a89e0..ee3c802df 100644 --- a/airbyte_cdk/sources/file_based/availability_strategy/__init__.py +++ b/airbyte_cdk/sources/file_based/availability_strategy/__init__.py @@ -1,11 +1,7 @@ -from .abstract_file_based_availability_strategy import ( - AbstractFileBasedAvailabilityStrategy, - AbstractFileBasedAvailabilityStrategyWrapper, -) +from .abstract_file_based_availability_strategy import AbstractFileBasedAvailabilityStrategy from .default_file_based_availability_strategy import DefaultFileBasedAvailabilityStrategy __all__ = [ "AbstractFileBasedAvailabilityStrategy", - "AbstractFileBasedAvailabilityStrategyWrapper", "DefaultFileBasedAvailabilityStrategy", ] diff --git a/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py b/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py index 12e1740b6..c7ae6ff43 100644 --- a/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +++ b/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py @@ -10,12 +10,6 @@ from airbyte_cdk.sources import Source from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AbstractAvailabilityStrategy, - StreamAvailability, - StreamAvailable, - StreamUnavailable, -) from airbyte_cdk.sources.streams.core import Stream if TYPE_CHECKING: @@ -28,7 +22,7 @@ def check_availability( # type: ignore[override] # Signature doesn't match bas self, stream: Stream, logger: logging.Logger, - _: Optional[Source], + source: Optional[Source] = None, ) -> Tuple[bool, Optional[str]]: """ Perform a connection check for the stream. @@ -51,23 +45,3 @@ def check_availability_and_parsability( Returns (True, None) if successful, otherwise (False, ). """ ... - - -class AbstractFileBasedAvailabilityStrategyWrapper(AbstractAvailabilityStrategy): - def __init__(self, stream: AbstractFileBasedStream) -> None: - self.stream = stream - - def check_availability(self, logger: logging.Logger) -> StreamAvailability: - is_available, reason = self.stream.availability_strategy.check_availability( - self.stream, logger, None - ) - if is_available: - return StreamAvailable() - return StreamUnavailable(reason or "") - - def check_availability_and_parsability( - self, logger: logging.Logger - ) -> Tuple[bool, Optional[str]]: - return self.stream.availability_strategy.check_availability_and_parsability( - self.stream, logger, None - ) diff --git a/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py index ef258b34d..e3fb0179e 100644 --- a/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py @@ -179,7 +179,6 @@ def record_passes_validation_policy(self, record: Mapping[str, Any]) -> bool: ) @cached_property - @deprecated("Deprecated as of CDK version 3.7.0.") def availability_strategy(self) -> AbstractFileBasedAvailabilityStrategy: return self._availability_strategy diff --git a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py index c36e5179d..67d0922f1 100644 --- a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +++ b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py @@ -19,10 +19,7 @@ ) from airbyte_cdk.sources import AbstractSource from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager -from airbyte_cdk.sources.file_based.availability_strategy import ( - AbstractFileBasedAvailabilityStrategy, - AbstractFileBasedAvailabilityStrategyWrapper, -) +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile @@ -97,7 +94,6 @@ def create_from_stream( ), name=stream.name, json_schema=stream.get_json_schema(), - availability_strategy=AbstractFileBasedAvailabilityStrategyWrapper(stream), primary_key=pk, cursor_field=cursor_field, logger=logger, diff --git a/airbyte_cdk/sources/streams/availability_strategy.py b/airbyte_cdk/sources/streams/availability_strategy.py index 312ddae19..96a2c9bc9 100644 --- a/airbyte_cdk/sources/streams/availability_strategy.py +++ b/airbyte_cdk/sources/streams/availability_strategy.py @@ -14,6 +14,7 @@ from airbyte_cdk.sources import Source +# FIXME this class AvailabilityStrategy(ABC): """ Abstract base class for checking stream availability. diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 26e6f09d4..33e7c4d10 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -9,7 +9,6 @@ from airbyte_cdk.models import AirbyteStream from airbyte_cdk.sources.source import ExperimentalClassWarning -from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -64,12 +63,6 @@ def cursor_field(self) -> Optional[str]: :return: The name of the field used as a cursor. Nested cursor fields are not supported. """ - @abstractmethod - def check_availability(self) -> StreamAvailability: - """ - :return: The stream's availability - """ - @abstractmethod def get_json_schema(self) -> Mapping[str, Any]: """ diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 7da594155..949f0545b 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -24,12 +24,7 @@ from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.streams import Stream -from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AbstractAvailabilityStrategy, - AlwaysAvailableAvailabilityStrategy, -) from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage @@ -101,7 +96,6 @@ def create_from_stream( name=stream.name, namespace=stream.namespace, json_schema=stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=pk, cursor_field=cursor_field, logger=logger, @@ -210,18 +204,6 @@ def get_json_schema(self) -> Mapping[str, Any]: def supports_incremental(self) -> bool: return self._legacy_stream.supports_incremental - def check_availability( - self, logger: logging.Logger, source: Optional["Source"] = None - ) -> Tuple[bool, Optional[str]]: - """ - Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters - :param logger: (ignored) - :param source: (ignored) - :return: - """ - availability = self._abstract_stream.check_availability() - return availability.is_available(), availability.message() - def as_airbyte_stream(self) -> AirbyteStream: return self._abstract_stream.as_airbyte_stream() @@ -370,28 +352,3 @@ def generate(self) -> Iterable[Partition]: self._cursor_field, self._state, ) - - -@deprecated( - "Availability strategy has been soft deprecated. Do not use. Class is subject to removal", - category=ExperimentalClassWarning, -) -class AvailabilityStrategyFacade(AvailabilityStrategy): - def __init__(self, abstract_availability_strategy: AbstractAvailabilityStrategy): - self._abstract_availability_strategy = abstract_availability_strategy - - def check_availability( - self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None - ) -> Tuple[bool, Optional[str]]: - """ - Checks stream availability. - - Important to note that the stream and source parameters are not used by the underlying AbstractAvailabilityStrategy. - - :param stream: (unused) - :param logger: logger object to use - :param source: (unused) - :return: A tuple of (boolean, str). If boolean is true, then the stream - """ - stream_availability = self._abstract_availability_strategy.check_availability(logger) - return stream_availability.is_available(), stream_availability.message() diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 54600d635..70ddd7d16 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -8,10 +8,6 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AbstractAvailabilityStrategy, - StreamAvailability, -) from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator @@ -23,7 +19,6 @@ def __init__( partition_generator: PartitionGenerator, name: str, json_schema: Mapping[str, Any], - availability_strategy: AbstractAvailabilityStrategy, primary_key: List[str], cursor_field: Optional[str], logger: Logger, @@ -34,7 +29,6 @@ def __init__( self._stream_partition_generator = partition_generator self._name = name self._json_schema = json_schema - self._availability_strategy = availability_strategy self._primary_key = primary_key self._cursor_field = cursor_field self._logger = logger @@ -53,9 +47,6 @@ def name(self) -> str: def namespace(self) -> Optional[str]: return self._namespace - def check_availability(self) -> StreamAvailability: - return self._availability_strategy.check_availability(self._logger) - @property def cursor_field(self) -> Optional[str]: return self._cursor_field diff --git a/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py b/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py index 185c5dceb..7db65b53d 100644 --- a/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +++ b/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py @@ -5,9 +5,6 @@ import logging from airbyte_cdk.sources.message import InMemoryMessageRepository -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AlwaysAvailableAvailabilityStrategy, -) from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.types import Record @@ -48,7 +45,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -84,7 +80,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -120,7 +115,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=["id"], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -171,7 +165,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -222,7 +215,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -255,7 +247,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -397,7 +388,6 @@ "key": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), diff --git a/unit_tests/sources/streams/concurrent/test_adapters.py b/unit_tests/sources/streams/concurrent/test_adapters.py index 66f48a9e0..82c5c91cb 100644 --- a/unit_tests/sources/streams/concurrent/test_adapters.py +++ b/unit_tests/sources/streams/concurrent/test_adapters.py @@ -18,7 +18,6 @@ from airbyte_cdk.models import Type as MessageType from airbyte_cdk.sources.message import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.adapters import ( - AvailabilityStrategyFacade, StreamFacade, StreamPartition, StreamPartitionGenerator, @@ -42,28 +41,6 @@ _ANY_CURSOR = Mock(spec=Cursor) -@pytest.mark.parametrize( - "stream_availability, expected_available, expected_message", - [ - pytest.param(StreamAvailable(), True, None, id="test_stream_is_available"), - pytest.param(STREAM_AVAILABLE, True, None, id="test_stream_is_available_using_singleton"), - pytest.param(StreamUnavailable("message"), False, "message", id="test_stream_is_available"), - ], -) -def test_availability_strategy_facade(stream_availability, expected_available, expected_message): - strategy = Mock() - strategy.check_availability.return_value = stream_availability - facade = AvailabilityStrategyFacade(strategy) - - logger = Mock() - available, message = facade.check_availability(Mock(), logger, Mock()) - - assert available == expected_available - assert message == expected_message - - strategy.check_availability.assert_called_once_with(logger) - - @pytest.mark.parametrize( "sync_mode", [ @@ -319,15 +296,6 @@ def test_given_cursor_is_not_noop_when_supports_incremental_then_return_true(sel Mock(spec=logging.Logger), ).supports_incremental - def test_check_availability_is_delegated_to_wrapped_stream(self): - availability = StreamAvailable() - self._abstract_stream.check_availability.return_value = availability - assert self._facade.check_availability(Mock(), Mock()) == ( - availability.is_available(), - availability.message(), - ) - self._abstract_stream.check_availability.assert_called_once_with() - def test_full_refresh(self): expected_stream_data = [{"data": 1}, {"data": 2}] records = [Record(data, "stream") for data in expected_stream_data] diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 2c9afe4da..dc2624eee 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -16,7 +16,6 @@ def setUp(self): self._partition_generator = Mock() self._name = "name" self._json_schema = {} - self._availability_strategy = Mock() self._primary_key = [] self._cursor_field = None self._logger = Mock() @@ -26,7 +25,6 @@ def setUp(self): self._partition_generator, self._name, self._json_schema, - self._availability_strategy, self._primary_key, self._cursor_field, self._logger, @@ -41,12 +39,6 @@ def test_get_json_schema(self): json_schema = self._stream.get_json_schema() assert json_schema == self._json_schema - def test_check_availability(self): - self._availability_strategy.check_availability.return_value = STREAM_AVAILABLE - availability = self._stream.check_availability() - assert availability == STREAM_AVAILABLE - self._availability_strategy.check_availability.assert_called_once_with(self._logger) - def test_check_for_error_raises_an_exception_if_any_of_the_futures_are_not_done(self): futures = [Mock() for _ in range(3)] for f in futures: @@ -93,7 +85,6 @@ def test_as_airbyte_stream_with_primary_key(self): self._partition_generator, self._name, json_schema, - self._availability_strategy, ["composite_key_1", "composite_key_2"], self._cursor_field, self._logger, @@ -131,7 +122,6 @@ def test_as_airbyte_stream_with_composite_primary_key(self): self._partition_generator, self._name, json_schema, - self._availability_strategy, ["id_a", "id_b"], self._cursor_field, self._logger, @@ -169,7 +159,6 @@ def test_as_airbyte_stream_with_a_cursor(self): self._partition_generator, self._name, json_schema, - self._availability_strategy, self._primary_key, "date", self._logger, @@ -200,7 +189,6 @@ def test_as_airbyte_stream_with_namespace(self): self._partition_generator, self._name, self._json_schema, - self._availability_strategy, self._primary_key, self._cursor_field, self._logger, @@ -231,7 +219,6 @@ def test_as_airbyte_stream_with_file_transfer_support(self): self._partition_generator, self._name, self._json_schema, - self._availability_strategy, self._primary_key, self._cursor_field, self._logger, From 76ac6f7ae6dda9f28da2f43b6a4de8b085d33e8a Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 31 Jul 2025 13:38:38 +0000 Subject: [PATCH 02/68] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py index 67d0922f1..fd8eef9b0 100644 --- a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +++ b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py @@ -19,7 +19,9 @@ ) from airbyte_cdk.sources import AbstractSource from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager -from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.availability_strategy import ( + AbstractFileBasedAvailabilityStrategy, +) from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile From 2d1e2f43396cabe792a9101af3841b5e7acc79ce Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 12:08:24 -0400 Subject: [PATCH 03/68] remove unused file --- .../concurrent/availability_strategy.py | 94 ------------------- .../streams/concurrent/test_adapters.py | 5 - .../streams/concurrent/test_default_stream.py | 1 - 3 files changed, 100 deletions(-) delete mode 100644 airbyte_cdk/sources/streams/concurrent/availability_strategy.py diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py deleted file mode 100644 index 118a7d0bb..000000000 --- a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# - -import logging -from abc import ABC, abstractmethod -from typing import Optional - -from typing_extensions import deprecated - -from airbyte_cdk.sources.source import ExperimentalClassWarning - - -class StreamAvailability(ABC): - @abstractmethod - def is_available(self) -> bool: - """ - :return: True if the stream is available. False if the stream is not - """ - - @abstractmethod - def message(self) -> Optional[str]: - """ - :return: A message describing why the stream is not available. If the stream is available, this should return None. - """ - - -class StreamAvailable(StreamAvailability): - def is_available(self) -> bool: - return True - - def message(self) -> Optional[str]: - return None - - -class StreamUnavailable(StreamAvailability): - def __init__(self, message: str): - self._message = message - - def is_available(self) -> bool: - return False - - def message(self) -> Optional[str]: - return self._message - - -# Singleton instances of StreamAvailability to avoid the overhead of creating new dummy objects -STREAM_AVAILABLE = StreamAvailable() - - -@deprecated( - "This class is experimental. Use at your own risk.", - category=ExperimentalClassWarning, -) -class AbstractAvailabilityStrategy(ABC): - """ - AbstractAvailabilityStrategy is an experimental interface developed as part of the Concurrent CDK. - This interface is not yet stable and may change in the future. Use at your own risk. - - Why create a new interface instead of using the existing AvailabilityStrategy? - The existing AvailabilityStrategy is tightly coupled with Stream and Source, which yields to circular dependencies and makes it difficult to move away from the Stream interface to AbstractStream. - """ - - @abstractmethod - def check_availability(self, logger: logging.Logger) -> StreamAvailability: - """ - Checks stream availability. - - :param logger: logger object to use - :return: A StreamAvailability object describing the stream's availability - """ - - -@deprecated( - "This class is experimental. Use at your own risk.", - category=ExperimentalClassWarning, -) -class AlwaysAvailableAvailabilityStrategy(AbstractAvailabilityStrategy): - """ - An availability strategy that always indicates a stream is available. - - This strategy is used to avoid breaking changes and serves as a soft - deprecation of the availability strategy, allowing a smoother transition - without disrupting existing functionality. - """ - - def check_availability(self, logger: logging.Logger) -> StreamAvailability: - """ - Checks stream availability. - - :param logger: logger object to use - :return: A StreamAvailability object describing the stream's availability - """ - return StreamAvailable() diff --git a/unit_tests/sources/streams/concurrent/test_adapters.py b/unit_tests/sources/streams/concurrent/test_adapters.py index 82c5c91cb..68efbc941 100644 --- a/unit_tests/sources/streams/concurrent/test_adapters.py +++ b/unit_tests/sources/streams/concurrent/test_adapters.py @@ -22,11 +22,6 @@ StreamPartition, StreamPartitionGenerator, ) -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - STREAM_AVAILABLE, - StreamAvailable, - StreamUnavailable, -) from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage from airbyte_cdk.sources.streams.core import Stream diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index dc2624eee..7cfc3ac05 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -6,7 +6,6 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.message import InMemoryMessageRepository -from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream From b4a5fecb7f8f5f7572076341cda7eec90ad3524c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 16:43:19 -0400 Subject: [PATCH 04/68] have declarative availability check support AbstractStream --- .../checks/check_dynamic_stream.py | 15 ++-- .../declarative/checks/check_stream.py | 35 ++++++---- .../streams/concurrent/abstract_stream.py | 7 ++ .../concurrent/availability_strategy.py | 38 +++++++++++ .../streams/concurrent/default_stream.py | 29 ++++++++ .../declarative/checks/test_check_stream.py | 7 +- .../streams/concurrent/test_default_stream.py | 68 ++++++++++++++++++- 7 files changed, 174 insertions(+), 25 deletions(-) create mode 100644 airbyte_cdk/sources/streams/concurrent/availability_strategy.py diff --git a/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py b/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py index 64d90de19..876750e4a 100644 --- a/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +++ b/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py @@ -3,12 +3,13 @@ # import logging -import traceback from dataclasses import InitVar, dataclass -from typing import Any, List, Mapping, Tuple +from typing import Any, List, Mapping, Tuple, Union -from airbyte_cdk import AbstractSource +from airbyte_cdk.sources.abstract_source import AbstractSource +from airbyte_cdk.sources.declarative.checks.check_stream import evaluate_availability from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy @@ -34,20 +35,16 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: def check_connection( self, source: AbstractSource, logger: logging.Logger, config: Mapping[str, Any] ) -> Tuple[bool, Any]: - streams = source.streams(config=config) + streams: List[Union[Stream, AbstractStream]] = source.streams(config=config) # type: ignore # this is a migration step and we expect the declarative CDK to migrate off of ConnectionChecker if len(streams) == 0: return False, f"No streams to connect to from source {source}" if not self.use_check_availability: return True, None - availability_strategy = HttpAvailabilityStrategy() - try: for stream in streams[: min(self.stream_count, len(streams))]: - stream_is_available, reason = availability_strategy.check_availability( - stream, logger - ) + stream_is_available, reason = evaluate_availability(stream, logger) if not stream_is_available: logger.warning(f"Stream {stream.name} is not available: {reason}") return False, reason diff --git a/airbyte_cdk/sources/declarative/checks/check_stream.py b/airbyte_cdk/sources/declarative/checks/check_stream.py index 1123349cb..db97098ef 100644 --- a/airbyte_cdk/sources/declarative/checks/check_stream.py +++ b/airbyte_cdk/sources/declarative/checks/check_stream.py @@ -5,13 +5,28 @@ import logging import traceback from dataclasses import InitVar, dataclass -from typing import Any, Dict, List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, Union -from airbyte_cdk import AbstractSource +from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.abstract_source import AbstractSource from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy +def evaluate_availability(stream: Union[Stream, AbstractStream], logger: logging.Logger) -> Tuple[bool, Optional[str]]: + """ + As a transition period, we want to support both Stream and AbstractStream until we migrate everything to AbstractStream. + """ + if isinstance(stream, Stream): + return HttpAvailabilityStrategy().check_availability(stream, logger) + elif isinstance(stream, AbstractStream): + availability = stream.check_availability() + return availability.is_available, availability.reason + else: + raise ValueError(f"Unsupported stream type {type(stream)}") + + @dataclass(frozen=True) class DynamicStreamCheckConfig: """Defines the configuration for dynamic stream during connection checking. This class specifies @@ -51,7 +66,7 @@ def check_connection( ) -> Tuple[bool, Any]: """Checks the connection to the source and its streams.""" try: - streams = source.streams(config=config) + streams: List[Union[Stream, AbstractStream]] = source.streams(config=config) # type: ignore # this is a migration step and we expect the declarative CDK to migrate off of ConnectionChecker if not streams: return False, f"No streams to connect to from source {source}" except Exception as error: @@ -82,13 +97,12 @@ def check_connection( return True, None def _check_stream_availability( - self, stream_name_to_stream: Dict[str, Any], stream_name: str, logger: logging.Logger + self, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], stream_name: str, logger: logging.Logger ) -> Tuple[bool, Any]: """Checks if streams are available.""" - availability_strategy = HttpAvailabilityStrategy() try: stream = stream_name_to_stream[stream_name] - stream_is_available, reason = availability_strategy.check_availability(stream, logger) + stream_is_available, reason = evaluate_availability(stream, logger) if not stream_is_available: message = f"Stream {stream_name} is not available: {reason}" logger.warning(message) @@ -98,7 +112,7 @@ def _check_stream_availability( return True, None def _check_dynamic_streams_availability( - self, source: AbstractSource, stream_name_to_stream: Dict[str, Any], logger: logging.Logger + self, source: AbstractSource, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], logger: logging.Logger ) -> Tuple[bool, Any]: """Checks the availability of dynamic streams.""" dynamic_streams = source.resolved_manifest.get("dynamic_streams", []) # type: ignore[attr-defined] # The source's resolved_manifest manifest is checked before calling this method @@ -135,18 +149,15 @@ def _map_generated_streams( def _check_generated_streams_availability( self, generated_streams: List[Dict[str, Any]], - stream_name_to_stream: Dict[str, Any], + stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], logger: logging.Logger, max_count: int, ) -> Tuple[bool, Any]: """Checks availability of generated dynamic streams.""" - availability_strategy = HttpAvailabilityStrategy() for declarative_stream in generated_streams[: min(max_count, len(generated_streams))]: stream = stream_name_to_stream[declarative_stream["name"]] try: - stream_is_available, reason = availability_strategy.check_availability( - stream, logger - ) + stream_is_available, reason = evaluate_availability(stream, logger) if not stream_is_available: message = f"Dynamic Stream {stream.name} is not available: {reason}" logger.warning(message) diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 33e7c4d10..53fa9450e 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -9,6 +9,7 @@ from airbyte_cdk.models import AirbyteStream from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -87,3 +88,9 @@ def cursor(self) -> Cursor: """ :return: The cursor associated with this stream. """ + + @abstractmethod + def check_availability(self) -> StreamAvailability: + """ + :return: If the stream is available and if not, why + """ diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py new file mode 100644 index 000000000..5b5288bf3 --- /dev/null +++ b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Optional + + +class StreamAvailability: + + @classmethod + def available(cls) -> "StreamAvailability": + return cls(True) + + @classmethod + def unavailable(cls, reason: str) -> "StreamAvailability": + return StreamAvailability(False, reason) + + def __init__(self, available: bool, reason: Optional[str] = None) -> None: + self._available = available + self._reason = reason + + if not available: + assert reason, "A reason needs to be provided if the stream is not available" + + @property + def is_available(self) -> bool: + """ + :return: True if the stream is available. False if the stream is not + """ + return self._available + + @property + def reason(self) -> Optional[str]: + """ + :return: A message describing why the stream is not available. If the stream is available, this should return None. + """ + return self._reason diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 70ddd7d16..7fa72d522 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -8,12 +8,15 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.utils.traced_exception import AirbyteTracedException class DefaultStream(AbstractStream): + def __init__( self, partition_generator: PartitionGenerator, @@ -91,3 +94,29 @@ def log_stream_sync_configuration(self) -> None: @property def cursor(self) -> Cursor: return self._cursor + + def check_availability(self) -> StreamAvailability: + """ + Check stream availability by attempting to read the first record of the stream. + """ + try: + partition = next(iter(self.generate_partitions())) + except StopIteration: + # NOTE: The following comment was copied from legacy stuff and I don't know how relevant it is: + # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!) + # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield ` + # without accounting for the case in which the parent stream is empty. + return StreamAvailability.unavailable( + f"Cannot attempt to connect to stream {self.name} - no stream slices were found" + ) + except AirbyteTracedException as error: + return StreamAvailability.unavailable(error.message) + + try: + next(iter(partition.read())) + return StreamAvailability.available() + except StopIteration: + self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") + return StreamAvailability.available() + except AirbyteTracedException as error: + return StreamAvailability.unavailable(error.message) diff --git a/unit_tests/sources/declarative/checks/test_check_stream.py b/unit_tests/sources/declarative/checks/test_check_stream.py index 3cbaf8fd8..49dc8ef9a 100644 --- a/unit_tests/sources/declarative/checks/test_check_stream.py +++ b/unit_tests/sources/declarative/checks/test_check_stream.py @@ -17,6 +17,7 @@ ConcurrentDeclarativeSource, ) from airbyte_cdk.sources.streams.http import HttpStream +from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse logger = logging.getLogger("test") @@ -45,7 +46,7 @@ def test_check_stream_with_slices_as_list( test_name, record, streams_to_check, stream_slice, expectation, slices_as_list ): - stream = MagicMock() + stream = MagicMock(spec=Stream) stream.name = "s1" stream.availability_strategy = None if slices_as_list: @@ -77,7 +78,7 @@ def mock_read_records(responses, default_response=None, **kwargs): def test_check_empty_stream(): - stream = MagicMock() + stream = MagicMock(spec=Stream) stream.name = "s1" stream.read_records.return_value = iter([]) stream.stream_slices.return_value = iter([None]) @@ -91,7 +92,7 @@ def test_check_empty_stream(): def test_check_stream_with_no_stream_slices_aborts(): - stream = MagicMock() + stream = MagicMock(spec=Stream) stream.name = "s1" stream.stream_slices.return_value = iter([]) diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 7cfc3ac05..6159ea1e6 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -4,15 +4,22 @@ import unittest from unittest.mock import Mock +import pytest + from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.message import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.sources.types import Record +from airbyte_cdk.utils.traced_exception import AirbyteTracedException class ThreadBasedConcurrentStreamTest(unittest.TestCase): def setUp(self): - self._partition_generator = Mock() + self._partition_generator = Mock(spec=PartitionGenerator) + self._partition = Mock(spec=Partition) self._name = "name" self._json_schema = {} self._primary_key = [] @@ -243,3 +250,62 @@ def test_as_airbyte_stream_with_file_transfer_support(self): actual_airbyte_stream = stream.as_airbyte_stream() assert actual_airbyte_stream == expected_airbyte_stream + + def test_given_no_partitions_when_get_availability_then_unavailable(self) -> None: + self._partition_generator.generate.return_value = [] + + availability = self._stream.check_availability() + + assert availability.is_available == False + assert "no stream slices were found" in availability.reason + + def test_given_AirbyteTracedException_when_generating_partitions_when_get_availability_then_unavailable(self) -> None: + error_message = "error while generating partitions" + self._partition_generator.generate.side_effect = AirbyteTracedException(message=error_message) + + availability = self._stream.check_availability() + + assert availability.is_available == False + assert error_message in availability.reason + + def test_given_unknown_error_when_generating_partitions_when_get_availability_then_raise(self) -> None: + """ + I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy + """ + self._partition_generator.generate.side_effect = ValueError() + with pytest.raises(ValueError): + self._stream.check_availability() + + def test_given_no_records_when_get_availability_then_available(self) -> None: + self._partition_generator.generate.return_value = [self._partition] + self._partition.read.return_value = [] + + availability = self._stream.check_availability() + + assert availability.is_available == True + + def test_given_records_when_get_availability_then_available(self) -> None: + self._partition_generator.generate.return_value = [self._partition] + self._partition.read.return_value = [Mock(spec=Record)] + + availability = self._stream.check_availability() + + assert availability.is_available == True + + def test_given_AirbyteTracedException_when_reading_records_when_get_availability_then_unavailable(self) -> None: + self._partition_generator.generate.return_value = [self._partition] + error_message = "error while reading records" + self._partition.read.side_effect = AirbyteTracedException(message=error_message) + + availability = self._stream.check_availability() + + assert availability.is_available == False + + def test_given_unknown_error_when_reading_record_when_get_availability_then_raise(self) -> None: + """ + I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy + """ + self._partition_generator.generate.side_effect = ValueError() + self._partition.read.return_value = [] + with pytest.raises(ValueError): + self._stream.check_availability() From fc6c6b6128bb7ff8c1d5841c884ac25943f33028 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 31 Jul 2025 20:53:03 +0000 Subject: [PATCH 05/68] Auto-fix lint and format issues --- .../sources/declarative/checks/check_stream.py | 16 ++++++++++++---- .../streams/concurrent/availability_strategy.py | 1 - .../sources/streams/concurrent/default_stream.py | 1 - .../declarative/checks/test_check_stream.py | 2 +- .../streams/concurrent/test_default_stream.py | 16 ++++++++++++---- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/airbyte_cdk/sources/declarative/checks/check_stream.py b/airbyte_cdk/sources/declarative/checks/check_stream.py index db97098ef..73940d382 100644 --- a/airbyte_cdk/sources/declarative/checks/check_stream.py +++ b/airbyte_cdk/sources/declarative/checks/check_stream.py @@ -7,14 +7,16 @@ from dataclasses import InitVar, dataclass from typing import Any, Dict, List, Mapping, Optional, Tuple, Union -from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.abstract_source import AbstractSource from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy -def evaluate_availability(stream: Union[Stream, AbstractStream], logger: logging.Logger) -> Tuple[bool, Optional[str]]: +def evaluate_availability( + stream: Union[Stream, AbstractStream], logger: logging.Logger +) -> Tuple[bool, Optional[str]]: """ As a transition period, we want to support both Stream and AbstractStream until we migrate everything to AbstractStream. """ @@ -97,7 +99,10 @@ def check_connection( return True, None def _check_stream_availability( - self, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], stream_name: str, logger: logging.Logger + self, + stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], + stream_name: str, + logger: logging.Logger, ) -> Tuple[bool, Any]: """Checks if streams are available.""" try: @@ -112,7 +117,10 @@ def _check_stream_availability( return True, None def _check_dynamic_streams_availability( - self, source: AbstractSource, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], logger: logging.Logger + self, + source: AbstractSource, + stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], + logger: logging.Logger, ) -> Tuple[bool, Any]: """Checks the availability of dynamic streams.""" dynamic_streams = source.resolved_manifest.get("dynamic_streams", []) # type: ignore[attr-defined] # The source's resolved_manifest manifest is checked before calling this method diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py index 5b5288bf3..3be77ff05 100644 --- a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py +++ b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -7,7 +7,6 @@ class StreamAvailability: - @classmethod def available(cls) -> "StreamAvailability": return cls(True) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 7fa72d522..d8814541f 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -16,7 +16,6 @@ class DefaultStream(AbstractStream): - def __init__( self, partition_generator: PartitionGenerator, diff --git a/unit_tests/sources/declarative/checks/test_check_stream.py b/unit_tests/sources/declarative/checks/test_check_stream.py index 49dc8ef9a..21f036440 100644 --- a/unit_tests/sources/declarative/checks/test_check_stream.py +++ b/unit_tests/sources/declarative/checks/test_check_stream.py @@ -16,8 +16,8 @@ from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( ConcurrentDeclarativeSource, ) -from airbyte_cdk.sources.streams.http import HttpStream from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.streams.http import HttpStream from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse logger = logging.getLogger("test") diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 6159ea1e6..98255bfe5 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -259,16 +259,22 @@ def test_given_no_partitions_when_get_availability_then_unavailable(self) -> Non assert availability.is_available == False assert "no stream slices were found" in availability.reason - def test_given_AirbyteTracedException_when_generating_partitions_when_get_availability_then_unavailable(self) -> None: + def test_given_AirbyteTracedException_when_generating_partitions_when_get_availability_then_unavailable( + self, + ) -> None: error_message = "error while generating partitions" - self._partition_generator.generate.side_effect = AirbyteTracedException(message=error_message) + self._partition_generator.generate.side_effect = AirbyteTracedException( + message=error_message + ) availability = self._stream.check_availability() assert availability.is_available == False assert error_message in availability.reason - def test_given_unknown_error_when_generating_partitions_when_get_availability_then_raise(self) -> None: + def test_given_unknown_error_when_generating_partitions_when_get_availability_then_raise( + self, + ) -> None: """ I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy """ @@ -292,7 +298,9 @@ def test_given_records_when_get_availability_then_available(self) -> None: assert availability.is_available == True - def test_given_AirbyteTracedException_when_reading_records_when_get_availability_then_unavailable(self) -> None: + def test_given_AirbyteTracedException_when_reading_records_when_get_availability_then_unavailable( + self, + ) -> None: self._partition_generator.generate.return_value = [self._partition] error_message = "error while reading records" self._partition.read.side_effect = AirbyteTracedException(message=error_message) From 5fe2e02054594f689a392d2ce706453b00e58168 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 17:03:30 -0400 Subject: [PATCH 06/68] mypy --- airbyte_cdk/sources/streams/concurrent/default_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index d8814541f..86eaaf9c1 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -109,7 +109,7 @@ def check_availability(self) -> StreamAvailability: f"Cannot attempt to connect to stream {self.name} - no stream slices were found" ) except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message) + return StreamAvailability.unavailable(error.message or error.internal_message or "") try: next(iter(partition.read())) @@ -118,4 +118,4 @@ def check_availability(self) -> StreamAvailability: self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") return StreamAvailability.available() except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message) + return StreamAvailability.unavailable(error.message or error.internal_message or "") From 1e8e9681672e05dd28e7f4fcecf53c8687b38337 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 31 Jul 2025 21:18:24 +0000 Subject: [PATCH 07/68] Auto-fix lint and format issues --- airbyte_cdk/sources/streams/concurrent/default_stream.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 86eaaf9c1..10f04e6ba 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -109,7 +109,9 @@ def check_availability(self) -> StreamAvailability: f"Cannot attempt to connect to stream {self.name} - no stream slices were found" ) except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) try: next(iter(partition.read())) @@ -118,4 +120,6 @@ def check_availability(self) -> StreamAvailability: self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") return StreamAvailability.available() except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) From 689e7929f33366c2123961e10dd1fee4207e2764 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Fri, 1 Aug 2025 14:22:58 -0400 Subject: [PATCH 08/68] Remove RFR stuff --- .../concurrent_declarative_source.py | 1 - .../parsers/model_to_component_factory.py | 65 +++----- .../test_model_to_component_factory.py | 148 +----------------- .../test_manifest_declarative_source.py | 32 ++-- 4 files changed, 28 insertions(+), 218 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index cc59b1554..8e49b9b2c 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -84,7 +84,6 @@ def __init__( # incremental streams running in full refresh. component_factory = component_factory or ModelToComponentFactory( emit_connector_builder_messages=emit_connector_builder_messages, - disable_resumable_full_refresh=True, connector_state_manager=self._connector_state_manager, max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"), ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 628bea575..7f953f14d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -7,6 +7,7 @@ import datetime import importlib import inspect +import logging import re from functools import partial from typing import ( @@ -544,6 +545,8 @@ StreamSlicer, StreamSlicerTestReadDecorator, ) +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import \ + StreamSlicerPartitionGenerator, DeclarativePartitionFactory from airbyte_cdk.sources.declarative.transformations import ( AddFields, RecordTransformation, @@ -604,7 +607,9 @@ WeekClampingStrategy, Weekday, ) -from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, DateTimeStreamStateConverter, @@ -634,7 +639,6 @@ def __init__( emit_connector_builder_messages: bool = False, disable_retries: bool = False, disable_cache: bool = False, - disable_resumable_full_refresh: bool = False, message_repository: Optional[MessageRepository] = None, connector_state_manager: Optional[ConnectorStateManager] = None, max_concurrent_async_job_count: Optional[int] = None, @@ -645,7 +649,6 @@ def __init__( self._emit_connector_builder_messages = emit_connector_builder_messages self._disable_retries = disable_retries self._disable_cache = disable_cache - self._disable_resumable_full_refresh = disable_resumable_full_refresh self._message_repository = message_repository or InMemoryMessageRepository( self._evaluate_log_level(emit_connector_builder_messages) ) @@ -2035,15 +2038,6 @@ def create_declarative_stream( file_uploader=file_uploader, incremental_sync=model.incremental_sync, ) - cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None - - if model.state_migrations: - state_transformations = [ - self._create_component_from_model(state_migration, config, declarative_stream=model) - for state_migration in model.state_migrations - ] - else: - state_transformations = [] schema_loader: Union[ CompositeSchemaLoader, @@ -2071,6 +2065,15 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) + cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None + + if model.state_migrations: + state_transformations = [ + self._create_component_from_model(state_migration, config, declarative_stream=model) + for state_migration in model.state_migrations + ] + else: + state_transformations = [] return DeclarativeStream( name=model.name or "", primary_key=primary_key, @@ -2185,28 +2188,6 @@ def _build_incremental_cursor( return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync return None - def _build_resumable_cursor( - self, - model: Union[ - AsyncRetrieverModel, - CustomRetrieverModel, - SimpleRetrieverModel, - ], - stream_slicer: Optional[PartitionRouter], - ) -> Optional[StreamSlicer]: - if hasattr(model, "paginator") and model.paginator and not stream_slicer: - # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` - return ResumableFullRefreshCursor(parameters={}) - elif stream_slicer: - # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` - return PerPartitionCursor( - cursor_factory=CursorFactory( - create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) - ), - partition_router=stream_slicer, - ) - return None - def _merge_stream_slicers( self, model: DeclarativeStreamModel, config: Config ) -> Optional[StreamSlicer]: @@ -2243,11 +2224,7 @@ def _merge_stream_slicers( if model.incremental_sync: return self._build_incremental_cursor(model, stream_slicer, config) - return ( - stream_slicer - if self._disable_resumable_full_refresh - else self._build_resumable_cursor(retriever_model, stream_slicer) - ) + return stream_slicer def create_default_error_handler( self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any @@ -2529,9 +2506,6 @@ def create_schema_type_identifier( def create_dynamic_schema_loader( self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any ) -> DynamicSchemaLoader: - stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) - combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) - schema_transformations = [] if model.schema_transformations: for transformation_model in model.schema_transformations: @@ -2544,7 +2518,7 @@ def create_dynamic_schema_loader( config=config, name=name, primary_key=None, - stream_slicer=combined_slicers, + stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), transformations=[], use_cache=True, log_formatter=( @@ -3808,15 +3782,12 @@ def create_components_mapping_definition( def create_http_components_resolver( self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None ) -> Any: - stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) - combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) - retriever = self._create_component_from_model( model=model.retriever, config=config, name=f"{stream_name if stream_name else '__http_components_resolver'}", primary_key=None, - stream_slicer=stream_slicer if stream_slicer else combined_slicers, + stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), transformations=[], ) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 4ac0b11e7..17a36c3b0 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -1055,152 +1055,6 @@ def test_stream_with_incremental_and_async_retriever_with_partition_router(use_l assert stream_slices == expected_stream_slices -def test_resumable_full_refresh_stream(): - content = """ -decoder: - type: JsonDecoder -extractor: - type: DpathExtractor -selector: - type: RecordSelector - record_filter: - type: RecordFilter - condition: "{{ record['id'] > stream_state['id'] }}" -metadata_paginator: - type: DefaultPaginator - page_size_option: - type: RequestOption - inject_into: body_json - field_path: ["variables", "page_size"] - page_token_option: - type: RequestPath - pagination_strategy: - type: "CursorPagination" - cursor_value: "{{ response._metadata.next }}" - page_size: 10 -requester: - type: HttpRequester - url_base: "https://api.sendgrid.com/v3/" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['apikey'] }}" - request_parameters: - unit: "day" -retriever: - paginator: - type: NoPagination - decoder: - $ref: "#/decoder" -partial_stream: - type: DeclarativeStream - schema_loader: - type: JsonFileSchemaLoader - file_path: "./source_sendgrid/schemas/{{ parameters.name }}.json" -list_stream: - $ref: "#/partial_stream" - $parameters: - name: "lists" - extractor: - $ref: "#/extractor" - field_path: ["{{ parameters['name'] }}"] - name: "lists" - primary_key: "id" - retriever: - $ref: "#/retriever" - requester: - $ref: "#/requester" - path: "{{ next_page_token['next_page_url'] }}" - paginator: - $ref: "#/metadata_paginator" - record_selector: - $ref: "#/selector" - transformations: - - type: AddFields - fields: - - path: ["extra"] - value: "{{ response.to_add }}" -check: - type: CheckStream - stream_names: ["list_stream"] -spec: - type: Spec - documentation_url: https://airbyte.com/#yaml-from-manifest - connection_specification: - title: Test Spec - type: object - required: - - api_key - additionalProperties: false - properties: - api_key: - type: string - airbyte_secret: true - title: API Key - description: Test API Key - order: 0 - advanced_auth: - auth_flow_type: "oauth2.0" - """ - parsed_manifest = YamlDeclarativeSource._parse(content) - resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - resolved_manifest["type"] = "DeclarativeSource" - manifest = transformer.propagate_types_and_parameters("", resolved_manifest, {}) - - stream_manifest = manifest["list_stream"] - assert stream_manifest["type"] == "DeclarativeStream" - stream = factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config - ) - - assert isinstance(stream, DeclarativeStream) - assert stream.primary_key == "id" - assert stream.name == "lists" - assert stream._stream_cursor_field.string == "" - - assert isinstance(stream.retriever, SimpleRetriever) - assert stream.retriever.primary_key == stream.primary_key - assert stream.retriever.name == stream.name - - assert isinstance(stream.retriever.record_selector, RecordSelector) - - assert isinstance(stream.retriever.stream_slicer, ResumableFullRefreshCursor) - assert isinstance(stream.retriever.cursor, ResumableFullRefreshCursor) - - assert isinstance(stream.retriever.paginator, DefaultPaginator) - assert isinstance(stream.retriever.paginator.decoder, PaginationDecoderDecorator) - for string in stream.retriever.paginator.page_size_option.field_path: - assert isinstance(string, InterpolatedString) - assert len(stream.retriever.paginator.page_size_option.field_path) == 2 - assert stream.retriever.paginator.page_size_option.inject_into == RequestOptionType.body_json - assert isinstance(stream.retriever.paginator.page_token_option, RequestPath) - assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" - assert stream.retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" - - assert isinstance(stream.retriever.paginator.pagination_strategy, CursorPaginationStrategy) - assert isinstance( - stream.retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator - ) - assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.string - == "{{ response._metadata.next }}" - ) - assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.default - == "{{ response._metadata.next }}" - ) - assert stream.retriever.paginator.pagination_strategy.page_size == 10 - - checker = factory.create_component( - model_type=CheckStreamModel, component_definition=manifest["check"], config=input_config - ) - - assert isinstance(checker, CheckStream) - streams_to_check = checker.stream_names - assert len(streams_to_check) == 1 - assert list(streams_to_check)[0] == "list_stream" - - def test_incremental_data_feed(): content = """ selector: @@ -2592,7 +2446,7 @@ def test_default_schema_loader(self): "values": "{{config['repos']}}", "cursor_field": "a_key", }, - PerPartitionCursor, + ListPartitionRouter, id="test_create_simple_retriever_with_partition_router", ), pytest.param( diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 6753e8e4e..8f72cc6a6 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -1818,8 +1818,8 @@ def _create_page(response_body): [ call({}, {}, None), call( - {"next_page_token": "next"}, - {"next_page_token": "next"}, + {}, + {}, {"next_page_token": "next"}, ), ], @@ -1907,16 +1907,9 @@ def _create_page(response_body): ), [{"ABC": 0, "partition": 0}, {"AED": 1, "partition": 0}, {"ABC": 2, "partition": 1}], [ - call({"states": []}, {"partition": "0"}, None), + call({}, {"partition": "0"}, None), call( - { - "states": [ - { - "partition": {"partition": "0"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - } - ] - }, + {}, {"partition": "1"}, None, ), @@ -2022,17 +2015,10 @@ def _create_page(response_body): {"ABC": 2, "partition": 1}, ], [ - call({"states": []}, {"partition": "0"}, None), - call({"states": []}, {"partition": "0"}, {"next_page_token": "next"}), + call({}, {"partition": "0"}, None), + call({}, {"partition": "0"}, {"next_page_token": "next"}), call( - { - "states": [ - { - "partition": {"partition": "0"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - } - ] - }, + {}, {"partition": "1"}, None, ), @@ -2201,12 +2187,12 @@ def test_only_parent_streams_use_cache(): # Parent stream created for substream assert ( - streams[1].retriever.stream_slicer._partition_router.parent_stream_configs[0].stream.name + streams[1].retriever.stream_slicer.parent_stream_configs[0].stream.name == "applications" ) assert ( streams[1] - .retriever.stream_slicer._partition_router.parent_stream_configs[0] + .retriever.stream_slicer.parent_stream_configs[0] .stream.retriever.requester.use_cache ) From 5399436280b5eef242cec592c7460f69beb63122 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 10:41:32 -0400 Subject: [PATCH 09/68] have bland stream be instantiated as DefaultStream --- .../concurrent_declarative_source.py | 12 +- .../manifest_declarative_source.py | 10 +- .../parsers/model_to_component_factory.py | 45 +++++-- .../declarative_partition_generator.py | 25 +++- .../sources/streams/concurrent/adapters.py | 4 +- .../streams/concurrent/default_stream.py | 10 +- .../test_connector_builder_handler.py | 2 +- .../test_model_to_component_factory.py | 96 +++++++++------ .../test_config_components_resolver.py | 2 +- .../retrievers/test_simple_retriever.py | 114 ------------------ .../test_declarative_partition_generator.py | 8 +- .../test_manifest_declarative_source.py | 18 +-- .../streams/concurrent/test_default_stream.py | 21 ++++ 13 files changed, 174 insertions(+), 193 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 8e49b9b2c..0ac6299f2 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -209,6 +209,10 @@ def _group_streams( # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible, # so we need to treat them as synchronous + if isinstance(declarative_stream, AbstractStream): + concurrent_streams.append(declarative_stream) + continue + supports_file_transfer = ( isinstance(declarative_stream, DeclarativeStream) and "file_uploader" in name_to_stream_mapping[declarative_stream.name] @@ -278,7 +282,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, retriever, self.message_repository, ), @@ -309,7 +313,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, retriever, self.message_repository, ), @@ -339,7 +343,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, declarative_stream.retriever, self.message_repository, ), @@ -399,7 +403,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, retriever, self.message_repository, ), diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index e962f3813..303d12ba4 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -8,7 +8,7 @@ from copy import deepcopy from importlib import metadata from types import ModuleType -from typing import Any, Dict, Iterator, List, Mapping, Optional, Set +from typing import Any, Dict, Iterator, List, Mapping, Optional, Set, Union import orjson import yaml @@ -66,6 +66,7 @@ from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING from airbyte_cdk.sources.declarative.spec.spec import Spec from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.types import Config, ConnectionDefinition from airbyte_cdk.sources.utils.slice_logger import ( @@ -297,7 +298,12 @@ def connection_checker(self) -> ConnectionChecker: f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}" ) - def streams(self, config: Mapping[str, Any]) -> List[Stream]: + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: + """ + As a migration step, this method will return both legacy stream (Stream) and concurrent stream (AbstractStream). + Once the migration is done, we can probably have this method throw "not implemented" as we figure out how to + fully decouple this from the AbstractSource. + """ if self._spec_component: self._spec_component.validate_config(config) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 7f953f14d..32c14873a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -598,6 +598,7 @@ Rate, UnlimitedCallRatePolicy, ) +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, ClampingStrategy, @@ -1920,8 +1921,8 @@ def create_datetime_based_cursor( ) def create_declarative_stream( - self, model: DeclarativeStreamModel, config: Config, **kwargs: Any - ) -> DeclarativeStream: + self, model: DeclarativeStreamModel, config: Config, is_parent=False, **kwargs: Any + ) -> Union[DeclarativeStream, AbstractStream]: # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in @@ -2065,8 +2066,38 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None + if isinstance(combined_slicers, PartitionRouter) and not is_parent and not self._emit_connector_builder_messages: + # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the + # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: + # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter + # * Streams without partition router but with cursor + # * Streams with both partition router and cursor + # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet + # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway + stream_name = model.name or "" + partition_generator = StreamSlicerPartitionGenerator( + DeclarativePartitionFactory( + stream_name, + schema_loader, + retriever, + self._message_repository, + ), + combined_slicers, + ) + FinalStateCursor(stream_name, None, self._message_repository) + return DefaultStream( + partition_generator=partition_generator, + name=stream_name, + json_schema=schema_loader.get_json_schema, + primary_key=get_primary_key_from_stream(primary_key), + cursor_field=None, + # FIXME we should have the cursor field has part of the interface of cursor + logger=logging.getLogger(f"airbyte.{stream_name}"), + # FIXME this is a breaking change compared to the old implementation, + cursor=FinalStateCursor(stream_name, None, self._message_repository), + ) + cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None if model.state_migrations: state_transformations = [ self._create_component_from_model(state_migration, config, declarative_stream=model) @@ -2094,7 +2125,7 @@ def _build_stream_slicer_from_partition_router( ], config: Config, stream_name: Optional[str] = None, - ) -> Optional[PartitionRouter]: + ) -> PartitionRouter: if ( hasattr(model, "partition_router") and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) @@ -2115,7 +2146,7 @@ def _build_stream_slicer_from_partition_router( return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router model=stream_slicer_model, config=config, stream_name=stream_name or "" ) - return None + return SinglePartitionRouter(parameters={}) def _build_incremental_cursor( self, @@ -2123,7 +2154,7 @@ def _build_incremental_cursor( stream_slicer: Optional[PartitionRouter], config: Config, ) -> Optional[StreamSlicer]: - if model.incremental_sync and stream_slicer: + if model.incremental_sync and (stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter)): if model.retriever.type == "AsyncRetriever": stream_name = model.name or "" stream_namespace = None @@ -2871,7 +2902,7 @@ def create_parent_stream_config( self, model: ParentStreamConfigModel, config: Config, **kwargs: Any ) -> ParentStreamConfig: declarative_stream = self._create_component_from_model( - model.stream, config=config, **kwargs + model.stream, config=config, is_parent=True, **kwargs, ) request_option = ( self._create_component_from_model(model.request_option, config=config) diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 94ee03a56..fe76e7ee2 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -3,6 +3,7 @@ from typing import Any, Iterable, Mapping, Optional from airbyte_cdk.sources.declarative.retrievers import Retriever +from airbyte_cdk.sources.declarative.schema import SchemaLoader from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator @@ -11,11 +12,23 @@ from airbyte_cdk.utils.slice_hasher import SliceHasher +class SchemaLoaderCachingDecorator(SchemaLoader): + + def __init__(self, schema_loader: SchemaLoader): + self._decorated = schema_loader + self._loaded_schema = None + + def get_json_schema(self) -> Mapping[str, Any]: + if self._loaded_schema is None: + self._loaded_schema = self._decorated.get_json_schema() + return self._loaded_schema + + class DeclarativePartitionFactory: def __init__( self, stream_name: str, - json_schema: Mapping[str, Any], + schema_loader: SchemaLoader, retriever: Retriever, message_repository: MessageRepository, ) -> None: @@ -25,14 +38,14 @@ def __init__( In order to avoid these problems, we will create one retriever per thread which should make the processing thread-safe. """ self._stream_name = stream_name - self._json_schema = json_schema + self._schema_loader = SchemaLoaderCachingDecorator(schema_loader) self._retriever = retriever self._message_repository = message_repository def create(self, stream_slice: StreamSlice) -> Partition: return DeclarativePartition( self._stream_name, - self._json_schema, + self._schema_loader, self._retriever, self._message_repository, stream_slice, @@ -43,20 +56,20 @@ class DeclarativePartition(Partition): def __init__( self, stream_name: str, - json_schema: Mapping[str, Any], + schema_loader: SchemaLoader, retriever: Retriever, message_repository: MessageRepository, stream_slice: StreamSlice, ): self._stream_name = stream_name - self._json_schema = json_schema + self._schema_loader = schema_loader self._retriever = retriever self._message_repository = message_repository self._stream_slice = stream_slice self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) def read(self) -> Iterable[Record]: - for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice): + for stream_data in self._retriever.read_records(self._schema_loader.get_json_schema(), self._stream_slice): if isinstance(stream_data, Mapping): record = ( stream_data diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 949f0545b..6a4682605 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -5,8 +5,7 @@ import copy import json import logging -from functools import lru_cache -from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union from typing_extensions import deprecated @@ -196,7 +195,6 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor - @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: return self._abstract_stream.get_json_schema() diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 86eaaf9c1..bceed08b2 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -2,9 +2,8 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from functools import lru_cache from logging import Logger -from typing import Any, Iterable, List, Mapping, Optional +from typing import Any, Iterable, List, Mapping, Optional, Union, Callable from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -20,7 +19,7 @@ def __init__( self, partition_generator: PartitionGenerator, name: str, - json_schema: Mapping[str, Any], + json_schema: Union[Mapping[str, Any], Callable[[], Mapping[str, Any]]], primary_key: List[str], cursor_field: Optional[str], logger: Logger, @@ -53,14 +52,15 @@ def namespace(self) -> Optional[str]: def cursor_field(self) -> Optional[str]: return self._cursor_field - @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: + if isinstance(self._json_schema, Callable): + return self._json_schema() return self._json_schema def as_airbyte_stream(self) -> AirbyteStream: stream = AirbyteStream( name=self.name, - json_schema=dict(self._json_schema), + json_schema=dict(self.get_json_schema()), supported_sync_modes=[SyncMode.full_refresh], is_resumable=False, is_file_based=self._supports_file_transfer, diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index 2587fb95a..98b42a737 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -780,7 +780,7 @@ def test_config_update() -> None: "client_secret": "a client secret", "refresh_token": "a refresh token", } - source = ManifestDeclarativeSource(source_config=manifest) + source = ManifestDeclarativeSource(source_config=manifest, emit_connector_builder_messages=True) refresh_request_response = { "access_token": "an updated access token", diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 17a36c3b0..c7d2f8d7a 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -157,6 +157,7 @@ from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import SchemaLoaderCachingDecorator from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource @@ -168,6 +169,7 @@ WeekClampingStrategy, ) from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, ) @@ -1757,38 +1759,39 @@ def test_config_with_defaults(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert stream.primary_key == "id" + assert isinstance(stream, DefaultStream) assert stream.name == "lists" - assert isinstance(stream.retriever, SimpleRetriever) - assert stream.retriever.name == stream.name - assert stream.retriever.primary_key == stream.primary_key + retriever = stream._stream_partition_generator._partition_factory._retriever + assert isinstance(retriever, SimpleRetriever) + assert retriever.name == stream.name + assert retriever.primary_key == "id" - assert isinstance(stream.schema_loader, JsonFileSchemaLoader) + schema_loader = get_schema_loader(stream) + assert isinstance(schema_loader, JsonFileSchemaLoader) assert ( - stream.schema_loader.file_path.string + schema_loader.file_path.string == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" ) assert ( - stream.schema_loader.file_path.default + schema_loader.file_path.default == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" ) - assert isinstance(stream.retriever.requester, HttpRequester) - assert stream.retriever.requester.http_method == HttpMethod.GET + assert isinstance(retriever.requester, HttpRequester) + assert retriever.requester.http_method == HttpMethod.GET - assert isinstance(stream.retriever.requester.authenticator, BearerAuthenticator) - assert stream.retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" + assert isinstance(retriever.requester.authenticator, BearerAuthenticator) + assert retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" - assert isinstance(stream.retriever.record_selector, RecordSelector) - assert isinstance(stream.retriever.record_selector.extractor, DpathExtractor) + assert isinstance(retriever.record_selector, RecordSelector) + assert isinstance(retriever.record_selector.extractor, DpathExtractor) assert [ - fp.eval(input_config) for fp in stream.retriever.record_selector.extractor._field_path + fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path ] == ["result"] - assert isinstance(stream.retriever.paginator, DefaultPaginator) - assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com" - assert stream.retriever.paginator.pagination_strategy.get_page_size() == 10 + assert isinstance(retriever.paginator, DefaultPaginator) + assert retriever.paginator.url_base.string == "https://api.sendgrid.com" + assert retriever.paginator.pagination_strategy.get_page_size() == 10 def test_create_default_paginator(): @@ -2184,8 +2187,8 @@ def test_no_transformations(self): config=input_config, ) - assert isinstance(stream, DeclarativeStream) - assert [] == stream.retriever.record_selector.transformations + assert isinstance(stream, DefaultStream) + assert [] == get_retriever(stream).record_selector.transformations def test_remove_fields(self): content = f""" @@ -2212,11 +2215,11 @@ def test_remove_fields(self): config=input_config, ) - assert isinstance(stream, DeclarativeStream) + assert isinstance(stream, DefaultStream) expected = [ RemoveFields(field_pointers=[["path", "to", "field1"], ["path2"]], parameters={}) ] - assert stream.retriever.record_selector.transformations == expected + assert get_retriever(stream).record_selector.transformations == expected def test_add_fields_no_value_type(self): content = f""" @@ -2375,8 +2378,8 @@ def _test_add_fields(self, content, expected): config=input_config, ) - assert isinstance(stream, DeclarativeStream) - assert stream.retriever.record_selector.transformations == expected + assert isinstance(stream, DefaultStream) + assert get_retriever(stream).record_selector.transformations == expected def test_default_schema_loader(self): component_definition = { @@ -2415,7 +2418,7 @@ def test_default_schema_loader(self): component_definition=propagated_source_config, config=input_config, ) - schema_loader = stream.schema_loader + schema_loader = get_schema_loader(stream) assert ( schema_loader.default_loader._get_json_filepath().split("/")[-1] == f"{stream.name}.json" @@ -2423,7 +2426,7 @@ def test_default_schema_loader(self): @pytest.mark.parametrize( - "incremental, partition_router, expected_type", + "incremental, partition_router, expected_router_type, expected_stream_type", [ pytest.param( { @@ -2437,6 +2440,7 @@ def test_default_schema_loader(self): }, None, DatetimeBasedCursor, + DeclarativeStream, id="test_create_simple_retriever_with_incremental", ), pytest.param( @@ -2447,6 +2451,7 @@ def test_default_schema_loader(self): "cursor_field": "a_key", }, ListPartitionRouter, + DefaultStream, id="test_create_simple_retriever_with_partition_router", ), pytest.param( @@ -2465,6 +2470,7 @@ def test_default_schema_loader(self): "cursor_field": "a_key", }, PerPartitionWithGlobalCursor, + DeclarativeStream, id="test_create_simple_retriever_with_incremental_and_partition_router", ), pytest.param( @@ -2490,17 +2496,19 @@ def test_default_schema_loader(self): }, ], PerPartitionWithGlobalCursor, + DeclarativeStream, id="test_create_simple_retriever_with_partition_routers_multiple_components", ), pytest.param( None, None, SinglePartitionRouter, + DefaultStream, id="test_create_simple_retriever_with_no_incremental_or_partition_router", ), ], ) -def test_merge_incremental_and_partition_router(incremental, partition_router, expected_type): +def test_merge_incremental_and_partition_router(incremental, partition_router, expected_router_type, expected_stream_type): stream_model = { "type": "DeclarativeStream", "retriever": { @@ -2531,22 +2539,21 @@ def test_merge_incremental_and_partition_router(incremental, partition_router, e model_type=DeclarativeStreamModel, component_definition=stream_model, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert isinstance(stream.retriever, SimpleRetriever) - assert isinstance(stream.retriever.stream_slicer, expected_type) + assert isinstance(stream, expected_stream_type) + retriever = get_retriever(stream) + assert isinstance(retriever, SimpleRetriever) + stream_slicer = retriever.stream_slicer if expected_stream_type == DeclarativeStream else stream._stream_partition_generator._stream_slicer + assert isinstance(stream_slicer, expected_router_type) if incremental and partition_router: - assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) + assert isinstance(retriever.stream_slicer, PerPartitionWithGlobalCursor) if isinstance(partition_router, list) and len(partition_router) > 1: assert isinstance( - stream.retriever.stream_slicer._partition_router, CartesianProductStreamSlicer + retriever.stream_slicer._partition_router, CartesianProductStreamSlicer ) - assert len(stream.retriever.stream_slicer._partition_router.stream_slicers) == len( + assert len(retriever.stream_slicer._partition_router.stream_slicers) == len( partition_router ) - elif partition_router and isinstance(partition_router, list) and len(partition_router) > 1: - assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) - assert len(stream.retriever.stream_slicer.stream_slicerS) == len(partition_router) def test_simple_retriever_emit_log_messages(): @@ -2714,8 +2721,8 @@ def test_create_custom_retriever(): model_type=DeclarativeStreamModel, component_definition=stream_model, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert isinstance(stream.retriever, MyCustomRetriever) + assert isinstance(stream, DefaultStream) + assert isinstance(stream._stream_partition_generator._partition_factory._retriever, MyCustomRetriever) @freezegun.freeze_time("2021-01-01 00:00:00") @@ -4646,14 +4653,23 @@ def test_create_stream_with_multiple_schema_loaders(): "", resolved_manifest["stream_A"], {} ) - declarative_stream = factory.create_component( + stream = factory.create_component( model_type=DeclarativeStreamModel, component_definition=partition_router_manifest, config=input_config, ) - schema_loader = declarative_stream.schema_loader + schema_loader = get_schema_loader(stream) assert isinstance(schema_loader, CompositeSchemaLoader) assert len(schema_loader.schema_loaders) == 2 assert isinstance(schema_loader.schema_loaders[0], InlineSchemaLoader) assert isinstance(schema_loader.schema_loaders[1], InlineSchemaLoader) + + +def get_schema_loader(stream: DefaultStream): + assert isinstance(stream._stream_partition_generator._partition_factory._schema_loader, SchemaLoaderCachingDecorator) + return stream._stream_partition_generator._partition_factory._schema_loader._decorated + + +def get_retriever(stream: Union[DeclarativeStream, DefaultStream]): + return stream.retriever if isinstance(stream, DeclarativeStream) else stream._stream_partition_generator._partition_factory._retriever diff --git a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py index 2f2cbca5b..7e9ae2150 100644 --- a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py @@ -383,5 +383,5 @@ def test_component_mapping_conditions(manifest, config, expected_conditional_par for stream in source.streams(config): if stream.name in expected_conditional_params: assert ( - stream.retriever.requester._parameters == expected_conditional_params[stream.name] + stream._stream_partition_generator._partition_factory._retriever.requester._parameters == expected_conditional_params[stream.name] ) diff --git a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py index a1e390177..44f307a32 100644 --- a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py @@ -265,120 +265,6 @@ def test_simple_retriever_resumable_full_refresh_cursor_page_increment( assert retriever.state == {"__ab_full_refresh_sync_complete": True} -@pytest.mark.parametrize( - "initial_state, expected_reset_value, expected_next_page", - [ - pytest.param(None, None, 1, id="test_initial_sync_no_state"), - pytest.param( - { - "next_page_token": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=tracy_stevens" - }, - "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=tracy_stevens", - "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", - id="test_reset_with_next_page_token", - ), - ], -) -def test_simple_retriever_resumable_full_refresh_cursor_reset_cursor_pagination( - initial_state, expected_reset_value, expected_next_page, requests_mock -): - expected_records = [ - Record(data={"name": "ed_baldwin"}, associated_slice=None, stream_name="users"), - Record(data={"name": "danielle_poole"}, associated_slice=None, stream_name="users"), - Record(data={"name": "tracy_stevens"}, associated_slice=None, stream_name="users"), - Record(data={"name": "deke_slayton"}, associated_slice=None, stream_name="users"), - Record(data={"name": "molly_cobb"}, associated_slice=None, stream_name="users"), - Record(data={"name": "gordo_stevens"}, associated_slice=None, stream_name="users"), - Record(data={"name": "margo_madison"}, associated_slice=None, stream_name="users"), - Record(data={"name": "ellen_waverly"}, associated_slice=None, stream_name="users"), - ] - - content = """ -name: users -type: DeclarativeStream -retriever: - type: SimpleRetriever - decoder: - type: JsonDecoder - paginator: - type: "DefaultPaginator" - page_token_option: - type: RequestPath - pagination_strategy: - type: "CursorPagination" - cursor_value: "{{ response.next_page }}" - requester: - path: /astronauts - type: HttpRequester - url_base: "https://for-all-mankind.nasa.com/api/v1" - http_method: GET - authenticator: - type: ApiKeyAuthenticator - api_token: "{{ config['api_key'] }}" - inject_into: - type: RequestOption - field_name: Api-Key - inject_into: header - request_headers: {} - request_body_json: {} - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: ["data"] - partition_router: [] -primary_key: [] - """ - - factory = ModelToComponentFactory() - stream_manifest = YamlDeclarativeSource._parse(content) - stream = factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config={} - ) - response_body = { - "data": [r.data for r in expected_records[:5]], - "next_page": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", - } - requests_mock.get("https://for-all-mankind.nasa.com/api/v1/astronauts", json=response_body) - requests_mock.get( - "https://for-all-mankind.nasa.com/astronauts?next_page=tracy_stevens", json=response_body - ) - response_body_2 = { - "data": [r.data for r in expected_records[5:]], - } - requests_mock.get( - "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", - json=response_body_2, - ) - stream_slicer = ResumableFullRefreshCursor(parameters={}) - if initial_state: - stream_slicer.set_initial_state(initial_state) - stream.retriever.stream_slices = stream_slicer - stream.retriever.cursor = stream_slicer - stream_slice = list(stream_slicer.stream_slices())[0] - actual_records = [ - r for r in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice) - ] - - assert len(actual_records) == 5 - assert actual_records == expected_records[:5] - assert stream.retriever.state == { - "next_page_token": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens" - } - requests_mock.get( - "https://for-all-mankind.nasa.com/astronauts?next_page=tracy_stevens", json=response_body - ) - requests_mock.get( - "https://for-all-mankind.nasa.com/astronauts?next_page=gordo_stevens", json=response_body_2 - ) - actual_records = [ - r for r in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice) - ] - assert len(actual_records) == 3 - assert actual_records == expected_records[5:] - assert stream.retriever.state == {"__ab_full_refresh_sync_complete": True} - - def test_simple_retriever_resumable_full_refresh_cursor_reset_skip_completed_stream(): expected_records = [ Record(data={"id": "abc"}, associated_slice=None, stream_name="test_stream"), diff --git a/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py b/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py index 3ced03a69..ba7b5c478 100644 --- a/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py +++ b/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py @@ -6,6 +6,7 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type from airbyte_cdk.sources.declarative.retrievers import Retriever +from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( DeclarativePartitionFactory, ) @@ -15,6 +16,7 @@ _STREAM_NAME = "a_stream_name" _JSON_SCHEMA = {"type": "object", "properties": {}} +_SCHEMA_LOADER = InlineSchemaLoader(_JSON_SCHEMA, {}) _A_STREAM_SLICE = StreamSlice( partition={"partition_key": "partition_value"}, cursor_slice={"cursor_key": "cursor_value"} ) @@ -34,7 +36,7 @@ def test_given_multiple_slices_partition_generator_uses_the_same_retriever(self) message_repository = Mock(spec=MessageRepository) partition_factory = DeclarativePartitionFactory( _STREAM_NAME, - _JSON_SCHEMA, + _SCHEMA_LOADER, retriever, message_repository, ) @@ -49,7 +51,7 @@ def test_given_a_mapping_when_read_then_yield_record(self) -> None: message_repository = Mock(spec=MessageRepository) partition_factory = DeclarativePartitionFactory( _STREAM_NAME, - _JSON_SCHEMA, + _SCHEMA_LOADER, retriever, message_repository, ) @@ -67,7 +69,7 @@ def test_given_not_a_record_when_read_then_send_to_message_repository(self) -> N message_repository = Mock(spec=MessageRepository) partition_factory = DeclarativePartitionFactory( _STREAM_NAME, - _JSON_SCHEMA, + _SCHEMA_LOADER, retriever, message_repository, ) diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 8f72cc6a6..51038095d 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -28,12 +28,14 @@ SyncMode, Type, ) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream logger = logging.getLogger("airbyte") @@ -280,8 +282,8 @@ def test_valid_manifest(self): streams = source.streams({}) assert len(streams) == 2 - assert isinstance(streams[0], DeclarativeStream) - assert isinstance(streams[1], DeclarativeStream) + assert isinstance(streams[0], DefaultStream) + assert isinstance(streams[1], DefaultStream) assert ( source.resolved_manifest["description"] == "This is a sample source connector that is very valid." @@ -1289,13 +1291,13 @@ def test_conditional_streams_manifest(self, is_sandbox, expected_stream_count): actual_streams = source.streams(config=config) assert len(actual_streams) == expected_stream_count - assert isinstance(actual_streams[0], DeclarativeStream) + assert isinstance(actual_streams[0], DefaultStream) assert actual_streams[0].name == "students" if is_sandbox: - assert isinstance(actual_streams[1], DeclarativeStream) + assert isinstance(actual_streams[1], DefaultStream) assert actual_streams[1].name == "classrooms" - assert isinstance(actual_streams[2], DeclarativeStream) + assert isinstance(actual_streams[2], DefaultStream) assert actual_streams[2].name == "clubs" assert ( @@ -2202,7 +2204,6 @@ def test_only_parent_streams_use_cache(): def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMessage]: - source = ManifestDeclarativeSource(source_config=manifest) catalog = ConfiguredAirbyteCatalog( streams=[ ConfiguredAirbyteStream( @@ -2214,7 +2215,10 @@ def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMess ) ] ) - return list(source.read(logger, {}, catalog, {})) + config = {} + state = {} + source = ConcurrentDeclarativeSource(catalog, config, state, manifest) + return list(source.read(logger, {}, catalog, state)) def test_declarative_component_schema_valid_ref_links(): diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 98255bfe5..129dde27f 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -45,6 +45,27 @@ def test_get_json_schema(self): json_schema = self._stream.get_json_schema() assert json_schema == self._json_schema + def test_json_schema_is_callable(self): + expected = {"schema": "is callable"} + json_schema_callable = lambda: expected + stream = DefaultStream( + self._partition_generator, + self._name, + json_schema_callable, + self._primary_key, + self._cursor_field, + self._logger, + FinalStateCursor( + stream_name=self._name, + stream_namespace=None, + message_repository=self._message_repository, + ), + ) + + result = stream.get_json_schema() + + assert result == expected + def test_check_for_error_raises_an_exception_if_any_of_the_futures_are_not_done(self): futures = [Mock() for _ in range(3)] for f in futures: From dff25594f0b1310609bffc4646f35b775bd3d08a Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 11:01:36 -0400 Subject: [PATCH 10/68] fix test --- .../test_manifest_declarative_source.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 51038095d..97f572510 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -36,6 +36,7 @@ ) from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from unit_tests.sources.declarative.parsers.test_model_to_component_factory import get_retriever logger = logging.getLogger("airbyte") @@ -2181,26 +2182,27 @@ def test_only_parent_streams_use_cache(): # Main stream with caching (parent for substream `applications_interviews`) assert streams[0].name == "applications" - assert streams[0].retriever.requester.use_cache + assert get_retriever(streams[0]).requester.use_cache # Substream assert streams[1].name == "applications_interviews" - assert not streams[1].retriever.requester.use_cache + + stream_1_retriever = get_retriever(streams[1]) + assert not stream_1_retriever.requester.use_cache # Parent stream created for substream assert ( - streams[1].retriever.stream_slicer.parent_stream_configs[0].stream.name - == "applications" + stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name + == "applications" ) assert ( - streams[1] - .retriever.stream_slicer.parent_stream_configs[0] + stream_1_retriever.stream_slicer.parent_stream_configs[0] .stream.retriever.requester.use_cache ) # Main stream without caching assert streams[2].name == "jobs" - assert not streams[2].retriever.requester.use_cache + assert not get_retriever(streams[2]).requester.use_cache def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMessage]: From 7dc2164e85f6d0513d891ef18f1873e94ceaadb5 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 12:50:39 -0400 Subject: [PATCH 11/68] fix test, format, lint and a bit of mypy --- .../manifest_declarative_source.py | 2 +- .../parsers/model_to_component_factory.py | 28 +++++++++--- .../declarative_partition_generator.py | 8 +++- .../sources/streams/concurrent/adapters.py | 3 ++ .../streams/concurrent/default_stream.py | 10 +++-- .../decoders/test_decoders_memory_usage.py | 7 ++- .../test_model_to_component_factory.py | 45 ++++++++++++------- .../test_config_components_resolver.py | 3 +- .../test_manifest_declarative_source.py | 16 +++---- 9 files changed, 79 insertions(+), 43 deletions(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 303d12ba4..b1736f371 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -298,7 +298,7 @@ def connection_checker(self) -> ConnectionChecker: f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}" ) - def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder """ As a migration step, this method will return both legacy stream (Stream) and concurrent stream (AbstractStream). Once the migration is done, we can probably have this method throw "not implemented" as we figure out how to diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 32c14873a..d53dcd79e 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -545,8 +545,10 @@ StreamSlicer, StreamSlicerTestReadDecorator, ) -from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import \ - StreamSlicerPartitionGenerator, DeclarativePartitionFactory +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, +) from airbyte_cdk.sources.declarative.transformations import ( AddFields, RecordTransformation, @@ -608,7 +610,12 @@ WeekClampingStrategy, Weekday, ) -from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.cursor import ( + ConcurrentCursor, + Cursor, + CursorField, + FinalStateCursor, +) from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( @@ -2066,7 +2073,11 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - if isinstance(combined_slicers, PartitionRouter) and not is_parent and not self._emit_connector_builder_messages: + if ( + isinstance(combined_slicers, PartitionRouter) + and not is_parent + and not self._emit_connector_builder_messages + ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter @@ -2154,7 +2165,9 @@ def _build_incremental_cursor( stream_slicer: Optional[PartitionRouter], config: Config, ) -> Optional[StreamSlicer]: - if model.incremental_sync and (stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter)): + if model.incremental_sync and ( + stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) + ): if model.retriever.type == "AsyncRetriever": stream_name = model.name or "" stream_namespace = None @@ -2902,7 +2915,10 @@ def create_parent_stream_config( self, model: ParentStreamConfigModel, config: Config, **kwargs: Any ) -> ParentStreamConfig: declarative_stream = self._create_component_from_model( - model.stream, config=config, is_parent=True, **kwargs, + model.stream, + config=config, + is_parent=True, + **kwargs, ) request_option = ( self._create_component_from_model(model.request_option, config=config) diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index fe76e7ee2..985f2d104 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -13,7 +13,6 @@ class SchemaLoaderCachingDecorator(SchemaLoader): - def __init__(self, schema_loader: SchemaLoader): self._decorated = schema_loader self._loaded_schema = None @@ -21,6 +20,9 @@ def __init__(self, schema_loader: SchemaLoader): def get_json_schema(self) -> Mapping[str, Any]: if self._loaded_schema is None: self._loaded_schema = self._decorated.get_json_schema() + + if self._loaded_schema is None: + raise ValueError("Could not load schema") return self._loaded_schema @@ -69,7 +71,9 @@ def __init__( self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) def read(self) -> Iterable[Record]: - for stream_data in self._retriever.read_records(self._schema_loader.get_json_schema(), self._stream_slice): + for stream_data in self._retriever.read_records( + self._schema_loader.get_json_schema(), self._stream_slice + ): if isinstance(stream_data, Mapping): record = ( stream_data diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 6a4682605..c1dea49de 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -5,6 +5,7 @@ import copy import json import logging +from functools import lru_cache from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union from typing_extensions import deprecated @@ -195,6 +196,8 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor + # FIXME the lru_cache seems to be mostly there because of typing issue + @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: return self._abstract_stream.get_json_schema() diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index bceed08b2..fbbd2b613 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,7 +3,7 @@ # from logging import Logger -from typing import Any, Iterable, List, Mapping, Optional, Union, Callable +from typing import Any, Callable, Iterable, List, Mapping, Optional, Union from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -109,7 +109,9 @@ def check_availability(self) -> StreamAvailability: f"Cannot attempt to connect to stream {self.name} - no stream slices were found" ) except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) try: next(iter(partition.read())) @@ -118,4 +120,6 @@ def check_availability(self) -> StreamAvailability: self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") return StreamAvailability.available() except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) diff --git a/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py b/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py index 6901c6382..2960c5802 100644 --- a/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py +++ b/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py @@ -93,9 +93,8 @@ def get_body(): requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users3", body=get_body()) requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users4", body=get_body()) - stream_slices = list(stream.stream_slices(sync_mode=SyncMode.full_refresh)) - for stream_slice in stream_slices: - for _ in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice): + for partition in stream.generate_partitions(): + for _ in partition.read(): counter += 1 - assert counter == lines_in_response * len(stream_slices) + assert counter == lines_in_response * 4 # 4 partitions diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index c7d2f8d7a..a1c2da8eb 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -157,7 +157,9 @@ from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator -from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import SchemaLoaderCachingDecorator +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + SchemaLoaderCachingDecorator, +) from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource @@ -1768,14 +1770,8 @@ def test_config_with_defaults(): schema_loader = get_schema_loader(stream) assert isinstance(schema_loader, JsonFileSchemaLoader) - assert ( - schema_loader.file_path.string - == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" - ) - assert ( - schema_loader.file_path.default - == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" - ) + assert schema_loader.file_path.string == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + assert schema_loader.file_path.default == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" assert isinstance(retriever.requester, HttpRequester) assert retriever.requester.http_method == HttpMethod.GET @@ -1785,9 +1781,9 @@ def test_config_with_defaults(): assert isinstance(retriever.record_selector, RecordSelector) assert isinstance(retriever.record_selector.extractor, DpathExtractor) - assert [ - fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path - ] == ["result"] + assert [fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path] == [ + "result" + ] assert isinstance(retriever.paginator, DefaultPaginator) assert retriever.paginator.url_base.string == "https://api.sendgrid.com" @@ -2508,7 +2504,9 @@ def test_default_schema_loader(self): ), ], ) -def test_merge_incremental_and_partition_router(incremental, partition_router, expected_router_type, expected_stream_type): +def test_merge_incremental_and_partition_router( + incremental, partition_router, expected_router_type, expected_stream_type +): stream_model = { "type": "DeclarativeStream", "retriever": { @@ -2542,7 +2540,11 @@ def test_merge_incremental_and_partition_router(incremental, partition_router, e assert isinstance(stream, expected_stream_type) retriever = get_retriever(stream) assert isinstance(retriever, SimpleRetriever) - stream_slicer = retriever.stream_slicer if expected_stream_type == DeclarativeStream else stream._stream_partition_generator._stream_slicer + stream_slicer = ( + retriever.stream_slicer + if expected_stream_type == DeclarativeStream + else stream._stream_partition_generator._stream_slicer + ) assert isinstance(stream_slicer, expected_router_type) if incremental and partition_router: @@ -2722,7 +2724,9 @@ def test_create_custom_retriever(): ) assert isinstance(stream, DefaultStream) - assert isinstance(stream._stream_partition_generator._partition_factory._retriever, MyCustomRetriever) + assert isinstance( + stream._stream_partition_generator._partition_factory._retriever, MyCustomRetriever + ) @freezegun.freeze_time("2021-01-01 00:00:00") @@ -4667,9 +4671,16 @@ def test_create_stream_with_multiple_schema_loaders(): def get_schema_loader(stream: DefaultStream): - assert isinstance(stream._stream_partition_generator._partition_factory._schema_loader, SchemaLoaderCachingDecorator) + assert isinstance( + stream._stream_partition_generator._partition_factory._schema_loader, + SchemaLoaderCachingDecorator, + ) return stream._stream_partition_generator._partition_factory._schema_loader._decorated def get_retriever(stream: Union[DeclarativeStream, DefaultStream]): - return stream.retriever if isinstance(stream, DeclarativeStream) else stream._stream_partition_generator._partition_factory._retriever + return ( + stream.retriever + if isinstance(stream, DeclarativeStream) + else stream._stream_partition_generator._partition_factory._retriever + ) diff --git a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py index 7e9ae2150..c9ca1ecd5 100644 --- a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py @@ -383,5 +383,6 @@ def test_component_mapping_conditions(manifest, config, expected_conditional_par for stream in source.streams(config): if stream.name in expected_conditional_params: assert ( - stream._stream_partition_generator._partition_factory._retriever.requester._parameters == expected_conditional_params[stream.name] + stream._stream_partition_generator._partition_factory._retriever.requester._parameters + == expected_conditional_params[stream.name] ) diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 97f572510..24258f193 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -28,7 +28,9 @@ SyncMode, Type, ) -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( @@ -2191,14 +2193,10 @@ def test_only_parent_streams_use_cache(): assert not stream_1_retriever.requester.use_cache # Parent stream created for substream - assert ( - stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name - == "applications" - ) - assert ( - stream_1_retriever.stream_slicer.parent_stream_configs[0] - .stream.retriever.requester.use_cache - ) + assert stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name == "applications" + assert stream_1_retriever.stream_slicer.parent_stream_configs[ + 0 + ].stream.retriever.requester.use_cache # Main stream without caching assert streams[2].name == "jobs" From 0bfbdfe1ee016e456768a09826dac23fb5d3c441 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 14:12:59 -0400 Subject: [PATCH 12/68] mypy --- .../declarative/concurrent_declarative_source.py | 13 ++++++------- .../parsers/model_to_component_factory.py | 2 +- .../declarative_partition_generator.py | 6 ++---- .../sources/streams/concurrent/default_stream.py | 4 +--- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 0ac6299f2..ba4ba1fe2 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -3,7 +3,7 @@ # import logging -from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple +from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union from airbyte_cdk.models import ( AirbyteCatalog, @@ -28,7 +28,6 @@ PerPartitionWithGlobalCursor, ) from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource -from airbyte_cdk.sources.declarative.models import FileUploader from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ConcurrencyLevel as ConcurrencyLevelModel, ) @@ -179,7 +178,7 @@ def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> Airbyte ] ) - def streams(self, config: Mapping[str, Any]) -> List[Stream]: + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder """ The `streams` method is used as part of the AbstractSource in the following cases: * ConcurrentDeclarativeSource.check -> ManifestDeclarativeSource.check -> AbstractSource.check -> DeclarativeSource.check_connection -> CheckStream.check_connection -> streams @@ -282,7 +281,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish retriever, self.message_repository, ), @@ -313,7 +312,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish retriever, self.message_repository, ), @@ -343,7 +342,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish declarative_stream.retriever, self.message_repository, ), @@ -403,7 +402,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish retriever, self.message_repository, ), diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index d53dcd79e..45b75b175 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1928,7 +1928,7 @@ def create_datetime_based_cursor( ) def create_declarative_stream( - self, model: DeclarativeStreamModel, config: Config, is_parent=False, **kwargs: Any + self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any ) -> Union[DeclarativeStream, AbstractStream]: # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 985f2d104..c7e0a24cf 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -15,15 +15,13 @@ class SchemaLoaderCachingDecorator(SchemaLoader): def __init__(self, schema_loader: SchemaLoader): self._decorated = schema_loader - self._loaded_schema = None + self._loaded_schema: Optional[Mapping[str, Any]] = None def get_json_schema(self) -> Mapping[str, Any]: if self._loaded_schema is None: self._loaded_schema = self._decorated.get_json_schema() - if self._loaded_schema is None: - raise ValueError("Could not load schema") - return self._loaded_schema + return self._loaded_schema # type: ignore # at that point, we assume the schema will be populated class DeclarativePartitionFactory: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index fbbd2b613..ca227fd50 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -53,9 +53,7 @@ def cursor_field(self) -> Optional[str]: return self._cursor_field def get_json_schema(self) -> Mapping[str, Any]: - if isinstance(self._json_schema, Callable): - return self._json_schema() - return self._json_schema + return self._json_schema() if callable(self._json_schema) else self._json_schema def as_airbyte_stream(self) -> AirbyteStream: stream = AirbyteStream( From 0b454bb551958f28771f817413313fdc5e83b8c2 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 14:13:33 -0400 Subject: [PATCH 13/68] format --- .../sources/declarative/concurrent_declarative_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index ba4ba1fe2..69582c12b 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -178,7 +178,7 @@ def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> Airbyte ] ) - def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder """ The `streams` method is used as part of the AbstractSource in the following cases: * ConcurrentDeclarativeSource.check -> ManifestDeclarativeSource.check -> AbstractSource.check -> DeclarativeSource.check_connection -> CheckStream.check_connection -> streams From 13c17f4437a56db2eb1105f3d68b9707d1b13db8 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 15:13:35 -0400 Subject: [PATCH 14/68] remove unused line --- .../sources/declarative/parsers/model_to_component_factory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 45b75b175..dcb84c8aa 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2095,7 +2095,6 @@ def create_declarative_stream( ), combined_slicers, ) - FinalStateCursor(stream_name, None, self._message_repository) return DefaultStream( partition_generator=partition_generator, name=stream_name, From fb75765d4dcd2893bd30c13ede714f732b2e884e Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 16:37:58 -0400 Subject: [PATCH 15/68] fix test --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 425ab8da8..35be7ce59 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2244,7 +2244,7 @@ def _build_concurrent_cursor( else: state_transformations = [] - if model.incremental_sync and stream_slicer: + if model.incremental_sync and stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter): return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing state_manager=self._connector_state_manager, model_type=DatetimeBasedCursorModel, From c07839529219211417205df7b010aa9d39c2137f Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 16:42:17 -0400 Subject: [PATCH 16/68] lint --- .../declarative/concurrent_declarative_source.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index aad88badb..be1d160b5 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -5,7 +5,18 @@ import logging from dataclasses import dataclass, field from queue import Queue -from typing import Any, ClassVar, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union +from typing import ( + Any, + ClassVar, + Generic, + Iterator, + List, + Mapping, + MutableMapping, + Optional, + Tuple, + Union, +) from airbyte_protocol_dataclasses.models import Level From decc557f4af3267cde542da2b070bc80c85f7808 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 16:44:06 -0400 Subject: [PATCH 17/68] format --- .../declarative/parsers/model_to_component_factory.py | 8 ++++++-- .../connector_builder/test_connector_builder_handler.py | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 35be7ce59..1cb3cdb7e 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2170,7 +2170,7 @@ def _build_incremental_cursor( ) if model.incremental_sync and ( - stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) + stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) ): if model.retriever.type == "AsyncRetriever": stream_name = model.name or "" @@ -2244,7 +2244,11 @@ def _build_concurrent_cursor( else: state_transformations = [] - if model.incremental_sync and stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter): + if ( + model.incremental_sync + and stream_slicer + and not isinstance(stream_slicer, SinglePartitionRouter) + ): return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing state_manager=self._connector_state_manager, model_type=DatetimeBasedCursorModel, diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index 4a68645a3..c036c12d3 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -785,7 +785,11 @@ def test_config_update() -> None: "refresh_token": "a refresh token", } source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=manifest, emit_connector_builder_messages=True + catalog=None, + config=config, + state=None, + source_config=manifest, + emit_connector_builder_messages=True, ) refresh_request_response = { From b8daf647915364453ba7c58c66fffb49092ad958 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 19:34:06 -0400 Subject: [PATCH 18/68] code review --- airbyte_cdk/sources/streams/concurrent/availability_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py index 3be77ff05..1068e6a92 100644 --- a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py +++ b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -13,7 +13,7 @@ def available(cls) -> "StreamAvailability": @classmethod def unavailable(cls, reason: str) -> "StreamAvailability": - return StreamAvailability(False, reason) + return cls(False, reason) def __init__(self, available: bool, reason: Optional[str] = None) -> None: self._available = available From 2bc4b307dd56575ad1014ad9bb17ac22f41e8895 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 08:11:48 -0400 Subject: [PATCH 19/68] code review --- unit_tests/sources/streams/concurrent/test_default_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 98255bfe5..12e2b34f4 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -313,7 +313,7 @@ def test_given_unknown_error_when_reading_record_when_get_availability_then_rais """ I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy """ - self._partition_generator.generate.side_effect = ValueError() - self._partition.read.return_value = [] + self._partition_generator.generate.return_value = [self._partition] + self._partition.read.side_effect = ValueError() with pytest.raises(ValueError): self._stream.check_availability() From d9d09f02d4b640def561af742ee2aacc745e3c52 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 09:48:09 -0400 Subject: [PATCH 20/68] incremental without partition router as DefaultStream --- .../parsers/model_to_component_factory.py | 62 +++++--- .../test_model_to_component_factory.py | 145 +++++++++--------- 2 files changed, 115 insertions(+), 92 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 67ec9f2ac..3a25ba5b0 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1940,22 +1940,11 @@ def create_declarative_stream( combined_slicers = self._merge_stream_slicers(model=model, config=config) primary_key = model.primary_key.__root__ if model.primary_key else None - stop_condition_on_cursor = ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_data_feed") - and model.incremental_sync.is_data_feed - ) - client_side_filtering_enabled = ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_client_side_incremental") - and model.incremental_sync.is_client_side_incremental + + stream_slicer = self._build_stream_slicer_from_partition_router( + model.retriever, config, stream_name=model.name ) - concurrent_cursor = None - if stop_condition_on_cursor or client_side_filtering_enabled: - stream_slicer = self._build_stream_slicer_from_partition_router( - model.retriever, config, stream_name=model.name - ) - concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) + concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -2030,9 +2019,9 @@ def create_declarative_stream( primary_key=primary_key, stream_slicer=combined_slicers, request_options_provider=request_options_provider, - stop_condition_cursor=concurrent_cursor, + stop_condition_cursor=concurrent_cursor if self._is_stop_condition_on_cursor(model) else None, client_side_incremental_sync={"cursor": concurrent_cursor} - if client_side_filtering_enabled + if self._is_client_side_filtering_enabled(model) else None, transformations=transformations, file_uploader=file_uploader, @@ -2066,17 +2055,30 @@ def create_declarative_stream( schema_loader = DefaultSchemaLoader(config=config, parameters=options) if ( - isinstance(combined_slicers, PartitionRouter) + (isinstance(combined_slicers, PartitionRouter) or isinstance(concurrent_cursor, ConcurrentCursor)) and not is_parent and not self._emit_connector_builder_messages ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter - # * Streams without partition router but with cursor + # * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition # * Streams with both partition router and cursor # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway + + stream_slicer = concurrent_cursor + if isinstance(retriever, AsyncRetriever): + # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method + # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a + # special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when + # built because the async retriever has a specific partition router which relies on this stream slicer. + # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in + # AsyncJobPartitionRouter. + stream_slicer = retriever.stream_slicer + elif isinstance(combined_slicers, PartitionRouter): + stream_slicer = combined_slicers + stream_name = model.name or "" partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( @@ -2085,18 +2087,19 @@ def create_declarative_stream( retriever, self._message_repository, ), - combined_slicers, + stream_slicer, ) + cursor = concurrent_cursor if concurrent_cursor else FinalStateCursor(stream_name, None, self._message_repository) return DefaultStream( partition_generator=partition_generator, name=stream_name, json_schema=schema_loader.get_json_schema, primary_key=get_primary_key_from_stream(primary_key), - cursor_field=None, + cursor_field=cursor.cursor_field.cursor_field_key if hasattr(cursor, "cursor_field") else "", # FIXME we should have the cursor field has part of the interface of cursor, # FIXME we should have the cursor field has part of the interface of cursor logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation, - cursor=FinalStateCursor(stream_name, None, self._message_repository), + cursor=cursor, ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None @@ -2118,6 +2121,21 @@ def create_declarative_stream( parameters=model.parameters or {}, ) + def _is_stop_condition_on_cursor(self, model): + return ( + model.incremental_sync + and hasattr(model.incremental_sync, "is_data_feed") + and model.incremental_sync.is_data_feed + ) + + def _is_client_side_filtering_enabled(self, model): + client_side_filtering_enabled = ( + model.incremental_sync + and hasattr(model.incremental_sync, "is_client_side_incremental") + and model.incremental_sync.is_client_side_incremental + ) + return client_side_filtering_enabled + def _build_stream_slicer_from_partition_router( self, model: Union[ diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index aa8d0d781..8d9b1f808 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -350,101 +350,102 @@ def test_full_config_stream(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert stream.primary_key == "id" + assert isinstance(stream, DefaultStream) assert stream.name == "lists" - assert stream._stream_cursor_field.string == "created" + assert stream.cursor_field == "created" - assert isinstance(stream.schema_loader, JsonFileSchemaLoader) - assert stream.schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.json" + schema_loader = get_schema_loader(stream) + assert isinstance(schema_loader, JsonFileSchemaLoader) + assert schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.json" - assert len(stream.retriever.record_selector.transformations) == 1 - add_fields = stream.retriever.record_selector.transformations[0] + retriever = get_retriever(stream) + assert len(retriever.record_selector.transformations) == 1 + add_fields = retriever.record_selector.transformations[0] assert isinstance(add_fields, AddFields) assert add_fields.fields[0].path == ["extra"] assert add_fields.fields[0].value.string == "{{ response.to_add }}" - assert isinstance(stream.retriever, SimpleRetriever) - assert stream.retriever.primary_key == stream.primary_key - assert stream.retriever.name == stream.name + assert isinstance(retriever, SimpleRetriever) + assert retriever.primary_key == "id" + assert retriever.name == stream.name - assert isinstance(stream.retriever.record_selector, RecordSelector) + assert isinstance(retriever.record_selector, RecordSelector) - assert isinstance(stream.retriever.record_selector.extractor, DpathExtractor) - assert isinstance(stream.retriever.record_selector.extractor.decoder, JsonDecoder) + assert isinstance(retriever.record_selector.extractor, DpathExtractor) + assert isinstance(retriever.record_selector.extractor.decoder, JsonDecoder) assert [ - fp.eval(input_config) for fp in stream.retriever.record_selector.extractor._field_path + fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path ] == ["lists"] - assert isinstance(stream.retriever.record_selector.record_filter, RecordFilter) + assert isinstance(retriever.record_selector.record_filter, RecordFilter) assert ( - stream.retriever.record_selector.record_filter._filter_interpolator.condition - == "{{ record['id'] > stream_state['id'] }}" + retriever.record_selector.record_filter._filter_interpolator.condition + == "{{ record['id'] > stream_state['id'] }}" ) - assert isinstance(stream.retriever.paginator, DefaultPaginator) - assert isinstance(stream.retriever.paginator.decoder, PaginationDecoderDecorator) - assert stream.retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" + assert isinstance(retriever.paginator, DefaultPaginator) + assert isinstance(retriever.paginator.decoder, PaginationDecoderDecorator) + assert retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" assert ( - stream.retriever.paginator.page_size_option.inject_into - == RequestOptionType.request_parameter + retriever.paginator.page_size_option.inject_into + == RequestOptionType.request_parameter ) - assert isinstance(stream.retriever.paginator.page_token_option, RequestPath) - assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" - assert stream.retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" + assert isinstance(retriever.paginator.page_token_option, RequestPath) + assert retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" + assert retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" - assert isinstance(stream.retriever.paginator.pagination_strategy, CursorPaginationStrategy) + assert isinstance(retriever.paginator.pagination_strategy, CursorPaginationStrategy) assert isinstance( - stream.retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator + retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator ) assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.string - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.string + == "{{ response._metadata.next }}" ) assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.default - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.default + == "{{ response._metadata.next }}" ) - assert stream.retriever.paginator.pagination_strategy.page_size == 10 + assert retriever.paginator.pagination_strategy.page_size == 10 - assert isinstance(stream.retriever.requester, HttpRequester) - assert stream.retriever.requester.http_method == HttpMethod.GET - assert stream.retriever.requester.name == stream.name - assert stream.retriever.requester._path.string == "{{ next_page_token['next_page_url'] }}" - assert stream.retriever.requester._path.default == "{{ next_page_token['next_page_url'] }}" + assert isinstance(retriever.requester, HttpRequester) + assert retriever.requester.http_method == HttpMethod.GET + assert retriever.requester.name == stream.name + assert retriever.requester._path.string == "{{ next_page_token['next_page_url'] }}" + assert retriever.requester._path.default == "{{ next_page_token['next_page_url'] }}" - assert isinstance(stream.retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) + assert isinstance(retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) assert ( - stream.retriever.request_option_provider.start_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.start_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - stream.retriever.request_option_provider.start_time_option.field_name.eval( + retriever.request_option_provider.start_time_option.field_name.eval( config=input_config ) - == "after" + == "after" ) assert ( - stream.retriever.request_option_provider.end_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.end_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - stream.retriever.request_option_provider.end_time_option.field_name.eval( + retriever.request_option_provider.end_time_option.field_name.eval( config=input_config ) - == "before" + == "before" ) - assert stream.retriever.request_option_provider._partition_field_start.string == "start_time" - assert stream.retriever.request_option_provider._partition_field_end.string == "end_time" + assert retriever.request_option_provider._partition_field_start.string == "start_time" + assert retriever.request_option_provider._partition_field_end.string == "end_time" - assert isinstance(stream.retriever.requester.authenticator, BearerAuthenticator) - assert stream.retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" + assert isinstance(retriever.requester.authenticator, BearerAuthenticator) + assert retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" assert isinstance( - stream.retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider + retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider ) assert ( - stream.retriever.requester.request_options_provider.request_parameters.get("unit") == "day" + retriever.requester.request_options_provider.request_parameters.get("unit") == "day" ) checker = factory.create_component( @@ -1117,7 +1118,7 @@ def test_incremental_data_feed(): ) assert isinstance( - stream.retriever.paginator.pagination_strategy, StopConditionPaginationStrategyDecorator + get_retriever(stream).paginator.pagination_strategy, StopConditionPaginationStrategyDecorator ) @@ -1198,11 +1199,12 @@ def test_client_side_incremental(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) + retriever = get_retriever(stream) assert isinstance( - stream.retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator + retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator ) - assert stream.retriever.record_selector.transform_before_filtering == True + assert get_retriever(stream).record_selector.transform_before_filtering == True def test_client_side_incremental_with_partition_router(): @@ -2440,8 +2442,8 @@ def test_default_schema_loader(self): "cursor_granularity": "PT0.000001S", }, None, - DatetimeBasedCursor, - DeclarativeStream, + ConcurrentCursor, + DefaultStream, id="test_create_simple_retriever_with_incremental", ), pytest.param( @@ -4130,7 +4132,8 @@ def test_simple_retriever_with_query_properties(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - query_properties = stream.retriever.additional_query_properties + retriever = get_retriever(stream) + query_properties = retriever.additional_query_properties assert isinstance(query_properties, QueryProperties) assert query_properties.property_list == [ "first_name", @@ -4141,18 +4144,18 @@ def test_simple_retriever_with_query_properties(): ] assert query_properties.always_include_properties == ["id"] - property_chunking = stream.retriever.additional_query_properties.property_chunking + property_chunking = retriever.additional_query_properties.property_chunking assert isinstance(property_chunking, PropertyChunking) assert property_chunking.property_limit_type == PropertyLimitType.property_count assert property_chunking.property_limit == 3 merge_strategy = ( - stream.retriever.additional_query_properties.property_chunking.record_merge_strategy + retriever.additional_query_properties.property_chunking.record_merge_strategy ) assert isinstance(merge_strategy, GroupByKey) assert merge_strategy.key == ["id"] - request_options_provider = stream.retriever.requester.request_options_provider + request_options_provider = retriever.requester.request_options_provider assert isinstance(request_options_provider, InterpolatedRequestOptionsProvider) # For a better developer experience we allow QueryProperties to be defined on the requester.request_parameters, # but it actually is leveraged by the SimpleRetriever which is why it is not included in the RequestOptionsProvider @@ -4232,27 +4235,28 @@ def test_simple_retriever_with_request_parameters_properties_from_endpoint(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - query_properties = stream.retriever.additional_query_properties + retriever = get_retriever(stream) + query_properties = retriever.additional_query_properties assert isinstance(query_properties, QueryProperties) assert query_properties.always_include_properties is None - properties_from_endpoint = stream.retriever.additional_query_properties.property_list + properties_from_endpoint = retriever.additional_query_properties.property_list assert isinstance(properties_from_endpoint, PropertiesFromEndpoint) assert properties_from_endpoint.property_field_path == ["name"] properties_from_endpoint_retriever = ( - stream.retriever.additional_query_properties.property_list.retriever + retriever.additional_query_properties.property_list.retriever ) assert isinstance(properties_from_endpoint_retriever, SimpleRetriever) properties_from_endpoint_requester = ( - stream.retriever.additional_query_properties.property_list.retriever.requester + retriever.additional_query_properties.property_list.retriever.requester ) assert isinstance(properties_from_endpoint_requester, HttpRequester) assert properties_from_endpoint_requester.url_base == "https://api.hubapi.com" assert properties_from_endpoint_requester.path == "/properties/v2/dynamics/properties" - property_chunking = stream.retriever.additional_query_properties.property_chunking + property_chunking = retriever.additional_query_properties.property_chunking assert isinstance(property_chunking, PropertyChunking) assert property_chunking.property_limit_type == PropertyLimitType.property_count assert property_chunking.property_limit == 3 @@ -4320,22 +4324,23 @@ def test_simple_retriever_with_requester_properties_from_endpoint(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - query_properties = stream.retriever.additional_query_properties + retriever = get_retriever(stream) + query_properties = retriever.additional_query_properties assert isinstance(query_properties, QueryProperties) assert query_properties.always_include_properties is None assert query_properties.property_chunking is None - properties_from_endpoint = stream.retriever.additional_query_properties.property_list + properties_from_endpoint = retriever.additional_query_properties.property_list assert isinstance(properties_from_endpoint, PropertiesFromEndpoint) assert properties_from_endpoint.property_field_path == ["name"] properties_from_endpoint_retriever = ( - stream.retriever.additional_query_properties.property_list.retriever + retriever.additional_query_properties.property_list.retriever ) assert isinstance(properties_from_endpoint_retriever, SimpleRetriever) properties_from_endpoint_requester = ( - stream.retriever.additional_query_properties.property_list.retriever.requester + retriever.additional_query_properties.property_list.retriever.requester ) assert isinstance(properties_from_endpoint_requester, HttpRequester) assert properties_from_endpoint_requester.url_base == "https://api.hubapi.com" From 1af22644ad638f8408587f506289a0e7646e7089 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:15:39 -0400 Subject: [PATCH 21/68] refactor regarding async stuff --- .../parsers/model_to_component_factory.py | 54 +++++++++-------- .../test_model_to_component_factory.py | 58 ++++++++----------- 2 files changed, 52 insertions(+), 60 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3a25ba5b0..0703a2e5c 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -94,16 +94,13 @@ ClientSideIncrementalRecordFilterDecorator, ) from airbyte_cdk.sources.declarative.incremental import ( - ChildPartitionResumableFullRefreshCursor, ConcurrentCursorFactory, ConcurrentPerPartitionCursor, CursorFactory, DatetimeBasedCursor, DeclarativeCursor, GlobalSubstreamCursor, - PerPartitionCursor, PerPartitionWithGlobalCursor, - ResumableFullRefreshCursor, ) from airbyte_cdk.sources.declarative.interpolation import InterpolatedString from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping @@ -446,10 +443,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ZipfileDecoder as ZipfileDecoderModel, ) -from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( - COMPONENTS_MODULE_NAME, - SDM_COMPONENTS_MODULE_NAME, -) from airbyte_cdk.sources.declarative.partition_routers import ( CartesianProductStreamSlicer, GroupingPartitionRouter, @@ -508,7 +501,7 @@ RequestOptionsProvider, ) from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath -from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod from airbyte_cdk.sources.declarative.resolvers import ( ComponentMappingDefinition, ConfigComponentsResolver, @@ -1941,10 +1934,10 @@ def create_declarative_stream( primary_key = model.primary_key.__root__ if model.primary_key else None - stream_slicer = self._build_stream_slicer_from_partition_router( + partition_router = self._build_stream_slicer_from_partition_router( model.retriever, config, stream_name=model.name ) - concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) + concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -2019,7 +2012,9 @@ def create_declarative_stream( primary_key=primary_key, stream_slicer=combined_slicers, request_options_provider=request_options_provider, - stop_condition_cursor=concurrent_cursor if self._is_stop_condition_on_cursor(model) else None, + stop_condition_cursor=concurrent_cursor + if self._is_stop_condition_on_cursor(model) + else None, client_side_incremental_sync={"cursor": concurrent_cursor} if self._is_client_side_filtering_enabled(model) else None, @@ -2055,7 +2050,10 @@ def create_declarative_stream( schema_loader = DefaultSchemaLoader(config=config, parameters=options) if ( - (isinstance(combined_slicers, PartitionRouter) or isinstance(concurrent_cursor, ConcurrentCursor)) + ( + isinstance(combined_slicers, PartitionRouter) + or isinstance(concurrent_cursor, ConcurrentCursor) + ) and not is_parent and not self._emit_connector_builder_messages ): @@ -2067,7 +2065,9 @@ def create_declarative_stream( # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway + stream_name = model.name or "" stream_slicer = concurrent_cursor + cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a @@ -2076,10 +2076,13 @@ def create_declarative_stream( # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in # AsyncJobPartitionRouter. stream_slicer = retriever.stream_slicer + if isinstance(combined_slicers, Cursor): + cursor = combined_slicers elif isinstance(combined_slicers, PartitionRouter): stream_slicer = combined_slicers + else: + cursor = concurrent_cursor - stream_name = model.name or "" partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( stream_name, @@ -2089,16 +2092,17 @@ def create_declarative_stream( ), stream_slicer, ) - cursor = concurrent_cursor if concurrent_cursor else FinalStateCursor(stream_name, None, self._message_repository) + return DefaultStream( partition_generator=partition_generator, name=stream_name, json_schema=schema_loader.get_json_schema, primary_key=get_primary_key_from_stream(primary_key), - cursor_field=cursor.cursor_field.cursor_field_key if hasattr(cursor, "cursor_field") else "", # FIXME we should have the cursor field has part of the interface of cursor, - # FIXME we should have the cursor field has part of the interface of cursor + cursor_field=cursor.cursor_field.cursor_field_key + if hasattr(cursor, "cursor_field") + else "", # FIXME we should have the cursor field has part of the interface of cursor, logger=logging.getLogger(f"airbyte.{stream_name}"), - # FIXME this is a breaking change compared to the old implementation, + # FIXME this is a breaking change compared to the old implementation which used the source name instead cursor=cursor, ) @@ -2121,18 +2125,18 @@ def create_declarative_stream( parameters=model.parameters or {}, ) - def _is_stop_condition_on_cursor(self, model): + def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: return ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_data_feed") - and model.incremental_sync.is_data_feed + model.incremental_sync + and hasattr(model.incremental_sync, "is_data_feed") + and model.incremental_sync.is_data_feed ) - def _is_client_side_filtering_enabled(self, model): + def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: client_side_filtering_enabled = ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_client_side_incremental") - and model.incremental_sync.is_client_side_incremental + model.incremental_sync + and hasattr(model.incremental_sync, "is_client_side_incremental") + and model.incremental_sync.is_client_side_incremental ) return client_side_filtering_enabled diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 8d9b1f808..b543354f7 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -373,38 +373,33 @@ def test_full_config_stream(): assert isinstance(retriever.record_selector.extractor, DpathExtractor) assert isinstance(retriever.record_selector.extractor.decoder, JsonDecoder) - assert [ - fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path - ] == ["lists"] + assert [fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path] == [ + "lists" + ] assert isinstance(retriever.record_selector.record_filter, RecordFilter) assert ( - retriever.record_selector.record_filter._filter_interpolator.condition - == "{{ record['id'] > stream_state['id'] }}" + retriever.record_selector.record_filter._filter_interpolator.condition + == "{{ record['id'] > stream_state['id'] }}" ) assert isinstance(retriever.paginator, DefaultPaginator) assert isinstance(retriever.paginator.decoder, PaginationDecoderDecorator) assert retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" - assert ( - retriever.paginator.page_size_option.inject_into - == RequestOptionType.request_parameter - ) + assert retriever.paginator.page_size_option.inject_into == RequestOptionType.request_parameter assert isinstance(retriever.paginator.page_token_option, RequestPath) assert retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" assert retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" assert isinstance(retriever.paginator.pagination_strategy, CursorPaginationStrategy) - assert isinstance( - retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator - ) + assert isinstance(retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator) assert ( - retriever.paginator.pagination_strategy._cursor_value.string - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.string + == "{{ response._metadata.next }}" ) assert ( - retriever.paginator.pagination_strategy._cursor_value.default - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.default + == "{{ response._metadata.next }}" ) assert retriever.paginator.pagination_strategy.page_size == 10 @@ -416,24 +411,20 @@ def test_full_config_stream(): assert isinstance(retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) assert ( - retriever.request_option_provider.start_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.start_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - retriever.request_option_provider.start_time_option.field_name.eval( - config=input_config - ) - == "after" + retriever.request_option_provider.start_time_option.field_name.eval(config=input_config) + == "after" ) assert ( - retriever.request_option_provider.end_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.end_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - retriever.request_option_provider.end_time_option.field_name.eval( - config=input_config - ) - == "before" + retriever.request_option_provider.end_time_option.field_name.eval(config=input_config) + == "before" ) assert retriever.request_option_provider._partition_field_start.string == "start_time" assert retriever.request_option_provider._partition_field_end.string == "end_time" @@ -444,9 +435,7 @@ def test_full_config_stream(): assert isinstance( retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider ) - assert ( - retriever.requester.request_options_provider.request_parameters.get("unit") == "day" - ) + assert retriever.requester.request_options_provider.request_parameters.get("unit") == "day" checker = factory.create_component( model_type=CheckStreamModel, component_definition=manifest["check"], config=input_config @@ -1118,7 +1107,8 @@ def test_incremental_data_feed(): ) assert isinstance( - get_retriever(stream).paginator.pagination_strategy, StopConditionPaginationStrategyDecorator + get_retriever(stream).paginator.pagination_strategy, + StopConditionPaginationStrategyDecorator, ) @@ -4149,9 +4139,7 @@ def test_simple_retriever_with_query_properties(): assert property_chunking.property_limit_type == PropertyLimitType.property_count assert property_chunking.property_limit == 3 - merge_strategy = ( - retriever.additional_query_properties.property_chunking.record_merge_strategy - ) + merge_strategy = retriever.additional_query_properties.property_chunking.record_merge_strategy assert isinstance(merge_strategy, GroupByKey) assert merge_strategy.key == ["id"] From 8c771bb032450bfda33ad694282c0d949df238bf Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:21:25 -0400 Subject: [PATCH 22/68] partially fix mypy --- .../declarative/parsers/model_to_component_factory.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 0703a2e5c..c23c39cab 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2066,8 +2066,8 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer = concurrent_cursor - cursor = FinalStateCursor(stream_name, None, self._message_repository) + stream_slicer: StreamSlicer = concurrent_cursor + cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a @@ -2126,19 +2126,18 @@ def create_declarative_stream( ) def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: - return ( + return bool( model.incremental_sync and hasattr(model.incremental_sync, "is_data_feed") and model.incremental_sync.is_data_feed ) def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: - client_side_filtering_enabled = ( + return bool( model.incremental_sync and hasattr(model.incremental_sync, "is_client_side_incremental") and model.incremental_sync.is_client_side_incremental ) - return client_side_filtering_enabled def _build_stream_slicer_from_partition_router( self, From fb40a6b1ee9a937349a500512f4226bdc26750b4 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:24:19 -0400 Subject: [PATCH 23/68] fix mypy --- .../sources/declarative/parsers/model_to_component_factory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index c23c39cab..1bb75f90c 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -610,6 +610,7 @@ ) from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream +from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer as ConcurrentStreamSlicer from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, DateTimeStreamStateConverter, @@ -2066,7 +2067,7 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer: StreamSlicer = concurrent_cursor + stream_slicer: ConcurrentStreamSlicer = concurrent_cursor cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method From 91752839ecb93294c20952159e952d460d0a5383 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:25:45 -0400 Subject: [PATCH 24/68] format --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 1bb75f90c..910324067 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -610,7 +610,9 @@ ) from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream -from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer as ConcurrentStreamSlicer +from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( + StreamSlicer as ConcurrentStreamSlicer, +) from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, DateTimeStreamStateConverter, From 1d84a49637e9fd18fe3ce3e06198aab1c40bcf51 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:35:42 -0400 Subject: [PATCH 25/68] mypy --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 910324067..cc3f698fd 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2069,7 +2069,7 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer: ConcurrentStreamSlicer = concurrent_cursor + stream_slicer: ConcurrentStreamSlicer = concurrent_cursor if concurrent_cursor else SinglePartitionRouter() cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method From 2cba5ffca735be862789e89dadbd2366268b2c0c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:45:52 -0400 Subject: [PATCH 26/68] fix --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index cc3f698fd..3eeb7f8ad 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2069,7 +2069,9 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer: ConcurrentStreamSlicer = concurrent_cursor if concurrent_cursor else SinglePartitionRouter() + stream_slicer: ConcurrentStreamSlicer = ( + concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={}) + ) cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method From 8181c833aca515fa8583c2aa9d5a560b9ad2e67e Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 14:33:38 -0400 Subject: [PATCH 27/68] fix condition where we might override FinalStateCursor with null --- .../parsers/model_to_component_factory.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3eeb7f8ad..5cf89814f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1929,19 +1929,8 @@ def create_datetime_based_cursor( def create_declarative_stream( self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any ) -> Union[DeclarativeStream, AbstractStream]: - # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field - # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the - # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in - # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. - combined_slicers = self._merge_stream_slicers(model=model, config=config) - primary_key = model.primary_key.__root__ if model.primary_key else None - partition_router = self._build_stream_slicer_from_partition_router( - model.retriever, config, stream_name=model.name - ) - concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) - if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -2008,6 +1997,15 @@ def create_declarative_stream( model=model.file_uploader, config=config ) + # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field + # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the + # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in + # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. + combined_slicers = self._merge_stream_slicers(model=model, config=config) + partition_router = self._build_stream_slicer_from_partition_router( + model.retriever, config, stream_name=model.name + ) + concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) retriever = self._create_component_from_model( model=model.retriever, config=config, @@ -2085,7 +2083,7 @@ def create_declarative_stream( cursor = combined_slicers elif isinstance(combined_slicers, PartitionRouter): stream_slicer = combined_slicers - else: + elif concurrent_cursor: cursor = concurrent_cursor partition_generator = StreamSlicerPartitionGenerator( From 10796293ea515cd3ca6f5ce17059cbfdc0fcddea Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 6 Aug 2025 09:52:24 -0400 Subject: [PATCH 28/68] supports_file_transfer --- .../sources/declarative/parsers/model_to_component_factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 67ec9f2ac..ec8f94478 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2097,6 +2097,7 @@ def create_declarative_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation, cursor=FinalStateCursor(stream_name, None, self._message_repository), + supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None From 7f643e4be73abf8b4d0dca61beab8a5a59e68efa Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 6 Aug 2025 10:27:46 -0400 Subject: [PATCH 29/68] format --- .../sources/declarative/parsers/model_to_component_factory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index ec8f94478..f672e06cd 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2097,7 +2097,8 @@ def create_declarative_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation, cursor=FinalStateCursor(stream_name, None, self._message_repository), - supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), + supports_file_transfer=hasattr(model, "file_uploader") + and bool(model.file_uploader), ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None From 11e3a35603cc2dbbf944d7d2842ab3f637a4473c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:03:26 -0400 Subject: [PATCH 30/68] format --- .../sources/declarative/concurrent_declarative_source.py | 2 +- .../stream_slicers/declarative_partition_generator.py | 4 +++- .../connector_builder/test_connector_builder_handler.py | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index b688ea23c..9a651514b 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -466,7 +466,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( stream_name=declarative_stream.name, - schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish + schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish retriever=retriever, message_repository=self.message_repository, max_records_limit=self._limits.max_records if self._limits else None, diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 8afb80813..4a511fe70 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -86,7 +86,9 @@ def read(self) -> Iterable[Record]: global total_record_counter if total_record_counter >= self._max_records_limit: return - for stream_data in self._retriever.read_records(self._schema_loader.get_json_schema(), self._stream_slice): + for stream_data in self._retriever.read_records( + self._schema_loader.get_json_schema(), self._stream_slice + ): if self._max_records_limit: if total_record_counter >= self._max_records_limit: break diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index bc846d526..c036c12d3 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -785,7 +785,11 @@ def test_config_update() -> None: "refresh_token": "a refresh token", } source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=manifest, emit_connector_builder_messages=True, + catalog=None, + config=config, + state=None, + source_config=manifest, + emit_connector_builder_messages=True, ) refresh_request_response = { From cee6157d13e32a8394792929d432d14fc510ea5b Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:06:41 -0400 Subject: [PATCH 31/68] [WIP still tests failing] Remove DeclarativeStream instantiation --- .../concurrent_partition_cursor.py | 31 +- .../parsers/model_to_component_factory.py | 231 ++--- .../substream_partition_router.py | 115 +-- .../per_partition_request_option_provider.py | 79 ++ .../resolvers/http_components_resolver.py | 5 +- .../declarative/retrievers/retriever.py | 4 + airbyte_cdk/sources/message/repository.py | 15 + .../file/file_stream_manifest.yaml | 2 +- ...t_file_stream_with_filename_extractor.yaml | 2 +- .../test_concurrent_perpartitioncursor.py | 48 +- .../test_per_partition_cursor_integration.py | 201 ++-- .../test_model_to_component_factory.py | 145 +-- .../test_grouping_partition_router.py | 124 ++- .../test_substream_partition_router.py | 890 +++++++----------- .../test_http_components_resolver.py | 8 +- ..._based_concurrent_stream_source_builder.py | 7 - 16 files changed, 889 insertions(+), 1018 deletions(-) create mode 100644 airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 2a7cfd1d3..285b81956 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -11,6 +11,7 @@ from datetime import timedelta from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional +from airbyte_cdk.models import AirbyteStateMessage, AirbyteStateBlob, AirbyteStreamState, AirbyteStateType, StreamDescriptor from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import ( Timer, @@ -128,6 +129,7 @@ def __init__( # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided + self._synced_some_data = False @property def cursor_field(self) -> CursorField: @@ -168,8 +170,8 @@ def close_partition(self, partition: Partition) -> None: with self._lock: self._semaphore_per_partition[partition_key].acquire() if not self._use_global_cursor: - self._cursor_per_partition[partition_key].close_partition(partition=partition) cursor = self._cursor_per_partition[partition_key] + cursor.close_partition(partition=partition) if ( partition_key in self._partitions_done_generating_stream_slices and self._semaphore_per_partition[partition_key]._value == 0 @@ -213,8 +215,10 @@ def ensure_at_least_one_state_emitted(self) -> None: if not any( semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items() ): - self._global_cursor = self._new_global_cursor - self._lookback_window = self._timer.finish() + if self._synced_some_data: + # we only update those if we actually synced some data + self._global_cursor = self._new_global_cursor + self._lookback_window = self._timer.finish() self._parent_state = self._partition_router.get_stream_state() self._emit_state_message(throttle=False) @@ -458,6 +462,7 @@ def observe(self, record: Record) -> None: except ValueError: return + self._synced_some_data = True record_cursor = self._connector_state_converter.output_format( self._connector_state_converter.parse_value(record_cursor_value) ) @@ -541,3 +546,23 @@ def _get_cursor(self, record: Record) -> ConcurrentCursor: def limit_reached(self) -> bool: return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT + + @staticmethod + def get_parent_state(stream_state: Optional[StreamState], parent_stream_name: str) -> Optional[AirbyteStateMessage]: + return AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(parent_stream_name, None), + stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]) + ) + ) if stream_state and "parent_state" in stream_state else None + + @staticmethod + def get_global_state(stream_state: Optional[StreamState], parent_stream_name: str) -> Optional[AirbyteStateMessage]: + return AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(parent_stream_name, None), + stream_state=AirbyteStateBlob(stream_state["state"]) + ) + ) if stream_state and "state" in stream_state else None diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a42dde19b..156ec7961 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -33,7 +33,7 @@ from airbyte_cdk.connector_builder.models import ( LogMessage as ConnectorBuilderLogMessage, ) -from airbyte_cdk.models import FailureType, Level +from airbyte_cdk.models import FailureType, Level, AirbyteStateMessage, AirbyteStreamState, AirbyteStateBlob, AirbyteStateType, StreamDescriptor from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker @@ -500,6 +500,8 @@ InterpolatedRequestOptionsProvider, RequestOptionsProvider, ) +from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import \ + PerPartitionRequestOptionsProvider from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod from airbyte_cdk.sources.declarative.resolvers import ( @@ -583,6 +585,7 @@ MessageRepository, NoopMessageRepository, ) +from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository from airbyte_cdk.sources.streams.call_rate import ( APIBudget, FixedWindowCallRatePolicy, @@ -630,6 +633,7 @@ SchemaNormalizationModel.None_: TransformConfig.NoTransform, SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, } +_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) class ModelToComponentFactory: @@ -1274,6 +1278,9 @@ def create_concurrent_cursor_from_datetime_based_cursor( f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" ) + # TODO validate and explain why we need to do this... + component_definition["$parameters"] = component_definition.get("parameters", {}) + parameters = component_definition.get("parameters", component_definition.get("$parameters", {})) datetime_based_cursor_model = model_type.parse_obj(component_definition) if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): @@ -1283,17 +1290,17 @@ def create_concurrent_cursor_from_datetime_based_cursor( interpolated_cursor_field = InterpolatedString.create( datetime_based_cursor_model.cursor_field, - parameters=datetime_based_cursor_model.parameters or {}, + parameters=parameters, ) cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) interpolated_partition_field_start = InterpolatedString.create( datetime_based_cursor_model.partition_field_start or "start_time", - parameters=datetime_based_cursor_model.parameters or {}, + parameters=parameters, ) interpolated_partition_field_end = InterpolatedString.create( datetime_based_cursor_model.partition_field_end or "end_time", - parameters=datetime_based_cursor_model.parameters or {}, + parameters=parameters, ) slice_boundary_fields = ( @@ -1313,7 +1320,7 @@ def create_concurrent_cursor_from_datetime_based_cursor( interpolated_lookback_window = ( InterpolatedString.create( datetime_based_cursor_model.lookback_window, - parameters=datetime_based_cursor_model.parameters or {}, + parameters=parameters, ) if datetime_based_cursor_model.lookback_window else None @@ -1399,7 +1406,7 @@ def create_concurrent_cursor_from_datetime_based_cursor( interpolated_step = ( InterpolatedString.create( datetime_based_cursor_model.step, - parameters=datetime_based_cursor_model.parameters or {}, + parameters=parameters, ) if datetime_based_cursor_model.step else None @@ -1416,7 +1423,7 @@ def create_concurrent_cursor_from_datetime_based_cursor( # object which we want to keep agnostic of being low-code target = InterpolatedString( string=datetime_based_cursor_model.clamping.target, - parameters=datetime_based_cursor_model.parameters or {}, + parameters=parameters, ) evaluated_target = target.eval(config=config) match evaluated_target: @@ -1587,7 +1594,7 @@ def create_concurrent_cursor_from_perpartition_cursor( interpolated_cursor_field = InterpolatedString.create( datetime_based_cursor_model.cursor_field, - parameters=datetime_based_cursor_model.parameters or {}, + parameters=component_definition.get("parameters", component_definition.get("$parameters", {})), # FIXME validate and explain why we need to do this ) cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) @@ -1618,7 +1625,7 @@ def create_concurrent_cursor_from_perpartition_cursor( stream_namespace=stream_namespace, config=config, message_repository=NoopMessageRepository(), - stream_state_migrations=stream_state_migrations, + # stream_state_migrations=stream_state_migrations, # FIXME is it expected to run migration on per partition state too? ) ) @@ -1931,6 +1938,10 @@ def create_declarative_stream( ) -> Union[DeclarativeStream, AbstractStream]: primary_key = model.primary_key.__root__ if model.primary_key else None + partition_router = self._build_stream_slicer_from_partition_router( + model.retriever, config, stream_name=model.name + ) + concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -1949,7 +1960,7 @@ def create_declarative_stream( else None ) - request_options_provider = DatetimeBasedRequestOptionsProvider( + datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( start_time_option=start_time_option, end_time_option=end_time_option, partition_field_start=cursor_model.partition_field_end, @@ -1957,9 +1968,17 @@ def create_declarative_stream( config=config, parameters=model.parameters or {}, ) + request_options_provider = ( + datetime_request_options_provider + if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) + else PerPartitionRequestOptionsProvider(partition_router, datetime_request_options_provider) + ) elif model.incremental_sync and isinstance( model.incremental_sync, IncrementingCountCursorModel ): + if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): + raise ValueError("PerPartition does not support per partition states because switching to global state is time based") + cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore start_time_option = ( @@ -1997,22 +2016,16 @@ def create_declarative_stream( model=model.file_uploader, config=config ) - # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field - # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the - # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in - # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. - combined_slicers = self._merge_stream_slicers(model=model, config=config) - partition_router = self._build_stream_slicer_from_partition_router( - model.retriever, config, stream_name=model.name + stream_slicer: ConcurrentStreamSlicer = ( + partition_router if isinstance(concurrent_cursor, FinalStateCursor) else concurrent_cursor ) - concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) retriever = self._create_component_from_model( model=model.retriever, config=config, name=model.name, primary_key=primary_key, - stream_slicer=combined_slicers, request_options_provider=request_options_provider, + stream_slicer=stream_slicer, stop_condition_cursor=concurrent_cursor if self._is_stop_condition_on_cursor(model) else None, @@ -2023,6 +2036,8 @@ def create_declarative_stream( file_uploader=file_uploader, incremental_sync=model.incremental_sync, ) + if isinstance(retriever, AsyncRetriever): + stream_slicer = retriever.stream_slicer schema_loader: Union[ CompositeSchemaLoader, @@ -2050,43 +2065,9 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - if ( - ( - isinstance(combined_slicers, PartitionRouter) - or isinstance(concurrent_cursor, ConcurrentCursor) - ) - and not is_parent - and not self._emit_connector_builder_messages - ): - # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the - # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: - # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter - # * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition - # * Streams with both partition router and cursor - # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet - # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway - - stream_name = model.name or "" - stream_slicer: ConcurrentStreamSlicer = ( - concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={}) - ) - cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) - if isinstance(retriever, AsyncRetriever): - # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method - # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a - # special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when - # built because the async retriever has a specific partition router which relies on this stream slicer. - # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in - # AsyncJobPartitionRouter. - stream_slicer = retriever.stream_slicer - if isinstance(combined_slicers, Cursor): - cursor = combined_slicers - elif isinstance(combined_slicers, PartitionRouter): - stream_slicer = combined_slicers - elif concurrent_cursor: - cursor = concurrent_cursor - - partition_generator = StreamSlicerPartitionGenerator( + stream_name = model.name or "" + return DefaultStream( + partition_generator=StreamSlicerPartitionGenerator( DeclarativePartitionFactory( stream_name, schema_loader, @@ -2094,40 +2075,18 @@ def create_declarative_stream( self._message_repository, ), stream_slicer, - ) - - return DefaultStream( - partition_generator=partition_generator, - name=stream_name, - json_schema=schema_loader.get_json_schema, - primary_key=get_primary_key_from_stream(primary_key), - cursor_field=cursor.cursor_field.cursor_field_key - if hasattr(cursor, "cursor_field") - else "", # FIXME we should have the cursor field has part of the interface of cursor, - logger=logging.getLogger(f"airbyte.{stream_name}"), - # FIXME this is a breaking change compared to the old implementation which used the source name instead - cursor=cursor, - supports_file_transfer=hasattr(model, "file_uploader") - and bool(model.file_uploader), - ) - - cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None - if model.state_migrations: - state_transformations = [ - self._create_component_from_model(state_migration, config, declarative_stream=model) - for state_migration in model.state_migrations - ] - else: - state_transformations = [] - return DeclarativeStream( - name=model.name or "", - primary_key=primary_key, - retriever=retriever, - schema_loader=schema_loader, - stream_cursor_field=cursor_field or "", - state_migrations=state_transformations, - config=config, - parameters=model.parameters or {}, + ), + name=stream_name, + json_schema=schema_loader.get_json_schema, + primary_key=get_primary_key_from_stream(primary_key), + cursor_field=concurrent_cursor.cursor_field.cursor_field_key + if hasattr(concurrent_cursor, "cursor_field") + else "", # FIXME we should have the cursor field has part of the interface of cursor, + logger=logging.getLogger(f"airbyte.{stream_name}"), + # FIXME this is a breaking change compared to the old implementation which used the source name instead + cursor=concurrent_cursor, + supports_file_transfer=hasattr(model, "file_uploader") + and bool(model.file_uploader), ) def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: @@ -2253,9 +2212,10 @@ def _build_concurrent_cursor( model: DeclarativeStreamModel, stream_slicer: Optional[PartitionRouter], config: Config, - ) -> Optional[StreamSlicer]: + ) -> Cursor: + stream_name = model.name or "" stream_state = self._connector_state_manager.get_stream_state( - stream_name=model.name or "", namespace=None + stream_name=stream_name, namespace=None ) if model.state_migrations: @@ -2275,20 +2235,20 @@ def _build_concurrent_cursor( state_manager=self._connector_state_manager, model_type=DatetimeBasedCursorModel, component_definition=model.incremental_sync.__dict__, - stream_name=model.name or "", + stream_name=stream_name, stream_namespace=None, config=config or {}, stream_state=stream_state, stream_state_migrations=state_transformations, partition_router=stream_slicer, - attempt_to_create_cursor_if_not_provided=True, + attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? ) elif model.incremental_sync: if type(model.incremental_sync) == IncrementingCountCursorModel: return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing model_type=IncrementingCountCursorModel, component_definition=model.incremental_sync.__dict__, - stream_name=model.name or "", + stream_name=stream_name, stream_namespace=None, config=config or {}, stream_state_migrations=state_transformations, @@ -2297,7 +2257,7 @@ def _build_concurrent_cursor( return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing model_type=type(model.incremental_sync), component_definition=model.incremental_sync.__dict__, - stream_name=model.name or "", + stream_name=stream_name, stream_namespace=None, config=config or {}, stream_state_migrations=state_transformations, @@ -2307,7 +2267,7 @@ def _build_concurrent_cursor( raise ValueError( f"Incremental sync of type {type(model.incremental_sync)} is not supported" ) - return None + return FinalStateCursor(stream_name, None, self._message_repository) def _merge_stream_slicers( self, model: DeclarativeStreamModel, config: Config @@ -3242,7 +3202,6 @@ def create_simple_retriever( *, name: str, primary_key: Optional[Union[str, List[str], List[List[str]]]], - stream_slicer: Optional[StreamSlicer], request_options_provider: Optional[RequestOptionsProvider] = None, stop_condition_cursor: Optional[Cursor] = None, client_side_incremental_sync: Optional[Dict[str, Any]] = None, @@ -3350,31 +3309,9 @@ def _get_url() -> str: config=config, ) - # Define cursor only if per partition or common incremental support is needed - cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None - - if ( - not isinstance(stream_slicer, DatetimeBasedCursor) - or type(stream_slicer) is not DatetimeBasedCursor - ): - # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). - # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement - # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's - # request_options_provider - request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) - elif not request_options_provider: + if not request_options_provider: request_options_provider = DefaultRequestOptionsProvider(parameters={}) - stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) - if self._should_limit_slices_fetched(): - stream_slicer = cast( - StreamSlicer, - StreamSlicerTestReadDecorator( - wrapped_slicer=stream_slicer, - maximum_number_of_slices=self._limit_slices_fetched or 5, - ), - ) - paginator = ( self._create_component_from_model( model=model.paginator, @@ -3423,9 +3360,9 @@ def _get_url() -> str: primary_key=primary_key, requester=requester, record_selector=record_selector, - stream_slicer=stream_slicer, + stream_slicer=_NO_STREAM_SLICING, request_option_provider=request_options_provider, - cursor=cursor, + cursor=None, config=config, ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, parameters=model.parameters or {}, @@ -3437,9 +3374,9 @@ def _get_url() -> str: primary_key=primary_key, requester=requester, record_selector=record_selector, - stream_slicer=stream_slicer, + stream_slicer=_NO_STREAM_SLICING, request_option_provider=request_options_provider, - cursor=cursor, + cursor=None, config=config, ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, additional_query_properties=query_properties, @@ -3827,16 +3764,47 @@ def create_substream_partition_router( def _create_message_repository_substream_wrapper( self, model: ParentStreamConfigModel, config: Config, **kwargs: Any ) -> Any: + # getting the parent state + child_state = self._connector_state_manager.get_stream_state(kwargs["stream_name"], None) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag + if model.incremental_dependency and child_state: + parent_stream_name = model.stream.name or "" + parent_state = ConcurrentPerPartitionCursor.get_parent_state(child_state, parent_stream_name) + + if not parent_state: + # there are two migration cases: state value from child stream or from global state + parent_state = ConcurrentPerPartitionCursor.get_global_state(child_state, parent_stream_name) + + if not parent_state and not isinstance(parent_state, dict): + cursor_field = InterpolatedString.create( + model.stream.incremental_sync.cursor_field, + parameters=model.stream.incremental_sync.parameters or {}, + ).eval(config) + cursor_values = child_state.values() + if cursor_values: + parent_state = AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=parent_stream_name, namespace=None), + stream_state=AirbyteStateBlob({cursor_field: list(cursor_values)[0]}), + ), + ) + connector_state_manager = ConnectorStateManager([parent_state] if parent_state else []) + else: + connector_state_manager = ConnectorStateManager([]) + substream_factory = ModelToComponentFactory( + connector_state_manager=connector_state_manager, limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, limit_slices_fetched=self._limit_slices_fetched, emit_connector_builder_messages=self._emit_connector_builder_messages, disable_retries=self._disable_retries, disable_cache=self._disable_cache, - message_repository=LogAppenderMessageRepositoryDecorator( - {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, - self._message_repository, - self._evaluate_log_level(self._emit_connector_builder_messages), + message_repository=StateFilteringMessageRepository( + LogAppenderMessageRepositoryDecorator( + {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, + self._message_repository, + self._evaluate_log_level(self._emit_connector_builder_messages), + ), ), ) @@ -3910,7 +3878,7 @@ def create_http_components_resolver( config=config, name=f"{stream_name if stream_name else '__http_components_resolver'}", primary_key=None, - stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), + stream_slicer=SinglePartitionRouter(parameters={}), transformations=[], ) @@ -3930,6 +3898,7 @@ def create_http_components_resolver( return HttpComponentsResolver( retriever=retriever, + stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), config=config, components_mapping=components_mapping, parameters=model.parameters or {}, @@ -4155,7 +4124,7 @@ def create_grouping_partition_router( self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any ) -> GroupingPartitionRouter: underlying_router = self._create_component_from_model( - model=model.underlying_partition_router, config=config + model=model.underlying_partition_router, config=config, **kwargs, ) if model.group_size < 1: raise ValueError(f"Group size must be greater than 0, got {model.group_size}") diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 000beeff9..d29a3c2d3 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -7,24 +7,39 @@ import json import logging from dataclasses import InitVar, dataclass -from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union +from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union, TypeVar import dpath import requests from airbyte_cdk.models import AirbyteMessage -from airbyte_cdk.models import Type as MessageType from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.requesters.request_option import ( RequestOption, RequestOptionType, ) -from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState -from airbyte_cdk.utils import AirbyteTracedException +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState if TYPE_CHECKING: - from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream + from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream + + +def iterate_with_last_flag(generator: Iterable[Partition]) -> Iterable[tuple[Partition, bool]]: + + iterator = iter(generator) + + try: + current = next(iterator) + except StopIteration: + return # Return an empty iterator + + for next_item in iterator: + yield current, False + current = next_item + + yield current, True @dataclass @@ -40,7 +55,7 @@ class ParentStreamConfig: incremental_dependency (bool): Indicates if the parent stream should be read incrementally. """ - stream: "DeclarativeStream" # Parent streams must be DeclarativeStream because we can't know which part of the stream slice is a partition for regular Stream + stream: "AbstractStream" parent_key: Union[InterpolatedString, str] partition_field: Union[InterpolatedString, str] config: Config @@ -176,59 +191,51 @@ def stream_slices(self) -> Iterable[StreamSlice]: for field_path in parent_stream_config.extra_fields ] - # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does - # not support either substreams or RFR, but something that needs to be considered once we do - for parent_record in parent_stream.read_only_records(): - parent_partition = None - # Skip non-records (eg AirbyteLogMessage) - if isinstance(parent_record, AirbyteMessage): - self.logger.warning( - f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." - ) - if parent_record.type == MessageType.RECORD: - parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record - else: - continue - elif isinstance(parent_record, Record): + for partition, is_last_slice in iterate_with_last_flag(parent_stream.generate_partitions()): + for parent_record, is_last_record_in_slice in iterate_with_last_flag(partition.read()): + parent_stream.cursor.observe(parent_record) parent_partition = ( parent_record.associated_slice.partition if parent_record.associated_slice else {} ) - parent_record = parent_record.data - elif not isinstance(parent_record, Mapping): - # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid - raise AirbyteTracedException( - message=f"Parent stream returned records as invalid type {type(parent_record)}" - ) - try: - partition_value = dpath.get( - parent_record, # type: ignore [arg-type] - parent_field, + record_data = parent_record.data + + try: + partition_value = dpath.get( + record_data, # type: ignore [arg-type] + parent_field, + ) + except KeyError: + # FIXME a log here would go a long way for debugging + continue + + # Add extra fields + extracted_extra_fields = self._extract_extra_fields(record_data, extra_fields) + + if parent_stream_config.lazy_read_pointer: + extracted_extra_fields = { + "child_response": self._extract_child_response( + record_data, + parent_stream_config.lazy_read_pointer, # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config + ), + **extracted_extra_fields, + } + + if is_last_record_in_slice: + parent_stream.cursor.close_partition(partition) + + yield StreamSlice( + partition={ + partition_field: partition_value, + "parent_slice": parent_partition or {}, + }, + cursor_slice={}, + extra_fields=extracted_extra_fields, ) - except KeyError: - continue - - # Add extra fields - extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields) - - if parent_stream_config.lazy_read_pointer: - extracted_extra_fields = { - "child_response": self._extract_child_response( - parent_record, - parent_stream_config.lazy_read_pointer, # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config - ), - **extracted_extra_fields, - } - - yield StreamSlice( - partition={ - partition_field: partition_value, - "parent_slice": parent_partition or {}, - }, - cursor_slice={}, - extra_fields=extracted_extra_fields, - ) + + parent_stream.cursor.ensure_at_least_one_state_emitted() + yield from [] def _extract_child_response( self, parent_record: Mapping[str, Any] | AirbyteMessage, pointer: List[InterpolatedString] @@ -414,7 +421,7 @@ def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: parent_state = {} for parent_config in self.parent_stream_configs: if parent_config.incremental_dependency: - parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.state) + parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.cursor.state) return parent_state @property diff --git a/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py b/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py new file mode 100644 index 000000000..04827b7fe --- /dev/null +++ b/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py @@ -0,0 +1,79 @@ +from typing import Optional, Mapping, Any, Union + +from airbyte_cdk.sources.declarative.partition_routers import PartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_options import RequestOptionsProvider +from airbyte_cdk.sources.types import StreamSlice, StreamState + + +class PerPartitionRequestOptionsProvider(RequestOptionsProvider): + def __init__(self, partition_router: PartitionRouter, cursor_provider: RequestOptionsProvider): + self._partition_router = partition_router + self._cursor_provider = cursor_provider + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._partition_router.get_request_params( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_provider.get_request_params( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._partition_router.get_request_headers( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_provider.get_request_headers( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return self._partition_router.get_request_body_data( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_provider.get_request_body_data( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._partition_router.get_request_body_json( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_provider.get_request_body_json( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index 6e85fc578..11952b963 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -17,6 +17,7 @@ ) from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer from airbyte_cdk.sources.types import Config @@ -28,12 +29,14 @@ class HttpComponentsResolver(ComponentsResolver): Attributes: retriever (Retriever): The retriever used to fetch data from an API. + stream_slicer (StreamSlicer): The how the data is sliced. config (Config): Configuration object for the resolver. components_mapping (List[ComponentMappingDefinition]): List of mappings to resolve. parameters (InitVar[Mapping[str, Any]]): Additional parameters for interpolation. """ retriever: Retriever + stream_slicer: StreamSlicer config: Config components_mapping: List[ComponentMappingDefinition] parameters: InitVar[Mapping[str, Any]] @@ -88,7 +91,7 @@ def resolve_components( """ kwargs = {"stream_template_config": stream_template_config} - for stream_slice in self.retriever.stream_slices(): + for stream_slice in self.stream_slicer.stream_slices(): for components_values in self.retriever.read_records( records_schema={}, stream_slice=stream_slice ): diff --git a/airbyte_cdk/sources/declarative/retrievers/retriever.py b/airbyte_cdk/sources/declarative/retrievers/retriever.py index 155de5782..c1cadc330 100644 --- a/airbyte_cdk/sources/declarative/retrievers/retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/retriever.py @@ -4,6 +4,7 @@ from abc import abstractmethod from typing import Any, Iterable, Mapping, Optional +from typing_extensions import deprecated from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import StreamSlice from airbyte_cdk.sources.streams.core import StreamData @@ -30,11 +31,13 @@ def read_records( """ @abstractmethod + @deprecated("Stream slicing is being moved to the stream level.") def stream_slices(self) -> Iterable[Optional[StreamSlice]]: """Returns the stream slices""" @property @abstractmethod + @deprecated("State management is being moved to the stream level.") def state(self) -> StreamState: """State getter, should return state in form that can serialized to a string and send to the output as a STATE AirbyteMessage. @@ -50,5 +53,6 @@ def state(self) -> StreamState: @state.setter @abstractmethod + @deprecated("State management is being moved to the stream level.") def state(self, value: StreamState) -> None: """State setter, accept state serialized by state getter.""" diff --git a/airbyte_cdk/sources/message/repository.py b/airbyte_cdk/sources/message/repository.py index 2fc156e8c..d806e9ac2 100644 --- a/airbyte_cdk/sources/message/repository.py +++ b/airbyte_cdk/sources/message/repository.py @@ -95,6 +95,21 @@ def consume_queue(self) -> Iterable[AirbyteMessage]: yield self._message_queue.popleft() +class StateFilteringMessageRepository(MessageRepository): + def __init__(self, decorated: MessageRepository) -> None: + self._decorated = decorated + + def emit_message(self, message: AirbyteMessage) -> None: + if message.type != Type.STATE: + self._decorated.emit_message(message) + + def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: + self._decorated.log_message(level, message_provider) + + def consume_queue(self) -> Iterable[AirbyteMessage]: + yield from self._decorated.consume_queue() + + class LogAppenderMessageRepositoryDecorator(MessageRepository): def __init__( self, diff --git a/unit_tests/sources/declarative/file/file_stream_manifest.yaml b/unit_tests/sources/declarative/file/file_stream_manifest.yaml index 256a3816c..7cba173c3 100644 --- a/unit_tests/sources/declarative/file/file_stream_manifest.yaml +++ b/unit_tests/sources/declarative/file/file_stream_manifest.yaml @@ -63,7 +63,7 @@ definitions: - "%Y-%m-%dT%H:%M:%SZ" - "%Y-%m-%dT%H:%M:%S%z" datetime_format: "%s" - cursor_field: "{{ parameters.get('cursor_field', 'updated_at') }}" + cursor_field: "updated_at" start_datetime: datetime: "{{ timestamp(config.get('start_date')) | int if config.get('start_date') else day_delta(-730, '%s') }}" start_time_option: diff --git a/unit_tests/sources/declarative/file/test_file_stream_with_filename_extractor.yaml b/unit_tests/sources/declarative/file/test_file_stream_with_filename_extractor.yaml index 1124b9ec0..b3469a070 100644 --- a/unit_tests/sources/declarative/file/test_file_stream_with_filename_extractor.yaml +++ b/unit_tests/sources/declarative/file/test_file_stream_with_filename_extractor.yaml @@ -63,7 +63,7 @@ definitions: - "%Y-%m-%dT%H:%M:%SZ" - "%Y-%m-%dT%H:%M:%S%z" datetime_format: "%s" - cursor_field: "{{ parameters.get('cursor_field', 'updated_at') }}" + cursor_field: "updated_at" start_datetime: datetime: "{{ timestamp(config.get('start_date')) | int if config.get('start_date') else day_delta(-730, '%s') }}" start_time_option: diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 13d1194dd..08ee227e3 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -295,10 +295,6 @@ } STREAM_NAME = "post_comment_votes" -CONFIG = { - "start_date": "2024-01-01T00:00:01Z", - "credentials": {"email": "email", "api_token": "api_token"}, -} SUBSTREAM_MANIFEST_NO_DEPENDENCY = deepcopy(SUBSTREAM_MANIFEST) # Disable incremental_dependency @@ -451,6 +447,10 @@ def _run_read( ).isoformat() + "Z" PARTITION_SYNC_START_TIME = "2024-01-02T00:00:00Z" +CONFIG = { + "start_date": START_DATE, + "credentials": {"email": "email", "api_token": "api_token"}, +} @pytest.mark.parametrize( @@ -1180,6 +1180,11 @@ def run_incremental_parent_state_test( f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, ), + # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too + ( + f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", + {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, + ), # Fetch the first page of comments for post 1 ( "https://api.example.com/community/posts/1/comments?per_page=100", @@ -1473,6 +1478,11 @@ def run_incremental_parent_state_test( ] }, ), + # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too + ( + f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", + {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, + ), # Fetch the first page of comments for post 1 ( "https://api.example.com/community/posts/1/comments?per_page=100", @@ -1614,6 +1624,11 @@ def run_incremental_parent_state_test( ] }, ), + # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too + ( + f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", + {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, + ), # Fetch the first page of comments for post 1 ( "https://api.example.com/community/posts/1/comments?per_page=100", @@ -1718,7 +1733,7 @@ def run_incremental_parent_state_test( ], } }, - "lookback_window": 1, + "lookback_window": 86400, # FIXME this run only sync one record without cursor value hence why it might make sense not to update the lookback window "use_global_cursor": False, "states": [ { @@ -2112,10 +2127,11 @@ def test_incremental_parent_state_migration( "states": [ { "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + "cursor": {"updated_at": START_DATE}, # FIXME this happens because the concurrent framework gets the start date as the max between the state value and the start value. In this case, the start value is higher } ], - "state": {}, + "lookback_window": 0, # FIXME the concurrent framework sets the lookback window to 0 as opposed to the declarative framework which would set not define it + # FIXME the concurrent framework does not set the global state if there are none as opposed to the declarative framework which would set an empty global state "use_global_cursor": False, "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, } @@ -2312,7 +2328,7 @@ def test_incremental_parent_state_no_slices( }, # Expected state { - "lookback_window": 1, + "lookback_window": 0, # FIXME maybe I'm wrong but I don't think it makes sense to have a lookback window being added from the state of "not having a lookback window" before "use_global_cursor": False, "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, "states": [ @@ -2577,16 +2593,22 @@ def test_incremental_parent_state_no_records( }, # Expected state { - # The global state, lookback window and the parent state are the same because sync failed for comment 20 + # The global state and lookback window are the same because sync failed for comment 20. + # The parent state will be updated up until the child records that were successful i.t. until post 2. + # Note that we still have an entry for the partition with post 2 but it is populated with the start date. "parent_state": { "post_comments": { "states": [ { "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + }, + { + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": START_DATE}, + }, ], - "state": {}, + "lookback_window": 0, "use_global_cursor": False, "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, } @@ -3873,7 +3895,7 @@ def test_given_all_partitions_finished_when_close_partition_then_final_state_emi assert len(final_state["states"]) == 2 assert final_state["state"]["updated_at"] == "2024-01-02T00:00:00Z" assert final_state["parent_state"] == {"posts": {"updated_at": "2024-01-06T00:00:00Z"}} - assert final_state["lookback_window"] == 1 + assert final_state["lookback_window"] == 86400 assert cursor._message_repository.emit_message.call_count == 2 assert mock_cursor.stream_slices.call_count == 2 # Called once for each partition diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 46b726758..5031a8018 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -19,6 +19,8 @@ StreamDescriptor, SyncMode, ) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource +from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import ( PerPartitionCursor, StreamSlice, @@ -26,6 +28,7 @@ from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.types import Record +from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder CURSOR_FIELD = "cursor_field" SYNC_MODE = SyncMode.incremental @@ -84,6 +87,7 @@ def build(self): manifest = { "version": "0.34.2", "type": "DeclarativeSource", + "concurrency_level": {"type": "ConcurrencyLevel", "default_concurrency": 1}, "check": {"type": "CheckStream", "stream_names": ["Rates"]}, "definitions": { "AnotherStream": { @@ -166,7 +170,25 @@ def build(self): def test_given_state_for_only_some_partition_when_stream_slices_then_create_slices_using_state_or_start_from_start_datetime(): - source = ManifestDeclarativeSource( + source = ConcurrentDeclarativeSource( + state=[ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="Rates"), + stream_state=AirbyteStateBlob({ + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-02-01"}, + } + ] + }), + ), + ), + ], + config={}, + catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).build(), source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2"]) .with_incremental_sync( @@ -181,21 +203,10 @@ def test_given_state_for_only_some_partition_when_stream_slices_then_create_slic .build() ) stream_instance = source.streams({})[0] - stream_instance.state = { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-02-01"}, - } - ] - } - slices = stream_instance.stream_slices( - sync_mode=SYNC_MODE, - stream_state={}, - ) + partitions = stream_instance.generate_partitions() - assert list(slices) == [ + assert list(map(lambda partition: partition.to_slice(), partitions)) == [ {"partition_field": "1", "start_time": "2022-02-01", "end_time": "2022-02-28"}, {"partition_field": "2", "start_time": "2022-01-01", "end_time": "2022-01-31"}, {"partition_field": "2", "start_time": "2022-02-01", "end_time": "2022-02-28"}, @@ -203,7 +214,10 @@ def test_given_state_for_only_some_partition_when_stream_slices_then_create_slic def test_given_record_for_partition_when_read_then_update_state(): - source = ManifestDeclarativeSource( + source = ConcurrentDeclarativeSource( + state=[], + config={}, + catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).build(), source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2"]) .with_incremental_sync( @@ -218,7 +232,7 @@ def test_given_record_for_partition_when_read_then_update_state(): .build() ) stream_instance = source.streams({})[0] - list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + partition = next(iter(stream_instance.generate_partitions())) stream_slice = StreamSlice( partition={"partition_field": "1"}, @@ -228,20 +242,15 @@ def test_given_record_for_partition_when_read_then_update_state(): SimpleRetriever, "_read_pages", side_effect=[ - [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_slice)] + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, "Rates", stream_slice)] ], ): - list( - stream_instance.read_records( - sync_mode=SYNC_MODE, - stream_slice=stream_slice, - stream_state={"states": []}, - cursor_field=CURSOR_FIELD, - ) - ) + for record in partition.read(): + stream_instance.cursor.observe(record) + stream_instance.cursor.close_partition(partition) - assert stream_instance.state == { - "state": {}, + assert stream_instance.cursor.state == { + "lookback_window": 0, "use_global_cursor": False, "states": [ { @@ -253,7 +262,10 @@ def test_given_record_for_partition_when_read_then_update_state(): def test_substream_without_input_state(): - test_source = ManifestDeclarativeSource( + test_source = ConcurrentDeclarativeSource( + state=[], + config={}, + catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).with_stream(ConfiguredAirbyteStreamBuilder().with_name("AnotherStream")).build(), source_config=ManifestBuilder() .with_substream_partition_router("AnotherStream") .with_incremental_sync( @@ -278,7 +290,6 @@ def test_substream_without_input_state(): ) stream_instance = test_source.streams({})[1] - parent_stream_slice = StreamSlice( partition={}, cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"} ) @@ -288,12 +299,13 @@ def test_substream_without_input_state(): SimpleRetriever, "_read_pages", side_effect=[ - [Record({"id": "1", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], - [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], + [Record({"id": "1", CURSOR_FIELD: "2022-01-15"}, "AnotherStream", parent_stream_slice)], + [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, "AnotherStream", parent_stream_slice)], ], ): - slices = list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) - assert list(slices) == [ + partition = list(map(lambda partition: partition.to_slice(), stream_instance.generate_partitions())) + + assert partition == [ StreamSlice( partition={ "parent_id": "1", @@ -334,7 +346,10 @@ def test_partition_limitation(caplog): We verify that the state only retains information for the two most recent partitions. """ stream_name = "Rates" - source = ManifestDeclarativeSource( + source = ConcurrentDeclarativeSource( + state=[], + config={}, + catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).build(), source_config=ManifestBuilder() .with_list_partition_router( stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] @@ -437,7 +452,7 @@ def test_partition_limitation(caplog): # Use caplog to capture logs with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): output = list(source.read(logger, {}, catalog, initial_state)) # Check if the warning was logged @@ -475,7 +490,37 @@ def test_perpartition_with_fallback(caplog): This test also checks that the appropriate warning logs are emitted when the partition limit is exceeded. """ stream_name = "Rates" - source = ManifestDeclarativeSource( + catalog = CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)).build() + initial_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=stream_name, namespace=None), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-01"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-02"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-01-03"}, + }, + ] + } + ), + ), + ) + ] + source = ConcurrentDeclarativeSource( + state=initial_state, + config={}, + catalog=catalog, source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2", "3", "4", "5", "6"]) .with_incremental_sync( @@ -568,49 +613,12 @@ def test_perpartition_with_fallback(caplog): ], ] - configured_stream = ConfiguredAirbyteStream( - stream=AirbyteStream( - name=stream_name, - json_schema={}, - supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], - ), - sync_mode=SyncMode.incremental, - destination_sync_mode=DestinationSyncMode.append, - ) - catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) - - initial_state = [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name=stream_name, namespace=None), - stream_state=AirbyteStateBlob( - { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-01-01"}, - }, - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-01-02"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-01-03"}, - }, - ] - } - ), - ), - ) - ] logger = MagicMock() # Use caplog to capture logs with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): output = list(source.read(logger, {}, catalog, initial_state)) # Check if the warnings were logged @@ -645,9 +653,15 @@ def test_per_partition_cursor_within_limit(caplog): This test also checks that no warning logs are emitted when the partition limit is not exceeded. """ - source = ManifestDeclarativeSource( + stream_name = "Rates" + catalog = CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)).build() + initial_state = {} + source = ConcurrentDeclarativeSource( + state=initial_state, + config={}, + catalog=catalog, source_config=ManifestBuilder() - .with_list_partition_router("Rates", "partition_field", ["1", "2", "3"]) + .with_list_partition_router(stream_name, "partition_field", ["1", "2", "3"]) .with_incremental_sync( "Rates", start_datetime="2022-01-01", @@ -661,75 +675,62 @@ def test_per_partition_cursor_within_limit(caplog): ) partition_slices = [ - StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 4) + StreamSlice(partition={"partition_field": str(i)}, cursor_slice=cursor_slice) for i in range(1, 4) for cursor_slice in [{"start_time": "2022-01-01", "end_time": "2022-01-31"}, {"start_time": "2022-02-01", "end_time": "2022-02-28"}, {"start_time": "2022-03-01", "end_time": "2022-03-31"}] ] records_list = [ [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_name, partition_slices[0] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, stream_name, partition_slices[1] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, stream_name, partition_slices[2] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, stream_name, partition_slices[3] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, stream_name, partition_slices[4] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, stream_name, partition_slices[5] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, stream_name, partition_slices[6] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, stream_name, partition_slices[7] ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, stream_name, partition_slices[8] ) ], ] - - configured_stream = ConfiguredAirbyteStream( - stream=AirbyteStream( - name="Rates", - json_schema={}, - supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], - ), - sync_mode=SyncMode.incremental, - destination_sync_mode=DestinationSyncMode.append, - ) - catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) - - initial_state = {} logger = MagicMock() # Use caplog to capture logs with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): output = list(source.read(logger, {}, catalog, initial_state)) # Since the partition limit is not exceeded, we expect no warnings diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index b543354f7..4c81f959e 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -198,6 +198,8 @@ "start_time": "2024-01-01T00:00:00.000+00:00", "end_time": "2025-01-01T00:00:00.000+00:00", } +CONFIG_START_TIME = ab_datetime_parse(input_config["start_time"]) +CONFIG_END_TIME = ab_datetime_parse(input_config["end_time"]) def get_factory_with_parameters( @@ -715,13 +717,14 @@ def test_create_substream_partition_router(): model_type=SubstreamPartitionRouterModel, component_definition=partition_router_manifest, config=input_config, + stream_name="child_stream", ) assert isinstance(partition_router, SubstreamPartitionRouter) parent_stream_configs = partition_router.parent_stream_configs assert len(parent_stream_configs) == 2 - assert isinstance(parent_stream_configs[0].stream, DeclarativeStream) - assert isinstance(parent_stream_configs[1].stream, DeclarativeStream) + assert isinstance(parent_stream_configs[0].stream, DefaultStream) + assert isinstance(parent_stream_configs[1].stream, DefaultStream) assert partition_router.parent_stream_configs[0].parent_key.eval({}) == "id" assert partition_router.parent_stream_configs[0].partition_field.eval({}) == "repository_id" @@ -919,22 +922,21 @@ def test_stream_with_incremental_and_retriever_with_partition_router(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert isinstance(stream.retriever, SimpleRetriever) - assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) + assert isinstance(stream, DefaultStream) + retriever = get_retriever(stream) + assert isinstance(retriever, SimpleRetriever) + assert isinstance(stream.cursor, ConcurrentPerPartitionCursor) - datetime_stream_slicer = ( - stream.retriever.stream_slicer._per_partition_cursor._cursor_factory.create() + concurrent_cursor = ( + stream.cursor._cursor_factory.create({}, timedelta(0)) # FIXME should we be allowed to pass `None` instead of `{}` ) - assert isinstance(datetime_stream_slicer, DatetimeBasedCursor) - assert isinstance(datetime_stream_slicer._start_datetime, MinMaxDatetime) - assert datetime_stream_slicer._start_datetime.datetime.string == "{{ config['start_time'] }}" - assert isinstance(datetime_stream_slicer._end_datetime, MinMaxDatetime) - assert datetime_stream_slicer._end_datetime.datetime.string == "{{ config['end_time'] }}" - assert datetime_stream_slicer.step == "P10D" - assert datetime_stream_slicer.cursor_field.string == "created" + assert isinstance(concurrent_cursor, ConcurrentCursor) + assert concurrent_cursor._start == CONFIG_START_TIME + assert concurrent_cursor._end_provider() == CONFIG_END_TIME + assert concurrent_cursor._slice_range == timedelta(days=10) + assert concurrent_cursor.cursor_field.cursor_field_key == "created" - list_stream_slicer = stream.retriever.stream_slicer._partition_router + list_stream_slicer = stream.cursor._partition_router assert isinstance(list_stream_slicer, ListPartitionRouter) assert list_stream_slicer.values == ["airbyte", "airbyte-cloud"] assert list_stream_slicer._cursor_field.string == "a_key" @@ -1011,9 +1013,10 @@ def test_stream_with_incremental_and_async_retriever_with_partition_router(use_l config=connector_config, ) - assert isinstance(stream, DeclarativeStream) - assert isinstance(stream.retriever, AsyncRetriever) - stream_slicer = stream.retriever.stream_slicer.stream_slicer + assert isinstance(stream, DefaultStream) + retriever = get_retriever(stream) + assert isinstance(retriever, AsyncRetriever) + stream_slicer = retriever.stream_slicer.stream_slicer assert isinstance(stream_slicer, ConcurrentPerPartitionCursor) assert stream_slicer.state == stream_state import json @@ -1273,12 +1276,13 @@ def test_client_side_incremental_with_partition_router(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) + retriever = get_retriever(stream) assert isinstance( - stream.retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator + retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator ) - assert stream.retriever.record_selector.transform_before_filtering == True + assert retriever.record_selector.transform_before_filtering == True assert isinstance( - stream.retriever.record_selector.record_filter._cursor, + retriever.record_selector.record_filter._cursor, ConcurrentPerPartitionCursor, ) @@ -2462,8 +2466,8 @@ def test_default_schema_loader(self): "values": "{{config['repos']}}", "cursor_field": "a_key", }, - PerPartitionWithGlobalCursor, - DeclarativeStream, + ConcurrentPerPartitionCursor, + DefaultStream, id="test_create_simple_retriever_with_incremental_and_partition_router", ), pytest.param( @@ -2488,8 +2492,8 @@ def test_default_schema_loader(self): "cursor_field": "b_key", }, ], - PerPartitionWithGlobalCursor, - DeclarativeStream, + ConcurrentPerPartitionCursor, + DefaultStream, id="test_create_simple_retriever_with_partition_routers_multiple_components", ), pytest.param( @@ -2545,12 +2549,12 @@ def test_merge_incremental_and_partition_router( assert isinstance(stream_slicer, expected_router_type) if incremental and partition_router: - assert isinstance(retriever.stream_slicer, PerPartitionWithGlobalCursor) + assert isinstance(stream.cursor, ConcurrentPerPartitionCursor) if isinstance(partition_router, list) and len(partition_router) > 1: assert isinstance( - retriever.stream_slicer._partition_router, CartesianProductStreamSlicer + stream.cursor._partition_router, CartesianProductStreamSlicer ) - assert len(retriever.stream_slicer._partition_router.stream_slicers) == len( + assert len(stream.cursor._partition_router.stream_slicers) == len( partition_router ) @@ -2597,7 +2601,6 @@ def test_simple_retriever_emit_log_messages(): assert retriever.log_formatter(response) == connector_builder_factory._get_log_formatter( None, retriever.name )(response) - assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) def test_create_page_increment(): @@ -2948,10 +2951,6 @@ def test_use_request_options_provider_for_datetime_based_cursor(): assert retriever.primary_key == "id" assert retriever.name == "Test" - assert isinstance(retriever.cursor, DatetimeBasedCursor) - assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) - assert isinstance(retriever.stream_slicer.wrapped_slicer, DatetimeBasedCursor) - assert isinstance(retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) assert ( retriever.request_option_provider.start_time_option.inject_into @@ -2973,79 +2972,6 @@ def test_use_request_options_provider_for_datetime_based_cursor(): assert retriever.request_option_provider._partition_field_end.string == "end_time" -def test_do_not_separate_request_options_provider_for_non_datetime_based_cursor(): - # This test validates that we're only using the dedicated RequestOptionsProvider for DatetimeBasedCursor and using the - # existing StreamSlicer for other types of cursors and partition routing. Once everything is migrated this test can be deleted - - config = { - "start_time": "2024-01-01T00:00:00.000000+0000", - } - - simple_retriever_model = { - "type": "SimpleRetriever", - "record_selector": { - "type": "RecordSelector", - "extractor": { - "type": "DpathExtractor", - "field_path": [], - }, - }, - "requester": { - "type": "HttpRequester", - "name": "list", - "url_base": "orange.com", - "path": "/v1/api", - }, - } - - datetime_based_cursor = DatetimeBasedCursor( - start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), - step="P5D", - cursor_field="updated_at", - datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", - cursor_granularity="PT1S", - is_compare_strictly=True, - config=config, - parameters={}, - ) - - list_partition_router = ListPartitionRouter( - cursor_field="id", - values=["four", "oh", "eight"], - config=config, - parameters={}, - ) - - per_partition_cursor = PerPartitionCursor( - cursor_factory=CursorFactory(lambda: datetime_based_cursor), - partition_router=list_partition_router, - ) - - connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) - retriever = connector_builder_factory.create_component( - model_type=SimpleRetrieverModel, - component_definition=simple_retriever_model, - config={}, - name="Test", - primary_key="id", - stream_slicer=per_partition_cursor, - request_options_provider=None, - transformations=[], - ) - - assert isinstance(retriever, SimpleRetriever) - assert retriever.primary_key == "id" - assert retriever.name == "Test" - - assert isinstance(retriever.cursor, PerPartitionCursor) - assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) - assert isinstance(retriever.stream_slicer.wrapped_slicer, PerPartitionCursor) - - assert isinstance(retriever.request_option_provider, PerPartitionCursor) - assert isinstance(retriever.request_option_provider._cursor_factory, CursorFactory) - assert retriever.request_option_provider._partition_router == list_partition_router - - def test_use_default_request_options_provider(): simple_retriever_model = { "type": "SimpleRetriever", @@ -3080,8 +3006,6 @@ def test_use_default_request_options_provider(): assert retriever.primary_key == "id" assert retriever.name == "Test" - assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) - assert isinstance(retriever.stream_slicer.wrapped_slicer, SinglePartitionRouter) assert isinstance(retriever.request_option_provider, DefaultRequestOptionsProvider) @@ -3946,6 +3870,7 @@ def test_create_grouping_partition_router_with_underlying_router(): model_type=GroupingPartitionRouterModel, component_definition=partition_router_manifest, config=input_config, + stream_name="child_stream", ) # Test the created partition router @@ -3956,7 +3881,7 @@ def test_create_grouping_partition_router_with_underlying_router(): # Test the underlying partition router parent_stream_configs = partition_router.underlying_partition_router.parent_stream_configs assert len(parent_stream_configs) == 1 - assert isinstance(parent_stream_configs[0].stream, DeclarativeStream) + assert isinstance(parent_stream_configs[0].stream, DefaultStream) assert parent_stream_configs[0].parent_key.eval({}) == "id" assert parent_stream_configs[0].partition_field.eval({}) == "repository_id" @@ -4004,6 +3929,7 @@ def test_create_grouping_partition_router_invalid_group_size(): model_type=GroupingPartitionRouterModel, component_definition=partition_router_manifest, config=input_config, + stream_name="child_stream", ) @@ -4055,6 +3981,7 @@ def test_create_grouping_partition_router_substream_with_request_option(): model_type=GroupingPartitionRouterModel, component_definition=partition_router_manifest, config=input_config, + stream_name="child_stream", ) diff --git a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py index 9bea606e4..acd4421e6 100644 --- a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py @@ -14,11 +14,19 @@ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( ParentStreamConfig, ) -from airbyte_cdk.sources.types import StreamSlice +from airbyte_cdk.sources.types import StreamSlice, Record from unit_tests.sources.declarative.partition_routers.test_substream_partition_router import ( MockStream, parent_slices, ) # Reuse MockStream and parent_slices +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import \ + InMemoryPartition + +_EMPTY_SLICE = StreamSlice(partition={}, cursor_slice={}) + + +def _build_records_for_slice(records: List[Mapping[str, Any]], _slice: StreamSlice): + return [Record(record, "stream_name", _slice) for record in records] @pytest.fixture @@ -30,17 +38,26 @@ def mock_config(): def mock_underlying_router(mock_config): """Fixture for a simple underlying router with predefined slices and extra fields.""" parent_stream = MockStream( - slices=[{}], # Single empty slice, parent_partition will be {} - records=[ - {"board_id": 0, "name": "Board 0", "owner": "User0"}, - { - "board_id": 0, - "name": "Board 0 Duplicate", - "owner": "User0 Duplicate", - }, # Duplicate board_id - ] - + [{"board_id": i, "name": f"Board {i}", "owner": f"User{i}"} for i in range(1, 5)], - name="mock_parent", + [ + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice( + [ + {"board_id": 0, "name": "Board 0", "owner": "User0"}, + { + "board_id": 0, + "name": "Board 0 Duplicate", + "owner": "User0 Duplicate", + }, + ] # Duplicate board_id + + [{"board_id": i, "name": f"Board {i}", "owner": f"User{i}"} for i in range(1, 5)], + _EMPTY_SLICE + ) + ) + ], + "first_stream" ) return SubstreamPartitionRouter( parent_stream_configs=[ @@ -62,13 +79,36 @@ def mock_underlying_router(mock_config): def mock_underlying_router_with_parent_slices(mock_config): """Fixture with varied parent slices for testing non-empty parent_slice.""" parent_stream = MockStream( - slices=parent_slices, # [{"slice": "first"}, {"slice": "second"}, {"slice": "third"}] - records=[ - {"board_id": 1, "name": "Board 1", "owner": "User1", "slice": "first"}, - {"board_id": 2, "name": "Board 2", "owner": "User2", "slice": "second"}, - {"board_id": 3, "name": "Board 3", "owner": "User3", "slice": "third"}, + [ + InMemoryPartition( + "partition_1", + "first_stream", + parent_slices[0], + _build_records_for_slice( + [{"board_id": 1, "name": "Board 1", "owner": "User1", "slice": "first"}], + parent_slices[0], + ), + ), + InMemoryPartition( + "partition_2", + "first_stream", + parent_slices[1], + _build_records_for_slice( + [{"board_id": 2, "name": "Board 2", "owner": "User2", "slice": "second"}], + parent_slices[1], + ), + ), + InMemoryPartition( + "partition_3", + "first_stream", + parent_slices[2], + _build_records_for_slice( + [{"board_id": 3, "name": "Board 3", "owner": "User3", "slice": "third"}], + parent_slices[2], + ), + ), ], - name="mock_parent", + "first_stream" ) return SubstreamPartitionRouter( parent_stream_configs=[ @@ -173,9 +213,15 @@ def test_stream_slices_grouping( def test_stream_slices_empty_underlying_router(mock_config): """Test behavior when the underlying router yields no slices.""" parent_stream = MockStream( - slices=[{}], - records=[], - name="mock_parent", + [ + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + [], + ) + ], + "first_stream" ) underlying_router = SubstreamPartitionRouter( parent_stream_configs=[ @@ -315,13 +361,22 @@ def test_set_initial_state_delegation(mock_config, mock_underlying_router): def test_stream_slices_extra_fields_varied(mock_config): """Test grouping with varied extra fields across partitions.""" parent_stream = MockStream( - slices=[{}], - records=[ - {"board_id": 1, "name": "Board 1", "owner": "User1"}, - {"board_id": 2, "name": "Board 2"}, # Missing owner - {"board_id": 3, "owner": "User3"}, # Missing name + [ + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice( + [ + {"board_id": 1, "name": "Board 1", "owner": "User1"}, + {"board_id": 2, "name": "Board 2"}, # Missing owner + {"board_id": 3, "owner": "User3"}, # Missing name + ], + _EMPTY_SLICE, + ), + ) ], - name="mock_parent", + "first_stream" ) underlying_router = SubstreamPartitionRouter( parent_stream_configs=[ @@ -362,9 +417,18 @@ def test_stream_slices_extra_fields_varied(mock_config): def test_grouping_with_complex_partitions_and_extra_fields(mock_config): """Test grouping with partitions containing multiple keys and extra fields.""" parent_stream = MockStream( - slices=[{}], - records=[{"board_id": i, "extra": f"extra_{i}", "name": f"Board {i}"} for i in range(3)], - name="mock_parent", + [ + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice( + [{"board_id": i, "extra": f"extra_{i}", "name": f"Board {i}"} for i in range(3)], + _EMPTY_SLICE, + ), + ) + ], + "first_stream" ) underlying_router = SubstreamPartitionRouter( parent_stream_configs=[ diff --git a/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py index 80c8f1e10..113679347 100644 --- a/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py @@ -3,23 +3,15 @@ # import logging -from functools import partial -from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union +from typing import Any, Iterable, List, Mapping, Optional +from unittest.mock import Mock import pytest as pytest +from airbyte_protocol_dataclasses.models import AirbyteStream -from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, SyncMode, Type -from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream -from airbyte_cdk.sources.declarative.incremental import ( - ChildPartitionResumableFullRefreshCursor, - ResumableFullRefreshCursor, -) -from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import ( - CursorFactory, - PerPartitionCursor, - StreamSlice, -) -from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor, ConcurrentCursorFactory +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, FinalStateCursor +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import StreamSlice from airbyte_cdk.sources.declarative.partition_routers import ( CartesianProductStreamSlicer, ListPartitionRouter, @@ -33,8 +25,15 @@ RequestOptionType, ) from airbyte_cdk.sources.streams.checkpoint import Cursor +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import \ + CustomFormatConcurrentStreamStateConverter from airbyte_cdk.sources.types import Record -from airbyte_cdk.utils import AirbyteTracedException +from airbyte_cdk.utils.datetime_helpers import ab_datetime_now, ab_datetime_parse +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import \ + InMemoryPartition parent_records = [{"id": 1, "data": "data1"}, {"id": 2, "data": "data2"}] more_records = [ @@ -49,173 +48,71 @@ data_second_parent_slice = [{"id": 2, "slice": "second", "data": "C"}] data_third_parent_slice = [] all_parent_data = data_first_parent_slice + data_second_parent_slice + data_third_parent_slice -parent_slices = [{"slice": "first"}, {"slice": "second"}, {"slice": "third"}] +parent_slices = [ + StreamSlice(partition={"slice": "first"}, cursor_slice={}), + StreamSlice(partition={"slice": "second"}, cursor_slice={}), + StreamSlice(partition={"slice": "third"}, cursor_slice={}), +] +parent_slices_with_cursor = [ + StreamSlice(partition={"slice": "first"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"}), + StreamSlice(partition={"slice": "second"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"}), + StreamSlice(partition={"slice": "third"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"}), +] second_parent_stream_slice = [StreamSlice(partition={"slice": "second_parent"}, cursor_slice={})] data_first_parent_slice_with_cursor = [ - {"id": 0, "slice": "first", "data": "A", "cursor": "first_cursor_0"}, - {"id": 1, "slice": "first", "data": "B", "cursor": "first_cursor_1"}, + {"id": 0, "slice": "first", "data": "A", "cursor": "2021-01-01"}, + {"id": 1, "slice": "first", "data": "B", "cursor": "2021-01-02"}, ] data_second_parent_slice_with_cursor = [ - {"id": 2, "slice": "second", "data": "C", "cursor": "second_cursor_2"} + {"id": 2, "slice": "second", "data": "C", "cursor": "2022-01-01"} ] all_parent_data_with_cursor = ( data_first_parent_slice_with_cursor + data_second_parent_slice_with_cursor ) +_EMPTY_SLICE = StreamSlice(partition={}, cursor_slice={}) +_ANY_STREAM = None + +def _build_records_for_slice(records: List[Mapping[str, Any]], _slice: StreamSlice): + return [Record(record, "stream_name", _slice) for record in records] -class MockStream(DeclarativeStream): - def __init__(self, slices, records, name, cursor_field="", cursor=None): - self.config = {} - self._slices = slices - self._records = records - self._stream_cursor_field = ( - InterpolatedString.create(cursor_field, parameters={}) - if isinstance(cursor_field, str) - else cursor_field - ) + +class MockStream(AbstractStream): + def __init__(self, partitions, name, cursor_field="", cursor=None): + self._partitions = partitions + self._stream_cursor_field = cursor_field self._name = name self._state = {"states": []} - self._cursor = cursor + self._cursor = cursor if cursor else FinalStateCursor(self._name, None, Mock()) + + def generate_partitions(self) -> Iterable[Partition]: + list(self._cursor.stream_slices()) + return self._partitions @property def name(self) -> str: return self._name @property - def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: - return "id" - - @property - def state(self) -> Mapping[str, Any]: - return self._state - - @state.setter - def state(self, value: Mapping[str, Any]) -> None: - self._state = value - - @property - def is_resumable(self) -> bool: - return bool(self._cursor) - - def get_cursor(self) -> Optional[Cursor]: - return self._cursor - - def stream_slices( - self, - *, - sync_mode: SyncMode, - cursor_field: List[str] = None, - stream_state: Mapping[str, Any] = None, - ) -> Iterable[Optional[StreamSlice]]: - for s in self._slices: - if isinstance(s, StreamSlice): - yield s - else: - yield StreamSlice(partition=s, cursor_slice={}) - - def read_records( - self, - sync_mode: SyncMode, - cursor_field: List[str] = None, - stream_slice: Mapping[str, Any] = None, - stream_state: Mapping[str, Any] = None, - ) -> Iterable[Mapping[str, Any]]: - # The parent stream's records should always be read as full refresh - assert sync_mode == SyncMode.full_refresh - - if not stream_slice: - result = self._records - else: - result = [ - Record(data=r, associated_slice=stream_slice, stream_name=self.name) - for r in self._records - if r["slice"] == stream_slice["slice"] - ] - - yield from result - - # Update the state only after reading the full slice - cursor_field = self._stream_cursor_field.eval(config=self.config) - if stream_slice and cursor_field and result: - self._state["states"].append( - {cursor_field: result[-1][cursor_field], "partition": stream_slice["slice"]} - ) + def cursor_field(self) -> Optional[str]: + return self._stream_cursor_field def get_json_schema(self) -> Mapping[str, Any]: return {} + def as_airbyte_stream(self) -> AirbyteStream: + raise NotImplementedError() -class MockIncrementalStream(MockStream): - def __init__(self, slices, records, name, cursor_field="", cursor=None, date_ranges=None): - super().__init__(slices, records, name, cursor_field, cursor) - if date_ranges is None: - date_ranges = [] - self._date_ranges = date_ranges - self._state = {} - - def read_records( - self, - sync_mode: SyncMode, - cursor_field: List[str] = None, - stream_slice: Mapping[str, Any] = None, - stream_state: Mapping[str, Any] = None, - ) -> Iterable[Mapping[str, Any]]: - results = [ - record - for record in self._records - if stream_slice["start_time"] <= record["updated_at"] <= stream_slice["end_time"] - ] - print(f"about to emit {results}") - yield from results - print(f"setting state to {stream_slice}") - self._state = stream_slice - - -class MockResumableFullRefreshStream(MockStream): - def __init__( - self, - slices, - name, - cursor_field="", - cursor=None, - record_pages: Optional[List[List[Mapping[str, Any]]]] = None, - ): - super().__init__(slices, [], name, cursor_field, cursor) - if record_pages: - self._record_pages = record_pages - else: - self._record_pages = [] - self._state: MutableMapping[str, Any] = {} - - def read_records( - self, - sync_mode: SyncMode, - cursor_field: List[str] = None, - stream_slice: Mapping[str, Any] = None, - stream_state: Mapping[str, Any] = None, - ) -> Iterable[Mapping[str, Any]]: - page_number = self.state.get("next_page_token") or 1 - yield from self._record_pages[page_number - 1] - - cursor = self.get_cursor() - if page_number < len(self._record_pages): - cursor.close_slice( - StreamSlice(cursor_slice={"next_page_token": page_number + 1}, partition={}) - ) - else: - cursor.close_slice( - StreamSlice(cursor_slice={"__ab_full_refresh_sync_complete": True}, partition={}) - ) + def log_stream_sync_configuration(self) -> None: + raise NotImplementedError() @property - def state(self) -> Mapping[str, Any]: - cursor = self.get_cursor() - return cursor.get_stream_state() if cursor else {} + def cursor(self) -> Cursor: + return self._cursor - @state.setter - def state(self, value: Mapping[str, Any]) -> None: - self._state = value + def check_availability(self) -> StreamAvailability: + raise NotImplementedError() @pytest.mark.parametrize( @@ -225,7 +122,7 @@ def state(self, value: Mapping[str, Any]) -> None: ( [ ParentStreamConfig( - stream=MockStream([{}], [], "first_stream"), + stream=MockStream([InMemoryPartition("partition_name", "first_stream", _EMPTY_SLICE, [])], "first_stream"), parent_key="id", partition_field="first_stream_id", parameters={}, @@ -237,7 +134,17 @@ def state(self, value: Mapping[str, Any]) -> None: ( [ ParentStreamConfig( - stream=MockStream([{}], parent_records, "first_stream"), + stream=MockStream( + [ + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice(parent_records, _EMPTY_SLICE) + ) + ], + "first_stream" + ), parent_key="id", partition_field="first_stream_id", parameters={}, @@ -252,7 +159,29 @@ def state(self, value: Mapping[str, Any]) -> None: ( [ ParentStreamConfig( - stream=MockStream(parent_slices, all_parent_data, "first_stream"), + stream=MockStream( + [ + InMemoryPartition( + "partition_1", + "first_stream", + parent_slices[0], + _build_records_for_slice(data_first_parent_slice, parent_slices[0]), + ), + InMemoryPartition( + "partition_2", + "first_stream", + parent_slices[1], + _build_records_for_slice(data_second_parent_slice, parent_slices[1]), + ), + InMemoryPartition( + "partition_3", + "first_stream", + parent_slices[2], + _build_records_for_slice(data_third_parent_slice, parent_slices[2]) + ), + ], + "first_stream" + ), parent_key="id", partition_field="first_stream_id", parameters={}, @@ -270,11 +199,26 @@ def state(self, value: Mapping[str, Any]) -> None: ParentStreamConfig( stream=MockStream( [ - StreamSlice(partition=p, cursor_slice={"start": 0, "end": 1}) - for p in parent_slices + InMemoryPartition( + "partition_1", + "first_stream", + parent_slices[0], + _build_records_for_slice(data_first_parent_slice, parent_slices[0]) + ), + InMemoryPartition( + "partition_2", + "first_stream", + parent_slices[1], + _build_records_for_slice(data_second_parent_slice, parent_slices[1]) + ), + InMemoryPartition( + "partition_3", + "first_stream", + parent_slices[2], + _build_records_for_slice(data_third_parent_slice, parent_slices[2]) + ), ], - all_parent_data, - "first_stream", + "first_stream" ), parent_key="id", partition_field="first_stream_id", @@ -292,9 +236,27 @@ def state(self, value: Mapping[str, Any]) -> None: [ ParentStreamConfig( stream=MockStream( - parent_slices, - data_first_parent_slice + data_second_parent_slice, - "first_stream", + [ + InMemoryPartition( + "partition_1", + "first_stream", + parent_slices[0], + _build_records_for_slice(data_first_parent_slice, parent_slices[0]) + ), + InMemoryPartition( + "partition_2", + "first_stream", + parent_slices[1], + _build_records_for_slice(data_second_parent_slice, parent_slices[1]) + ), + InMemoryPartition( + "partition_3", + "first_stream", + parent_slices[2], + [], + ), + ], + "first_stream" ), parent_key="id", partition_field="first_stream_id", @@ -302,7 +264,17 @@ def state(self, value: Mapping[str, Any]) -> None: config={}, ), ParentStreamConfig( - stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + stream=MockStream( + [ + InMemoryPartition( + "partition_1", + "first_stream", + second_parent_stream_slice[0], + _build_records_for_slice(more_records, second_parent_stream_slice[0]) + ), + ], + "first_stream" + ), parent_key="id", partition_field="second_stream_id", parameters={}, @@ -321,7 +293,15 @@ def state(self, value: Mapping[str, Any]) -> None: [ ParentStreamConfig( stream=MockStream( - [{}], [{"id": 0}, {"id": 1}, {"_id": 2}, {"id": 3}], "first_stream" + [ + InMemoryPartition( + "partition_1", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice([{"id": 0}, {"id": 1}, {"_id": 2}, {"id": 3}], _EMPTY_SLICE) + ), + ], + "first_stream" ), parent_key="id", partition_field="first_stream_id", @@ -339,9 +319,15 @@ def state(self, value: Mapping[str, Any]) -> None: [ ParentStreamConfig( stream=MockStream( - [{}], - [{"a": {"b": 0}}, {"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 3}}], - "first_stream", + [ + InMemoryPartition( + "partition_1", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice([{"a": {"b": 0}}, {"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 3}}], _EMPTY_SLICE) + ), + ], + "first_stream" ), parent_key="a/b", partition_field="first_stream_id", @@ -361,8 +347,8 @@ def state(self, value: Mapping[str, Any]) -> None: "test_single_parent_slices_no_records", "test_single_parent_slices_with_records", "test_with_parent_slices_and_records", - "test_multiple_parent_streams", "test_cursor_values_are_removed_from_parent_slices", + "test_multiple_parent_streams", "test_missed_parent_key", "test_dpath_extraction", ], @@ -383,25 +369,6 @@ def test_substream_partition_router(parent_stream_configs, expected_slices): assert slices == expected_slices -def test_substream_partition_router_invalid_parent_record_type(): - partition_router = SubstreamPartitionRouter( - parent_stream_configs=[ - ParentStreamConfig( - stream=MockStream([{}], [list()], "first_stream"), - parent_key="id", - partition_field="first_stream_id", - parameters={}, - config={}, - ) - ], - parameters={}, - config={}, - ) - - with pytest.raises(AirbyteTracedException): - _ = [s for s in partition_router.stream_slices()] - - @pytest.mark.parametrize( "initial_state, expected_parent_state", [ @@ -503,8 +470,12 @@ def test_set_initial_state(initial_state, expected_parent_state): and sets the appropriate parent stream state. """ parent_stream = MockStream( - slices=[{}], - records=[], + InMemoryPartition( + "partition_1", + "stream_name", + _EMPTY_SLICE, + [], + ), name="parent_stream_name1", cursor_field="parent_stream_cursor", ) @@ -639,11 +610,7 @@ def test_request_option( partition_router = SubstreamPartitionRouter( parent_stream_configs=[ ParentStreamConfig( - stream=MockStream( - parent_slices, - data_first_parent_slice + data_second_parent_slice, - "first_stream", - ), + stream=_ANY_STREAM, parent_key="id", partition_field="first_stream_id", parameters={}, @@ -651,7 +618,7 @@ def test_request_option( request_option=parent_stream_request_parameters[0], ), ParentStreamConfig( - stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + stream=_ANY_STREAM, parent_key="id", partition_field="second_stream_id", parameters={}, @@ -676,10 +643,57 @@ def test_request_option( ( ParentStreamConfig( stream=MockStream( - parent_slices, - all_parent_data_with_cursor, + [ + InMemoryPartition( + "partition_1", + "first_stream", + parent_slices_with_cursor[0], + _build_records_for_slice(data_first_parent_slice_with_cursor, parent_slices_with_cursor[0]) + ), + InMemoryPartition( + "partition_2", + "first_stream", + parent_slices_with_cursor[1], + _build_records_for_slice(data_second_parent_slice_with_cursor, parent_slices_with_cursor[1]) + ), + InMemoryPartition( + "partition_3", + "first_stream", + parent_slices_with_cursor[2], + _build_records_for_slice([], parent_slices_with_cursor[2]) + ), + ], "first_stream", - cursor_field="cursor", + cursor=ConcurrentPerPartitionCursor( + cursor_factory=ConcurrentCursorFactory( + lambda stream_state, runtime_lookback_window: ConcurrentCursor( + stream_name="first_stream", + stream_namespace=None, + stream_state=stream_state, + message_repository=Mock(), + connector_state_manager=Mock(), + connector_state_converter=CustomFormatConcurrentStreamStateConverter("%Y-%m-%d"), + cursor_field=CursorField("cursor"), + slice_boundary_fields=("start", "end"), + start=ab_datetime_parse("2021-01-01").to_datetime(), + end_provider=lambda: ab_datetime_parse("2023-01-01").to_datetime(), + lookback_window=runtime_lookback_window, + ), + ), + partition_router=ListPartitionRouter( + values=["first", "second", "third"], + cursor_field="slice", + config={}, + parameters={}, + ), + stream_name="first_stream", + stream_namespace=None, + stream_state={}, + message_repository=Mock(), + connector_state_manager=Mock(), + connector_state_converter=CustomFormatConcurrentStreamStateConverter("%Y-%m-%d"), + cursor_field=CursorField("cursor"), + ) ), parent_key="id", partition_field="first_stream_id", @@ -687,14 +701,14 @@ def test_request_option( config={}, incremental_dependency=True, ), - { - "first_stream": { - "states": [ - {"cursor": "first_cursor_1", "partition": "first"}, - {"cursor": "second_cursor_2", "partition": "second"}, - ] - } - }, + {"first_stream": {"lookback_window": 0, + "states": [{"cursor": {"cursor": "2021-01-02"}, + "partition": {"slice": "first"}}, + {"cursor": {"cursor": "2022-01-01"}, + "partition": {"slice": "second"}}, + {"cursor": {"cursor": "2021-01-01"}, + "partition": {"slice": "third"}}], + "use_global_cursor": False}}, ), ], ids=[ @@ -784,59 +798,6 @@ def test_request_params_interpolation_for_parent_stream( assert partition_router.get_request_params(stream_slice=stream_slice) == expected_request_params -def test_given_record_is_airbyte_message_when_stream_slices_then_use_record_data(): - parent_slice = {} - partition_router = SubstreamPartitionRouter( - parent_stream_configs=[ - ParentStreamConfig( - stream=MockStream( - [parent_slice], - [ - AirbyteMessage( - type=Type.RECORD, - record=AirbyteRecordMessage( - data={"id": "record value"}, emitted_at=0, stream="stream" - ), - ) - ], - "first_stream", - ), - parent_key="id", - partition_field="partition_field", - parameters={}, - config={}, - ) - ], - parameters={}, - config={}, - ) - - slices = list(partition_router.stream_slices()) - assert slices == [{"partition_field": "record value", "parent_slice": parent_slice}] - - -def test_given_record_is_record_object_when_stream_slices_then_use_record_data(): - parent_slice = {} - partition_router = SubstreamPartitionRouter( - parent_stream_configs=[ - ParentStreamConfig( - stream=MockStream( - [parent_slice], [Record({"id": "record value"}, {})], "first_stream" - ), - parent_key="id", - partition_field="partition_field", - parameters={}, - config={}, - ) - ], - parameters={}, - config={}, - ) - - slices = list(partition_router.stream_slices()) - assert slices == [{"partition_field": "record value", "parent_slice": parent_slice}] - - def test_substream_using_incremental_parent_stream(): mock_slices = [ StreamSlice( @@ -857,15 +818,30 @@ def test_substream_using_incremental_parent_stream(): partition_router = SubstreamPartitionRouter( parent_stream_configs=[ ParentStreamConfig( - stream=MockIncrementalStream( - slices=mock_slices, - records=[ - Record({"id": "may_record_0", "updated_at": "2024-05-15"}, mock_slices[0]), - Record({"id": "may_record_1", "updated_at": "2024-05-16"}, mock_slices[0]), - Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, mock_slices[1]), - Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, mock_slices[1]), + stream=MockStream( + [ + InMemoryPartition( + "partition_1", + "first_stream", + mock_slices[0], + [ + Record({"id": "may_record_0", "updated_at": "2024-05-15"}, "first_stream", mock_slices[0]), + Record({"id": "may_record_1", "updated_at": "2024-05-16"}, "first_stream", mock_slices[0]), + ] + ), + InMemoryPartition( + "partition_1", + "first_stream", + mock_slices[1], + [ + Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, "first_stream", + mock_slices[1]), + Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, "first_stream", + mock_slices[1]), + ] + ), ], - name="first_stream", + "first_stream" ), parent_key="id", partition_field="partition_field", @@ -904,25 +880,55 @@ def test_substream_checkpoints_after_each_parent_partition(): ] expected_parent_state = [ - {"first_stream": {}}, - {"first_stream": {}}, - {"first_stream": {"start_time": "2024-04-27", "end_time": "2024-05-27"}}, - {"first_stream": {"start_time": "2024-04-27", "end_time": "2024-05-27"}}, - {"first_stream": {"start_time": "2024-05-27", "end_time": "2024-06-27"}}, + {"first_stream": {"updated_at": mock_slices[0]["start_time"]}}, + {"first_stream": {"updated_at": "2024-05-16"}}, + {"first_stream": {"updated_at": "2024-05-16"}}, + {"first_stream": {"updated_at": "2024-06-16"}}, + {"first_stream": {"updated_at": "2024-06-16"}}, ] partition_router = SubstreamPartitionRouter( parent_stream_configs=[ ParentStreamConfig( - stream=MockIncrementalStream( - slices=mock_slices, - records=[ - Record({"id": "may_record_0", "updated_at": "2024-05-15"}, mock_slices[0]), - Record({"id": "may_record_1", "updated_at": "2024-05-16"}, mock_slices[0]), - Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, mock_slices[1]), - Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, mock_slices[1]), + stream=MockStream( + [ + InMemoryPartition( + "partition_1", + "first_stream", + mock_slices[0], + [ + Record({"id": "may_record_0", "updated_at": "2024-05-15"}, "first_stream", + mock_slices[0]), + Record({"id": "may_record_1", "updated_at": "2024-05-16"}, "first_stream", + mock_slices[0]), + ] + ), + InMemoryPartition( + "partition_1", + "first_stream", + mock_slices[1], + [ + Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, "first_stream", + mock_slices[1]), + Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, "first_stream", + mock_slices[1]), + ] + ), ], - name="first_stream", + "first_stream", + "updated_at", + ConcurrentCursor( + stream_name="first_stream", + stream_namespace=None, + stream_state={}, + message_repository=Mock(), + connector_state_manager=Mock(), + connector_state_converter=CustomFormatConcurrentStreamStateConverter("%Y-%m-%d"), + cursor_field=CursorField("updated_at"), + slice_boundary_fields=("start_time", "end_time"), + start=ab_datetime_parse(mock_slices[0]["start_time"]).to_datetime(), + end_provider=lambda: ab_datetime_parse("2023-01-01").to_datetime(), + ), ), incremental_dependency=True, parent_key="id", @@ -940,273 +946,7 @@ def test_substream_checkpoints_after_each_parent_partition(): assert actual_slice == expected_slices[expected_counter] assert partition_router.get_stream_state() == expected_parent_state[expected_counter] expected_counter += 1 - assert partition_router.get_stream_state() == expected_parent_state[expected_counter] - - -@pytest.mark.parametrize( - "use_incremental_dependency", - [ - pytest.param(False, id="test_resumable_full_refresh_stream_without_parent_checkpoint"), - pytest.param( - True, - id="test_resumable_full_refresh_stream_with_use_incremental_dependency_for_parent_checkpoint", - ), - ], -) -def test_substream_using_resumable_full_refresh_parent_stream(use_incremental_dependency): - mock_slices = [ - StreamSlice(cursor_slice={}, partition={}), - StreamSlice(cursor_slice={"next_page_token": 2}, partition={}), - StreamSlice(cursor_slice={"next_page_token": 3}, partition={}), - ] - - expected_slices = [ - {"partition_field": "makoto_yuki", "parent_slice": {}}, - {"partition_field": "yukari_takeba", "parent_slice": {}}, - {"partition_field": "mitsuru_kirijo", "parent_slice": {}}, - {"partition_field": "akihiko_sanada", "parent_slice": {}}, - {"partition_field": "junpei_iori", "parent_slice": {}}, - {"partition_field": "fuuka_yamagishi", "parent_slice": {}}, - ] - - expected_parent_state = [ - {"persona_3_characters": {}}, - {"persona_3_characters": {}}, - {"persona_3_characters": {"next_page_token": 2}}, - {"persona_3_characters": {"next_page_token": 2}}, - {"persona_3_characters": {"next_page_token": 3}}, - {"persona_3_characters": {"next_page_token": 3}}, - {"persona_3_characters": {"__ab_full_refresh_sync_complete": True}}, - ] - - partition_router = SubstreamPartitionRouter( - parent_stream_configs=[ - ParentStreamConfig( - stream=MockResumableFullRefreshStream( - slices=[StreamSlice(partition={}, cursor_slice={})], - cursor=ResumableFullRefreshCursor(parameters={}), - record_pages=[ - [ - Record( - data={"id": "makoto_yuki"}, - associated_slice=mock_slices[0], - stream_name="test_stream", - ), - Record( - data={"id": "yukari_takeba"}, - associated_slice=mock_slices[0], - stream_name="test_stream", - ), - ], - [ - Record( - data={"id": "mitsuru_kirijo"}, - associated_slice=mock_slices[1], - stream_name="test_stream", - ), - Record( - data={"id": "akihiko_sanada"}, - associated_slice=mock_slices[1], - stream_name="test_stream", - ), - ], - [ - Record( - data={"id": "junpei_iori"}, - associated_slice=mock_slices[2], - stream_name="test_stream", - ), - Record( - data={"id": "fuuka_yamagishi"}, - associated_slice=mock_slices[2], - stream_name="test_stream", - ), - ], - ], - name="persona_3_characters", - ), - incremental_dependency=use_incremental_dependency, - parent_key="id", - partition_field="partition_field", - parameters={}, - config={}, - ) - ], - parameters={}, - config={}, - ) - - expected_counter = 0 - for actual_slice in partition_router.stream_slices(): - assert actual_slice == expected_slices[expected_counter] - if use_incremental_dependency: - assert partition_router.get_stream_state() == expected_parent_state[expected_counter] - expected_counter += 1 - if use_incremental_dependency: - assert partition_router.get_stream_state() == expected_parent_state[expected_counter] - - -@pytest.mark.parametrize( - "use_incremental_dependency", - [ - pytest.param( - False, id="test_substream_resumable_full_refresh_stream_without_parent_checkpoint" - ), - pytest.param( - True, - id="test_substream_resumable_full_refresh_stream_with_use_incremental_dependency_for_parent_checkpoint", - ), - ], -) -def test_substream_using_resumable_full_refresh_parent_stream_slices(use_incremental_dependency): - mock_parent_slices = [ - StreamSlice(cursor_slice={}, partition={}), - StreamSlice(cursor_slice={"next_page_token": 2}, partition={}), - StreamSlice(cursor_slice={"next_page_token": 3}, partition={}), - ] - - expected_parent_slices = [ - {"partition_field": "makoto_yuki", "parent_slice": {}}, - {"partition_field": "yukari_takeba", "parent_slice": {}}, - {"partition_field": "mitsuru_kirijo", "parent_slice": {}}, - {"partition_field": "akihiko_sanada", "parent_slice": {}}, - {"partition_field": "junpei_iori", "parent_slice": {}}, - {"partition_field": "fuuka_yamagishi", "parent_slice": {}}, - ] - - expected_parent_state = [ - {"persona_3_characters": {}}, - {"persona_3_characters": {}}, - {"persona_3_characters": {"next_page_token": 2}}, - {"persona_3_characters": {"next_page_token": 2}}, - {"persona_3_characters": {"next_page_token": 3}}, - {"persona_3_characters": {"next_page_token": 3}}, - {"persona_3_characters": {"__ab_full_refresh_sync_complete": True}}, - ] - - expected_substream_state = { - "states": [ - { - "partition": {"parent_slice": {}, "partition_field": "makoto_yuki"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - }, - { - "partition": {"parent_slice": {}, "partition_field": "yukari_takeba"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - }, - { - "partition": {"parent_slice": {}, "partition_field": "mitsuru_kirijo"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - }, - { - "partition": {"parent_slice": {}, "partition_field": "akihiko_sanada"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - }, - { - "partition": {"parent_slice": {}, "partition_field": "junpei_iori"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - }, - { - "partition": {"parent_slice": {}, "partition_field": "fuuka_yamagishi"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - }, - ], - "parent_state": {"persona_3_characters": {"__ab_full_refresh_sync_complete": True}}, - } - - partition_router = SubstreamPartitionRouter( - parent_stream_configs=[ - ParentStreamConfig( - stream=MockResumableFullRefreshStream( - slices=[StreamSlice(partition={}, cursor_slice={})], - cursor=ResumableFullRefreshCursor(parameters={}), - record_pages=[ - [ - Record( - data={"id": "makoto_yuki"}, - associated_slice=mock_parent_slices[0], - stream_name="test_stream", - ), - Record( - data={"id": "yukari_takeba"}, - associated_slice=mock_parent_slices[0], - stream_name="test_stream", - ), - ], - [ - Record( - data={"id": "mitsuru_kirijo"}, - associated_slice=mock_parent_slices[1], - stream_name="test_stream", - ), - Record( - data={"id": "akihiko_sanada"}, - associated_slice=mock_parent_slices[1], - stream_name="test_stream", - ), - ], - [ - Record( - data={"id": "junpei_iori"}, - associated_slice=mock_parent_slices[2], - stream_name="test_stream", - ), - Record( - data={"id": "fuuka_yamagishi"}, - associated_slice=mock_parent_slices[2], - stream_name="test_stream", - ), - ], - ], - name="persona_3_characters", - ), - incremental_dependency=use_incremental_dependency, - parent_key="id", - partition_field="partition_field", - parameters={}, - config={}, - ) - ], - parameters={}, - config={}, - ) - - substream_cursor_slicer = PerPartitionCursor( - cursor_factory=CursorFactory( - create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) - ), - partition_router=partition_router, - ) - - expected_counter = 0 - for actual_slice in substream_cursor_slicer.stream_slices(): - # close the substream slice - substream_cursor_slicer.close_slice(actual_slice) - # check the slice has been processed - assert actual_slice == expected_parent_slices[expected_counter] - # check for parent state - if use_incremental_dependency: - assert ( - substream_cursor_slicer._partition_router.get_stream_state() - == expected_parent_state[expected_counter] - ) - expected_counter += 1 - if use_incremental_dependency: - assert ( - substream_cursor_slicer._partition_router.get_stream_state() - == expected_parent_state[expected_counter] - ) - - # validate final state for closed substream slices - final_state = substream_cursor_slicer.get_stream_state() - if not use_incremental_dependency: - assert final_state["states"] == expected_substream_state["states"], ( - "State for substreams is not valid!" - ) - else: - assert final_state == expected_substream_state, ( - "State for substreams with incremental dependency is not valid!" - ) + assert partition_router.get_stream_state() == expected_parent_state[-1] @pytest.mark.parametrize( @@ -1216,20 +956,29 @@ def test_substream_using_resumable_full_refresh_parent_stream_slices(use_increme [ ParentStreamConfig( stream=MockStream( - [{}], [ - { - "id": 1, - "field_1": "value_1", - "field_2": {"nested_field": "nested_value_1"}, - }, - { - "id": 2, - "field_1": "value_2", - "field_2": {"nested_field": "nested_value_2"}, - }, + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice( + [ + { + "id": 1, + "field_1": "value_1", + "field_2": {"nested_field": "nested_value_1"}, + }, + { + "id": 2, + "field_1": "value_2", + "field_2": {"nested_field": "nested_value_2"}, + }, + ], + _EMPTY_SLICE + ) + ) ], - "first_stream", + "first_stream" ), parent_key="id", partition_field="first_stream_id", @@ -1246,10 +995,19 @@ def test_substream_using_resumable_full_refresh_parent_stream_slices(use_increme ( [ ParentStreamConfig( - stream=MockStream( - [{}], - [{"id": 1, "field_1": "value_1"}, {"id": 2, "field_1": "value_2"}], - "first_stream", +stream=MockStream( + [ + InMemoryPartition( + "partition_name", + "first_stream", + _EMPTY_SLICE, + _build_records_for_slice( + [{"id": 1, "field_1": "value_1"}, {"id": 2, "field_1": "value_2"}], + _EMPTY_SLICE + ) + ) + ], + "first_stream" ), parent_key="id", partition_field="first_stream_id", diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py index 437822181..991539e1e 100644 --- a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -392,11 +392,13 @@ def test_http_components_resolver( ): mock_retriever = MagicMock() mock_retriever.read_records.return_value = retriever_data - mock_retriever.stream_slices.return_value = [{}] + stream_slicer = MagicMock() + stream_slicer.stream_slices.return_value = [{}] config = {} resolver = HttpComponentsResolver( retriever=mock_retriever, + stream_slicer=stream_slicer, config=config, components_mapping=components_mapping, parameters={}, @@ -457,11 +459,13 @@ def test_http_components_resolver_with_stream_slices( ): mock_retriever = MagicMock() mock_retriever.read_records.return_value = retriever_data - mock_retriever.stream_slices.return_value = [{"parent_id": 1}, {"parent_id": 2}] + stream_slicer = MagicMock() + stream_slicer.stream_slices.return_value = [{"parent_id": 1}, {"parent_id": 2}] config = {} resolver = HttpComponentsResolver( retriever=mock_retriever, + stream_slicer=stream_slicer, config=config, components_mapping=components_mapping, parameters={}, diff --git a/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py b/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py index b0dcd272c..48816d39b 100644 --- a/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +++ b/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py @@ -125,7 +125,6 @@ def __init__(self, name, stream_name, _slice, records): self._stream_name = stream_name self._slice = _slice self._records = records - self._is_closed = False def read(self) -> Iterable[Record]: for record_or_exception in self._records: @@ -145,12 +144,6 @@ def __hash__(self) -> int: else: return hash(self._name) - def close(self) -> None: - self._is_closed = True - - def is_closed(self) -> bool: - return self._is_closed - class ConcurrentSourceBuilder(SourceBuilder[ConcurrentCdkSource]): def __init__(self): From ebb4b288af8b7659140775461e10e52a98e41497 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:22:51 -0400 Subject: [PATCH 32/68] more fixes for DefaultStream in Connector Builder --- .../connector_builder/test_reader/reader.py | 2 +- .../parsers/model_to_component_factory.py | 9 ++++++-- .../test_connector_builder_handler.py | 22 ++++++++++++++----- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/airbyte_cdk/connector_builder/test_reader/reader.py b/airbyte_cdk/connector_builder/test_reader/reader.py index 5c16798a2..e3d43f825 100644 --- a/airbyte_cdk/connector_builder/test_reader/reader.py +++ b/airbyte_cdk/connector_builder/test_reader/reader.py @@ -120,7 +120,7 @@ def run_test_read( deprecation_warnings: List[LogMessage] = source.deprecation_warnings() schema_inferrer = SchemaInferrer( - self._pk_to_nested_and_composite_field(stream.primary_key) if stream else None, + self._pk_to_nested_and_composite_field(stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key) if stream else None, self._cursor_field_to_nested_and_composite_field(stream.cursor_field) if stream else None, diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 58c8e654e..ebafcddb8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2072,7 +2072,6 @@ def create_declarative_stream( if ( isinstance(combined_slicers, PartitionRouter) and not is_parent - and not self._emit_connector_builder_messages ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: @@ -2089,7 +2088,13 @@ def create_declarative_stream( retriever, self._message_repository, ), - combined_slicers, + stream_slicer=cast( + StreamSlicer, + StreamSlicerTestReadDecorator( + wrapped_slicer=combined_slicers, + maximum_number_of_slices=self._limit_slices_fetched or 5, + ), + ), ) return DefaultStream( partition_generator=partition_generator, diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index c036c12d3..4ebdd565a 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -7,7 +7,7 @@ import json import logging import os -from typing import List, Literal +from typing import List, Literal, Union from unittest import mock from unittest.mock import MagicMock, patch @@ -17,7 +17,6 @@ from airbyte_cdk import connector_builder from airbyte_cdk.connector_builder.connector_builder_handler import ( - TestLimits, create_source, get_limits, resolve_manifest, @@ -60,6 +59,7 @@ from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets, update_secrets from unit_tests.connector_builder.utils import create_configured_catalog @@ -440,6 +440,14 @@ } +def get_retriever(stream: Union[DeclarativeStream, DefaultStream]): + return ( + stream.retriever + if isinstance(stream, DeclarativeStream) + else stream._stream_partition_generator._partition_factory._retriever + ) + + @pytest.fixture def valid_resolve_manifest_config_file(tmp_path): config_file = tmp_path / "config.json" @@ -1130,8 +1138,9 @@ def test_read_source(mock_http_stream): streams = source.streams(config) for s in streams: - assert isinstance(s.retriever, SimpleRetriever) - assert isinstance(s.retriever.stream_slicer, StreamSlicerTestReadDecorator) + retriever = get_retriever(s) + assert isinstance(retriever, SimpleRetriever) + assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) @patch.object( @@ -1177,8 +1186,9 @@ def test_read_source_single_page_single_slice(mock_http_stream): streams = source.streams(config) for s in streams: - assert isinstance(s.retriever, SimpleRetriever) - assert isinstance(s.retriever.stream_slicer, StreamSlicerTestReadDecorator) + retriever = get_retriever(s) + assert isinstance(retriever, SimpleRetriever) + assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) @pytest.mark.parametrize( From 6fef39b817818166748cf70f12df3fef41d40e1b Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:27:27 -0400 Subject: [PATCH 33/68] mypy and format --- airbyte_cdk/connector_builder/test_reader/reader.py | 6 +++++- .../declarative/parsers/model_to_component_factory.py | 5 +---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/connector_builder/test_reader/reader.py b/airbyte_cdk/connector_builder/test_reader/reader.py index e3d43f825..3ff920208 100644 --- a/airbyte_cdk/connector_builder/test_reader/reader.py +++ b/airbyte_cdk/connector_builder/test_reader/reader.py @@ -120,7 +120,11 @@ def run_test_read( deprecation_warnings: List[LogMessage] = source.deprecation_warnings() schema_inferrer = SchemaInferrer( - self._pk_to_nested_and_composite_field(stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key) if stream else None, + self._pk_to_nested_and_composite_field( + stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key + ) + if stream + else None, # type: ignore # We are accessing the private property here as the primary key is not exposed. We should either expose it or use `as_airbyte_stream` to retrieve it as this is the "official" way where it is exposed in the Airbyte protocol self._cursor_field_to_nested_and_composite_field(stream.cursor_field) if stream else None, diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index ebafcddb8..9dac39011 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2069,10 +2069,7 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - if ( - isinstance(combined_slicers, PartitionRouter) - and not is_parent - ): + if isinstance(combined_slicers, PartitionRouter) and not is_parent: # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter From e31fed969a56b2bfdb81510f235feb8ceef4ffb7 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:36:39 -0400 Subject: [PATCH 34/68] format broke mypy --- airbyte_cdk/connector_builder/test_reader/reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/connector_builder/test_reader/reader.py b/airbyte_cdk/connector_builder/test_reader/reader.py index 3ff920208..e7399f3f6 100644 --- a/airbyte_cdk/connector_builder/test_reader/reader.py +++ b/airbyte_cdk/connector_builder/test_reader/reader.py @@ -121,10 +121,10 @@ def run_test_read( schema_inferrer = SchemaInferrer( self._pk_to_nested_and_composite_field( - stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key + stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key # type: ignore # We are accessing the private property here as the primary key is not exposed. We should either expose it or use `as_airbyte_stream` to retrieve it as this is the "official" way where it is exposed in the Airbyte protocol ) if stream - else None, # type: ignore # We are accessing the private property here as the primary key is not exposed. We should either expose it or use `as_airbyte_stream` to retrieve it as this is the "official" way where it is exposed in the Airbyte protocol + else None, self._cursor_field_to_nested_and_composite_field(stream.cursor_field) if stream else None, From 23c9712172b8910e32f831cfc66a9821c2c6836f Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 15:02:14 -0400 Subject: [PATCH 35/68] fix connector builder tests and format --- .../concurrent_partition_cursor.py | 48 ++-- .../parsers/model_to_component_factory.py | 67 ++++-- .../substream_partition_router.py | 29 ++- .../test_connector_builder_handler.py | 8 +- .../test_concurrent_perpartitioncursor.py | 16 +- .../test_per_partition_cursor_integration.py | 121 +++++++--- .../test_model_to_component_factory.py | 12 +- .../test_grouping_partition_router.py | 29 ++- .../test_substream_partition_router.py | 218 ++++++++++++------ 9 files changed, 379 insertions(+), 169 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 285b81956..4925b5138 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -11,7 +11,13 @@ from datetime import timedelta from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional -from airbyte_cdk.models import AirbyteStateMessage, AirbyteStateBlob, AirbyteStreamState, AirbyteStateType, StreamDescriptor +from airbyte_cdk.models import ( + AirbyteStateMessage, + AirbyteStateBlob, + AirbyteStreamState, + AirbyteStateType, + StreamDescriptor, +) from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import ( Timer, @@ -548,21 +554,33 @@ def limit_reached(self) -> bool: return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT @staticmethod - def get_parent_state(stream_state: Optional[StreamState], parent_stream_name: str) -> Optional[AirbyteStateMessage]: - return AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(parent_stream_name, None), - stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]) + def get_parent_state( + stream_state: Optional[StreamState], parent_stream_name: str + ) -> Optional[AirbyteStateMessage]: + return ( + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(parent_stream_name, None), + stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]), + ), ) - ) if stream_state and "parent_state" in stream_state else None + if stream_state and "parent_state" in stream_state + else None + ) @staticmethod - def get_global_state(stream_state: Optional[StreamState], parent_stream_name: str) -> Optional[AirbyteStateMessage]: - return AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(parent_stream_name, None), - stream_state=AirbyteStateBlob(stream_state["state"]) + def get_global_state( + stream_state: Optional[StreamState], parent_stream_name: str + ) -> Optional[AirbyteStateMessage]: + return ( + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(parent_stream_name, None), + stream_state=AirbyteStateBlob(stream_state["state"]), + ), ) - ) if stream_state and "state" in stream_state else None + if stream_state and "state" in stream_state + else None + ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index bfc1fd3d1..1f47a4283 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -33,7 +33,15 @@ from airbyte_cdk.connector_builder.models import ( LogMessage as ConnectorBuilderLogMessage, ) -from airbyte_cdk.models import FailureType, Level, AirbyteStateMessage, AirbyteStreamState, AirbyteStateBlob, AirbyteStateType, StreamDescriptor +from airbyte_cdk.models import ( + FailureType, + Level, + AirbyteStateMessage, + AirbyteStreamState, + AirbyteStateBlob, + AirbyteStateType, + StreamDescriptor, +) from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker @@ -500,8 +508,9 @@ InterpolatedRequestOptionsProvider, RequestOptionsProvider, ) -from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import \ - PerPartitionRequestOptionsProvider +from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( + PerPartitionRequestOptionsProvider, +) from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod from airbyte_cdk.sources.declarative.resolvers import ( @@ -1282,7 +1291,9 @@ def create_concurrent_cursor_from_datetime_based_cursor( # TODO validate and explain why we need to do this... component_definition["$parameters"] = component_definition.get("parameters", {}) - parameters = component_definition.get("parameters", component_definition.get("$parameters", {})) + parameters = component_definition.get( + "parameters", component_definition.get("$parameters", {}) + ) datetime_based_cursor_model = model_type.parse_obj(component_definition) if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): @@ -1596,7 +1607,9 @@ def create_concurrent_cursor_from_perpartition_cursor( interpolated_cursor_field = InterpolatedString.create( datetime_based_cursor_model.cursor_field, - parameters=component_definition.get("parameters", component_definition.get("$parameters", {})), # FIXME validate and explain why we need to do this + parameters=component_definition.get( + "parameters", component_definition.get("$parameters", {}) + ), # FIXME validate and explain why we need to do this ) cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) @@ -1973,13 +1986,17 @@ def create_declarative_stream( request_options_provider = ( datetime_request_options_provider if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) - else PerPartitionRequestOptionsProvider(partition_router, datetime_request_options_provider) + else PerPartitionRequestOptionsProvider( + partition_router, datetime_request_options_provider + ) ) elif model.incremental_sync and isinstance( model.incremental_sync, IncrementingCountCursorModel ): if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): - raise ValueError("PerPartition does not support per partition states because switching to global state is time based") + raise ValueError( + "PerPartition does not support per partition states because switching to global state is time based" + ) cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore @@ -2019,7 +2036,9 @@ def create_declarative_stream( ) stream_slicer: ConcurrentStreamSlicer = ( - partition_router if isinstance(concurrent_cursor, FinalStateCursor) else concurrent_cursor + partition_router + if isinstance(concurrent_cursor, FinalStateCursor) + else concurrent_cursor ) retriever = self._create_component_from_model( model=model.retriever, @@ -2088,8 +2107,7 @@ def create_declarative_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation which used the source name instead cursor=concurrent_cursor, - supports_file_transfer=hasattr(model, "file_uploader") - and bool(model.file_uploader), + supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), ) def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: @@ -3768,14 +3786,20 @@ def _create_message_repository_substream_wrapper( self, model: ParentStreamConfigModel, config: Config, **kwargs: Any ) -> Any: # getting the parent state - child_state = self._connector_state_manager.get_stream_state(kwargs["stream_name"], None) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag + child_state = self._connector_state_manager.get_stream_state( + kwargs["stream_name"], None + ) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag if model.incremental_dependency and child_state: parent_stream_name = model.stream.name or "" - parent_state = ConcurrentPerPartitionCursor.get_parent_state(child_state, parent_stream_name) + parent_state = ConcurrentPerPartitionCursor.get_parent_state( + child_state, parent_stream_name + ) if not parent_state: # there are two migration cases: state value from child stream or from global state - parent_state = ConcurrentPerPartitionCursor.get_global_state(child_state, parent_stream_name) + parent_state = ConcurrentPerPartitionCursor.get_global_state( + child_state, parent_stream_name + ) if not parent_state and not isinstance(parent_state, dict): cursor_field = InterpolatedString.create( @@ -3787,8 +3811,12 @@ def _create_message_repository_substream_wrapper( parent_state = AirbyteStateMessage( type=AirbyteStateType.STREAM, stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name=parent_stream_name, namespace=None), - stream_state=AirbyteStateBlob({cursor_field: list(cursor_values)[0]}), + stream_descriptor=StreamDescriptor( + name=parent_stream_name, namespace=None + ), + stream_state=AirbyteStateBlob( + {cursor_field: list(cursor_values)[0]} + ), ), ) connector_state_manager = ConnectorStateManager([parent_state] if parent_state else []) @@ -3804,7 +3832,10 @@ def _create_message_repository_substream_wrapper( disable_cache=self._disable_cache, message_repository=StateFilteringMessageRepository( LogAppenderMessageRepositoryDecorator( - {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, + { + "airbyte_cdk": {"stream": {"is_substream": True}}, + "http": {"is_auxiliary": True}, + }, self._message_repository, self._evaluate_log_level(self._emit_connector_builder_messages), ), @@ -4127,7 +4158,9 @@ def create_grouping_partition_router( self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any ) -> GroupingPartitionRouter: underlying_router = self._create_component_from_model( - model=model.underlying_partition_router, config=config, **kwargs, + model=model.underlying_partition_router, + config=config, + **kwargs, ) if model.group_size < 1: raise ValueError(f"Group size must be greater than 0, got {model.group_size}") diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index d29a3c2d3..68a1d8200 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -7,7 +7,17 @@ import json import logging from dataclasses import InitVar, dataclass -from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union, TypeVar +from typing import ( + TYPE_CHECKING, + Any, + Iterable, + List, + Mapping, + MutableMapping, + Optional, + Union, + TypeVar, +) import dpath import requests @@ -27,7 +37,6 @@ def iterate_with_last_flag(generator: Iterable[Partition]) -> Iterable[tuple[Partition, bool]]: - iterator = iter(generator) try: @@ -191,8 +200,12 @@ def stream_slices(self) -> Iterable[StreamSlice]: for field_path in parent_stream_config.extra_fields ] - for partition, is_last_slice in iterate_with_last_flag(parent_stream.generate_partitions()): - for parent_record, is_last_record_in_slice in iterate_with_last_flag(partition.read()): + for partition, is_last_slice in iterate_with_last_flag( + parent_stream.generate_partitions() + ): + for parent_record, is_last_record_in_slice in iterate_with_last_flag( + partition.read() + ): parent_stream.cursor.observe(parent_record) parent_partition = ( parent_record.associated_slice.partition @@ -211,7 +224,9 @@ def stream_slices(self) -> Iterable[StreamSlice]: continue # Add extra fields - extracted_extra_fields = self._extract_extra_fields(record_data, extra_fields) + extracted_extra_fields = self._extract_extra_fields( + record_data, extra_fields + ) if parent_stream_config.lazy_read_pointer: extracted_extra_fields = { @@ -421,7 +436,9 @@ def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: parent_state = {} for parent_config in self.parent_stream_configs: if parent_config.incremental_dependency: - parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.cursor.state) + parent_state[parent_config.stream.name] = copy.deepcopy( + parent_config.stream.cursor.state + ) return parent_state @property diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index 4ebdd565a..b117453fb 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -1140,7 +1140,9 @@ def test_read_source(mock_http_stream): for s in streams: retriever = get_retriever(s) assert isinstance(retriever, SimpleRetriever) - assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) + assert isinstance( + s._stream_partition_generator._stream_slicer, StreamSlicerTestReadDecorator + ) @patch.object( @@ -1188,7 +1190,9 @@ def test_read_source_single_page_single_slice(mock_http_stream): for s in streams: retriever = get_retriever(s) assert isinstance(retriever, SimpleRetriever) - assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) + assert isinstance( + s._stream_partition_generator._stream_slicer, StreamSlicerTestReadDecorator + ) @pytest.mark.parametrize( diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 0f6470069..26cc36e70 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -1185,8 +1185,8 @@ def run_incremental_parent_state_test( ), # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too ( - f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", - {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, + f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", + {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, ), # Fetch the first page of comments for post 1 ( @@ -1483,8 +1483,8 @@ def run_incremental_parent_state_test( ), # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too ( - f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", - {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, + f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", + {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, ), # Fetch the first page of comments for post 1 ( @@ -1629,8 +1629,8 @@ def run_incremental_parent_state_test( ), # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too ( - f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", - {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, + f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", + {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, ), # Fetch the first page of comments for post 1 ( @@ -2130,7 +2130,9 @@ def test_incremental_parent_state_migration( "states": [ { "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": START_DATE}, # FIXME this happens because the concurrent framework gets the start date as the max between the state value and the start value. In this case, the start value is higher + "cursor": { + "updated_at": START_DATE + }, # FIXME this happens because the concurrent framework gets the start date as the max between the state value and the start value. In this case, the start value is higher } ], "lookback_window": 0, # FIXME the concurrent framework sets the lookback window to 0 as opposed to the declarative framework which would set not define it diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 5031a8018..b5fd09ec7 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -19,7 +19,9 @@ StreamDescriptor, SyncMode, ) -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import ( PerPartitionCursor, @@ -172,23 +174,27 @@ def build(self): def test_given_state_for_only_some_partition_when_stream_slices_then_create_slices_using_state_or_start_from_start_datetime(): source = ConcurrentDeclarativeSource( state=[ - AirbyteStateMessage( + AirbyteStateMessage( type=AirbyteStateType.STREAM, stream=AirbyteStreamState( stream_descriptor=StreamDescriptor(name="Rates"), - stream_state=AirbyteStateBlob({ - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-02-01"}, - } - ] - }), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-02-01"}, + } + ] + } + ), ), ), ], config={}, - catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).build(), + catalog=CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) + .build(), source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2"]) .with_incremental_sync( @@ -200,7 +206,7 @@ def test_given_state_for_only_some_partition_when_stream_slices_then_create_slic step="P1M", cursor_granularity="P1D", ) - .build() + .build(), ) stream_instance = source.streams({})[0] @@ -217,7 +223,9 @@ def test_given_record_for_partition_when_read_then_update_state(): source = ConcurrentDeclarativeSource( state=[], config={}, - catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).build(), + catalog=CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) + .build(), source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2"]) .with_incremental_sync( @@ -229,7 +237,7 @@ def test_given_record_for_partition_when_read_then_update_state(): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), ) stream_instance = source.streams({})[0] partition = next(iter(stream_instance.generate_partitions())) @@ -242,7 +250,13 @@ def test_given_record_for_partition_when_read_then_update_state(): SimpleRetriever, "_read_pages", side_effect=[ - [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, "Rates", stream_slice)] + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, + "Rates", + stream_slice, + ) + ] ], ): for record in partition.read(): @@ -265,7 +279,10 @@ def test_substream_without_input_state(): test_source = ConcurrentDeclarativeSource( state=[], config={}, - catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).with_stream(ConfiguredAirbyteStreamBuilder().with_name("AnotherStream")).build(), + catalog=CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) + .with_stream(ConfiguredAirbyteStreamBuilder().with_name("AnotherStream")) + .build(), source_config=ManifestBuilder() .with_substream_partition_router("AnotherStream") .with_incremental_sync( @@ -286,7 +303,7 @@ def test_substream_without_input_state(): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), ) stream_instance = test_source.streams({})[1] @@ -303,7 +320,9 @@ def test_substream_without_input_state(): [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, "AnotherStream", parent_stream_slice)], ], ): - partition = list(map(lambda partition: partition.to_slice(), stream_instance.generate_partitions())) + partition = list( + map(lambda partition: partition.to_slice(), stream_instance.generate_partitions()) + ) assert partition == [ StreamSlice( @@ -349,7 +368,9 @@ def test_partition_limitation(caplog): source = ConcurrentDeclarativeSource( state=[], config={}, - catalog=CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")).build(), + catalog=CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) + .build(), source_config=ManifestBuilder() .with_list_partition_router( stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] @@ -363,7 +384,7 @@ def test_partition_limitation(caplog): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), ) partition_slices = [ @@ -490,7 +511,11 @@ def test_perpartition_with_fallback(caplog): This test also checks that the appropriate warning logs are emitted when the partition limit is exceeded. """ stream_name = "Rates" - catalog = CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)).build() + catalog = ( + CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)) + .build() + ) initial_state = [ AirbyteStateMessage( type=AirbyteStateType.STREAM, @@ -532,7 +557,7 @@ def test_perpartition_with_fallback(caplog): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), ) partition_slices = [ @@ -654,7 +679,11 @@ def test_per_partition_cursor_within_limit(caplog): This test also checks that no warning logs are emitted when the partition limit is not exceeded. """ stream_name = "Rates" - catalog = CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)).build() + catalog = ( + CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)) + .build() + ) initial_state = {} source = ConcurrentDeclarativeSource( state=initial_state, @@ -671,57 +700,81 @@ def test_per_partition_cursor_within_limit(caplog): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), ) partition_slices = [ - StreamSlice(partition={"partition_field": str(i)}, cursor_slice=cursor_slice) for i in range(1, 4) for cursor_slice in [{"start_time": "2022-01-01", "end_time": "2022-01-31"}, {"start_time": "2022-02-01", "end_time": "2022-02-28"}, {"start_time": "2022-03-01", "end_time": "2022-03-31"}] + StreamSlice(partition={"partition_field": str(i)}, cursor_slice=cursor_slice) + for i in range(1, 4) + for cursor_slice in [ + {"start_time": "2022-01-01", "end_time": "2022-01-31"}, + {"start_time": "2022-02-01", "end_time": "2022-02-28"}, + {"start_time": "2022-03-01", "end_time": "2022-03-31"}, + ] ] records_list = [ [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_name, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, + stream_name, + partition_slices[0], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, stream_name, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, + stream_name, + partition_slices[1], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, stream_name, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, + stream_name, + partition_slices[2], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, stream_name, partition_slices[3] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, + stream_name, + partition_slices[3], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, stream_name, partition_slices[4] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, + stream_name, + partition_slices[4], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, stream_name, partition_slices[5] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, + stream_name, + partition_slices[5], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, stream_name, partition_slices[6] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, + stream_name, + partition_slices[6], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, stream_name, partition_slices[7] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, + stream_name, + partition_slices[7], ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, stream_name, partition_slices[8] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, + stream_name, + partition_slices[8], ) ], ] diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 4c81f959e..c51383e8f 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -928,7 +928,9 @@ def test_stream_with_incremental_and_retriever_with_partition_router(): assert isinstance(stream.cursor, ConcurrentPerPartitionCursor) concurrent_cursor = ( - stream.cursor._cursor_factory.create({}, timedelta(0)) # FIXME should we be allowed to pass `None` instead of `{}` + stream.cursor._cursor_factory.create( + {}, timedelta(0) + ) # FIXME should we be allowed to pass `None` instead of `{}` ) assert isinstance(concurrent_cursor, ConcurrentCursor) assert concurrent_cursor._start == CONFIG_START_TIME @@ -2551,12 +2553,8 @@ def test_merge_incremental_and_partition_router( if incremental and partition_router: assert isinstance(stream.cursor, ConcurrentPerPartitionCursor) if isinstance(partition_router, list) and len(partition_router) > 1: - assert isinstance( - stream.cursor._partition_router, CartesianProductStreamSlicer - ) - assert len(stream.cursor._partition_router.stream_slicers) == len( - partition_router - ) + assert isinstance(stream.cursor._partition_router, CartesianProductStreamSlicer) + assert len(stream.cursor._partition_router.stream_slicers) == len(partition_router) def test_simple_retriever_emit_log_messages(): diff --git a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py index acd4421e6..3f18439d2 100644 --- a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py @@ -19,8 +19,9 @@ MockStream, parent_slices, ) # Reuse MockStream and parent_slices -from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import \ - InMemoryPartition +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import ( + InMemoryPartition, +) _EMPTY_SLICE = StreamSlice(partition={}, cursor_slice={}) @@ -52,12 +53,15 @@ def mock_underlying_router(mock_config): "owner": "User0 Duplicate", }, ] # Duplicate board_id - + [{"board_id": i, "name": f"Board {i}", "owner": f"User{i}"} for i in range(1, 5)], - _EMPTY_SLICE - ) + + [ + {"board_id": i, "name": f"Board {i}", "owner": f"User{i}"} + for i in range(1, 5) + ], + _EMPTY_SLICE, + ), ) ], - "first_stream" + "first_stream", ) return SubstreamPartitionRouter( parent_stream_configs=[ @@ -108,7 +112,7 @@ def mock_underlying_router_with_parent_slices(mock_config): ), ), ], - "first_stream" + "first_stream", ) return SubstreamPartitionRouter( parent_stream_configs=[ @@ -221,7 +225,7 @@ def test_stream_slices_empty_underlying_router(mock_config): [], ) ], - "first_stream" + "first_stream", ) underlying_router = SubstreamPartitionRouter( parent_stream_configs=[ @@ -376,7 +380,7 @@ def test_stream_slices_extra_fields_varied(mock_config): ), ) ], - "first_stream" + "first_stream", ) underlying_router = SubstreamPartitionRouter( parent_stream_configs=[ @@ -423,12 +427,15 @@ def test_grouping_with_complex_partitions_and_extra_fields(mock_config): "first_stream", _EMPTY_SLICE, _build_records_for_slice( - [{"board_id": i, "extra": f"extra_{i}", "name": f"Board {i}"} for i in range(3)], + [ + {"board_id": i, "extra": f"extra_{i}", "name": f"Board {i}"} + for i in range(3) + ], _EMPTY_SLICE, ), ) ], - "first_stream" + "first_stream", ) underlying_router = SubstreamPartitionRouter( parent_stream_configs=[ diff --git a/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py index 113679347..2d0d7fa4a 100644 --- a/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py @@ -9,8 +9,15 @@ import pytest as pytest from airbyte_protocol_dataclasses.models import AirbyteStream -from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor, ConcurrentCursorFactory -from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, FinalStateCursor +from airbyte_cdk.sources.declarative.incremental import ( + ConcurrentPerPartitionCursor, + ConcurrentCursorFactory, +) +from airbyte_cdk.sources.streams.concurrent.cursor import ( + ConcurrentCursor, + CursorField, + FinalStateCursor, +) from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import StreamSlice from airbyte_cdk.sources.declarative.partition_routers import ( CartesianProductStreamSlicer, @@ -28,12 +35,14 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition -from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import \ - CustomFormatConcurrentStreamStateConverter +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + CustomFormatConcurrentStreamStateConverter, +) from airbyte_cdk.sources.types import Record from airbyte_cdk.utils.datetime_helpers import ab_datetime_now, ab_datetime_parse -from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import \ - InMemoryPartition +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import ( + InMemoryPartition, +) parent_records = [{"id": 1, "data": "data1"}, {"id": 2, "data": "data2"}] more_records = [ @@ -54,9 +63,15 @@ StreamSlice(partition={"slice": "third"}, cursor_slice={}), ] parent_slices_with_cursor = [ - StreamSlice(partition={"slice": "first"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"}), - StreamSlice(partition={"slice": "second"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"}), - StreamSlice(partition={"slice": "third"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"}), + StreamSlice( + partition={"slice": "first"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"} + ), + StreamSlice( + partition={"slice": "second"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"} + ), + StreamSlice( + partition={"slice": "third"}, cursor_slice={"start": "2021-01-01", "end": "2023-01-01"} + ), ] second_parent_stream_slice = [StreamSlice(partition={"slice": "second_parent"}, cursor_slice={})] @@ -122,7 +137,10 @@ def check_availability(self) -> StreamAvailability: ( [ ParentStreamConfig( - stream=MockStream([InMemoryPartition("partition_name", "first_stream", _EMPTY_SLICE, [])], "first_stream"), + stream=MockStream( + [InMemoryPartition("partition_name", "first_stream", _EMPTY_SLICE, [])], + "first_stream", + ), parent_key="id", partition_field="first_stream_id", parameters={}, @@ -140,10 +158,10 @@ def check_availability(self) -> StreamAvailability: "partition_name", "first_stream", _EMPTY_SLICE, - _build_records_for_slice(parent_records, _EMPTY_SLICE) + _build_records_for_slice(parent_records, _EMPTY_SLICE), ) ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", @@ -171,16 +189,18 @@ def check_availability(self) -> StreamAvailability: "partition_2", "first_stream", parent_slices[1], - _build_records_for_slice(data_second_parent_slice, parent_slices[1]), + _build_records_for_slice( + data_second_parent_slice, parent_slices[1] + ), ), InMemoryPartition( "partition_3", "first_stream", parent_slices[2], - _build_records_for_slice(data_third_parent_slice, parent_slices[2]) + _build_records_for_slice(data_third_parent_slice, parent_slices[2]), ), ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", @@ -203,22 +223,24 @@ def check_availability(self) -> StreamAvailability: "partition_1", "first_stream", parent_slices[0], - _build_records_for_slice(data_first_parent_slice, parent_slices[0]) + _build_records_for_slice(data_first_parent_slice, parent_slices[0]), ), InMemoryPartition( "partition_2", "first_stream", parent_slices[1], - _build_records_for_slice(data_second_parent_slice, parent_slices[1]) + _build_records_for_slice( + data_second_parent_slice, parent_slices[1] + ), ), InMemoryPartition( "partition_3", "first_stream", parent_slices[2], - _build_records_for_slice(data_third_parent_slice, parent_slices[2]) + _build_records_for_slice(data_third_parent_slice, parent_slices[2]), ), ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", @@ -241,13 +263,15 @@ def check_availability(self) -> StreamAvailability: "partition_1", "first_stream", parent_slices[0], - _build_records_for_slice(data_first_parent_slice, parent_slices[0]) + _build_records_for_slice(data_first_parent_slice, parent_slices[0]), ), InMemoryPartition( "partition_2", "first_stream", parent_slices[1], - _build_records_for_slice(data_second_parent_slice, parent_slices[1]) + _build_records_for_slice( + data_second_parent_slice, parent_slices[1] + ), ), InMemoryPartition( "partition_3", @@ -256,7 +280,7 @@ def check_availability(self) -> StreamAvailability: [], ), ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", @@ -270,10 +294,12 @@ def check_availability(self) -> StreamAvailability: "partition_1", "first_stream", second_parent_stream_slice[0], - _build_records_for_slice(more_records, second_parent_stream_slice[0]) + _build_records_for_slice( + more_records, second_parent_stream_slice[0] + ), ), ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="second_stream_id", @@ -298,10 +324,12 @@ def check_availability(self) -> StreamAvailability: "partition_1", "first_stream", _EMPTY_SLICE, - _build_records_for_slice([{"id": 0}, {"id": 1}, {"_id": 2}, {"id": 3}], _EMPTY_SLICE) + _build_records_for_slice( + [{"id": 0}, {"id": 1}, {"_id": 2}, {"id": 3}], _EMPTY_SLICE + ), ), ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", @@ -324,10 +352,18 @@ def check_availability(self) -> StreamAvailability: "partition_1", "first_stream", _EMPTY_SLICE, - _build_records_for_slice([{"a": {"b": 0}}, {"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 3}}], _EMPTY_SLICE) + _build_records_for_slice( + [ + {"a": {"b": 0}}, + {"a": {"b": 1}}, + {"a": {"c": 2}}, + {"a": {"b": 3}}, + ], + _EMPTY_SLICE, + ), ), ], - "first_stream" + "first_stream", ), parent_key="a/b", partition_field="first_stream_id", @@ -648,19 +684,23 @@ def test_request_option( "partition_1", "first_stream", parent_slices_with_cursor[0], - _build_records_for_slice(data_first_parent_slice_with_cursor, parent_slices_with_cursor[0]) + _build_records_for_slice( + data_first_parent_slice_with_cursor, parent_slices_with_cursor[0] + ), ), InMemoryPartition( "partition_2", "first_stream", parent_slices_with_cursor[1], - _build_records_for_slice(data_second_parent_slice_with_cursor, parent_slices_with_cursor[1]) + _build_records_for_slice( + data_second_parent_slice_with_cursor, parent_slices_with_cursor[1] + ), ), InMemoryPartition( "partition_3", "first_stream", parent_slices_with_cursor[2], - _build_records_for_slice([], parent_slices_with_cursor[2]) + _build_records_for_slice([], parent_slices_with_cursor[2]), ), ], "first_stream", @@ -672,7 +712,9 @@ def test_request_option( stream_state=stream_state, message_repository=Mock(), connector_state_manager=Mock(), - connector_state_converter=CustomFormatConcurrentStreamStateConverter("%Y-%m-%d"), + connector_state_converter=CustomFormatConcurrentStreamStateConverter( + "%Y-%m-%d" + ), cursor_field=CursorField("cursor"), slice_boundary_fields=("start", "end"), start=ab_datetime_parse("2021-01-01").to_datetime(), @@ -691,9 +733,11 @@ def test_request_option( stream_state={}, message_repository=Mock(), connector_state_manager=Mock(), - connector_state_converter=CustomFormatConcurrentStreamStateConverter("%Y-%m-%d"), + connector_state_converter=CustomFormatConcurrentStreamStateConverter( + "%Y-%m-%d" + ), cursor_field=CursorField("cursor"), - ) + ), ), parent_key="id", partition_field="first_stream_id", @@ -701,14 +745,17 @@ def test_request_option( config={}, incremental_dependency=True, ), - {"first_stream": {"lookback_window": 0, - "states": [{"cursor": {"cursor": "2021-01-02"}, - "partition": {"slice": "first"}}, - {"cursor": {"cursor": "2022-01-01"}, - "partition": {"slice": "second"}}, - {"cursor": {"cursor": "2021-01-01"}, - "partition": {"slice": "third"}}], - "use_global_cursor": False}}, + { + "first_stream": { + "lookback_window": 0, + "states": [ + {"cursor": {"cursor": "2021-01-02"}, "partition": {"slice": "first"}}, + {"cursor": {"cursor": "2022-01-01"}, "partition": {"slice": "second"}}, + {"cursor": {"cursor": "2021-01-01"}, "partition": {"slice": "third"}}, + ], + "use_global_cursor": False, + } + }, ), ], ids=[ @@ -825,23 +872,37 @@ def test_substream_using_incremental_parent_stream(): "first_stream", mock_slices[0], [ - Record({"id": "may_record_0", "updated_at": "2024-05-15"}, "first_stream", mock_slices[0]), - Record({"id": "may_record_1", "updated_at": "2024-05-16"}, "first_stream", mock_slices[0]), - ] + Record( + {"id": "may_record_0", "updated_at": "2024-05-15"}, + "first_stream", + mock_slices[0], + ), + Record( + {"id": "may_record_1", "updated_at": "2024-05-16"}, + "first_stream", + mock_slices[0], + ), + ], ), InMemoryPartition( "partition_1", "first_stream", mock_slices[1], [ - Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, "first_stream", - mock_slices[1]), - Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, "first_stream", - mock_slices[1]), - ] + Record( + {"id": "jun_record_0", "updated_at": "2024-06-15"}, + "first_stream", + mock_slices[1], + ), + Record( + {"id": "jun_record_1", "updated_at": "2024-06-16"}, + "first_stream", + mock_slices[1], + ), + ], ), ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="partition_field", @@ -897,22 +958,34 @@ def test_substream_checkpoints_after_each_parent_partition(): "first_stream", mock_slices[0], [ - Record({"id": "may_record_0", "updated_at": "2024-05-15"}, "first_stream", - mock_slices[0]), - Record({"id": "may_record_1", "updated_at": "2024-05-16"}, "first_stream", - mock_slices[0]), - ] + Record( + {"id": "may_record_0", "updated_at": "2024-05-15"}, + "first_stream", + mock_slices[0], + ), + Record( + {"id": "may_record_1", "updated_at": "2024-05-16"}, + "first_stream", + mock_slices[0], + ), + ], ), InMemoryPartition( "partition_1", "first_stream", mock_slices[1], [ - Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, "first_stream", - mock_slices[1]), - Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, "first_stream", - mock_slices[1]), - ] + Record( + {"id": "jun_record_0", "updated_at": "2024-06-15"}, + "first_stream", + mock_slices[1], + ), + Record( + {"id": "jun_record_1", "updated_at": "2024-06-16"}, + "first_stream", + mock_slices[1], + ), + ], ), ], "first_stream", @@ -923,7 +996,9 @@ def test_substream_checkpoints_after_each_parent_partition(): stream_state={}, message_repository=Mock(), connector_state_manager=Mock(), - connector_state_converter=CustomFormatConcurrentStreamStateConverter("%Y-%m-%d"), + connector_state_converter=CustomFormatConcurrentStreamStateConverter( + "%Y-%m-%d" + ), cursor_field=CursorField("updated_at"), slice_boundary_fields=("start_time", "end_time"), start=ab_datetime_parse(mock_slices[0]["start_time"]).to_datetime(), @@ -974,11 +1049,11 @@ def test_substream_checkpoints_after_each_parent_partition(): "field_2": {"nested_field": "nested_value_2"}, }, ], - _EMPTY_SLICE - ) + _EMPTY_SLICE, + ), ) ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", @@ -995,19 +1070,22 @@ def test_substream_checkpoints_after_each_parent_partition(): ( [ ParentStreamConfig( -stream=MockStream( + stream=MockStream( [ InMemoryPartition( "partition_name", "first_stream", _EMPTY_SLICE, _build_records_for_slice( - [{"id": 1, "field_1": "value_1"}, {"id": 2, "field_1": "value_2"}], - _EMPTY_SLICE - ) + [ + {"id": 1, "field_1": "value_1"}, + {"id": 2, "field_1": "value_2"}, + ], + _EMPTY_SLICE, + ), ) ], - "first_stream" + "first_stream", ), parent_key="id", partition_field="first_stream_id", From f5d1b716e24a7231724d9aaf8cc2e02a7032f700 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 21 Aug 2025 11:27:19 -0400 Subject: [PATCH 36/68] Merging main into branch --- .../connector_builder_handler.py | 68 +++++----- airbyte_cdk/connector_builder/main.py | 6 +- .../connector_builder/test_reader/helpers.py | 26 +--- .../test_reader/message_grouper.py | 2 +- airbyte_cdk/entrypoint.py | 4 +- airbyte_cdk/logger.py | 24 +++- .../concurrent_read_processor.py | 35 +++-- .../concurrent_source/concurrent_source.py | 47 +++---- .../concurrent_declarative_source.py | 76 +---------- .../parsers/model_to_component_factory.py | 7 +- .../declarative_partition_generator.py | 55 +++----- .../stream_slicer_test_read_decorator.py | 4 +- .../sources/message/concurrent_repository.py | 43 ------- .../streams/concurrent/partition_reader.py | 51 +------- .../streams/concurrent/partitions/types.py | 8 +- airbyte_cdk/sources/utils/slice_logger.py | 4 - .../test_connector_builder_handler.py | 65 +++++----- .../connector_builder/test_message_grouper.py | 120 ++++++++++++++++++ .../test_concurrent_perpartitioncursor.py | 14 -- .../retrievers/test_simple_retriever.py | 9 +- .../schema/test_dynamic_schema_loader.py | 9 +- .../test_declarative_partition_generator.py | 88 ++----------- .../scenarios/stream_facade_builder.py | 5 +- .../test_concurrent_read_processor.py | 60 ++++++++- .../concurrent/test_partition_reader.py | 51 ++------ 25 files changed, 359 insertions(+), 522 deletions(-) delete mode 100644 airbyte_cdk/sources/message/concurrent_repository.py diff --git a/airbyte_cdk/connector_builder/connector_builder_handler.py b/airbyte_cdk/connector_builder/connector_builder_handler.py index a7d2163a9..513546737 100644 --- a/airbyte_cdk/connector_builder/connector_builder_handler.py +++ b/airbyte_cdk/connector_builder/connector_builder_handler.py @@ -3,8 +3,8 @@ # -from dataclasses import asdict -from typing import Any, Dict, List, Mapping, Optional +from dataclasses import asdict, dataclass, field +from typing import Any, ClassVar, Dict, List, Mapping from airbyte_cdk.connector_builder.test_reader import TestReader from airbyte_cdk.models import ( @@ -15,32 +15,45 @@ Type, ) from airbyte_cdk.models import Type as MessageType -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( - ConcurrentDeclarativeSource, - TestLimits, -) from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( + ModelToComponentFactory, +) from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets from airbyte_cdk.utils.datetime_helpers import ab_datetime_now from airbyte_cdk.utils.traced_exception import AirbyteTracedException +DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE = 5 +DEFAULT_MAXIMUM_NUMBER_OF_SLICES = 5 +DEFAULT_MAXIMUM_RECORDS = 100 +DEFAULT_MAXIMUM_STREAMS = 100 + MAX_PAGES_PER_SLICE_KEY = "max_pages_per_slice" MAX_SLICES_KEY = "max_slices" MAX_RECORDS_KEY = "max_records" MAX_STREAMS_KEY = "max_streams" +@dataclass +class TestLimits: + __test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name + + max_records: int = field(default=DEFAULT_MAXIMUM_RECORDS) + max_pages_per_slice: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE) + max_slices: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES) + max_streams: int = field(default=DEFAULT_MAXIMUM_STREAMS) + + def get_limits(config: Mapping[str, Any]) -> TestLimits: command_config = config.get("__test_read_config", {}) - return TestLimits( - max_records=command_config.get(MAX_RECORDS_KEY, TestLimits.DEFAULT_MAX_RECORDS), - max_pages_per_slice=command_config.get( - MAX_PAGES_PER_SLICE_KEY, TestLimits.DEFAULT_MAX_PAGES_PER_SLICE - ), - max_slices=command_config.get(MAX_SLICES_KEY, TestLimits.DEFAULT_MAX_SLICES), - max_streams=command_config.get(MAX_STREAMS_KEY, TestLimits.DEFAULT_MAX_STREAMS), + max_pages_per_slice = ( + command_config.get(MAX_PAGES_PER_SLICE_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE ) + max_slices = command_config.get(MAX_SLICES_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_SLICES + max_records = command_config.get(MAX_RECORDS_KEY) or DEFAULT_MAXIMUM_RECORDS + max_streams = command_config.get(MAX_STREAMS_KEY) or DEFAULT_MAXIMUM_STREAMS + return TestLimits(max_records, max_pages_per_slice, max_slices, max_streams) def should_migrate_manifest(config: Mapping[str, Any]) -> bool: @@ -62,30 +75,21 @@ def should_normalize_manifest(config: Mapping[str, Any]) -> bool: return config.get("__should_normalize", False) -def create_source( - config: Mapping[str, Any], - limits: TestLimits, - catalog: Optional[ConfiguredAirbyteCatalog], - state: Optional[List[AirbyteStateMessage]], -) -> ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]: +def create_source(config: Mapping[str, Any], limits: TestLimits) -> ManifestDeclarativeSource: manifest = config["__injected_declarative_manifest"] - - # We enforce a concurrency level of 1 so that the stream is processed on a single thread - # to retain ordering for the grouping of the builder message responses. - if "concurrency_level" in manifest: - manifest["concurrency_level"]["default_concurrency"] = 1 - else: - manifest["concurrency_level"] = {"type": "ConcurrencyLevel", "default_concurrency": 1} - - return ConcurrentDeclarativeSource( - catalog=catalog, + return ManifestDeclarativeSource( config=config, - state=state, - source_config=manifest, emit_connector_builder_messages=True, + source_config=manifest, migrate_manifest=should_migrate_manifest(config), normalize_manifest=should_normalize_manifest(config), - limits=limits, + component_factory=ModelToComponentFactory( + emit_connector_builder_messages=True, + limit_pages_fetched_per_slice=limits.max_pages_per_slice, + limit_slices_fetched=limits.max_slices, + disable_retries=True, + disable_cache=True, + ), ) diff --git a/airbyte_cdk/connector_builder/main.py b/airbyte_cdk/connector_builder/main.py index 22be81c82..80cf4afa9 100644 --- a/airbyte_cdk/connector_builder/main.py +++ b/airbyte_cdk/connector_builder/main.py @@ -91,12 +91,12 @@ def handle_connector_builder_request( def handle_request(args: List[str]) -> str: command, config, catalog, state = get_config_and_catalog_from_args(args) limits = get_limits(config) - source = create_source(config=config, limits=limits, catalog=catalog, state=state) - return orjson.dumps( # type: ignore[no-any-return] # Serializer.dump() always returns AirbyteMessage + source = create_source(config, limits) + return orjson.dumps( AirbyteMessageSerializer.dump( handle_connector_builder_request(source, command, config, catalog, state, limits) ) - ).decode() + ).decode() # type: ignore[no-any-return] # Serializer.dump() always returns AirbyteMessage if __name__ == "__main__": diff --git a/airbyte_cdk/connector_builder/test_reader/helpers.py b/airbyte_cdk/connector_builder/test_reader/helpers.py index 3cc634ccb..9154610cc 100644 --- a/airbyte_cdk/connector_builder/test_reader/helpers.py +++ b/airbyte_cdk/connector_builder/test_reader/helpers.py @@ -5,7 +5,7 @@ import json from copy import deepcopy from json import JSONDecodeError -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Any, Dict, List, Mapping, Optional from airbyte_cdk.connector_builder.models import ( AuxiliaryRequest, @@ -17,8 +17,6 @@ from airbyte_cdk.models import ( AirbyteLogMessage, AirbyteMessage, - AirbyteStateBlob, - AirbyteStateMessage, OrchestratorType, TraceType, ) @@ -468,7 +466,7 @@ def handle_current_slice( return StreamReadSlices( pages=current_slice_pages, slice_descriptor=current_slice_descriptor, - state=[convert_state_blob_to_mapping(latest_state_message)] if latest_state_message else [], + state=[latest_state_message] if latest_state_message else [], auxiliary_requests=auxiliary_requests if auxiliary_requests else [], ) @@ -720,23 +718,3 @@ def get_auxiliary_request_type(stream: dict, http: dict) -> str: # type: ignore Determines the type of the auxiliary request based on the stream and HTTP properties. """ return "PARENT_STREAM" if stream.get("is_substream", False) else str(http.get("type", None)) - - -def convert_state_blob_to_mapping( - state_message: Union[AirbyteStateMessage, Dict[str, Any]], -) -> Dict[str, Any]: - """ - The AirbyteStreamState stores state as an AirbyteStateBlob which deceivingly is not - a dictionary, but rather a list of kwargs fields. This in turn causes it to not be - properly turned into a dictionary when translating this back into response output - by the connector_builder_handler using asdict() - """ - - if isinstance(state_message, AirbyteStateMessage) and state_message.stream: - state_value = state_message.stream.stream_state - if isinstance(state_value, AirbyteStateBlob): - state_value_mapping = {k: v for k, v in state_value.__dict__.items()} - state_message.stream.stream_state = state_value_mapping # type: ignore # we intentionally set this as a Dict so that StreamReadSlices is translated properly in the resulting HTTP response - return state_message # type: ignore # See above, but when this is an AirbyteStateMessage we must convert AirbyteStateBlob to a Dict - else: - return state_message # type: ignore # This is guaranteed to be a Dict since we check isinstance AirbyteStateMessage above diff --git a/airbyte_cdk/connector_builder/test_reader/message_grouper.py b/airbyte_cdk/connector_builder/test_reader/message_grouper.py index 999b54b72..33b594451 100644 --- a/airbyte_cdk/connector_builder/test_reader/message_grouper.py +++ b/airbyte_cdk/connector_builder/test_reader/message_grouper.py @@ -95,7 +95,7 @@ def get_message_groups( latest_state_message: Optional[Dict[str, Any]] = None slice_auxiliary_requests: List[AuxiliaryRequest] = [] - while message := next(messages, None): + while records_count < limit and (message := next(messages, None)): json_message = airbyte_message_to_json(message) if is_page_http_request_for_different_stream(json_message, stream_name): diff --git a/airbyte_cdk/entrypoint.py b/airbyte_cdk/entrypoint.py index 76a1be32e..54c207487 100644 --- a/airbyte_cdk/entrypoint.py +++ b/airbyte_cdk/entrypoint.py @@ -22,7 +22,7 @@ from airbyte_cdk.connector import TConfig from airbyte_cdk.exception_handler import init_uncaught_exception_handler -from airbyte_cdk.logger import PRINT_BUFFER, init_logger +from airbyte_cdk.logger import PRINT_BUFFER, init_logger, is_platform_debug_log_enabled from airbyte_cdk.models import ( AirbyteConnectionStatus, AirbyteMessage, @@ -158,7 +158,7 @@ def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: if not cmd: raise Exception("No command passed") - if hasattr(parsed_args, "debug") and parsed_args.debug: + if (hasattr(parsed_args, "debug") and parsed_args.debug) or is_platform_debug_log_enabled(): self.logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) self.logger.debug("Debug logs enabled") diff --git a/airbyte_cdk/logger.py b/airbyte_cdk/logger.py index 13c3b4676..4223bda55 100644 --- a/airbyte_cdk/logger.py +++ b/airbyte_cdk/logger.py @@ -1,10 +1,10 @@ # # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # - import json import logging import logging.config +import os from typing import Any, Callable, Mapping, Optional, Tuple import orjson @@ -40,6 +40,10 @@ } +def is_platform_debug_log_enabled() -> bool: + return os.environ.get("LOG_LEVEL", "info").lower() == "debug" + + def init_logger(name: Optional[str] = None) -> logging.Logger: """Initial set up of logger""" logger = logging.getLogger(name) @@ -73,8 +77,22 @@ def format(self, record: logging.LogRecord) -> str: airbyte_level = self.level_mapping.get(record.levelno, "INFO") if airbyte_level == Level.DEBUG: extras = self.extract_extra_args_from_record(record) - debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras} - return filter_secrets(json.dumps(debug_dict)) + if is_platform_debug_log_enabled(): + # We have a different behavior between debug logs enabled through `--debug` argument and debug logs + # enabled through environment variable. The reason is that for platform logs, we need to have these + # printed as AirbyteMessage which is not the case with the current previous implementation. + # Why not migrate both to AirbyteMessages then? AirbyteMessages do not support having structured logs. + # which means that the DX would be degraded compared to the current solution (devs will need to identify + # the `log.message` field and figure out where in this field is the response while the current solution + # have a specific field that is structured for extras. + message = f"{filter_secrets(record.getMessage())} ///\nExtra logs: {filter_secrets(json.dumps(extras))}" + log_message = AirbyteMessage( + type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message) + ) + return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode() + else: + debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras} + return filter_secrets(json.dumps(debug_dict)) else: message = super().format(record) message = filter_secrets(message) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 33731e74c..09bd921e1 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -95,14 +95,11 @@ def on_partition(self, partition: Partition) -> None: """ stream_name = partition.stream_name() self._streams_to_running_partitions[stream_name].add(partition) - cursor = self._stream_name_to_instance[stream_name].cursor if self._slice_logger.should_log_slice_message(self._logger): self._message_repository.emit_message( self._slice_logger.create_slice_log_message(partition.to_slice()) ) - self._thread_pool_manager.submit( - self._partition_reader.process_partition, partition, cursor - ) + self._thread_pool_manager.submit(self._partition_reader.process_partition, partition) def on_partition_complete_sentinel( self, sentinel: PartitionCompleteSentinel @@ -115,16 +112,26 @@ def on_partition_complete_sentinel( """ partition = sentinel.partition - partitions_running = self._streams_to_running_partitions[partition.stream_name()] - if partition in partitions_running: - partitions_running.remove(partition) - # If all partitions were generated and this was the last one, the stream is done - if ( - partition.stream_name() not in self._streams_currently_generating_partitions - and len(partitions_running) == 0 - ): - yield from self._on_stream_is_done(partition.stream_name()) - yield from self._message_repository.consume_queue() + try: + if sentinel.is_successful: + stream = self._stream_name_to_instance[partition.stream_name()] + stream.cursor.close_partition(partition) + except Exception as exception: + self._flag_exception(partition.stream_name(), exception) + yield AirbyteTracedException.from_exception( + exception, stream_descriptor=StreamDescriptor(name=partition.stream_name()) + ).as_sanitized_airbyte_message() + finally: + partitions_running = self._streams_to_running_partitions[partition.stream_name()] + if partition in partitions_running: + partitions_running.remove(partition) + # If all partitions were generated and this was the last one, the stream is done + if ( + partition.stream_name() not in self._streams_currently_generating_partitions + and len(partitions_running) == 0 + ): + yield from self._on_stream_is_done(partition.stream_name()) + yield from self._message_repository.consume_queue() def on_record(self, record: Record) -> Iterable[AirbyteMessage]: """ diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index 9ccfc1088..ffdee2dc1 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -4,7 +4,7 @@ import concurrent import logging from queue import Queue -from typing import Iterable, Iterator, List, Optional +from typing import Iterable, Iterator, List from airbyte_cdk.models import AirbyteMessage from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor @@ -16,7 +16,7 @@ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer -from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionLogger, PartitionReader +from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.types import ( PartitionCompleteSentinel, @@ -43,7 +43,6 @@ def create( logger: logging.Logger, slice_logger: SliceLogger, message_repository: MessageRepository, - queue: Optional[Queue[QueueItem]] = None, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> "ConcurrentSource": is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1 @@ -60,13 +59,12 @@ def create( logger, ) return ConcurrentSource( - threadpool=threadpool, - logger=logger, - slice_logger=slice_logger, - queue=queue, - message_repository=message_repository, - initial_number_partitions_to_generate=initial_number_of_partitions_to_generate, - timeout_seconds=timeout_seconds, + threadpool, + logger, + slice_logger, + message_repository, + initial_number_of_partitions_to_generate, + timeout_seconds, ) def __init__( @@ -74,7 +72,6 @@ def __init__( threadpool: ThreadPoolManager, logger: logging.Logger, slice_logger: SliceLogger = DebugSliceLogger(), - queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), initial_number_partitions_to_generate: int = 1, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, @@ -94,28 +91,25 @@ def __init__( self._initial_number_partitions_to_generate = initial_number_partitions_to_generate self._timeout_seconds = timeout_seconds - # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less - # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating - # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more - # information and might even need to be configurable depending on the source - self._queue = queue or Queue(maxsize=10_000) - def read( self, streams: List[AbstractStream], ) -> Iterator[AirbyteMessage]: self._logger.info("Starting syncing") + + # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less + # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating + # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more + # information and might even need to be configurable depending on the source + queue: Queue[QueueItem] = Queue(maxsize=10_000) concurrent_stream_processor = ConcurrentReadProcessor( streams, - PartitionEnqueuer(self._queue, self._threadpool), + PartitionEnqueuer(queue, self._threadpool), self._threadpool, self._logger, self._slice_logger, self._message_repository, - PartitionReader( - self._queue, - PartitionLogger(self._slice_logger, self._logger, self._message_repository), - ), + PartitionReader(queue), ) # Enqueue initial partition generation tasks @@ -123,7 +117,7 @@ def read( # Read from the queue until all partitions were generated and read yield from self._consume_from_queue( - self._queue, + queue, concurrent_stream_processor, ) self._threadpool.check_for_errors_and_shutdown() @@ -147,10 +141,7 @@ def _consume_from_queue( airbyte_message_or_record_or_exception, concurrent_stream_processor, ) - # In the event that a partition raises an exception, anything remaining in - # the queue will be missed because is_done() can raise an exception and exit - # out of this loop before remaining items are consumed - if queue.empty() and concurrent_stream_processor.is_done(): + if concurrent_stream_processor.is_done() and queue.empty(): # all partitions were generated and processed. we're done here break @@ -170,7 +161,5 @@ def _handle_item( yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item) elif isinstance(queue_item, Record): yield from concurrent_stream_processor.on_record(queue_item) - elif isinstance(queue_item, AirbyteMessage): - yield queue_item else: raise ValueError(f"Unknown queue item type: {type(queue_item)}") diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 9a651514b..720934a11 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -3,22 +3,7 @@ # import logging -from dataclasses import dataclass, field -from queue import Queue -from typing import ( - Any, - ClassVar, - Generic, - Iterator, - List, - Mapping, - MutableMapping, - Optional, - Tuple, - Union, -) - -from airbyte_protocol_dataclasses.models import Level +from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union from airbyte_cdk.models import ( AirbyteCatalog, @@ -58,8 +43,6 @@ StreamSlicerPartitionGenerator, ) from airbyte_cdk.sources.declarative.types import ConnectionDefinition -from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository -from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository from airbyte_cdk.sources.source import TState from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -67,22 +50,6 @@ from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream -from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem - - -@dataclass -class TestLimits: - __test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name - - DEFAULT_MAX_PAGES_PER_SLICE: ClassVar[int] = 5 - DEFAULT_MAX_SLICES: ClassVar[int] = 5 - DEFAULT_MAX_RECORDS: ClassVar[int] = 100 - DEFAULT_MAX_STREAMS: ClassVar[int] = 100 - - max_records: int = field(default=DEFAULT_MAX_RECORDS) - max_pages_per_slice: int = field(default=DEFAULT_MAX_PAGES_PER_SLICE) - max_slices: int = field(default=DEFAULT_MAX_SLICES) - max_streams: int = field(default=DEFAULT_MAX_STREAMS) class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]): @@ -98,9 +65,7 @@ def __init__( source_config: ConnectionDefinition, debug: bool = False, emit_connector_builder_messages: bool = False, - migrate_manifest: bool = False, - normalize_manifest: bool = False, - limits: Optional[TestLimits] = None, + component_factory: Optional[ModelToComponentFactory] = None, config_path: Optional[str] = None, **kwargs: Any, ) -> None: @@ -108,39 +73,21 @@ def __init__( # no longer needs to store the original incoming state. But maybe there's an edge case? self._connector_state_manager = ConnectorStateManager(state=state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later - # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less - # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating - # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more - # information and might even need to be configurable depending on the source - queue: Queue[QueueItem] = Queue(maxsize=10_000) - message_repository = InMemoryMessageRepository( - Level.DEBUG if emit_connector_builder_messages else Level.INFO - ) - # To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic # cursors. We do this by no longer automatically instantiating RFR cursors when converting # the declarative models into runtime components. Concurrent sources will continue to checkpoint # incremental streams running in full refresh. - component_factory = ModelToComponentFactory( + component_factory = component_factory or ModelToComponentFactory( emit_connector_builder_messages=emit_connector_builder_messages, - message_repository=ConcurrentMessageRepository(queue, message_repository), connector_state_manager=self._connector_state_manager, max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"), - limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None, - limit_slices_fetched=limits.max_slices if limits else None, - disable_retries=True if limits else False, - disable_cache=True if limits else False, ) - self._limits = limits - super().__init__( source_config=source_config, config=config, debug=debug, emit_connector_builder_messages=emit_connector_builder_messages, - migrate_manifest=migrate_manifest, - normalize_manifest=normalize_manifest, component_factory=component_factory, config_path=config_path, ) @@ -170,7 +117,6 @@ def __init__( initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate, logger=self.logger, slice_logger=self._slice_logger, - queue=queue, message_repository=self.message_repository, ) @@ -334,14 +280,8 @@ def _group_streams( schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish retriever=retriever, message_repository=self.message_repository, - max_records_limit=self._limits.max_records - if self._limits - else None, ), stream_slicer=declarative_stream.retriever.stream_slicer, - slice_limit=self._limits.max_slices - if self._limits - else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later ) else: if ( @@ -371,12 +311,8 @@ def _group_streams( schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish retriever=retriever, message_repository=self.message_repository, - max_records_limit=self._limits.max_records - if self._limits - else None, ), stream_slicer=cursor, - slice_limit=self._limits.max_slices if self._limits else None, ) concurrent_streams.append( @@ -405,12 +341,8 @@ def _group_streams( schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish retriever=declarative_stream.retriever, message_repository=self.message_repository, - max_records_limit=self._limits.max_records if self._limits else None, ), declarative_stream.retriever.stream_slicer, - slice_limit=self._limits.max_slices - if self._limits - else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later ) final_state_cursor = FinalStateCursor( @@ -469,10 +401,8 @@ def _group_streams( schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish retriever=retriever, message_repository=self.message_repository, - max_records_limit=self._limits.max_records if self._limits else None, ), perpartition_cursor, - slice_limit=self._limits.max_slices if self._limits else None, ) concurrent_streams.append( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 1f47a4283..1bfc0079a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -642,10 +642,6 @@ } _NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) -# Ideally this should use the value defined in ConcurrentDeclarativeSource, but -# this would be a circular import -MAX_SLICES = 5 - class ModelToComponentFactory: EPOCH_DATETIME_FORMAT = "%s" @@ -2096,7 +2092,6 @@ def create_declarative_stream( self._message_repository, ), stream_slicer, - self._limit_slices_fetched or 5 if self._should_limit_slices_fetched() else None, ), name=stream_name, json_schema=schema_loader.get_json_schema, @@ -3912,7 +3907,7 @@ def create_http_components_resolver( config=config, name=f"{stream_name if stream_name else '__http_components_resolver'}", primary_key=None, - stream_slicer=SinglePartitionRouter(parameters={}), + stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), transformations=[], ) diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 98a472427..466608910 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -1,12 +1,9 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. -from typing import Any, Iterable, Mapping, Optional, cast +from typing import Any, Iterable, Mapping, Optional from airbyte_cdk.sources.declarative.retrievers import Retriever from airbyte_cdk.sources.declarative.schema import SchemaLoader -from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer_test_read_decorator import ( - StreamSlicerTestReadDecorator, -) from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator @@ -14,10 +11,17 @@ from airbyte_cdk.sources.types import Record, StreamSlice from airbyte_cdk.utils.slice_hasher import SliceHasher -# For Connector Builder test read operations, we track the total number of records -# read for the stream at the global level so that we can stop reading early if we -# exceed the record limit -total_record_counter = 0 + +class SchemaLoaderCachingDecorator(SchemaLoader): + def __init__(self, schema_loader: SchemaLoader): + self._decorated = schema_loader + self._loaded_schema: Optional[Mapping[str, Any]] = None + + def get_json_schema(self) -> Mapping[str, Any]: + if self._loaded_schema is None: + self._loaded_schema = self._decorated.get_json_schema() + + return self._loaded_schema # type: ignore # at that point, we assume the schema will be populated class SchemaLoaderCachingDecorator(SchemaLoader): @@ -39,7 +43,6 @@ def __init__( schema_loader: SchemaLoader, retriever: Retriever, message_repository: MessageRepository, - max_records_limit: Optional[int] = None, ) -> None: """ The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not @@ -50,7 +53,6 @@ def __init__( self._schema_loader = SchemaLoaderCachingDecorator(schema_loader) self._retriever = retriever self._message_repository = message_repository - self._max_records_limit = max_records_limit def create(self, stream_slice: StreamSlice) -> Partition: return DeclarativePartition( @@ -58,7 +60,6 @@ def create(self, stream_slice: StreamSlice) -> Partition: schema_loader=self._schema_loader, retriever=self._retriever, message_repository=self._message_repository, - max_records_limit=self._max_records_limit, stream_slice=stream_slice, ) @@ -70,29 +71,19 @@ def __init__( schema_loader: SchemaLoader, retriever: Retriever, message_repository: MessageRepository, - max_records_limit: Optional[int], stream_slice: StreamSlice, ): self._stream_name = stream_name self._schema_loader = schema_loader self._retriever = retriever self._message_repository = message_repository - self._max_records_limit = max_records_limit self._stream_slice = stream_slice self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) def read(self) -> Iterable[Record]: - if self._max_records_limit: - global total_record_counter - if total_record_counter >= self._max_records_limit: - return for stream_data in self._retriever.read_records( self._schema_loader.get_json_schema(), self._stream_slice ): - if self._max_records_limit: - if total_record_counter >= self._max_records_limit: - break - if isinstance(stream_data, Mapping): record = ( stream_data @@ -107,9 +98,6 @@ def read(self) -> Iterable[Record]: else: self._message_repository.emit_message(stream_data) - if self._max_records_limit: - total_record_counter += 1 - def to_slice(self) -> Optional[Mapping[str, Any]]: return self._stream_slice @@ -122,23 +110,10 @@ def __hash__(self) -> int: class StreamSlicerPartitionGenerator(PartitionGenerator): def __init__( - self, - partition_factory: DeclarativePartitionFactory, - stream_slicer: StreamSlicer, - slice_limit: Optional[int] = None, + self, partition_factory: DeclarativePartitionFactory, stream_slicer: StreamSlicer ) -> None: self._partition_factory = partition_factory - - if slice_limit: - self._stream_slicer = cast( - StreamSlicer, - StreamSlicerTestReadDecorator( - wrapped_slicer=stream_slicer, - maximum_number_of_slices=slice_limit, - ), - ) - else: - self._stream_slicer = stream_slicer + self._stream_slicer = stream_slicer def generate(self) -> Iterable[Partition]: for stream_slice in self._stream_slicer.stream_slices(): diff --git a/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py b/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py index d261c27e8..323c89196 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py @@ -4,10 +4,10 @@ from dataclasses import dataclass from itertools import islice -from typing import Any, Iterable +from typing import Any, Iterable, Mapping, Optional, Union from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer -from airbyte_cdk.sources.types import StreamSlice +from airbyte_cdk.sources.types import StreamSlice, StreamState @dataclass diff --git a/airbyte_cdk/sources/message/concurrent_repository.py b/airbyte_cdk/sources/message/concurrent_repository.py deleted file mode 100644 index 947ee4c46..000000000 --- a/airbyte_cdk/sources/message/concurrent_repository.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - -from queue import Queue -from typing import Callable, Iterable - -from airbyte_cdk.models import AirbyteMessage, Level -from airbyte_cdk.sources.message.repository import LogMessage, MessageRepository -from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem - - -class ConcurrentMessageRepository(MessageRepository): - """ - Message repository that immediately loads messages onto the queue processed on the - main thread. This ensures that messages are processed in the correct order they are - received. The InMemoryMessageRepository implementation does not have guaranteed - ordering since whether to process the main thread vs. partitions is non-deterministic - and there can be a lag between reading the main-thread and consuming messages on the - MessageRepository. - - This is particularly important for the connector builder which relies on grouping - of messages to organize request/response, pages, and partitions. - """ - - def __init__(self, queue: Queue[QueueItem], message_repository: MessageRepository): - self._queue = queue - self._decorated_message_repository = message_repository - - def emit_message(self, message: AirbyteMessage) -> None: - self._decorated_message_repository.emit_message(message) - for message in self._decorated_message_repository.consume_queue(): - self._queue.put(message) - - def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: - self._decorated_message_repository.log_message(level, message_provider) - for message in self._decorated_message_repository.consume_queue(): - self._queue.put(message) - - def consume_queue(self) -> Iterable[AirbyteMessage]: - """ - This method shouldn't need to be called because as part of emit_message() we are already - loading messages onto the queue processed on the main thread. - """ - yield from [] diff --git a/airbyte_cdk/sources/streams/concurrent/partition_reader.py b/airbyte_cdk/sources/streams/concurrent/partition_reader.py index 0edc5056a..3d23fd9cf 100644 --- a/airbyte_cdk/sources/streams/concurrent/partition_reader.py +++ b/airbyte_cdk/sources/streams/concurrent/partition_reader.py @@ -1,45 +1,14 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - -import logging +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# from queue import Queue -from typing import Optional from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException -from airbyte_cdk.sources.message.repository import MessageRepository -from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.types import ( PartitionCompleteSentinel, QueueItem, ) -from airbyte_cdk.sources.utils.slice_logger import SliceLogger - - -# Since moving all the connector builder workflow to the concurrent CDK which required correct ordering -# of grouping log messages onto the main write thread using the ConcurrentMessageRepository, this -# separate flow and class that was used to log slices onto this partition's message_repository -# should just be replaced by emitting messages directly onto the repository instead of an intermediary. -class PartitionLogger: - """ - Helper class that provides a mechanism for passing a log message onto the current - partitions message repository - """ - - def __init__( - self, - slice_logger: SliceLogger, - logger: logging.Logger, - message_repository: MessageRepository, - ): - self._slice_logger = slice_logger - self._logger = logger - self._message_repository = message_repository - - def log(self, partition: Partition) -> None: - if self._slice_logger.should_log_slice_message(self._logger): - self._message_repository.emit_message( - self._slice_logger.create_slice_log_message(partition.to_slice()) - ) class PartitionReader: @@ -49,18 +18,13 @@ class PartitionReader: _IS_SUCCESSFUL = True - def __init__( - self, - queue: Queue[QueueItem], - partition_logger: Optional[PartitionLogger] = None, - ) -> None: + def __init__(self, queue: Queue[QueueItem]) -> None: """ :param queue: The queue to put the records in. """ self._queue = queue - self._partition_logger = partition_logger - def process_partition(self, partition: Partition, cursor: Cursor) -> None: + def process_partition(self, partition: Partition) -> None: """ Process a partition and put the records in the output queue. When all the partitions are added to the queue, a sentinel is added to the queue to indicate that all the partitions have been generated. @@ -73,13 +37,8 @@ def process_partition(self, partition: Partition, cursor: Cursor) -> None: :return: None """ try: - if self._partition_logger: - self._partition_logger.log(partition) - for record in partition.read(): self._queue.put(record) - cursor.observe(record) - cursor.close_partition(partition) self._queue.put(PartitionCompleteSentinel(partition, self._IS_SUCCESSFUL)) except Exception as e: self._queue.put(StreamThreadException(e, partition.stream_name())) diff --git a/airbyte_cdk/sources/streams/concurrent/partitions/types.py b/airbyte_cdk/sources/streams/concurrent/partitions/types.py index 3ae63c242..77644c6b9 100644 --- a/airbyte_cdk/sources/streams/concurrent/partitions/types.py +++ b/airbyte_cdk/sources/streams/concurrent/partitions/types.py @@ -4,7 +4,6 @@ from typing import Any, Union -from airbyte_cdk.models import AirbyteMessage from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import ( PartitionGenerationCompletedSentinel, ) @@ -35,10 +34,5 @@ def __eq__(self, other: Any) -> bool: Typedef representing the items that can be added to the ThreadBasedConcurrentStream """ QueueItem = Union[ - Record, - Partition, - PartitionCompleteSentinel, - PartitionGenerationCompletedSentinel, - Exception, - AirbyteMessage, + Record, Partition, PartitionCompleteSentinel, PartitionGenerationCompletedSentinel, Exception ] diff --git a/airbyte_cdk/sources/utils/slice_logger.py b/airbyte_cdk/sources/utils/slice_logger.py index 4b29f3e0d..ee802a7a6 100644 --- a/airbyte_cdk/sources/utils/slice_logger.py +++ b/airbyte_cdk/sources/utils/slice_logger.py @@ -11,10 +11,6 @@ from airbyte_cdk.models import Type as MessageType -# Once everything runs on the concurrent CDK and we've cleaned up the legacy flows, we should try to remove -# this class and write messages directly to the message_repository instead of through the logger because for -# cases like the connector builder where ordering of messages is important, using the logger can cause -# messages to be grouped out of order. Alas work for a different day. class SliceLogger(ABC): """ SliceLogger is an interface that allows us to log slices of data in a uniform way. diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index b117453fb..4136506c0 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -17,6 +17,10 @@ from airbyte_cdk import connector_builder from airbyte_cdk.connector_builder.connector_builder_handler import ( + DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE, + DEFAULT_MAXIMUM_NUMBER_OF_SLICES, + DEFAULT_MAXIMUM_RECORDS, + TestLimits, create_source, get_limits, resolve_manifest, @@ -54,9 +58,9 @@ from airbyte_cdk.models import Type as MessageType from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( ConcurrentDeclarativeSource, - TestLimits, ) from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream @@ -538,9 +542,7 @@ def test_resolve_manifest(valid_resolve_manifest_config_file): config = copy.deepcopy(RESOLVE_MANIFEST_CONFIG) command = "resolve_manifest" config["__command"] = command - source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=MANIFEST - ) + source = ManifestDeclarativeSource(source_config=MANIFEST) limits = TestLimits() resolved_manifest = handle_connector_builder_request( source, command, config, create_configured_catalog("dummy_stream"), _A_STATE, limits @@ -689,21 +691,19 @@ def test_resolve_manifest(valid_resolve_manifest_config_file): def test_resolve_manifest_error_returns_error_response(): - class MockConcurrentDeclarativeSource: + class MockManifestDeclarativeSource: @property def resolved_manifest(self): raise ValueError - source = MockConcurrentDeclarativeSource() + source = MockManifestDeclarativeSource() response = resolve_manifest(source) assert "Error resolving manifest" in response.trace.error.message def test_read(): config = TEST_READ_CONFIG - source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=MANIFEST - ) + source = ManifestDeclarativeSource(source_config=MANIFEST) real_record = AirbyteRecordMessage( data={"id": "1234", "key": "value"}, emitted_at=1, stream=_stream_name @@ -835,7 +835,7 @@ def cursor_field(self): def name(self): return _stream_name - class MockConcurrentDeclarativeSource: + class MockManifestDeclarativeSource: def streams(self, config): return [MockDeclarativeStream()] @@ -857,7 +857,7 @@ def check_config_against_spec(self) -> Literal[False]: stack_trace = "a stack trace" mock_from_exception.return_value = stack_trace - source = MockConcurrentDeclarativeSource() + source = MockManifestDeclarativeSource() limits = TestLimits() response = read_stream( source, @@ -899,22 +899,19 @@ def test_handle_429_response(): config = TEST_READ_CONFIG limits = TestLimits() - catalog = ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG) - source = create_source(config=config, limits=limits, catalog=catalog, state=None) + source = create_source(config, limits) with patch("requests.Session.send", return_value=response) as mock_send: response = handle_connector_builder_request( source, "test_read", config, - catalog, + ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG), _A_PER_PARTITION_STATE, limits, ) - # The test read will attempt a read for 5 partitions, and attempt 1 request - # each time that will not be retried - assert mock_send.call_count == 5 + mock_send.assert_called_once() @pytest.mark.parametrize( @@ -966,7 +963,7 @@ def test_invalid_config_command(invalid_config_file, dummy_catalog): @pytest.fixture def manifest_declarative_source(): - return mock.Mock(spec=ConcurrentDeclarativeSource, autospec=True) + return mock.Mock(spec=ManifestDeclarativeSource, autospec=True) def create_mock_retriever(name, url_base, path): @@ -991,16 +988,16 @@ def create_mock_declarative_stream(http_stream): ( "test_no_test_read_config", {}, - TestLimits.DEFAULT_MAX_RECORDS, - TestLimits.DEFAULT_MAX_SLICES, - TestLimits.DEFAULT_MAX_PAGES_PER_SLICE, + DEFAULT_MAXIMUM_RECORDS, + DEFAULT_MAXIMUM_NUMBER_OF_SLICES, + DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE, ), ( "test_no_values_set", {"__test_read_config": {}}, - TestLimits.DEFAULT_MAX_RECORDS, - TestLimits.DEFAULT_MAX_SLICES, - TestLimits.DEFAULT_MAX_PAGES_PER_SLICE, + DEFAULT_MAXIMUM_RECORDS, + DEFAULT_MAXIMUM_NUMBER_OF_SLICES, + DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE, ), ( "test_values_are_set", @@ -1028,9 +1025,9 @@ def test_create_source(): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config=config, limits=limits, catalog=None, state=None) + source = create_source(config, limits) - assert isinstance(source, ConcurrentDeclarativeSource) + assert isinstance(source, ManifestDeclarativeSource) assert source._constructor._limit_pages_fetched_per_slice == limits.max_pages_per_slice assert source._constructor._limit_slices_fetched == limits.max_slices assert source._constructor._disable_cache @@ -1122,7 +1119,7 @@ def test_read_source(mock_http_stream): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config=config, limits=limits, catalog=catalog, state=None) + source = create_source(config, limits) output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data slices = output_data["slices"] @@ -1173,7 +1170,7 @@ def test_read_source_single_page_single_slice(mock_http_stream): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config=config, limits=limits, catalog=catalog, state=None) + source = create_source(config, limits) output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data slices = output_data["slices"] @@ -1263,7 +1260,7 @@ def test_handle_read_external_requests(deployment_mode, url_base, expected_error test_manifest["streams"][0]["$parameters"]["url_base"] = url_base config = {"__injected_declarative_manifest": test_manifest} - source = create_source(config=config, limits=limits, catalog=catalog, state=None) + source = create_source(config, limits) with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): output_data = read_stream( @@ -1293,13 +1290,13 @@ def test_handle_read_external_requests(deployment_mode, url_base, expected_error pytest.param( "CLOUD", "https://10.0.27.27/tokens/bearer", - "StreamThreadException", + "AirbyteTracedException", id="test_cloud_read_with_private_endpoint", ), pytest.param( "CLOUD", "http://unsecured.protocol/tokens/bearer", - "StreamThreadException", + "InvalidSchema", id="test_cloud_read_with_unsecured_endpoint", ), pytest.param( @@ -1359,7 +1356,7 @@ def test_handle_read_external_oauth_request(deployment_mode, token_url, expected ) config = {"__injected_declarative_manifest": test_manifest} - source = create_source(config=config, limits=limits, catalog=catalog, state=None) + source = create_source(config, limits) with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): output_data = read_stream( @@ -1416,9 +1413,7 @@ def test_read_stream_exception_with_secrets(): def test_full_resolve_manifest(valid_resolve_manifest_config_file): config = copy.deepcopy(RESOLVE_DYNAMIC_STREAM_MANIFEST_CONFIG) command = config["__command"] - source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=DYNAMIC_STREAM_MANIFEST - ) + source = ManifestDeclarativeSource(source_config=DYNAMIC_STREAM_MANIFEST) limits = TestLimits(max_streams=2) with HttpMocker() as http_mocker: http_mocker.get( diff --git a/unit_tests/connector_builder/test_message_grouper.py b/unit_tests/connector_builder/test_message_grouper.py index e79ee117c..6c4f11526 100644 --- a/unit_tests/connector_builder/test_message_grouper.py +++ b/unit_tests/connector_builder/test_message_grouper.py @@ -307,6 +307,126 @@ def test_get_grouped_messages_with_logs(mock_entrypoint_read: Mock) -> None: assert actual_log == expected_logs[i] +@pytest.mark.parametrize( + "request_record_limit, max_record_limit, should_fail", + [ + pytest.param(1, 3, False, id="test_create_request_with_record_limit"), + pytest.param(3, 1, True, id="test_create_request_record_limit_exceeds_max"), + ], +) +@patch("airbyte_cdk.connector_builder.test_reader.reader.AirbyteEntrypoint.read") +def test_get_grouped_messages_record_limit( + mock_entrypoint_read: Mock, request_record_limit: int, max_record_limit: int, should_fail: bool +) -> None: + stream_name = "hashiras" + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = { + "status_code": 200, + "headers": {"field": "value"}, + "body": {"content": '{"name": "field"}'}, + } + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url, stream_name), + record_message(stream_name, {"name": "Shinobu Kocho"}), + record_message(stream_name, {"name": "Muichiro Tokito"}), + request_response_log_message(request, response, url, stream_name), + record_message(stream_name, {"name": "Mitsuri Kanroji"}), + ] + ), + ) + n_records = 2 + record_limit = min(request_record_limit, max_record_limit) + + api = TestReader(MAX_PAGES_PER_SLICE, MAX_SLICES, max_record_limit=max_record_limit) + # this is the call we expect to raise an exception + if should_fail: + with pytest.raises(ValueError): + api.run_test_read( + mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog(stream_name), + stream_name=stream_name, + state=_NO_STATE, + record_limit=request_record_limit, + ) + else: + actual_response: StreamRead = api.run_test_read( + mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog(stream_name), + stream_name=stream_name, + state=_NO_STATE, + record_limit=request_record_limit, + ) + single_slice = actual_response.slices[0] + total_records = 0 + for i, actual_page in enumerate(single_slice.pages): + total_records += len(actual_page.records) + assert total_records == min([record_limit, n_records]) + + assert (total_records >= max_record_limit) == actual_response.test_read_limit_reached + + +@pytest.mark.parametrize( + "max_record_limit", + [ + pytest.param(2, id="test_create_request_no_record_limit"), + pytest.param(1, id="test_create_request_no_record_limit_n_records_exceed_max"), + ], +) +@patch("airbyte_cdk.connector_builder.test_reader.reader.AirbyteEntrypoint.read") +def test_get_grouped_messages_default_record_limit( + mock_entrypoint_read: Mock, max_record_limit: int +) -> None: + stream_name = "hashiras" + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = { + "status_code": 200, + "headers": {"field": "value"}, + "body": {"content": '{"name": "field"}'}, + } + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url, stream_name), + record_message(stream_name, {"name": "Shinobu Kocho"}), + record_message(stream_name, {"name": "Muichiro Tokito"}), + request_response_log_message(request, response, url, stream_name), + record_message(stream_name, {"name": "Mitsuri Kanroji"}), + ] + ), + ) + n_records = 2 + + api = TestReader(MAX_PAGES_PER_SLICE, MAX_SLICES, max_record_limit=max_record_limit) + actual_response: StreamRead = api.run_test_read( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog(stream_name), + stream_name=stream_name, + state=_NO_STATE, + ) + single_slice = actual_response.slices[0] + total_records = 0 + for i, actual_page in enumerate(single_slice.pages): + total_records += len(actual_page.records) + assert total_records == min([max_record_limit, n_records]) + + @patch("airbyte_cdk.connector_builder.test_reader.reader.AirbyteEntrypoint.read") def test_get_grouped_messages_limit_0(mock_entrypoint_read: Mock) -> None: stream_name = "hashiras" diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 26cc36e70..987592033 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -3646,7 +3646,6 @@ def test_given_no_partitions_processed_when_close_partition_then_no_state_update schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=slice, ) ) @@ -3731,7 +3730,6 @@ def test_given_unfinished_first_parent_partition_no_parent_state_update(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=slice, ) ) @@ -3826,7 +3824,6 @@ def test_given_unfinished_last_parent_partition_with_partial_parent_state_update schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=slice, ) ) @@ -3916,7 +3913,6 @@ def test_given_all_partitions_finished_when_close_partition_then_final_state_emi schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=slice, ) ) @@ -3990,7 +3986,6 @@ def test_given_partition_limit_exceeded_when_close_partition_then_switch_to_glob schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=slice, ) ) @@ -4075,7 +4070,6 @@ def test_semaphore_cleanup(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=s, ) ) @@ -4195,7 +4189,6 @@ def test_duplicate_partition_after_closing_partition_cursor_deleted(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=first_1, ) ) @@ -4207,7 +4200,6 @@ def test_duplicate_partition_after_closing_partition_cursor_deleted(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=two, ) ) @@ -4219,7 +4211,6 @@ def test_duplicate_partition_after_closing_partition_cursor_deleted(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=second_1, ) ) @@ -4280,7 +4271,6 @@ def test_duplicate_partition_after_closing_partition_cursor_exists(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=first_1, ) ) @@ -4292,7 +4282,6 @@ def test_duplicate_partition_after_closing_partition_cursor_exists(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=two, ) ) @@ -4305,7 +4294,6 @@ def test_duplicate_partition_after_closing_partition_cursor_exists(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=second_1, ) ) @@ -4363,7 +4351,6 @@ def test_duplicate_partition_while_processing(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=generated[1], ) ) @@ -4374,7 +4361,6 @@ def test_duplicate_partition_while_processing(): schema_loader=_EMPTY_SCHEMA_LOADER, retriever=MagicMock(), message_repository=MagicMock(), - max_records_limit=None, stream_slice=generated[0], ) ) diff --git a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py index d39e84e4d..44f307a32 100644 --- a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py @@ -11,14 +11,7 @@ import requests from airbyte_cdk import YamlDeclarativeSource -from airbyte_cdk.models import ( - AirbyteLogMessage, - AirbyteMessage, - AirbyteRecordMessage, - Level, - SyncMode, - Type, -) +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type from airbyte_cdk.sources.declarative.auth.declarative_authenticator import NoAuth from airbyte_cdk.sources.declarative.decoders import JsonDecoder from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordSelector diff --git a/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py b/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py index 20147465f..97f89879c 100644 --- a/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py +++ b/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py @@ -10,7 +10,9 @@ from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( ConcurrentDeclarativeSource, - TestLimits, +) +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( + ModelToComponentFactory, ) from airbyte_cdk.sources.declarative.schema import DynamicSchemaLoader, SchemaTypeIdentifier from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse @@ -351,13 +353,14 @@ def test_dynamic_schema_loader_with_type_conditions(): }, }, } - source = ConcurrentDeclarativeSource( source_config=_MANIFEST_WITH_TYPE_CONDITIONS, config=_CONFIG, catalog=None, state=None, - limits=TestLimits(), # Avoid caching on the HttpClient which could result in caching the requests/responses of other tests + component_factory=ModelToComponentFactory( + disable_cache=True + ), # Avoid caching on the HttpClient which could result in caching the requests/responses of other tests ) with HttpMocker() as http_mocker: http_mocker.get( diff --git a/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py b/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py index 358b2c5e6..b09c708ad 100644 --- a/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py +++ b/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py @@ -4,18 +4,18 @@ from unittest import TestCase from unittest.mock import Mock -# This allows for the global total_record_counter to be reset between tests -import airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator as declarative_partition_generator from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type from airbyte_cdk.sources.declarative.retrievers import Retriever from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, +) from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.core import StreamData -from airbyte_cdk.sources.types import Record, StreamSlice +from airbyte_cdk.sources.types import StreamSlice _STREAM_NAME = "a_stream_name" -_JSON_SCHEMA = InlineSchemaLoader(schema={"type": "object", "properties": {}}, parameters={}) -_SCHEMA_LOADER = InlineSchemaLoader(_JSON_SCHEMA, {}) +_SCHEMA_LOADER = InlineSchemaLoader({"type": "object", "properties": {}}, {}) _A_STREAM_SLICE = StreamSlice( partition={"partition_key": "partition_value"}, cursor_slice={"cursor_key": "cursor_value"} ) @@ -33,7 +33,7 @@ class StreamSlicerPartitionGeneratorTest(TestCase): def test_given_multiple_slices_partition_generator_uses_the_same_retriever(self) -> None: retriever = self._mock_retriever([]) message_repository = Mock(spec=MessageRepository) - partition_factory = declarative_partition_generator.DeclarativePartitionFactory( + partition_factory = DeclarativePartitionFactory( _STREAM_NAME, _SCHEMA_LOADER, retriever, @@ -48,7 +48,7 @@ def test_given_multiple_slices_partition_generator_uses_the_same_retriever(self) def test_given_a_mapping_when_read_then_yield_record(self) -> None: retriever = self._mock_retriever([_A_RECORD]) message_repository = Mock(spec=MessageRepository) - partition_factory = declarative_partition_generator.DeclarativePartitionFactory( + partition_factory = DeclarativePartitionFactory( _STREAM_NAME, _SCHEMA_LOADER, retriever, @@ -66,7 +66,7 @@ def test_given_a_mapping_when_read_then_yield_record(self) -> None: def test_given_not_a_record_when_read_then_send_to_message_repository(self) -> None: retriever = self._mock_retriever([_AIRBYTE_LOG_MESSAGE]) message_repository = Mock(spec=MessageRepository) - partition_factory = declarative_partition_generator.DeclarativePartitionFactory( + partition_factory = DeclarativePartitionFactory( _STREAM_NAME, _SCHEMA_LOADER, retriever, @@ -77,78 +77,6 @@ def test_given_not_a_record_when_read_then_send_to_message_repository(self) -> N message_repository.emit_message.assert_called_once_with(_AIRBYTE_LOG_MESSAGE) - def test_max_records_reached_stops_reading(self) -> None: - declarative_partition_generator.total_record_counter = 0 - - expected_records = [ - Record(data={"id": 1, "name": "Max"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Oscar"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Charles"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Alex"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Yuki"}, stream_name="stream_name"), - ] - - mock_records = expected_records + [ - Record(data={"id": 1, "name": "Lewis"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Lando"}, stream_name="stream_name"), - ] - - retriever = self._mock_retriever(mock_records) - message_repository = Mock(spec=MessageRepository) - partition_factory = declarative_partition_generator.DeclarativePartitionFactory( - _STREAM_NAME, - _JSON_SCHEMA, - retriever, - message_repository, - max_records_limit=5, - ) - - partition = partition_factory.create(_A_STREAM_SLICE) - - actual_records = list(partition.read()) - - assert len(actual_records) == 5 - assert actual_records == expected_records - - def test_max_records_reached_on_previous_partition(self) -> None: - declarative_partition_generator.total_record_counter = 0 - - expected_records = [ - Record(data={"id": 1, "name": "Max"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Oscar"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Charles"}, stream_name="stream_name"), - ] - - mock_records = expected_records + [ - Record(data={"id": 1, "name": "Alex"}, stream_name="stream_name"), - Record(data={"id": 1, "name": "Yuki"}, stream_name="stream_name"), - ] - - retriever = self._mock_retriever(mock_records) - message_repository = Mock(spec=MessageRepository) - partition_factory = declarative_partition_generator.DeclarativePartitionFactory( - _STREAM_NAME, - _JSON_SCHEMA, - retriever, - message_repository, - max_records_limit=3, - ) - - partition = partition_factory.create(_A_STREAM_SLICE) - - first_partition_records = list(partition.read()) - - assert len(first_partition_records) == 3 - assert first_partition_records == expected_records - - second_partition_records = list(partition.read()) - assert len(second_partition_records) == 0 - - # The DeclarativePartition exits out of the read before attempting to read_records() if - # the max_records_limit has already been reached. So we only expect to see read_records() - # called for the first partition read and not the second - retriever.read_records.assert_called_once() - @staticmethod def _mock_retriever(read_return_value: List[StreamData]) -> Mock: retriever = Mock(spec=Retriever) diff --git a/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py b/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py index 75b52f6b2..50695ba1e 100644 --- a/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +++ b/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py @@ -50,10 +50,7 @@ def __init__( self._message_repository = InMemoryMessageRepository() threadpool_manager = ThreadPoolManager(threadpool, streams[0].logger) concurrent_source = ConcurrentSource( - threadpool=threadpool_manager, - logger=streams[0].logger, - slice_logger=NeverLogSliceLogger(), - message_repository=self._message_repository, + threadpool_manager, streams[0].logger, NeverLogSliceLogger(), self._message_repository ) super().__init__(concurrent_source) self._streams = streams diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index a681f75eb..d6ea64583 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -176,12 +176,10 @@ def test_handle_partition(self): self._partition_reader, ) - expected_cursor = handler._stream_name_to_instance[_ANOTHER_STREAM_NAME].cursor - handler.on_partition(self._a_closed_partition) self._thread_pool_manager.submit.assert_called_with( - self._partition_reader.process_partition, self._a_closed_partition, expected_cursor + self._partition_reader.process_partition, self._a_closed_partition ) assert ( self._a_closed_partition in handler._streams_to_running_partitions[_ANOTHER_STREAM_NAME] @@ -203,12 +201,10 @@ def test_handle_partition_emits_log_message_if_it_should_be_logged(self): self._partition_reader, ) - expected_cursor = handler._stream_name_to_instance[_STREAM_NAME].cursor - handler.on_partition(self._an_open_partition) self._thread_pool_manager.submit.assert_called_with( - self._partition_reader.process_partition, self._an_open_partition, expected_cursor + self._partition_reader.process_partition, self._an_open_partition ) self._message_repository.emit_message.assert_called_with(self._log_message) @@ -257,6 +253,8 @@ def test_handle_on_partition_complete_sentinel_with_messages_from_repository(sel ] assert messages == expected_messages + self._stream.cursor.close_partition.assert_called_once() + @freezegun.freeze_time("2020-01-01T00:00:00") def test_handle_on_partition_complete_sentinel_yields_status_message_if_the_stream_is_done( self, @@ -304,6 +302,55 @@ def test_handle_on_partition_complete_sentinel_yields_status_message_if_the_stre ) ] assert messages == expected_messages + self._another_stream.cursor.close_partition.assert_called_once() + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_given_exception_on_partition_complete_sentinel_then_yield_error_trace_message_and_stream_is_incomplete( + self, + ) -> None: + self._a_closed_partition.stream_name.return_value = self._stream.name + self._stream.cursor.close_partition.side_effect = ValueError + + handler = ConcurrentReadProcessor( + [self._stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + handler.on_partition(self._a_closed_partition) + list( + handler.on_partition_generation_completed( + PartitionGenerationCompletedSentinel(self._stream) + ) + ) + messages = list( + handler.on_partition_complete_sentinel( + PartitionCompleteSentinel(self._a_closed_partition) + ) + ) + + expected_status_message = AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor( + name=self._stream.name, + ), + status=AirbyteStreamStatus.INCOMPLETE, + ), + emitted_at=1577836800000.0, + ), + ) + assert list(map(lambda message: message.trace.type, messages)) == [ + TraceType.ERROR, + TraceType.STREAM_STATUS, + ] + assert messages[1] == expected_status_message @freezegun.freeze_time("2020-01-01T00:00:00") def test_handle_on_partition_complete_sentinel_yields_no_status_message_if_the_stream_is_not_done( @@ -332,6 +379,7 @@ def test_handle_on_partition_complete_sentinel_yields_no_status_message_if_the_s expected_messages = [] assert messages == expected_messages + self._stream.cursor.close_partition.assert_called_once() @freezegun.freeze_time("2020-01-01T00:00:00") def test_on_record_no_status_message_no_repository_messge(self): diff --git a/unit_tests/sources/streams/concurrent/test_partition_reader.py b/unit_tests/sources/streams/concurrent/test_partition_reader.py index a41750772..1910e034d 100644 --- a/unit_tests/sources/streams/concurrent/test_partition_reader.py +++ b/unit_tests/sources/streams/concurrent/test_partition_reader.py @@ -1,5 +1,6 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# import unittest from queue import Queue from typing import Callable, Iterable, List @@ -7,9 +8,7 @@ import pytest -from airbyte_cdk import InMemoryMessageRepository from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException -from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.types import ( @@ -27,15 +26,10 @@ class PartitionReaderTest(unittest.TestCase): def setUp(self) -> None: self._queue: Queue[QueueItem] = Queue() - self._partition_reader = PartitionReader(self._queue, None) + self._partition_reader = PartitionReader(self._queue) def test_given_no_records_when_process_partition_then_only_emit_sentinel(self): - cursor = FinalStateCursor( - stream_name="test", - stream_namespace=None, - message_repository=InMemoryMessageRepository(), - ) - self._partition_reader.process_partition(self._a_partition([]), cursor) + self._partition_reader.process_partition(self._a_partition([])) while queue_item := self._queue.get(): if not isinstance(queue_item, PartitionCompleteSentinel): @@ -46,24 +40,19 @@ def test_given_read_partition_successful_when_process_partition_then_queue_recor self, ): partition = self._a_partition(_RECORDS) - cursor = Mock() - self._partition_reader.process_partition(partition, cursor) + self._partition_reader.process_partition(partition) queue_content = self._consume_queue() assert queue_content == _RECORDS + [PartitionCompleteSentinel(partition)] - cursor.observe.assert_called() - cursor.close_partition.assert_called_once() - - def test_given_exception_from_read_when_process_partition_then_queue_records_and_exception_and_sentinel( + def test_given_exception_when_process_partition_then_queue_records_and_exception_and_sentinel( self, ): partition = Mock() - cursor = Mock() exception = ValueError() partition.read.side_effect = self._read_with_exception(_RECORDS, exception) - self._partition_reader.process_partition(partition, cursor) + self._partition_reader.process_partition(partition) queue_content = self._consume_queue() @@ -72,23 +61,6 @@ def test_given_exception_from_read_when_process_partition_then_queue_records_and PartitionCompleteSentinel(partition), ] - def test_given_exception_from_close_slice_when_process_partition_then_queue_records_and_exception_and_sentinel( - self, - ): - partition = self._a_partition(_RECORDS) - cursor = Mock() - exception = ValueError() - cursor.close_partition.side_effect = self._close_partition_with_exception(exception) - self._partition_reader.process_partition(partition, cursor) - - queue_content = self._consume_queue() - - # 4 total messages in queue. 2 records, 1 thread exception, 1 partition sentinel value - assert len(queue_content) == 4 - assert queue_content[:2] == _RECORDS - assert isinstance(queue_content[2], StreamThreadException) - assert queue_content[3] == PartitionCompleteSentinel(partition) - def _a_partition(self, records: List[Record]) -> Partition: partition = Mock(spec=Partition) partition.read.return_value = iter(records) @@ -104,13 +76,6 @@ def mocked_function() -> Iterable[Record]: return mocked_function - @staticmethod - def _close_partition_with_exception(exception: Exception) -> Callable[[Partition], None]: - def mocked_function(partition: Partition) -> None: - raise exception - - return mocked_function - def _consume_queue(self): queue_content = [] while queue_item := self._queue.get(): From 6b35969e8386ac14622d8b0d265f31b7ba9be59a Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 25 Aug 2025 09:22:20 -0400 Subject: [PATCH 37/68] fix failed merge --- .../sources/declarative/parsers/model_to_component_factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 70da57feb..1bfc0079a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2117,6 +2117,7 @@ def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bo model.incremental_sync and hasattr(model.incremental_sync, "is_client_side_incremental") and model.incremental_sync.is_client_side_incremental + ) def _build_stream_slicer_from_partition_router( self, From fbfcfd26c5f85508eee61697e486e6bdb0a406d5 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 26 Aug 2025 10:28:47 -0400 Subject: [PATCH 38/68] adding comments based on code review --- .../parsers/model_to_component_factory.py | 73 +- .../substream_partition_router.py | 7 + airbyte_cdk/sources/message/repository.py | 5 + .../test_per_partition_cursor_integration.py | 817 ------------------ 4 files changed, 54 insertions(+), 848 deletions(-) delete mode 100644 unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 1bfc0079a..2ca62ff84 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3784,6 +3784,46 @@ def _create_message_repository_substream_wrapper( child_state = self._connector_state_manager.get_stream_state( kwargs["stream_name"], None ) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag + connector_state_manager = self._instantiate_parent_stream_state_manager(child_state, config, model) + + substream_factory = ModelToComponentFactory( + connector_state_manager=connector_state_manager, + limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, + limit_slices_fetched=self._limit_slices_fetched, + emit_connector_builder_messages=self._emit_connector_builder_messages, + disable_retries=self._disable_retries, + disable_cache=self._disable_cache, + message_repository=StateFilteringMessageRepository( + LogAppenderMessageRepositoryDecorator( + { + "airbyte_cdk": {"stream": {"is_substream": True}}, + "http": {"is_auxiliary": True}, + }, + self._message_repository, + self._evaluate_log_level(self._emit_connector_builder_messages), + ), + ), + ) + + # This flag will be used exclusively for StateDelegatingStream when a parent stream is created + has_parent_state = bool( + self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) + if model.incremental_dependency + else False + ) + return substream_factory._create_component_from_model( + model=model, config=config, has_parent_state=has_parent_state, **kwargs + ) + + def _instantiate_parent_stream_state_manager(self, child_state, config, model): + """ + With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the + `set_initial_state` flow that existed for the declarative cursors. This state is taken from + self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account + for the MessageRepository being different). So we need to pass a ConnectorStateManager to the + ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if + incremental_dependency is set. + """ if model.incremental_dependency and child_state: parent_stream_name = model.stream.name or "" parent_state = ConcurrentPerPartitionCursor.get_parent_state( @@ -3814,38 +3854,9 @@ def _create_message_repository_substream_wrapper( ), ), ) - connector_state_manager = ConnectorStateManager([parent_state] if parent_state else []) - else: - connector_state_manager = ConnectorStateManager([]) - - substream_factory = ModelToComponentFactory( - connector_state_manager=connector_state_manager, - limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, - limit_slices_fetched=self._limit_slices_fetched, - emit_connector_builder_messages=self._emit_connector_builder_messages, - disable_retries=self._disable_retries, - disable_cache=self._disable_cache, - message_repository=StateFilteringMessageRepository( - LogAppenderMessageRepositoryDecorator( - { - "airbyte_cdk": {"stream": {"is_substream": True}}, - "http": {"is_auxiliary": True}, - }, - self._message_repository, - self._evaluate_log_level(self._emit_connector_builder_messages), - ), - ), - ) + return ConnectorStateManager([parent_state] if parent_state else []) - # This flag will be used exclusively for StateDelegatingStream when a parent stream is created - has_parent_state = bool( - self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) - if model.incremental_dependency - else False - ) - return substream_factory._create_component_from_model( - model=model, config=config, has_parent_state=has_parent_state, **kwargs - ) + return ConnectorStateManager([]) @staticmethod def create_wait_time_from_header( diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 68a1d8200..f74ab5817 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -206,6 +206,13 @@ def stream_slices(self) -> Iterable[StreamSlice]: for parent_record, is_last_record_in_slice in iterate_with_last_flag( partition.read() ): + # In the previous CDK implementation, state management was done internally by the stream. + # However, this could cause issues when doing availability check for example as the availability + # check would progress the state so state management was moved outside of the read method. + # Hence, we need to call the cursor here. + # Note that we call observe and close_partition before emitting the associated record as the + # ConcurrentPerPartitionCursor will associate a record with the state of the stream after the + # record was consumed. parent_stream.cursor.observe(parent_record) parent_partition = ( parent_record.associated_slice.partition diff --git a/airbyte_cdk/sources/message/repository.py b/airbyte_cdk/sources/message/repository.py index d806e9ac2..39e7fc2ff 100644 --- a/airbyte_cdk/sources/message/repository.py +++ b/airbyte_cdk/sources/message/repository.py @@ -96,6 +96,11 @@ def consume_queue(self) -> Iterable[AirbyteMessage]: class StateFilteringMessageRepository(MessageRepository): + """ + This message repository is used when creating parent streams for SubstreamPartitionRouter. As the child stream + manages the state for both the child and the parents, we want to prevent parents from emitting state messages. + """ + def __init__(self, decorated: MessageRepository) -> None: self._decorated = decorated diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py deleted file mode 100644 index b5fd09ec7..000000000 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ /dev/null @@ -1,817 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# - -import logging -from unittest.mock import MagicMock, patch - -import orjson - -from airbyte_cdk.models import ( - AirbyteStateBlob, - AirbyteStateMessage, - AirbyteStateType, - AirbyteStream, - AirbyteStreamState, - ConfiguredAirbyteCatalog, - ConfiguredAirbyteStream, - DestinationSyncMode, - StreamDescriptor, - SyncMode, -) -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( - ConcurrentDeclarativeSource, -) -from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor -from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import ( - PerPartitionCursor, - StreamSlice, -) -from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource -from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever -from airbyte_cdk.sources.types import Record -from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder - -CURSOR_FIELD = "cursor_field" -SYNC_MODE = SyncMode.incremental - - -class ManifestBuilder: - def __init__(self): - self._incremental_sync = {} - self._partition_router = {} - self._substream_partition_router = {} - - def with_list_partition_router(self, stream_name, cursor_field, partitions): - self._partition_router[stream_name] = { - "type": "ListPartitionRouter", - "cursor_field": cursor_field, - "values": partitions, - } - return self - - def with_substream_partition_router(self, stream_name): - self._substream_partition_router[stream_name] = { - "type": "SubstreamPartitionRouter", - "parent_stream_configs": [ - { - "type": "ParentStreamConfig", - "stream": "#/definitions/Rates", - "parent_key": "id", - "partition_field": "parent_id", - } - ], - } - return self - - def with_incremental_sync( - self, - stream_name, - start_datetime, - end_datetime, - datetime_format, - cursor_field, - step, - cursor_granularity, - ): - self._incremental_sync[stream_name] = { - "type": "DatetimeBasedCursor", - "start_datetime": start_datetime, - "end_datetime": end_datetime, - "datetime_format": datetime_format, - "cursor_field": cursor_field, - "step": step, - "cursor_granularity": cursor_granularity, - } - return self - - def build(self): - manifest = { - "version": "0.34.2", - "type": "DeclarativeSource", - "concurrency_level": {"type": "ConcurrencyLevel", "default_concurrency": 1}, - "check": {"type": "CheckStream", "stream_names": ["Rates"]}, - "definitions": { - "AnotherStream": { - "type": "DeclarativeStream", - "name": "AnotherStream", - "primary_key": [], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": {"id": {"type": "string"}}, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.apilayer.com", - "path": "/exchangerates_data/latest", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - }, - }, - "Rates": { - "type": "DeclarativeStream", - "name": "Rates", - "primary_key": [], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": {}, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.apilayer.com", - "path": "/exchangerates_data/latest", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - }, - }, - }, - "streams": [{"$ref": "#/definitions/Rates"}, {"$ref": "#/definitions/AnotherStream"}], - "spec": { - "connection_specification": { - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "required": [], - "properties": {}, - "additionalProperties": True, - }, - "documentation_url": "https://example.org", - "type": "Spec", - }, - } - for stream_name, incremental_sync_definition in self._incremental_sync.items(): - manifest["definitions"][stream_name]["incremental_sync"] = incremental_sync_definition - for stream_name, partition_router_definition in self._partition_router.items(): - manifest["definitions"][stream_name]["retriever"]["partition_router"] = ( - partition_router_definition - ) - for stream_name, partition_router_definition in self._substream_partition_router.items(): - manifest["definitions"][stream_name]["retriever"]["partition_router"] = ( - partition_router_definition - ) - return manifest - - -def test_given_state_for_only_some_partition_when_stream_slices_then_create_slices_using_state_or_start_from_start_datetime(): - source = ConcurrentDeclarativeSource( - state=[ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name="Rates"), - stream_state=AirbyteStateBlob( - { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-02-01"}, - } - ] - } - ), - ), - ), - ], - config={}, - catalog=CatalogBuilder() - .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) - .build(), - source_config=ManifestBuilder() - .with_list_partition_router("Rates", "partition_field", ["1", "2"]) - .with_incremental_sync( - "Rates", - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - ) - stream_instance = source.streams({})[0] - - partitions = stream_instance.generate_partitions() - - assert list(map(lambda partition: partition.to_slice(), partitions)) == [ - {"partition_field": "1", "start_time": "2022-02-01", "end_time": "2022-02-28"}, - {"partition_field": "2", "start_time": "2022-01-01", "end_time": "2022-01-31"}, - {"partition_field": "2", "start_time": "2022-02-01", "end_time": "2022-02-28"}, - ] - - -def test_given_record_for_partition_when_read_then_update_state(): - source = ConcurrentDeclarativeSource( - state=[], - config={}, - catalog=CatalogBuilder() - .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) - .build(), - source_config=ManifestBuilder() - .with_list_partition_router("Rates", "partition_field", ["1", "2"]) - .with_incremental_sync( - "Rates", - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - ) - stream_instance = source.streams({})[0] - partition = next(iter(stream_instance.generate_partitions())) - - stream_slice = StreamSlice( - partition={"partition_field": "1"}, - cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, - ) - with patch.object( - SimpleRetriever, - "_read_pages", - side_effect=[ - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, - "Rates", - stream_slice, - ) - ] - ], - ): - for record in partition.read(): - stream_instance.cursor.observe(record) - stream_instance.cursor.close_partition(partition) - - assert stream_instance.cursor.state == { - "lookback_window": 0, - "use_global_cursor": False, - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-01-15"}, - } - ], - } - - -def test_substream_without_input_state(): - test_source = ConcurrentDeclarativeSource( - state=[], - config={}, - catalog=CatalogBuilder() - .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) - .with_stream(ConfiguredAirbyteStreamBuilder().with_name("AnotherStream")) - .build(), - source_config=ManifestBuilder() - .with_substream_partition_router("AnotherStream") - .with_incremental_sync( - "Rates", - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .with_incremental_sync( - "AnotherStream", - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - ) - - stream_instance = test_source.streams({})[1] - parent_stream_slice = StreamSlice( - partition={}, cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"} - ) - - # This mocks the resulting records of the Rates stream which acts as the parent stream of the SubstreamPartitionRouter being tested - with patch.object( - SimpleRetriever, - "_read_pages", - side_effect=[ - [Record({"id": "1", CURSOR_FIELD: "2022-01-15"}, "AnotherStream", parent_stream_slice)], - [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, "AnotherStream", parent_stream_slice)], - ], - ): - partition = list( - map(lambda partition: partition.to_slice(), stream_instance.generate_partitions()) - ) - - assert partition == [ - StreamSlice( - partition={ - "parent_id": "1", - "parent_slice": {}, - }, - cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, - ), - StreamSlice( - partition={ - "parent_id": "1", - "parent_slice": {}, - }, - cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, - ), - StreamSlice( - partition={ - "parent_id": "2", - "parent_slice": {}, - }, - cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, - ), - StreamSlice( - partition={ - "parent_id": "2", - "parent_slice": {}, - }, - cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, - ), - ] - - -def test_partition_limitation(caplog): - """ - Test that when the number of partitions exceeds the maximum allowed limit in PerPartitionCursor, - the oldest partitions are dropped, and the state is updated accordingly. - - In this test, we set the maximum number of partitions to 2 and provide 3 partitions. - We verify that the state only retains information for the two most recent partitions. - """ - stream_name = "Rates" - source = ConcurrentDeclarativeSource( - state=[], - config={}, - catalog=CatalogBuilder() - .with_stream(ConfiguredAirbyteStreamBuilder().with_name("Rates")) - .build(), - source_config=ManifestBuilder() - .with_list_partition_router( - stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] - ) - .with_incremental_sync( - stream_name=stream_name, - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - ) - - partition_slices = [ - StreamSlice(partition={"partition_field": "1"}, cursor_slice={}), - StreamSlice(partition={"partition_field": "2"}, cursor_slice={}), - StreamSlice(partition={"partition_field": "3"}, cursor_slice={}), - ] - - records_list = [ - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ), - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ), - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - associated_slice=partition_slices[1], - stream_name=stream_name, - ) - ], - [], - [], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, - associated_slice=partition_slices[2], - stream_name=stream_name, - ) - ], - ] - - configured_stream = ConfiguredAirbyteStream( - stream=AirbyteStream( - name="Rates", - json_schema={}, - supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], - ), - sync_mode=SyncMode.incremental, - destination_sync_mode=DestinationSyncMode.append, - ) - catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) - - initial_state = [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), - stream_state=AirbyteStateBlob( - { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-01-01"}, - }, - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-01-02"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-01-03"}, - }, - ] - } - ), - ), - ) - ] - logger = MagicMock() - - # Use caplog to capture logs - with caplog.at_level(logging.WARNING, logger="airbyte"): - with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): - output = list(source.read(logger, {}, catalog, initial_state)) - - # Check if the warning was logged - logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] - warning_message = 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.' - assert warning_message in logged_messages - - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == { - "lookback_window": 1, - "state": {"cursor_field": "2022-02-17"}, - "use_global_cursor": False, - "states": [ - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-01-16"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-02-17"}, - }, - ], - } - - -def test_perpartition_with_fallback(caplog): - """ - Test that when the number of partitions exceeds the limit in PerPartitionCursor, - the cursor falls back to using the global cursor for state management. - - This test also checks that the appropriate warning logs are emitted when the partition limit is exceeded. - """ - stream_name = "Rates" - catalog = ( - CatalogBuilder() - .with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)) - .build() - ) - initial_state = [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name=stream_name, namespace=None), - stream_state=AirbyteStateBlob( - { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-01-01"}, - }, - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-01-02"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-01-03"}, - }, - ] - } - ), - ), - ) - ] - source = ConcurrentDeclarativeSource( - state=initial_state, - config={}, - catalog=catalog, - source_config=ManifestBuilder() - .with_list_partition_router("Rates", "partition_field", ["1", "2", "3", "4", "5", "6"]) - .with_incremental_sync( - stream_name=stream_name, - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - ) - - partition_slices = [ - StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 7) - ] - - records_list = [ - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ), - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ), - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - associated_slice=partition_slices[1], - stream_name=stream_name, - ) - ], - [], - [], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, - associated_slice=partition_slices[2], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, - associated_slice=partition_slices[3], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, - associated_slice=partition_slices[3], - stream_name=stream_name, - ) - ], - [], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, - associated_slice=partition_slices[4], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-13"}, - associated_slice=partition_slices[3], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, - associated_slice=partition_slices[3], - stream_name=stream_name, - ) - ], - ] - - logger = MagicMock() - - # Use caplog to capture logs - with caplog.at_level(logging.WARNING, logger="airbyte"): - with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): - output = list(source.read(logger, {}, catalog, initial_state)) - - # Check if the warnings were logged - expected_warning_messages = [ - 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.', - 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"2"}. Over limit: 2.', - 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"3"}. Over limit: 3.', - ] - - logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] - - for expected_message in expected_warning_messages: - assert expected_message in logged_messages - - # Proceed with existing assertions - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == { - "use_global_cursor": True, - "state": {"cursor_field": "2022-02-19"}, - "lookback_window": 1, - } - - -def test_per_partition_cursor_within_limit(caplog): - """ - Test that the PerPartitionCursor correctly updates the state for each partition - when the number of partitions is within the allowed limit. - - This test also checks that no warning logs are emitted when the partition limit is not exceeded. - """ - stream_name = "Rates" - catalog = ( - CatalogBuilder() - .with_stream(ConfiguredAirbyteStreamBuilder().with_name(stream_name)) - .build() - ) - initial_state = {} - source = ConcurrentDeclarativeSource( - state=initial_state, - config={}, - catalog=catalog, - source_config=ManifestBuilder() - .with_list_partition_router(stream_name, "partition_field", ["1", "2", "3"]) - .with_incremental_sync( - "Rates", - start_datetime="2022-01-01", - end_datetime="2022-03-31", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - ) - - partition_slices = [ - StreamSlice(partition={"partition_field": str(i)}, cursor_slice=cursor_slice) - for i in range(1, 4) - for cursor_slice in [ - {"start_time": "2022-01-01", "end_time": "2022-01-31"}, - {"start_time": "2022-02-01", "end_time": "2022-02-28"}, - {"start_time": "2022-03-01", "end_time": "2022-03-31"}, - ] - ] - - records_list = [ - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, - stream_name, - partition_slices[0], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, - stream_name, - partition_slices[1], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, - stream_name, - partition_slices[2], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - stream_name, - partition_slices[3], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, - stream_name, - partition_slices[4], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, - stream_name, - partition_slices[5], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, - stream_name, - partition_slices[6], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, - stream_name, - partition_slices[7], - ) - ], - [ - Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, - stream_name, - partition_slices[8], - ) - ], - ] - logger = MagicMock() - - # Use caplog to capture logs - with caplog.at_level(logging.WARNING, logger="airbyte"): - with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): - output = list(source.read(logger, {}, catalog, initial_state)) - - # Since the partition limit is not exceeded, we expect no warnings - logged_warnings = [record.message for record in caplog.records if record.levelname == "WARNING"] - assert len(logged_warnings) == 0 - - # Proceed with existing assertions - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == { - "lookback_window": 1, - "state": {"cursor_field": "2022-03-29"}, - "use_global_cursor": False, - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-03-25"}, - }, - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-03-28"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-03-29"}, - }, - ], - } From 8c6bd428e31943a0d203be1891f05ade7b3e7312 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 26 Aug 2025 11:02:49 -0400 Subject: [PATCH 39/68] format --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 2ca62ff84..a023c82cb 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3784,7 +3784,9 @@ def _create_message_repository_substream_wrapper( child_state = self._connector_state_manager.get_stream_state( kwargs["stream_name"], None ) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag - connector_state_manager = self._instantiate_parent_stream_state_manager(child_state, config, model) + connector_state_manager = self._instantiate_parent_stream_state_manager( + child_state, config, model + ) substream_factory = ModelToComponentFactory( connector_state_manager=connector_state_manager, From 35db9d7a2b80715b655f15c2d66e94fce6573aaf Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 26 Aug 2025 11:02:59 -0400 Subject: [PATCH 40/68] add test for per_partition_request_option_provider --- ...t_per_partition_request_option_provider.py | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 unit_tests/sources/declarative/requesters/request_options/test_per_partition_request_option_provider.py diff --git a/unit_tests/sources/declarative/requesters/request_options/test_per_partition_request_option_provider.py b/unit_tests/sources/declarative/requesters/request_options/test_per_partition_request_option_provider.py new file mode 100644 index 000000000..598aa92ac --- /dev/null +++ b/unit_tests/sources/declarative/requesters/request_options/test_per_partition_request_option_provider.py @@ -0,0 +1,230 @@ +from unittest import TestCase +from unittest.mock import Mock + +from airbyte_cdk.sources.declarative.partition_routers import PartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_options import RequestOptionsProvider +from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( + PerPartitionRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.types import StreamSlice + +_STREAM_STATE = {"state_key": "state_value"} +_STREAM_SLICE = StreamSlice(partition={"slice_key": "slice_value"}, cursor_slice={}) +_NEXT_PAGE_TOKEN = {"page_token_key": "page_token_value"} + + +class TestPerPartitionRequestOptionsProvider(TestCase): + def setUp(self): + self._partition_router = Mock(spec=PartitionRouter) + self._cursor_provider = Mock(spec=RequestOptionsProvider) + self._option_provider = PerPartitionRequestOptionsProvider( + self._partition_router, self._cursor_provider + ) + + def test_given_partition_router_value_when_get_request_params_then_return_partition_router_params( + self, + ) -> None: + self._partition_router.get_request_params.return_value = {"key": "value"} + self._cursor_provider.get_request_params.return_value = dict() + + result = self._option_provider.get_request_params( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_cursor_provider_value_when_get_request_params_then_return_partition_router_params( + self, + ) -> None: + self._partition_router.get_request_params.return_value = dict() + self._cursor_provider.get_request_params.return_value = {"key": "value"} + + result = self._option_provider.get_request_params( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_both_provide_value_when_get_request_params_then_overwrite_from_cursor( + self, + ) -> None: + self._partition_router.get_request_params.return_value = { + "key_duplicate": "value_partition", + "key_partition": "value_partition", + } + self._cursor_provider.get_request_params.return_value = { + "key_duplicate": "value_cursor", + "key_cursor": "value_cursor", + } + + result = self._option_provider.get_request_params( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == { + "key_duplicate": "value_cursor", + "key_partition": "value_partition", + "key_cursor": "value_cursor", + } + + def test_given_partition_router_value_when_get_request_headers_then_return_partition_router_headers( + self, + ) -> None: + self._partition_router.get_request_headers.return_value = {"key": "value"} + self._cursor_provider.get_request_headers.return_value = dict() + + result = self._option_provider.get_request_headers( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_cursor_provider_value_when_get_request_headers_then_return_cursor_provider_headers( + self, + ) -> None: + self._partition_router.get_request_headers.return_value = dict() + self._cursor_provider.get_request_headers.return_value = {"key": "value"} + + result = self._option_provider.get_request_headers( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_both_provide_value_when_get_request_headers_then_overwrite_from_cursor( + self, + ) -> None: + self._partition_router.get_request_headers.return_value = { + "key_duplicate": "value_partition", + "key_partition": "value_partition", + } + self._cursor_provider.get_request_headers.return_value = { + "key_duplicate": "value_cursor", + "key_cursor": "value_cursor", + } + + result = self._option_provider.get_request_headers( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == { + "key_duplicate": "value_cursor", + "key_partition": "value_partition", + "key_cursor": "value_cursor", + } + + def test_given_partition_router_value_when_get_request_body_data_then_return_partition_router_body_data( + self, + ) -> None: + self._partition_router.get_request_body_data.return_value = {"key": "value"} + self._cursor_provider.get_request_body_data.return_value = dict() + + result = self._option_provider.get_request_body_data( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_cursor_provider_value_when_get_request_body_data_then_return_cursor_provider_body_data( + self, + ) -> None: + self._partition_router.get_request_body_data.return_value = dict() + self._cursor_provider.get_request_body_data.return_value = {"key": "value"} + + result = self._option_provider.get_request_body_data( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_both_provide_value_when_get_request_body_data_then_overwrite_from_cursor( + self, + ) -> None: + self._partition_router.get_request_body_data.return_value = { + "key_duplicate": "value_partition", + "key_partition": "value_partition", + } + self._cursor_provider.get_request_body_data.return_value = { + "key_duplicate": "value_cursor", + "key_cursor": "value_cursor", + } + + result = self._option_provider.get_request_body_data( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == { + "key_duplicate": "value_cursor", + "key_partition": "value_partition", + "key_cursor": "value_cursor", + } + + def test_given_partition_router_value_when_get_request_body_json_then_return_partition_router_body_json( + self, + ) -> None: + self._partition_router.get_request_body_json.return_value = {"key": "value"} + self._cursor_provider.get_request_body_json.return_value = dict() + + result = self._option_provider.get_request_body_json( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_cursor_provider_value_when_get_request_body_json_then_return_cursor_provider_body_json( + self, + ) -> None: + self._partition_router.get_request_body_json.return_value = dict() + self._cursor_provider.get_request_body_json.return_value = {"key": "value"} + + result = self._option_provider.get_request_body_json( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == {"key": "value"} + + def test_given_both_provide_value_when_get_request_body_json_then_overwrite_from_cursor( + self, + ) -> None: + self._partition_router.get_request_body_json.return_value = { + "key_duplicate": "value_partition", + "key_partition": "value_partition", + } + self._cursor_provider.get_request_body_json.return_value = { + "key_duplicate": "value_cursor", + "key_cursor": "value_cursor", + } + + result = self._option_provider.get_request_body_json( + stream_state=_STREAM_STATE, + stream_slice=_STREAM_SLICE, + next_page_token=_NEXT_PAGE_TOKEN, + ) + + assert result == { + "key_duplicate": "value_cursor", + "key_partition": "value_partition", + "key_cursor": "value_cursor", + } From f74aa7ee29477cb07892dd570661fa80e8a1a524 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 26 Aug 2025 11:31:47 -0400 Subject: [PATCH 41/68] add comment to explain parameter not being propagated sometimes for concurrent cursors --- .../parsers/model_to_component_factory.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a023c82cb..9c74ef240 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1285,8 +1285,10 @@ def create_concurrent_cursor_from_datetime_based_cursor( f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" ) - # TODO validate and explain why we need to do this... - component_definition["$parameters"] = component_definition.get("parameters", {}) + # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: + # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` + # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` + # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. parameters = component_definition.get( "parameters", component_definition.get("$parameters", {}) ) @@ -1603,9 +1605,13 @@ def create_concurrent_cursor_from_perpartition_cursor( interpolated_cursor_field = InterpolatedString.create( datetime_based_cursor_model.cursor_field, + # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: + # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` + # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` + # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. parameters=component_definition.get( "parameters", component_definition.get("$parameters", {}) - ), # FIXME validate and explain why we need to do this + ), ) cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) From 76492e9c8552e9acc6ecab83ff9c463f9e928052 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 26 Aug 2025 14:27:34 -0400 Subject: [PATCH 42/68] remove legacy tests that can't be maintained --- .../test_parent_state_stream.py | 2165 ----------------- 1 file changed, 2165 deletions(-) delete mode 100644 unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py diff --git a/unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py b/unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py deleted file mode 100644 index 4fbbd7355..000000000 --- a/unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py +++ /dev/null @@ -1,2165 +0,0 @@ -# Copyright (c) 2024 Airbyte, Inc., all rights reserved. - -import copy -from typing import Any, List, Mapping, MutableMapping, Optional, Union -from unittest.mock import MagicMock - -import orjson -import pytest -import requests_mock - -from airbyte_cdk.models import ( - AirbyteMessage, - AirbyteStateBlob, - AirbyteStateMessage, - AirbyteStateType, - AirbyteStream, - AirbyteStreamState, - ConfiguredAirbyteCatalog, - ConfiguredAirbyteStream, - DestinationSyncMode, - StreamDescriptor, - SyncMode, -) -from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource - -SUBSTREAM_MANIFEST: MutableMapping[str, Any] = { - "version": "0.51.42", - "type": "DeclarativeSource", - "check": {"type": "CheckStream", "stream_names": ["post_comment_votes"]}, - "definitions": { - "basic_authenticator": { - "type": "BasicHttpAuthenticator", - "username": "{{ config['credentials']['email'] + '/token' }}", - "password": "{{ config['credentials']['api_token'] }}", - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": { - "type": "DpathExtractor", - "field_path": ["{{ parameters.get('data_path') or parameters['name'] }}"], - }, - "schema_normalization": "Default", - }, - "paginator": { - "type": "DefaultPaginator", - "page_size_option": { - "type": "RequestOption", - "field_name": "per_page", - "inject_into": "request_parameter", - }, - "pagination_strategy": { - "type": "CursorPagination", - "page_size": 100, - "cursor_value": "{{ response.get('next_page', {}) }}", - "stop_condition": "{{ not response.get('next_page', {}) }}", - }, - "page_token_option": {"type": "RequestPath"}, - }, - }, - "cursor_incremental_sync": { - "type": "DatetimeBasedCursor", - "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], - "datetime_format": "%Y-%m-%dT%H:%M:%SZ", - "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", - "start_datetime": {"datetime": "{{ config.get('start_date')}}"}, - "start_time_option": { - "inject_into": "request_parameter", - "field_name": "start_time", - "type": "RequestOption", - }, - }, - "posts_stream": { - "type": "DeclarativeStream", - "name": "posts", - "primary_key": ["id"], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": { - "id": {"type": "integer"}, - "updated_at": {"type": "string", "format": "date-time"}, - "title": {"type": "string"}, - "content": {"type": "string"}, - }, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "path": "/community/posts", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": "#/definitions/retriever/record_selector", - "paginator": "#/definitions/retriever/paginator", - }, - "incremental_sync": "#/definitions/cursor_incremental_sync", - "$parameters": { - "name": "posts", - "path": "community/posts", - "data_path": "posts", - "cursor_field": "updated_at", - "primary_key": "id", - }, - }, - "post_comments_stream": { - "type": "DeclarativeStream", - "name": "post_comments", - "primary_key": ["id"], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": { - "id": {"type": "integer"}, - "updated_at": {"type": "string", "format": "date-time"}, - "post_id": {"type": "integer"}, - "comment": {"type": "string"}, - }, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "path": "/community/posts/{{ stream_slice.id }}/comments", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": ["comments"]}, - "record_filter": { - "condition": "{{ record['updated_at'] >= stream_interval.get('start_date', config.get('start_date')) }}" - }, - }, - "paginator": "#/definitions/retriever/paginator", - "partition_router": { - "type": "SubstreamPartitionRouter", - "parent_stream_configs": [ - { - "stream": "#/definitions/posts_stream", - "parent_key": "id", - "partition_field": "id", - "incremental_dependency": True, - } - ], - }, - }, - "incremental_sync": { - "type": "DatetimeBasedCursor", - "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], - "datetime_format": "%Y-%m-%dT%H:%M:%SZ", - "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", - "start_datetime": {"datetime": "{{ config.get('start_date') }}"}, - }, - "$parameters": { - "name": "post_comments", - "path": "community/posts/{{ stream_slice.id }}/comments", - "data_path": "comments", - "cursor_field": "updated_at", - "primary_key": "id", - }, - }, - "post_comment_votes_stream": { - "type": "DeclarativeStream", - "name": "post_comment_votes", - "primary_key": ["id"], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": { - "id": {"type": "integer"}, - "created_at": {"type": "string", "format": "date-time"}, - "comment_id": {"type": "integer"}, - "vote": {"type": "number"}, - }, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "path": "/community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": "#/definitions/retriever/record_selector", - "paginator": "#/definitions/retriever/paginator", - "partition_router": { - "type": "SubstreamPartitionRouter", - "parent_stream_configs": [ - { - "stream": "#/definitions/post_comments_stream", - "parent_key": "id", - "partition_field": "id", - "incremental_dependency": True, - } - ], - }, - }, - "incremental_sync": "#/definitions/cursor_incremental_sync", - "$parameters": { - "name": "post_comment_votes", - "path": "community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", - "data_path": "votes", - "cursor_field": "created_at", - "primary_key": "id", - }, - }, - }, - "streams": [ - {"$ref": "#/definitions/posts_stream"}, - {"$ref": "#/definitions/post_comments_stream"}, - {"$ref": "#/definitions/post_comment_votes_stream"}, - ], -} - - -def _run_read( - manifest: Mapping[str, Any], - config: Mapping[str, Any], - stream_name: str, - state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, -) -> List[AirbyteMessage]: - source = ManifestDeclarativeSource(source_config=manifest) - catalog = ConfiguredAirbyteCatalog( - streams=[ - ConfiguredAirbyteStream( - stream=AirbyteStream( - name=stream_name, - json_schema={}, - supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], - ), - sync_mode=SyncMode.incremental, - destination_sync_mode=DestinationSyncMode.append, - ) - ] - ) - logger = MagicMock() - return list(source.read(logger, config, catalog, state)) - - -def run_incremental_parent_state_test( - manifest, mock_requests, expected_records, initial_state, expected_states -): - """ - Run an incremental parent state test for the specified stream. - - This function performs the following steps: - 1. Mocks the API requests as defined in mock_requests. - 2. Executes the read operation using the provided manifest and config. - 3. Asserts that the output records match the expected records. - 4. Collects intermediate states and records, performing additional reads as necessary. - 5. Compares the cumulative records from each state against the expected records. - 6. Asserts that the final state matches one of the expected states for each run. - - Args: - manifest (dict): The manifest configuration for the stream. - mock_requests (list): A list of tuples containing URL and response data for mocking API requests. - expected_records (list): The expected records to compare against the output. - initial_state (list): The initial state to start the read operation. - expected_states (list): A list of expected final states after the read operation. - """ - _stream_name = "post_comment_votes" - config = { - "start_date": "2024-01-01T00:00:01Z", - "credentials": {"email": "email", "api_token": "api_token"}, - } - - with requests_mock.Mocker() as m: - for url, response in mock_requests: - m.get(url, json=response) - - # Run the initial read - output = _run_read(manifest, config, _stream_name, initial_state) - output_data = [message.record.data for message in output if message.record] - - # Assert that output_data equals expected_records - assert output_data == expected_records - - # Collect the intermediate states and records produced before each state - cumulative_records = [] - intermediate_states = [] - final_states = [] # To store the final state after each read - - # Store the final state after the initial read - final_state_initial = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - final_states.append(final_state_initial[-1]) - - for message in output: - if message.type.value == "RECORD": - record_data = message.record.data - cumulative_records.append(record_data) - elif message.type.value == "STATE": - # Record the state and the records produced before this state - state = message.state - records_before_state = cumulative_records.copy() - intermediate_states.append((state, records_before_state)) - - # For each intermediate state, perform another read starting from that state - for state, records_before_state in intermediate_states[:-1]: - output_intermediate = _run_read(manifest, config, _stream_name, [state]) - records_from_state = [ - message.record.data for message in output_intermediate if message.record - ] - - # Combine records produced before the state with records from the new read - cumulative_records_state = records_before_state + records_from_state - - # Duplicates may occur because the state matches the cursor of the last record, causing it to be re-emitted in the next sync. - cumulative_records_state_deduped = list( - {orjson.dumps(record): record for record in cumulative_records_state}.values() - ) - - # Compare the cumulative records with the expected records - expected_records_set = list( - {orjson.dumps(record): record for record in expected_records}.values() - ) - assert sorted( - cumulative_records_state_deduped, key=lambda x: orjson.dumps(x) - ) == sorted(expected_records_set, key=lambda x: orjson.dumps(x)), ( - f"Records mismatch with intermediate state {state}. Expected {expected_records}, got {cumulative_records_state_deduped}" - ) - - # Store the final state after each intermediate read - final_state_intermediate = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output_intermediate - if message.state - ] - final_states.append(final_state_intermediate[-1]) - - # Assert that the final state matches the expected state for all runs - for i, final_state in enumerate(final_states): - assert final_state in expected_states, ( - f"Final state mismatch at run {i + 1}. Expected {expected_states}, got {final_state}" - ) - - -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", - [ - ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", - { - "posts": [ - {"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, - {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - { - "votes": [ - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", - { - "votes": [ - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"} - ] - }, - ), - # Requests with intermediate states - # Fetch votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-15T00:00:00Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - }, - ), - # Fetch votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-13T00:00:00Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ], - }, - ), - # Fetch votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-15T00:00:00Z", - { - "votes": [], - }, - ), - # Fetch votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-12T00:00:00Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-12T00:00:15Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - ], - # Expected records - [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, - ], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob( - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": { - "posts": {"updated_at": "2024-01-05T00:00:00Z"} - }, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-02T00:00:00Z"}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-03T00:00:00Z"}, - }, - ], - } - ), - ), - ) - ], - # Expected state - { - "use_global_cursor": False, - "state": {"created_at": "2024-01-15T00:00:00Z"}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-09T00:00:00Z"}, - }, - ], - } - }, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:15Z"}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-10T00:00:00Z"}, - }, - ], - }, - ), - ], -) -def test_incremental_parent_state( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state -): - additional_expected_state = copy.deepcopy(expected_state) - # State for empty partition (comment 12), when the global cursor is used for intermediate states - empty_state = { - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - } - additional_expected_state["states"].append(empty_state) - run_incremental_parent_state_test( - manifest, - mock_requests, - expected_records, - initial_state, - [expected_state, additional_expected_state], - ) - - -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", - [ - ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "posts": [ - {"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, - {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-02T00:00:00Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-02T00:00:00Z&page=2", - {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-02T00:00:00Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"} - ] - }, - ), - ], - # Expected records - [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, - ], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob({"created_at": "2024-01-02T00:00:00Z"}), - ), - ) - ], - # Expected state - { - "use_global_cursor": False, - "state": {"created_at": "2024-01-15T00:00:00Z"}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-09T00:00:00Z"}, - }, - ], - } - }, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:00Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:15Z"}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-10T00:00:00Z"}, - }, - ], - }, - ), - ], -) -def test_incremental_parent_state_migration( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state -): - """ - Test incremental partition router with parent state migration - """ - _stream_name = "post_comment_votes" - config = { - "start_date": "2024-01-01T00:00:01Z", - "credentials": {"email": "email", "api_token": "api_token"}, - } - - with requests_mock.Mocker() as m: - for url, response in mock_requests: - m.get(url, json=response) - - output = _run_read(manifest, config, _stream_name, initial_state) - output_data = [message.record.data for message in output if message.record] - - assert output_data == expected_records - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == expected_state - - -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", - [ - ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", - { - "posts": [], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - {"posts": []}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": []}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": []}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": []}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", - {"votes": []}, - ), - ], - # Expected records - [], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob( - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": { - "posts": {"updated_at": "2024-01-05T00:00:00Z"} - }, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-02T00:00:00Z"}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-03T00:00:00Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:00Z"}, - "lookback_window": 1, - } - ), - ), - ) - ], - # Expected state - { - "lookback_window": 1, - "use_global_cursor": False, - "state": {"created_at": "2024-01-03T00:00:00Z"}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - } - }, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-03T00:00:00Z"}, - }, - ], - }, - ), - ], -) -def test_incremental_parent_state_no_slices( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state -): - """ - Test incremental partition router with no parent records - """ - _stream_name = "post_comment_votes" - config = { - "start_date": "2024-01-01T00:00:01Z", - "credentials": {"email": "email", "api_token": "api_token"}, - } - - with requests_mock.Mocker() as m: - for url, response in mock_requests: - m.get(url, json=response) - - output = _run_read(manifest, config, _stream_name, initial_state) - output_data = [message.record.data for message in output if message.record] - - assert output_data == expected_records - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == expected_state - - -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", - [ - ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", - { - "posts": [ - {"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, - {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", - {"votes": []}, - ), - ], - # Expected records - [], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob( - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": { - "posts": {"updated_at": "2024-01-05T00:00:00Z"} - }, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-02T00:00:00Z"}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-03T00:00:00Z"}, - }, - ], - "use_global_cursor": True, - "state": {"created_at": "2024-01-03T00:00:00Z"}, - "lookback_window": 0, - } - ), - ), - ) - ], - # Expected state - { - "lookback_window": 1, - "use_global_cursor": True, - "state": {"created_at": "2024-01-03T00:00:00Z"}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-09T00:00:00Z"}, - }, - ], - } - }, - }, - ), - ], -) -def test_incremental_parent_state_no_records( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state -): - """ - Test incremental partition router with no child records - """ - _stream_name = "post_comment_votes" - config = { - "start_date": "2024-01-01T00:00:01Z", - "credentials": {"email": "email", "api_token": "api_token"}, - } - - with requests_mock.Mocker() as m: - for url, response in mock_requests: - m.get(url, json=response) - - output = _run_read(manifest, config, _stream_name, initial_state) - output_data = [message.record.data for message in output if message.record] - - assert output_data == expected_records - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == expected_state - - -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", - [ - ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z", - { - "posts": [ - {"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, - {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z&page=2", - {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", - { - "votes": [ - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-01T00:00:01Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", - { - "votes": [ - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"} - ] - }, - ), - ], - # Expected records - [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, - ], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob( - { - # This should not happen since parent state is disabled, but I've added this to validate that and - # incoming parent_state is ignored when the parent stream's incremental_dependency is disabled - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": { - "posts": {"updated_at": "2024-01-05T00:00:00Z"} - }, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-02T00:00:00Z"}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": "2024-01-03T00:00:00Z"}, - }, - ], - } - ), - ), - ) - ], - # Expected state - { - "use_global_cursor": False, - "state": {"created_at": "2024-01-15T00:00:00Z"}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:15Z"}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-10T00:00:00Z"}, - }, - ], - }, - ), - ], -) -def test_incremental_parent_state_no_incremental_dependency( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state -): - """ - This is a pretty complicated test that syncs a low-code connector stream with three levels of substreams - - posts: (ids: 1, 2, 3) - - post comments: (parent post 1 with ids: 9, 10, 11, 12; parent post 2 with ids: 20, 21; parent post 3 with id: 30) - - post comment votes: (parent comment 10 with ids: 100, 101; parent comment 11 with id: 102; - parent comment 20 with id: 200; parent comment 21 with id: 201, parent comment 30 with id: 300) - - By setting incremental_dependency to false, parent streams will not use the incoming state and will not update state. - The post_comment_votes substream is incremental and will emit state messages We verify this by ensuring that mocked - parent stream requests use the incoming config as query parameters and the substream state messages does not - contain parent stream state. - """ - - _stream_name = "post_comment_votes" - config = { - "start_date": "2024-01-01T00:00:01Z", - "credentials": {"email": "email", "api_token": "api_token"}, - } - - # Disable incremental_dependency - manifest["definitions"]["post_comments_stream"]["retriever"]["partition_router"][ - "parent_stream_configs" - ][0]["incremental_dependency"] = False - manifest["definitions"]["post_comment_votes_stream"]["retriever"]["partition_router"][ - "parent_stream_configs" - ][0]["incremental_dependency"] = False - - with requests_mock.Mocker() as m: - for url, response in mock_requests: - m.get(url, json=response) - - output = _run_read(manifest, config, _stream_name, initial_state) - output_data = [message.record.data for message in output if message.record] - - assert output_data == expected_records - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == expected_state - - -SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR: MutableMapping[str, Any] = { - "version": "0.51.42", - "type": "DeclarativeSource", - "check": {"type": "CheckStream", "stream_names": ["post_comment_votes"]}, - "definitions": { - "basic_authenticator": { - "type": "BasicHttpAuthenticator", - "username": "{{ config['credentials']['email'] + '/token' }}", - "password": "{{ config['credentials']['api_token'] }}", - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": { - "type": "DpathExtractor", - "field_path": ["{{ parameters.get('data_path') or parameters['name'] }}"], - }, - "schema_normalization": "Default", - }, - "paginator": { - "type": "DefaultPaginator", - "page_size_option": { - "type": "RequestOption", - "field_name": "per_page", - "inject_into": "request_parameter", - }, - "pagination_strategy": { - "type": "CursorPagination", - "page_size": 100, - "cursor_value": "{{ response.get('next_page', {}) }}", - "stop_condition": "{{ not response.get('next_page', {}) }}", - }, - "page_token_option": {"type": "RequestPath"}, - }, - }, - "cursor_incremental_sync": { - "type": "DatetimeBasedCursor", - "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], - "datetime_format": "%Y-%m-%dT%H:%M:%SZ", - "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", - "start_datetime": {"datetime": "{{ config.get('start_date')}}"}, - "start_time_option": { - "inject_into": "request_parameter", - "field_name": "start_time", - "type": "RequestOption", - }, - }, - "posts_stream": { - "type": "DeclarativeStream", - "name": "posts", - "primary_key": ["id"], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": { - "id": {"type": "integer"}, - "updated_at": {"type": "string", "format": "date-time"}, - "title": {"type": "string"}, - "content": {"type": "string"}, - }, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "path": "/community/posts", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": "#/definitions/retriever/record_selector", - "paginator": "#/definitions/retriever/paginator", - }, - "incremental_sync": "#/definitions/cursor_incremental_sync", - "$parameters": { - "name": "posts", - "path": "community/posts", - "data_path": "posts", - "cursor_field": "updated_at", - "primary_key": "id", - }, - }, - "post_comments_stream": { - "type": "DeclarativeStream", - "name": "post_comments", - "primary_key": ["id"], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": { - "id": {"type": "integer"}, - "updated_at": {"type": "string", "format": "date-time"}, - "post_id": {"type": "integer"}, - "comment": {"type": "string"}, - }, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "path": "/community/posts/{{ stream_slice.id }}/comments", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": ["comments"]}, - "record_filter": { - "condition": "{{ record['updated_at'] >= stream_interval.get('start_date', config.get('start_date')) }}" - }, - }, - "paginator": "#/definitions/retriever/paginator", - "partition_router": { - "type": "SubstreamPartitionRouter", - "parent_stream_configs": [ - { - "stream": "#/definitions/posts_stream", - "parent_key": "id", - "partition_field": "id", - "incremental_dependency": True, - } - ], - }, - }, - "incremental_sync": { - "type": "DatetimeBasedCursor", - "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], - "datetime_format": "%Y-%m-%dT%H:%M:%SZ", - "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", - "start_datetime": {"datetime": "{{ config.get('start_date') }}"}, - }, - "$parameters": { - "name": "post_comments", - "path": "community/posts/{{ stream_slice.id }}/comments", - "data_path": "comments", - "cursor_field": "updated_at", - "primary_key": "id", - }, - }, - "post_comment_votes_stream": { - "type": "DeclarativeStream", - "name": "post_comment_votes", - "primary_key": ["id"], - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": { - "$schema": "http://json-schema.org/schema#", - "properties": { - "id": {"type": "integer"}, - "created_at": {"type": "string", "format": "date-time"}, - "comment_id": {"type": "integer"}, - "vote": {"type": "number"}, - }, - "type": "object", - }, - }, - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://api.example.com", - "path": "/community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", - "http_method": "GET", - "authenticator": "#/definitions/basic_authenticator", - }, - "record_selector": "#/definitions/retriever/record_selector", - "paginator": "#/definitions/retriever/paginator", - "partition_router": { - "type": "SubstreamPartitionRouter", - "parent_stream_configs": [ - { - "stream": "#/definitions/post_comments_stream", - "parent_key": "id", - "partition_field": "id", - "incremental_dependency": True, - } - ], - }, - }, - "incremental_sync": { - "type": "DatetimeBasedCursor", - "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], - "datetime_format": "%Y-%m-%dT%H:%M:%SZ", - "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", - "start_datetime": {"datetime": "{{ config.get('start_date')}}"}, - "start_time_option": { - "inject_into": "request_parameter", - "field_name": "start_time", - "type": "RequestOption", - }, - "global_substream_cursor": True, - }, - "$parameters": { - "name": "post_comment_votes", - "path": "community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", - "data_path": "votes", - "cursor_field": "created_at", - "primary_key": "id", - }, - }, - }, - "streams": [ - {"$ref": "#/definitions/posts_stream"}, - {"$ref": "#/definitions/post_comments_stream"}, - {"$ref": "#/definitions/post_comment_votes_stream"}, - ], -} -SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR_NO_DEPENDENCY = copy.deepcopy( - SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR -) -SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR_NO_DEPENDENCY["definitions"]["post_comment_votes_stream"][ - "retriever" -]["partition_router"]["parent_stream_configs"][0]["incremental_dependency"] = False - - -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", - [ - ( - "test_global_substream_cursor", - SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", - { - "posts": [ - {"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, - {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", - {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:01Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:01Z", - { - "votes": [ - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", - { - "votes": [ - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"} - ] - }, - ), - # Requests with intermediate states - # Fetch votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-14T23:59:59Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - }, - ), - # Fetch votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-14T23:59:59Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ], - }, - ), - # Fetch votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-14T23:59:59Z", - { - "votes": [], - }, - ), - # Fetch votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-14T23:59:59Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-14T23:59:59Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - ], - # Expected records - [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, - ], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob( - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": { - "posts": {"updated_at": "2024-01-05T00:00:00Z"} - }, - } - }, - "state": {"created_at": "2024-01-04T02:03:04Z"}, - "lookback_window": 93784, - } - ), - ), - ) - ], - # Expected state - { - "state": {"created_at": "2024-01-15T00:00:00Z"}, - "lookback_window": 1, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-09T00:00:00Z"}, - }, - ], - } - }, - }, - ), - ( - "test_global_substream_cursor_no_dependency", - SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR_NO_DEPENDENCY, - [ - # Fetch the first page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z", - { - "posts": [ - {"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, - {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z&page=2", - }, - ), - # Fetch the second page of posts - ( - "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z&page=2", - {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:00Z", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"} - ] - }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-03T00:00:00Z", - { - "votes": [ - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"} - ] - }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", - { - "votes": [ - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"} - ] - }, - ), - ], - # Expected records - [ - {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, - {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, - {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, - {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, - {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, - {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, - ], - # Initial state - [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor( - name="post_comment_votes", namespace=None - ), - stream_state=AirbyteStateBlob( - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": { - "posts": {"updated_at": "2024-01-05T00:00:00Z"} - }, - } - }, - "state": {"created_at": "2024-01-04T02:03:04Z"}, - "lookback_window": 93784, - } - ), - ), - ) - ], - # Expected state - {"state": {"created_at": "2024-01-15T00:00:00Z"}, "lookback_window": 1}, - ), - ], -) -def test_incremental_global_parent_state( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state -): - run_incremental_parent_state_test( - manifest, mock_requests, expected_records, initial_state, [expected_state] - ) From 2135f188350abb964d178bdfce60de86c1655654 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 26 Aug 2025 15:06:59 -0400 Subject: [PATCH 43/68] re-add test --- .../test_per_partition_cursor_integration.py | 763 ++++++++++++++++++ 1 file changed, 763 insertions(+) create mode 100644 unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py new file mode 100644 index 000000000..46b726758 --- /dev/null +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -0,0 +1,763 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from unittest.mock import MagicMock, patch + +import orjson + +from airbyte_cdk.models import ( + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + StreamDescriptor, + SyncMode, +) +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import ( + PerPartitionCursor, + StreamSlice, +) +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.types import Record + +CURSOR_FIELD = "cursor_field" +SYNC_MODE = SyncMode.incremental + + +class ManifestBuilder: + def __init__(self): + self._incremental_sync = {} + self._partition_router = {} + self._substream_partition_router = {} + + def with_list_partition_router(self, stream_name, cursor_field, partitions): + self._partition_router[stream_name] = { + "type": "ListPartitionRouter", + "cursor_field": cursor_field, + "values": partitions, + } + return self + + def with_substream_partition_router(self, stream_name): + self._substream_partition_router[stream_name] = { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "type": "ParentStreamConfig", + "stream": "#/definitions/Rates", + "parent_key": "id", + "partition_field": "parent_id", + } + ], + } + return self + + def with_incremental_sync( + self, + stream_name, + start_datetime, + end_datetime, + datetime_format, + cursor_field, + step, + cursor_granularity, + ): + self._incremental_sync[stream_name] = { + "type": "DatetimeBasedCursor", + "start_datetime": start_datetime, + "end_datetime": end_datetime, + "datetime_format": datetime_format, + "cursor_field": cursor_field, + "step": step, + "cursor_granularity": cursor_granularity, + } + return self + + def build(self): + manifest = { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "definitions": { + "AnotherStream": { + "type": "DeclarativeStream", + "name": "AnotherStream", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {"id": {"type": "string"}}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + }, + "Rates": { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + }, + }, + "streams": [{"$ref": "#/definitions/Rates"}, {"$ref": "#/definitions/AnotherStream"}], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + } + for stream_name, incremental_sync_definition in self._incremental_sync.items(): + manifest["definitions"][stream_name]["incremental_sync"] = incremental_sync_definition + for stream_name, partition_router_definition in self._partition_router.items(): + manifest["definitions"][stream_name]["retriever"]["partition_router"] = ( + partition_router_definition + ) + for stream_name, partition_router_definition in self._substream_partition_router.items(): + manifest["definitions"][stream_name]["retriever"]["partition_router"] = ( + partition_router_definition + ) + return manifest + + +def test_given_state_for_only_some_partition_when_stream_slices_then_create_slices_using_state_or_start_from_start_datetime(): + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + stream_instance = source.streams({})[0] + stream_instance.state = { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-02-01"}, + } + ] + } + + slices = stream_instance.stream_slices( + sync_mode=SYNC_MODE, + stream_state={}, + ) + + assert list(slices) == [ + {"partition_field": "1", "start_time": "2022-02-01", "end_time": "2022-02-28"}, + {"partition_field": "2", "start_time": "2022-01-01", "end_time": "2022-01-31"}, + {"partition_field": "2", "start_time": "2022-02-01", "end_time": "2022-02-28"}, + ] + + +def test_given_record_for_partition_when_read_then_update_state(): + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + stream_instance = source.streams({})[0] + list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + + stream_slice = StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ) + with patch.object( + SimpleRetriever, + "_read_pages", + side_effect=[ + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_slice)] + ], + ): + list( + stream_instance.read_records( + sync_mode=SYNC_MODE, + stream_slice=stream_slice, + stream_state={"states": []}, + cursor_field=CURSOR_FIELD, + ) + ) + + assert stream_instance.state == { + "state": {}, + "use_global_cursor": False, + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-15"}, + } + ], + } + + +def test_substream_without_input_state(): + test_source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_substream_partition_router("AnotherStream") + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .with_incremental_sync( + "AnotherStream", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + stream_instance = test_source.streams({})[1] + + parent_stream_slice = StreamSlice( + partition={}, cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"} + ) + + # This mocks the resulting records of the Rates stream which acts as the parent stream of the SubstreamPartitionRouter being tested + with patch.object( + SimpleRetriever, + "_read_pages", + side_effect=[ + [Record({"id": "1", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], + [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], + ], + ): + slices = list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + assert list(slices) == [ + StreamSlice( + partition={ + "parent_id": "1", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ), + StreamSlice( + partition={ + "parent_id": "1", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + StreamSlice( + partition={ + "parent_id": "2", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ), + StreamSlice( + partition={ + "parent_id": "2", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + ] + + +def test_partition_limitation(caplog): + """ + Test that when the number of partitions exceeds the maximum allowed limit in PerPartitionCursor, + the oldest partitions are dropped, and the state is updated accordingly. + + In this test, we set the maximum number of partitions to 2 and provide 3 partitions. + We verify that the state only retains information for the two most recent partitions. + """ + stream_name = "Rates" + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router( + stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] + ) + .with_incremental_sync( + stream_name=stream_name, + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + partition_slices = [ + StreamSlice(partition={"partition_field": "1"}, cursor_slice={}), + StreamSlice(partition={"partition_field": "2"}, cursor_slice={}), + StreamSlice(partition={"partition_field": "3"}, cursor_slice={}), + ] + + records_list = [ + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, + associated_slice=partition_slices[0], + stream_name=stream_name, + ), + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, + associated_slice=partition_slices[0], + stream_name=stream_name, + ), + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, + associated_slice=partition_slices[0], + stream_name=stream_name, + ) + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, + associated_slice=partition_slices[1], + stream_name=stream_name, + ) + ], + [], + [], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, + associated_slice=partition_slices[2], + stream_name=stream_name, + ) + ], + ] + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream( + name="Rates", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + ), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + initial_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-01"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-02"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-01-03"}, + }, + ] + } + ), + ), + ) + ] + logger = MagicMock() + + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + output = list(source.read(logger, {}, catalog, initial_state)) + + # Check if the warning was logged + logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + warning_message = 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.' + assert warning_message in logged_messages + + final_state = [ + orjson.loads(orjson.dumps(message.state.stream.stream_state)) + for message in output + if message.state + ] + assert final_state[-1] == { + "lookback_window": 1, + "state": {"cursor_field": "2022-02-17"}, + "use_global_cursor": False, + "states": [ + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-16"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-02-17"}, + }, + ], + } + + +def test_perpartition_with_fallback(caplog): + """ + Test that when the number of partitions exceeds the limit in PerPartitionCursor, + the cursor falls back to using the global cursor for state management. + + This test also checks that the appropriate warning logs are emitted when the partition limit is exceeded. + """ + stream_name = "Rates" + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3", "4", "5", "6"]) + .with_incremental_sync( + stream_name=stream_name, + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + partition_slices = [ + StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 7) + ] + + records_list = [ + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, + associated_slice=partition_slices[0], + stream_name=stream_name, + ), + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, + associated_slice=partition_slices[0], + stream_name=stream_name, + ), + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, + associated_slice=partition_slices[0], + stream_name=stream_name, + ) + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, + associated_slice=partition_slices[1], + stream_name=stream_name, + ) + ], + [], + [], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, + associated_slice=partition_slices[2], + stream_name=stream_name, + ) + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, + associated_slice=partition_slices[3], + stream_name=stream_name, + ) + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, + associated_slice=partition_slices[3], + stream_name=stream_name, + ) + ], + [], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, + associated_slice=partition_slices[4], + stream_name=stream_name, + ) + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-13"}, + associated_slice=partition_slices[3], + stream_name=stream_name, + ) + ], + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, + associated_slice=partition_slices[3], + stream_name=stream_name, + ) + ], + ] + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream( + name=stream_name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + ), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + initial_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=stream_name, namespace=None), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-01"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-02"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-01-03"}, + }, + ] + } + ), + ), + ) + ] + logger = MagicMock() + + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + output = list(source.read(logger, {}, catalog, initial_state)) + + # Check if the warnings were logged + expected_warning_messages = [ + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.', + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"2"}. Over limit: 2.', + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"3"}. Over limit: 3.', + ] + + logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + + for expected_message in expected_warning_messages: + assert expected_message in logged_messages + + # Proceed with existing assertions + final_state = [ + orjson.loads(orjson.dumps(message.state.stream.stream_state)) + for message in output + if message.state + ] + assert final_state[-1] == { + "use_global_cursor": True, + "state": {"cursor_field": "2022-02-19"}, + "lookback_window": 1, + } + + +def test_per_partition_cursor_within_limit(caplog): + """ + Test that the PerPartitionCursor correctly updates the state for each partition + when the number of partitions is within the allowed limit. + + This test also checks that no warning logs are emitted when the partition limit is not exceeded. + """ + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-03-31", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + partition_slices = [ + StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 4) + ] + + records_list = [ + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, partition_slices[0] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, partition_slices[0] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, partition_slices[0] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[1] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, partition_slices[1] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, partition_slices[1] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, partition_slices[2] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, partition_slices[2] + ) + ], + [ + Record( + {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, partition_slices[2] + ) + ], + ] + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream( + name="Rates", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + ), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + initial_state = {} + logger = MagicMock() + + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): + output = list(source.read(logger, {}, catalog, initial_state)) + + # Since the partition limit is not exceeded, we expect no warnings + logged_warnings = [record.message for record in caplog.records if record.levelname == "WARNING"] + assert len(logged_warnings) == 0 + + # Proceed with existing assertions + final_state = [ + orjson.loads(orjson.dumps(message.state.stream.stream_state)) + for message in output + if message.state + ] + assert final_state[-1] == { + "lookback_window": 1, + "state": {"cursor_field": "2022-03-29"}, + "use_global_cursor": False, + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-03-25"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-03-28"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-03-29"}, + }, + ], + } From 485502e96f9475c54dca8e44c3ddebe89623c024 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Tue, 26 Aug 2025 15:24:46 -0700 Subject: [PATCH 44/68] add slice_limit parameter to StreamSlicerPartitionGenerator so connector builder slice limits are enforced --- .../sources/declarative/parsers/model_to_component_factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 10e0cb026..2a61a7f80 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2110,6 +2110,7 @@ def create_declarative_stream( self._message_repository, ), stream_slicer, + slice_limit=self._limit_slices_fetched, ), name=stream_name, json_schema=schema_loader.get_json_schema, From 1bd76e8db96a382b05c5eaa837f89c4a2975818b Mon Sep 17 00:00:00 2001 From: brianjlai Date: Tue, 26 Aug 2025 17:26:24 -0700 Subject: [PATCH 45/68] fix 4 of 6 test_per_partition_cursor_integration.py tests --- .../test_parent_state_stream.py | 10 +- .../test_per_partition_cursor_integration.py | 373 +++++++++++++----- 2 files changed, 270 insertions(+), 113 deletions(-) rename unit_tests/{legacy => }/sources/declarative/incremental/test_per_partition_cursor_integration.py (74%) diff --git a/unit_tests/legacy/sources/declarative/partition_routers/test_parent_state_stream.py b/unit_tests/legacy/sources/declarative/partition_routers/test_parent_state_stream.py index c3cab5500..6f37043cb 100644 --- a/unit_tests/legacy/sources/declarative/partition_routers/test_parent_state_stream.py +++ b/unit_tests/legacy/sources/declarative/partition_routers/test_parent_state_stream.py @@ -8,9 +8,6 @@ import pytest import requests_mock -from airbyte_cdk.legacy.sources.declarative.manifest_declarative_source import ( - ManifestDeclarativeSource, -) from airbyte_cdk.models import ( AirbyteMessage, AirbyteStateBlob, @@ -24,6 +21,9 @@ StreamDescriptor, SyncMode, ) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) SUBSTREAM_MANIFEST: MutableMapping[str, Any] = { "version": "0.51.42", @@ -242,7 +242,6 @@ def _run_read( stream_name: str, state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, ) -> List[AirbyteMessage]: - source = ManifestDeclarativeSource(source_config=manifest) catalog = ConfiguredAirbyteCatalog( streams=[ ConfiguredAirbyteStream( @@ -256,6 +255,9 @@ def _run_read( ) ] ) + source = ConcurrentDeclarativeSource( + source_config=manifest, config=config, catalog=catalog, state=state + ) logger = MagicMock() return list(source.read(logger, config, catalog, state)) diff --git a/unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py similarity index 74% rename from unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor_integration.py rename to unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index f3f8dfc72..68104bec4 100644 --- a/unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -7,12 +7,6 @@ import orjson -from airbyte_cdk.legacy.sources.declarative.incremental.per_partition_cursor import ( - PerPartitionCursor, -) -from airbyte_cdk.legacy.sources.declarative.manifest_declarative_source import ( - ManifestDeclarativeSource, -) from airbyte_cdk.models import ( AirbyteStateBlob, AirbyteStateMessage, @@ -25,6 +19,10 @@ StreamDescriptor, SyncMode, ) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) +from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.types import Record, StreamSlice @@ -167,7 +165,26 @@ def build(self): def test_given_state_for_only_some_partition_when_stream_slices_then_create_slices_using_state_or_start_from_start_datetime(): - source = ManifestDeclarativeSource( + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="Rates", namespace=None), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-02-01"}, + } + ] + } + ), + ), + ) + ] + + source = ConcurrentDeclarativeSource( source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2"]) .with_incremental_sync( @@ -179,32 +196,45 @@ def test_given_state_for_only_some_partition_when_stream_slices_then_create_slic step="P1M", cursor_granularity="P1D", ) - .build() + .build(), + config={}, + catalog=None, + state=state, ) stream_instance = source.streams({})[0] - stream_instance.state = { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-02-01"}, - } - ] - } - slices = stream_instance.stream_slices( - sync_mode=SYNC_MODE, - stream_state={}, - ) + partitions = stream_instance.generate_partitions() + slices = [partition.to_slice() for partition in partitions] - assert list(slices) == [ - {"partition_field": "1", "start_time": "2022-02-01", "end_time": "2022-02-28"}, - {"partition_field": "2", "start_time": "2022-01-01", "end_time": "2022-01-31"}, - {"partition_field": "2", "start_time": "2022-02-01", "end_time": "2022-02-28"}, + assert slices == [ + StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ), + StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), ] -def test_given_record_for_partition_when_read_then_update_state(): - source = ManifestDeclarativeSource( +def test_given_record_for_partition_when_read_then_update_state(caplog): + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream( + name="Rates", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + ), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + source = ConcurrentDeclarativeSource( source_config=ManifestBuilder() .with_list_partition_router("Rates", "partition_field", ["1", "2"]) .with_incremental_sync( @@ -216,45 +246,102 @@ def test_given_record_for_partition_when_read_then_update_state(): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), + config={}, + catalog=catalog, + state=None, ) + logger = MagicMock() + stream_instance = source.streams({})[0] - list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + # list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) - stream_slice = StreamSlice( - partition={"partition_field": "1"}, - cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, - ) - with patch.object( - SimpleRetriever, - "_read_pages", - side_effect=[ - [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_slice)] - ], - ): - list( - stream_instance.read_records( - sync_mode=SYNC_MODE, - stream_slice=stream_slice, - stream_state={"states": []}, - cursor_field=CURSOR_FIELD, + stream_slice = [ + StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ), + StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + ] + + # with patch.object( + # SimpleRetriever, + # "_read_pages", + # side_effect=[ + # [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_slice)] + # ], + # ): + # list( + # stream_instance.read( + # sync_mode=SYNC_MODE, + # stream_slice=stream_slice, + # stream_state={"states": []}, + # cursor_field=CURSOR_FIELD, + # ) + # ) + + records = [ + [ + Record( + data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, + stream_name="Rates", + associated_slice=stream_slice[0], ) - ) + ], + [], + [], + [], + # [Record(data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, stream_name="Rates", associated_slice=stream_slice[1])], + # [Record(data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_name="Rates", associated_slice=stream_slice[1])], + # [Record(data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, stream_name="Rates", associated_slice=stream_slice[1])], + ] - assert stream_instance.state == { - "state": {}, + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records): + output = list(source.read(logger, {}, catalog, None)) + + # Since the partition limit is not exceeded, we expect no warnings + logged_warnings = [record.message for record in caplog.records if record.levelname == "WARNING"] + assert len(logged_warnings) == 0 + + # Proceed with existing assertions + final_state = [ + orjson.loads(orjson.dumps(message.state.stream.stream_state)) + for message in output + if message.state + ] + + assert final_state[-1] == { + "lookback_window": 1, + "state": {"cursor_field": "2022-01-15"}, "use_global_cursor": False, "states": [ { "partition": {"partition_field": "1"}, "cursor": {CURSOR_FIELD: "2022-01-15"}, - } + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-01"}, + }, ], } def test_substream_without_input_state(): - test_source = ManifestDeclarativeSource( + test_source = ConcurrentDeclarativeSource( source_config=ManifestBuilder() .with_substream_partition_router("AnotherStream") .with_incremental_sync( @@ -275,7 +362,10 @@ def test_substream_without_input_state(): step="P1M", cursor_granularity="P1D", ) - .build() + .build(), + config={}, + catalog=None, + state=None, ) stream_instance = test_source.streams({})[1] @@ -293,7 +383,7 @@ def test_substream_without_input_state(): [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], ], ): - slices = list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + slices = [partition.to_slice() for partition in stream_instance.generate_partitions()] assert list(slices) == [ StreamSlice( partition={ @@ -335,22 +425,6 @@ def test_partition_limitation(caplog): We verify that the state only retains information for the two most recent partitions. """ stream_name = "Rates" - source = ManifestDeclarativeSource( - source_config=ManifestBuilder() - .with_list_partition_router( - stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] - ) - .with_incremental_sync( - stream_name=stream_name, - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build() - ) partition_slices = [ StreamSlice(partition={"partition_field": "1"}, cursor_slice={}), @@ -435,10 +509,30 @@ def test_partition_limitation(caplog): ] logger = MagicMock() + source = ConcurrentDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router( + stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] + ) + .with_incremental_sync( + stream_name=stream_name, + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build(), + config={}, + catalog=catalog, + state=initial_state, + ) + # Use caplog to capture logs with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): output = list(source.read(logger, {}, catalog, initial_state)) # Check if the warning was logged @@ -476,20 +570,6 @@ def test_perpartition_with_fallback(caplog): This test also checks that the appropriate warning logs are emitted when the partition limit is exceeded. """ stream_name = "Rates" - source = ManifestDeclarativeSource( - source_config=ManifestBuilder() - .with_list_partition_router("Rates", "partition_field", ["1", "2", "3", "4", "5", "6"]) - .with_incremental_sync( - stream_name=stream_name, - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build() - ) partition_slices = [ StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 7) @@ -608,10 +688,28 @@ def test_perpartition_with_fallback(caplog): ] logger = MagicMock() + source = ConcurrentDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3", "4", "5", "6"]) + .with_incremental_sync( + stream_name=stream_name, + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build(), + config={}, + catalog=catalog, + state=initial_state, + ) + # Use caplog to capture logs with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): output = list(source.read(logger, {}, catalog, initial_state)) # Check if the warnings were logged @@ -646,69 +744,108 @@ def test_per_partition_cursor_within_limit(caplog): This test also checks that no warning logs are emitted when the partition limit is not exceeded. """ - source = ManifestDeclarativeSource( - source_config=ManifestBuilder() - .with_list_partition_router("Rates", "partition_field", ["1", "2", "3"]) - .with_incremental_sync( - "Rates", - start_datetime="2022-01-01", - end_datetime="2022-03-31", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build() - ) - partition_slices = [ StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 4) ] + slice_range_2022_01_01_partition_1 = StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ) + slice_range_2022_02_01_partition_1 = StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ) + slice_range_2022_03_01_partition_1 = StreamSlice( + partition={"partition_field": "1"}, + cursor_slice={"start_time": "2022-03-01", "end_time": "2022-03-31"}, + ) + slice_range_2022_01_01_partition_2 = StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ) + slice_range_2022_02_01_partition_2 = StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ) + slice_range_2022_03_01_partition_2 = StreamSlice( + partition={"partition_field": "2"}, + cursor_slice={"start_time": "2022-03-01", "end_time": "2022-03-31"}, + ) + slice_range_2022_01_01_partition_3 = StreamSlice( + partition={"partition_field": "3"}, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ) + slice_range_2022_02_01_partition_3 = StreamSlice( + partition={"partition_field": "3"}, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ) + slice_range_2022_03_01_partition_3 = StreamSlice( + partition={"partition_field": "3"}, + cursor_slice={"start_time": "2022-03-01", "end_time": "2022-03-31"}, + ) records_list = [ [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, + stream_name="Rates", + associated_slice=slice_range_2022_01_01_partition_1, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, + stream_name="Rates", + associated_slice=slice_range_2022_02_01_partition_1, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, partition_slices[0] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, + stream_name="Rates", + associated_slice=slice_range_2022_03_01_partition_1, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, + stream_name="Rates", + associated_slice=slice_range_2022_01_01_partition_2, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, + stream_name="Rates", + associated_slice=slice_range_2022_02_01_partition_2, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, partition_slices[1] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, + stream_name="Rates", + associated_slice=slice_range_2022_03_01_partition_2, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, + stream_name="Rates", + associated_slice=slice_range_2022_01_01_partition_3, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, + stream_name="Rates", + associated_slice=slice_range_2022_02_01_partition_3, ) ], [ Record( - {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, partition_slices[2] + {"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, + stream_name="Rates", + associated_slice=slice_range_2022_03_01_partition_3, ) ], ] @@ -727,10 +864,28 @@ def test_per_partition_cursor_within_limit(caplog): initial_state = {} logger = MagicMock() + source = ConcurrentDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-03-31", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build(), + config={}, + catalog=catalog, + state=initial_state, + ) + # Use caplog to capture logs with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): output = list(source.read(logger, {}, catalog, initial_state)) # Since the partition limit is not exceeded, we expect no warnings From 9158b676ff2bb99c58f89e6556eb7519876c2ffa Mon Sep 17 00:00:00 2001 From: brianjlai Date: Tue, 26 Aug 2025 18:21:04 -0700 Subject: [PATCH 46/68] fix 2 more tests --- .../declarative/test_manifest_declarative_source.py | 7 ++++--- .../declarative/test_concurrent_declarative_source.py | 8 +++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/unit_tests/legacy/sources/declarative/test_manifest_declarative_source.py b/unit_tests/legacy/sources/declarative/test_manifest_declarative_source.py index ad6735201..d6e8c22fa 100644 --- a/unit_tests/legacy/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/legacy/sources/declarative/test_manifest_declarative_source.py @@ -2195,10 +2195,11 @@ def test_only_parent_streams_use_cache(): assert not stream_1_retriever.requester.use_cache # Parent stream created for substream - assert stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name == "applications" - assert stream_1_retriever.stream_slicer.parent_stream_configs[ + stream_slicer = streams[1]._stream_partition_generator._stream_slicer + assert stream_slicer.parent_stream_configs[0].stream.name == "applications" + assert stream_slicer.parent_stream_configs[ 0 - ].stream.retriever.requester.use_cache + ].stream._stream_partition_generator._partition_factory._retriever.requester.use_cache # Main stream without caching assert streams[2].name == "jobs" diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 9f96ee50f..77a17d7fb 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -4042,14 +4042,12 @@ def test_only_parent_streams_use_cache(): # Parent stream created for substream assert ( - stream_1._stream_partition_generator._partition_factory._retriever.stream_slicer.parent_stream_configs[ - 0 - ].stream.name + stream_1._stream_partition_generator._stream_slicer.parent_stream_configs[0].stream.name == "applications" ) - assert stream_1._stream_partition_generator._partition_factory._retriever.stream_slicer.parent_stream_configs[ + assert stream_1._stream_partition_generator._stream_slicer.parent_stream_configs[ 0 - ].stream.retriever.requester.use_cache + ].stream._stream_partition_generator._partition_factory._retriever.requester.use_cache # Main stream without caching stream_2 = streams[2] From bb424b881ce7beb0a68e86db828e89298554f46a Mon Sep 17 00:00:00 2001 From: brianjlai Date: Tue, 26 Aug 2025 19:55:30 -0700 Subject: [PATCH 47/68] whatever --- .../test_per_partition_cursor_integration.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 68104bec4..61339b5fd 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -710,19 +710,34 @@ def test_perpartition_with_fallback(caplog): with caplog.at_level(logging.WARNING, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): - output = list(source.read(logger, {}, catalog, initial_state)) + with patch.object(ConcurrentPerPartitionCursor, "SWITCH_TO_GLOBAL_LIMIT", 1): + output = list(source.read(logger, {}, catalog, initial_state)) # Check if the warnings were logged - expected_warning_messages = [ - 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.', - 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"2"}. Over limit: 2.', - 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"3"}. Over limit: 3.', - ] - logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + warning_message = ( + "The maximum number of partitions has been reached. Dropping the oldest partition:" + ) + expected_warning_over_limit_messages = [ + "Over limit: 1", + "Over limit: 2", + "Over limit: 3", + ] - for expected_message in expected_warning_messages: - assert expected_message in logged_messages + for logged_message in logged_messages: + assert warning_message in logged_message + + for expected_warning_over_limit_message in expected_warning_over_limit_messages: + assert ( + len( + [ + logged_message + for logged_message in logged_messages + if expected_warning_over_limit_message in logged_message + ] + ) + > 0 + ) # Proceed with existing assertions final_state = [ From 6c8771c5b68d8091f2d82400700da638e4689a16 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Tue, 26 Aug 2025 21:03:15 -0700 Subject: [PATCH 48/68] clean tests --- .../test_per_partition_cursor_integration.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 61339b5fd..6a28edcc1 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -253,9 +253,6 @@ def test_given_record_for_partition_when_read_then_update_state(caplog): ) logger = MagicMock() - stream_instance = source.streams({})[0] - # list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) - stream_slice = [ StreamSlice( partition={"partition_field": "1"}, @@ -275,22 +272,6 @@ def test_given_record_for_partition_when_read_then_update_state(caplog): ), ] - # with patch.object( - # SimpleRetriever, - # "_read_pages", - # side_effect=[ - # [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_slice)] - # ], - # ): - # list( - # stream_instance.read( - # sync_mode=SYNC_MODE, - # stream_slice=stream_slice, - # stream_state={"states": []}, - # cursor_field=CURSOR_FIELD, - # ) - # ) - records = [ [ Record( @@ -302,9 +283,6 @@ def test_given_record_for_partition_when_read_then_update_state(caplog): [], [], [], - # [Record(data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, stream_name="Rates", associated_slice=stream_slice[1])], - # [Record(data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_name="Rates", associated_slice=stream_slice[1])], - # [Record(data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, stream_name="Rates", associated_slice=stream_slice[1])], ] # Use caplog to capture logs From f9eb050b15699b7e24935672809f2847d2dd6682 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 27 Aug 2025 09:42:58 -0400 Subject: [PATCH 49/68] fix remaining tests in test_per_partition_cursor_integration --- .../concurrent_partition_cursor.py | 2 +- .../test_per_partition_cursor_integration.py | 68 ++++--------------- 2 files changed, 16 insertions(+), 54 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 4925b5138..6a982475f 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -55,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor): Manages state per partition when a stream has many partitions, preventing data loss or duplication. Attributes: - DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). + DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError. - **Partition Limitation Logic** Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached. diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 6a28edcc1..07ddc7666 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -394,13 +394,13 @@ def test_substream_without_input_state(): ] -def test_partition_limitation(caplog): +def test_switch_to_global_limit(caplog): """ - Test that when the number of partitions exceeds the maximum allowed limit in PerPartitionCursor, - the oldest partitions are dropped, and the state is updated accordingly. + Test that when the number of partitions exceeds the limit to switch to global state. - In this test, we set the maximum number of partitions to 2 and provide 3 partitions. - We verify that the state only retains information for the two most recent partitions. + In this test, we set the maximum number of partitions to 1 (not 2 because we evaluate this before generating a + partition and the limit is not inclusive) and provide 3 partitions. + We verify that the state switch to global. """ stream_name = "Rates" @@ -508,15 +508,15 @@ def test_partition_limitation(caplog): ) # Use caplog to capture logs - with caplog.at_level(logging.WARNING, logger="airbyte"): + with caplog.at_level(logging.INFO, logger="airbyte"): with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + with patch.object(ConcurrentPerPartitionCursor, "SWITCH_TO_GLOBAL_LIMIT", 1): output = list(source.read(logger, {}, catalog, initial_state)) # Check if the warning was logged - logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] - warning_message = 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.' - assert warning_message in logged_messages + logged_messages = [record.message for record in caplog.records if record.levelname == "INFO"] + warning_message = "Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of" + assert any(map(lambda message: warning_message in message, logged_messages)) final_state = [ orjson.loads(orjson.dumps(message.state.stream.stream_state)) @@ -526,17 +526,7 @@ def test_partition_limitation(caplog): assert final_state[-1] == { "lookback_window": 1, "state": {"cursor_field": "2022-02-17"}, - "use_global_cursor": False, - "states": [ - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-01-16"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-02-17"}, - }, - ], + "use_global_cursor": True, } @@ -684,38 +674,10 @@ def test_perpartition_with_fallback(caplog): state=initial_state, ) - # Use caplog to capture logs - with caplog.at_level(logging.WARNING, logger="airbyte"): - with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): - with patch.object(ConcurrentPerPartitionCursor, "SWITCH_TO_GLOBAL_LIMIT", 1): - output = list(source.read(logger, {}, catalog, initial_state)) - - # Check if the warnings were logged - logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] - warning_message = ( - "The maximum number of partitions has been reached. Dropping the oldest partition:" - ) - expected_warning_over_limit_messages = [ - "Over limit: 1", - "Over limit: 2", - "Over limit: 3", - ] - - for logged_message in logged_messages: - assert warning_message in logged_message - - for expected_warning_over_limit_message in expected_warning_over_limit_messages: - assert ( - len( - [ - logged_message - for logged_message in logged_messages - if expected_warning_over_limit_message in logged_message - ] - ) - > 0 - ) + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(ConcurrentPerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + with patch.object(ConcurrentPerPartitionCursor, "SWITCH_TO_GLOBAL_LIMIT", 1): + output = list(source.read(logger, {}, catalog, initial_state)) # Proceed with existing assertions final_state = [ From cf8b084a0912e324f3f0abf30ef623e3b0f7bca0 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 27 Aug 2025 10:26:29 -0400 Subject: [PATCH 50/68] coderabbitai code review --- .../concurrent_partition_cursor.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 6a982475f..25cd0916f 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -557,16 +557,19 @@ def limit_reached(self) -> bool: def get_parent_state( stream_state: Optional[StreamState], parent_stream_name: str ) -> Optional[AirbyteStateMessage]: - return ( - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(parent_stream_name, None), - stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]), - ), - ) - if stream_state and "parent_state" in stream_state - else None + if "parent_state" not in stream_state: + logger.warning(f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state") + return None + elif parent_stream_name not in stream_state["parent_state"]: + logger.info(f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}") + return None + + return AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(parent_stream_name, None), + stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]), + ), ) @staticmethod From 52864420d477f6eaf8b9e4cba6b11c29d08d5c82 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 27 Aug 2025 11:28:19 -0400 Subject: [PATCH 51/68] add tests, format and lint --- .../concurrent_partition_cursor.py | 12 +- .../substream_partition_router.py | 2 +- .../per_partition_request_option_provider.py | 2 +- .../test_per_partition_cursor_integration.py | 113 +++++++++++++++++- .../test_grouping_partition_router.py | 2 +- 5 files changed, 121 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 25cd0916f..514bb946b 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -12,10 +12,10 @@ from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional from airbyte_cdk.models import ( - AirbyteStateMessage, AirbyteStateBlob, - AirbyteStreamState, + AirbyteStateMessage, AirbyteStateType, + AirbyteStreamState, StreamDescriptor, ) from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager @@ -558,10 +558,14 @@ def get_parent_state( stream_state: Optional[StreamState], parent_stream_name: str ) -> Optional[AirbyteStateMessage]: if "parent_state" not in stream_state: - logger.warning(f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state") + logger.warning( + f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state" + ) return None elif parent_stream_name not in stream_state["parent_state"]: - logger.info(f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}") + logger.info( + f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}" + ) return None return AirbyteStateMessage( diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index f74ab5817..bfc97ad3b 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -15,8 +15,8 @@ Mapping, MutableMapping, Optional, - Union, TypeVar, + Union, ) import dpath diff --git a/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py b/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py index 04827b7fe..de8cbe12c 100644 --- a/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +++ b/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py @@ -1,4 +1,4 @@ -from typing import Optional, Mapping, Any, Union +from typing import Any, Mapping, Optional, Union from airbyte_cdk.sources.declarative.partition_routers import PartitionRouter from airbyte_cdk.sources.declarative.requesters.request_options import RequestOptionsProvider diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 07ddc7666..5a3f28b02 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -3,11 +3,15 @@ # import logging +from typing import Iterator, List, Tuple from unittest.mock import MagicMock, patch import orjson +import requests_mock from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, AirbyteStateBlob, AirbyteStateMessage, AirbyteStateType, @@ -18,6 +22,7 @@ DestinationSyncMode, StreamDescriptor, SyncMode, + Type, ) from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( ConcurrentDeclarativeSource, @@ -25,6 +30,7 @@ from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.types import Record, StreamSlice +from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder CURSOR_FIELD = "cursor_field" SYNC_MODE = SyncMode.incremental @@ -35,6 +41,7 @@ def __init__(self): self._incremental_sync = {} self._partition_router = {} self._substream_partition_router = {} + self._concurrency_default = None def with_list_partition_router(self, stream_name, cursor_field, partitions): self._partition_router[stream_name] = { @@ -44,7 +51,7 @@ def with_list_partition_router(self, stream_name, cursor_field, partitions): } return self - def with_substream_partition_router(self, stream_name): + def with_substream_partition_router(self, stream_name, incremental_dependency=False): self._substream_partition_router[stream_name] = { "type": "SubstreamPartitionRouter", "parent_stream_configs": [ @@ -53,6 +60,7 @@ def with_substream_partition_router(self, stream_name): "stream": "#/definitions/Rates", "parent_key": "id", "partition_field": "parent_id", + "incremental_dependency": incremental_dependency, } ], } @@ -76,9 +84,23 @@ def with_incremental_sync( "cursor_field": cursor_field, "step": step, "cursor_granularity": cursor_granularity, + "start_time_option": { + "type": "RequestOption", + "field_name": "from", + "inject_into": "request_parameter", + }, + "end_time_option": { + "type": "RequestOption", + "field_name": "to", + "inject_into": "request_parameter", + }, } return self + def with_concurrency(self, default: int) -> "ManifestBuilder": + self._concurrency_default = default + return self + def build(self): manifest = { "version": "0.34.2", @@ -102,7 +124,7 @@ def build(self): "requester": { "type": "HttpRequester", "url_base": "https://api.apilayer.com", - "path": "/exchangerates_data/latest", + "path": "/exchangerates_data/parent/{{ stream_partition['parent_id'] }}/child/latest", "http_method": "GET", }, "record_selector": { @@ -128,7 +150,7 @@ def build(self): "requester": { "type": "HttpRequester", "url_base": "https://api.apilayer.com", - "path": "/exchangerates_data/latest", + "path": "/exchangerates_data/parent/latest", "http_method": "GET", }, "record_selector": { @@ -161,6 +183,12 @@ def build(self): manifest["definitions"][stream_name]["retriever"]["partition_router"] = ( partition_router_definition ) + + if self._concurrency_default: + manifest["concurrency_level"] = { + "type": "ConcurrencyLevel", + "default_concurrency": self._concurrency_default, + } return manifest @@ -872,3 +900,82 @@ def test_per_partition_cursor_within_limit(caplog): }, ], } + + +def test_parent_stream_is_updated_after_parent_record_fully_consumed(): + source = ConcurrentDeclarativeSource( + source_config=ManifestBuilder() + .with_substream_partition_router("AnotherStream", incremental_dependency=True) + .with_incremental_sync( + "AnotherStream", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1Y", + cursor_granularity="P1D", + ) + .with_concurrency(1) # so that we know partition 1 gets processed before 2 + .build(), + config={}, + catalog=None, + state=None, + ) + + with requests_mock.Mocker() as m: + # Request for parent stream + m.get( + "https://api.apilayer.com/exchangerates_data/parent/latest?from=2022-01-01&to=2022-02-28", + json=[{"id": "1"}], + ) + + # Requests for child stream + record_from_first_cursor_interval = {"id": "child_1.1"} + m.get( + "https://api.apilayer.com/exchangerates_data/parent/1/child/latest?from=2022-01-01&to=2022-01-31", + json=[record_from_first_cursor_interval], + ) + record_from_second_cursor_interval = {"id": "child_1.2"} + m.get( + "https://api.apilayer.com/exchangerates_data/parent/1/child/latest?from=2022-02-01&to=2022-02-28", + json=[record_from_second_cursor_interval], + ) + + message_iterator = source.read( + MagicMock(), + {}, + CatalogBuilder() + .with_stream(ConfiguredAirbyteStreamBuilder().with_name("AnotherStream")) + .build(), + None, + ) + + records, state = get_records_until_state_message(message_iterator) + + assert len(records) == 1 and records[0].data == record_from_first_cursor_interval + assert "parent_state" not in state.stream.stream_state.__dict__ + + records, state = get_records_until_state_message(message_iterator) + assert "parent_state" in state.stream.stream_state.__dict__ + + +def get_records_until_state_message( + message_iterator: Iterator[AirbyteMessage], +) -> Tuple[List[AirbyteRecordMessage], AirbyteStateMessage]: + records = [] + for message in message_iterator: + if message.type == Type.RECORD: + records.append(message.record) + elif message.type == Type.STATE: + return records, message.state + + raise ValueError("No state message encountered") diff --git a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py index 3f18439d2..a7704d9c4 100644 --- a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py @@ -14,7 +14,7 @@ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( ParentStreamConfig, ) -from airbyte_cdk.sources.types import StreamSlice, Record +from airbyte_cdk.sources.types import Record, StreamSlice from unit_tests.sources.declarative.partition_routers.test_substream_partition_router import ( MockStream, parent_slices, From eb635b1e77a929b4c47f99680406789a54e0675f Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 27 Aug 2025 15:59:06 -0400 Subject: [PATCH 52/68] fix linting but break test_per_partition_cursor.py for now --- .../concurrent_partition_cursor.py | 6 +- .../parsers/model_to_component_factory.py | 32 +++-- .../cartesian_product_stream_slicer.py | 6 - .../grouping_partition_router.py | 5 - .../list_partition_router.py | 6 - .../partition_routers/partition_router.py | 24 +--- .../single_partition_router.py | 6 - .../substream_partition_router.py | 59 +------- .../per_partition_request_option_provider.py | 32 +++-- .../test_grouping_partition_router.py | 83 ----------- .../test_substream_partition_router.py | 135 ------------------ 11 files changed, 52 insertions(+), 342 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 514bb946b..80843b518 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -432,9 +432,6 @@ def _set_initial_state(self, stream_state: StreamState) -> None: if stream_state.get("parent_state"): self._parent_state = stream_state["parent_state"] - # Set parent state for partition routers based on parent streams - self._partition_router.set_initial_state(stream_state) - def _set_global_state(self, stream_state: Mapping[str, Any]) -> None: """ Initializes the global cursor state from the provided stream state. @@ -557,6 +554,9 @@ def limit_reached(self) -> bool: def get_parent_state( stream_state: Optional[StreamState], parent_stream_name: str ) -> Optional[AirbyteStateMessage]: + if not stream_state: + return stream_state + if "parent_state" not in stream_state: logger.warning( f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state" diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 5e3aa06cc..64a92cbeb 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -98,6 +98,7 @@ RecordSelector, ResponseToFileExtractor, ) +from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) @@ -106,7 +107,6 @@ ConcurrentPerPartitionCursor, CursorFactory, DatetimeBasedCursor, - DeclarativeCursor, GlobalSubstreamCursor, PerPartitionWithGlobalCursor, ) @@ -512,7 +512,7 @@ PerPartitionRequestOptionsProvider, ) from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath -from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester from airbyte_cdk.sources.declarative.resolvers import ( ComponentMappingDefinition, ConfigComponentsResolver, @@ -529,6 +529,7 @@ from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( ConnectorBuilderFileUploader, DefaultFileUploader, + FileUploader, LocalFileSystemFileWriter, NoopFileWriter, ) @@ -553,6 +554,7 @@ ) from airbyte_cdk.sources.declarative.transformations import ( AddFields, + RecordTransformation, RemoveFields, ) from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition @@ -3255,7 +3257,7 @@ def create_simple_retriever( log_formatter: Optional[Callable[[Response], Any]] = None, **kwargs: Any, ) -> SimpleRetriever: - def _get_url() -> str: + def _get_url(req: Requester) -> str: """ Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. This is needed because the URL is not set until the requester is created. @@ -3264,12 +3266,12 @@ def _get_url() -> str: _url: str = ( model.requester.url if hasattr(model.requester, "url") and model.requester.url is not None - else requester.get_url() + else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) ) _url_base: str = ( model.requester.url_base if hasattr(model.requester, "url_base") and model.requester.url_base is not None - else requester.get_url_base() + else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) ) return _url or _url_base @@ -3355,7 +3357,7 @@ def _get_url() -> str: self._create_component_from_model( model=model.paginator, config=config, - url_base=_get_url(), + url_base=_get_url(requester), extractor_model=model.record_selector.extractor, decoder=decoder, cursor_used_for_stop_condition=stop_condition_cursor or None, @@ -3538,12 +3540,14 @@ def create_async_retriever( transformations: List[RecordTransformation], **kwargs: Any, ) -> AsyncRetriever: - def _get_download_retriever() -> SimpleRetriever: + def _get_download_retriever( + requester: Requester, extractor: RecordExtractor, _decoder: Decoder + ) -> SimpleRetriever: # We create a record selector for the download retriever # with no schema normalization and no transformations, neither record filter # as all this occurs in the record_selector of the AsyncRetriever record_selector = RecordSelector( - extractor=download_extractor, + extractor=extractor, name=name, record_filter=None, transformations=[], @@ -3554,7 +3558,7 @@ def _get_download_retriever() -> SimpleRetriever: paginator = ( self._create_component_from_model( model=model.download_paginator, - decoder=decoder, + decoder=_decoder, config=config, url_base="", ) @@ -3563,7 +3567,7 @@ def _get_download_retriever() -> SimpleRetriever: ) return SimpleRetriever( - requester=download_requester, + requester=requester, record_selector=record_selector, primary_key=None, name=name, @@ -3657,7 +3661,9 @@ def _get_job_timeout() -> datetime.timedelta: config=config, name=job_download_components_name, ) - download_retriever = _get_download_retriever() + download_retriever = _get_download_retriever( + download_requester, download_extractor, download_decoder + ) abort_requester = ( self._create_component_from_model( model=model.abort_requester, @@ -3840,7 +3846,9 @@ def _create_message_repository_substream_wrapper( model=model, config=config, has_parent_state=has_parent_state, **kwargs ) - def _instantiate_parent_stream_state_manager(self, child_state, config, model): + def _instantiate_parent_stream_state_manager( + self, child_state: MutableMapping[str, Any], config: Config, model: ParentStreamConfigModel + ): """ With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the `set_initial_state` flow that existed for the declarative cursors. This state is taken from diff --git a/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py b/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py index 8718004bf..fb85701bd 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +++ b/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py @@ -159,12 +159,6 @@ def stream_slices(self) -> Iterable[StreamSlice]: cursor_slice = {} yield StreamSlice(partition=partition, cursor_slice=cursor_slice) - def set_initial_state(self, stream_state: StreamState) -> None: - """ - Parent stream states are not supported for cartesian product stream slicer - """ - pass - def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: """ Parent stream states are not supported for cartesian product stream slicer diff --git a/airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py index a08acbbea..f4b18f5e2 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py @@ -140,11 +140,6 @@ def get_request_body_json( ) -> Mapping[str, Any]: return {} - def set_initial_state(self, stream_state: StreamState) -> None: - """Delegate state initialization to the underlying partition router.""" - self.underlying_partition_router.set_initial_state(stream_state) - self._state = self.underlying_partition_router.get_stream_state() - def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: """Delegate state retrieval to the underlying partition router.""" return self._state diff --git a/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py index 6049cefe2..a5ed25357 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py @@ -108,12 +108,6 @@ def _get_request_option( else: return {} - def set_initial_state(self, stream_state: StreamState) -> None: - """ - ListPartitionRouter doesn't have parent streams - """ - pass - def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: """ ListPartitionRouter doesn't have parent streams diff --git a/airbyte_cdk/sources/declarative/partition_routers/partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/partition_router.py index 3a9bc3abf..a8c6ba824 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/partition_router.py @@ -15,31 +15,9 @@ class PartitionRouter(StreamSlicer): """ Base class for partition routers. Methods: - set_parent_state(stream_state): Set the state of the parent streams. - get_parent_state(): Get the state of the parent streams. + get_stream_state(): Get the state of the parent streams. """ - @abstractmethod - def set_initial_state(self, stream_state: StreamState) -> None: - """ - Set the state of the parent streams. - - This method should only be implemented if the slicer is based on some parent stream and needs to read this stream - incrementally using the state. - - Args: - stream_state (StreamState): The state of the streams to be set. The expected format is a dictionary that includes - 'parent_state' which is a dictionary of parent state names to their corresponding state. - Example: - { - "parent_state": { - "parent_stream_name_1": { ... }, - "parent_stream_name_2": { ... }, - ... - } - } - """ - @abstractmethod def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: """ diff --git a/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py index 32e6a353d..e056edd2f 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py @@ -50,12 +50,6 @@ def get_request_body_json( def stream_slices(self) -> Iterable[StreamSlice]: yield StreamSlice(partition={}, cursor_slice={}) - def set_initial_state(self, stream_state: StreamState) -> None: - """ - SinglePartitionRouter doesn't have parent streams - """ - pass - def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: """ SinglePartitionRouter doesn't have parent streams diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index bfc97ad3b..84b1a8dcd 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -36,7 +36,10 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream -def iterate_with_last_flag(generator: Iterable[Partition]) -> Iterable[tuple[Partition, bool]]: +T = TypeVar("T") + + +def iterate_with_last_flag(generator: Iterable[T]) -> Iterable[tuple[T, bool]]: iterator = iter(generator) try: @@ -307,60 +310,6 @@ def _extract_extra_fields( extracted_extra_fields[".".join(extra_field_path)] = extra_field_value return extracted_extra_fields - def set_initial_state(self, stream_state: StreamState) -> None: - """ - Set the state of the parent streams. - - If the `parent_state` key is missing from `stream_state`, migrate the child stream state to the parent stream's state format. - This migration applies only to parent streams with incremental dependencies. - - Args: - stream_state (StreamState): The state of the streams to be set. - - Example of state format: - { - "parent_state": { - "parent_stream_name1": { - "last_updated": "2023-05-27T00:00:00Z" - }, - "parent_stream_name2": { - "last_updated": "2023-05-27T00:00:00Z" - } - } - } - - Example of migrating to parent state format: - - Initial state: - { - "updated_at": "2023-05-27T00:00:00Z" - } - - After migration: - { - "updated_at": "2023-05-27T00:00:00Z", - "parent_state": { - "parent_stream_name": { - "parent_stream_cursor": "2023-05-27T00:00:00Z" - } - } - } - """ - if not stream_state: - return - - parent_state = stream_state.get("parent_state", {}) - - # Set state for each parent stream with an incremental dependency - for parent_config in self.parent_stream_configs: - if ( - not parent_state.get(parent_config.stream.name, {}) - and parent_config.incremental_dependency - ): - # Migrate child state to parent state format - parent_state = self._migrate_child_state_to_parent_state(stream_state) - - if parent_config.incremental_dependency: - parent_config.stream.state = parent_state.get(parent_config.stream.name, {}) - def _migrate_child_state_to_parent_state(self, stream_state: StreamState) -> StreamState: """ Migrate the child or global stream state into the parent stream's state format. diff --git a/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py b/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py index de8cbe12c..423bd6f14 100644 --- a/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +++ b/airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py @@ -19,11 +19,15 @@ def get_request_params( ) -> Mapping[str, Any]: return self._partition_router.get_request_params( # type: ignore # this always returns a mapping stream_state=stream_state, - stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}) + if stream_slice + else StreamSlice(partition={}, cursor_slice={}), next_page_token=next_page_token, ) | self._cursor_provider.get_request_params( stream_state=stream_state, - stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice) + if stream_slice + else StreamSlice(partition={}, cursor_slice={}), next_page_token=next_page_token, ) @@ -36,11 +40,15 @@ def get_request_headers( ) -> Mapping[str, Any]: return self._partition_router.get_request_headers( # type: ignore # this always returns a mapping stream_state=stream_state, - stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}) + if stream_slice + else stream_slice, next_page_token=next_page_token, ) | self._cursor_provider.get_request_headers( stream_state=stream_state, - stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice) + if stream_slice + else stream_slice, next_page_token=next_page_token, ) @@ -53,11 +61,15 @@ def get_request_body_data( ) -> Union[Mapping[str, Any], str]: return self._partition_router.get_request_body_data( # type: ignore # this always returns a mapping stream_state=stream_state, - stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}) + if stream_slice + else stream_slice, next_page_token=next_page_token, ) | self._cursor_provider.get_request_body_data( stream_state=stream_state, - stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice) + if stream_slice + else stream_slice, next_page_token=next_page_token, ) @@ -70,10 +82,14 @@ def get_request_body_json( ) -> Mapping[str, Any]: return self._partition_router.get_request_body_json( # type: ignore # this always returns a mapping stream_state=stream_state, - stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}) + if stream_slice + else stream_slice, next_page_token=next_page_token, ) | self._cursor_provider.get_request_body_json( stream_state=stream_state, - stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice) + if stream_slice + else stream_slice, next_page_token=next_page_token, ) diff --git a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py index a7704d9c4..f02cc243d 100644 --- a/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_grouping_partition_router.py @@ -348,20 +348,6 @@ def __next__(self): ) -def test_set_initial_state_delegation(mock_config, mock_underlying_router): - """Test that set_initial_state delegates to the underlying router.""" - router = GroupingPartitionRouter( - group_size=2, - underlying_partition_router=mock_underlying_router, - config=mock_config, - ) - mock_state = {"some_key": "some_value"} - mock_underlying_router.set_initial_state = MagicMock() - - router.set_initial_state(mock_state) - mock_underlying_router.set_initial_state.assert_called_once_with(mock_state) - - def test_stream_slices_extra_fields_varied(mock_config): """Test grouping with varied extra fields across partitions.""" parent_stream = MockStream( @@ -514,72 +500,3 @@ def test_get_request_params_default(mock_config, mock_underlying_router): ) ) assert params == {} - - -def test_stream_slices_resume_from_state(mock_config, mock_underlying_router): - """Test that stream_slices resumes correctly from a previous state.""" - - # Simulate underlying router state handling - class MockPartitionRouter: - def __init__(self): - self.slices = [ - StreamSlice( - partition={"board_ids": i}, - cursor_slice={}, - extra_fields={"name": f"Board {i}", "owner": f"User{i}"}, - ) - for i in range(5) - ] - self.state = {"last_board_id": 0} # Initial state - - def set_initial_state(self, state): - self.state = state - - def get_stream_state(self): - return self.state - - def stream_slices(self): - last_board_id = self.state.get("last_board_id", -1) - for slice in self.slices: - board_id = slice.partition["board_ids"] - if board_id <= last_board_id: - continue - self.state = {"last_board_id": board_id} - yield slice - - underlying_router = MockPartitionRouter() - router = GroupingPartitionRouter( - group_size=2, - underlying_partition_router=underlying_router, - config=mock_config, - deduplicate=True, - ) - - # First sync: process first two slices - router.set_initial_state({"last_board_id": 0}) - slices_iter = router.stream_slices() - first_batch = next(slices_iter) - assert first_batch == StreamSlice( - partition={"board_ids": [1, 2]}, - cursor_slice={}, - extra_fields={"name": ["Board 1", "Board 2"], "owner": ["User1", "User2"]}, - ) - state_after_first = router.get_stream_state() - assert state_after_first == {"last_board_id": 2}, "State should reflect last processed board_id" - - # Simulate a new sync resuming from the previous state - new_router = GroupingPartitionRouter( - group_size=2, - underlying_partition_router=MockPartitionRouter(), - config=mock_config, - deduplicate=True, - ) - new_router.set_initial_state(state_after_first) - resumed_slices = list(new_router.stream_slices()) - assert resumed_slices == [ - StreamSlice( - partition={"board_ids": [3, 4]}, - cursor_slice={}, - extra_fields={"name": ["Board 3", "Board 4"], "owner": ["User3", "User4"]}, - ) - ], "Should resume from board_id 3" diff --git a/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py b/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py index 532835abc..9b4069221 100644 --- a/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +++ b/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py @@ -404,141 +404,6 @@ def test_substream_partition_router(parent_stream_configs, expected_slices): assert slices == expected_slices -@pytest.mark.parametrize( - "initial_state, expected_parent_state", - [ - # Case 1: Empty initial state, no parent state expected - ({}, {}), - # Case 2: Initial state with no `parent_state`, migrate `updated_at` to `parent_stream_cursor` - ( - {"updated_at": "2023-05-27T00:00:00Z"}, - {"parent_stream_cursor": "2023-05-27T00:00:00Z"}, - ), - # Case 3: Initial state with global `state`, no migration expected - ( - {"state": {"updated": "2023-05-27T00:00:00Z"}}, - {"parent_stream_cursor": "2023-05-27T00:00:00Z"}, - ), - # Case 4: Initial state with per-partition `states`, no migration expected - ( - { - "states": [ - { - "partition": { - "issue_id": "10012", - "parent_slice": { - "parent_slice": {}, - "project_id": "10000", - }, - }, - "cursor": {"updated": "2021-01-01T00:00:00+0000"}, - }, - { - "partition": { - "issue_id": "10019", - "parent_slice": { - "parent_slice": {}, - "project_id": "10000", - }, - }, - "cursor": {"updated": "2021-01-01T00:00:00+0000"}, - }, - { - "partition": { - "issue_id": "10000", - "parent_slice": { - "parent_slice": {}, - "project_id": "10000", - }, - }, - "cursor": {"updated": "2021-01-01T00:00:00+0000"}, - }, - ] - }, - {}, - ), - # Case 5: Initial state with `parent_state`, existing parent state persists - ( - { - "parent_state": { - "parent_stream_name1": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}, - }, - }, - {"parent_stream_cursor": "2023-05-27T00:00:00Z"}, - ), - # Case 6: Declarative global cursor state, no migration expected - ( - { - "looback_window": 1, - "use_global_cursor": True, - "state": {"updated": "2023-05-27T00:00:00Z"}, - }, - {"parent_stream_cursor": "2023-05-27T00:00:00Z"}, - ), - # Case 7: Migrate child state to parent state but child state is empty - ( - { - "state": {}, - "states": [], - "parent_state": {"posts": {}}, - "lookback_window": 1, - "use_global_cursor": False, - }, - {}, - ), - ], - ids=[ - "empty_initial_state", - "initial_state_no_parent_legacy_state", - "initial_state_no_parent_global_state", - "initial_state_no_parent_per_partition_state", - "initial_state_with_parent_state", - "initial_state_no_parent_global_state_declarative", - "initial_state_no_parent_and_no_child", - ], -) -def test_set_initial_state(initial_state, expected_parent_state): - """ - Test the `set_initial_state` method of SubstreamPartitionRouter. - - This test verifies that the method correctly handles different initial state formats - and sets the appropriate parent stream state. - """ - parent_stream = MockStream( - InMemoryPartition( - "partition_1", - "stream_name", - _EMPTY_SLICE, - [], - ), - name="parent_stream_name1", - cursor_field="parent_stream_cursor", - ) - parent_stream.state = {} - parent_stream_config = ParentStreamConfig( - stream=parent_stream, - parent_key="id", - partition_field="parent_stream_id", - parameters={}, - config={}, - incremental_dependency=True, - ) - - partition_router = SubstreamPartitionRouter( - parent_stream_configs=[parent_stream_config], - parameters={}, - config={}, - ) - - partition_router.set_initial_state(initial_state) - - # Assert the state of the parent stream - assert parent_stream.state == expected_parent_state, ( - f"Unexpected parent state. Initial state: {initial_state}, " - f"Expected: {expected_parent_state}, Got: {parent_stream.state}" - ) - - @pytest.mark.parametrize( "parent_stream_request_parameters, expected_req_params, expected_headers, expected_body_json, expected_body_data", [ From 2b849f7fdf53da5ce60ca246543f838db8c5efa8 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 27 Aug 2025 16:15:13 -0400 Subject: [PATCH 53/68] fix test_per_partition_cursor.py --- .../incremental/per_partition_cursor.py | 6 ++- .../incremental/test_per_partition_cursor.py | 41 ------------------- 2 files changed, 4 insertions(+), 43 deletions(-) diff --git a/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py b/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py index 57c7fd21b..54060bd94 100644 --- a/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +++ b/airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py @@ -146,8 +146,10 @@ def set_initial_state(self, stream_state: StreamState) -> None: if "state" in stream_state: self._state_to_migrate_from = stream_state["state"] - # Set parent state for partition routers based on parent streams - self._partition_router.set_initial_state(stream_state) + # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the + # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called. + # We are still keeping this line as a comment to be explicit about the past behavior. + # self._partition_router.set_initial_state(stream_state) def observe(self, stream_slice: StreamSlice, record: Record) -> None: self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe( diff --git a/unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor.py b/unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor.py index 9cbc1ad1c..4e6bf38fb 100644 --- a/unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor.py +++ b/unit_tests/legacy/sources/declarative/incremental/test_per_partition_cursor.py @@ -508,47 +508,6 @@ def test_get_request_body_json( cursor.get_request_body_json(stream_slice=stream_slice) -def test_parent_state_is_set_for_per_partition_cursor( - mocked_cursor_factory, mocked_partition_router -): - # Define the parent state to be used in the test - parent_state = {"parent_cursor": "parent_state_value"} - - # Mock the partition router to return a stream slice - partition = StreamSlice( - partition={"partition_field_1": "a value", "partition_field_2": "another value"}, - cursor_slice={}, - ) - mocked_partition_router.stream_slices.return_value = [partition] - - # Mock the cursor factory to create cursors with specific states - mocked_cursor_factory.create.side_effect = [ - MockedCursorBuilder() - .with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]) - .with_stream_state(CURSOR_STATE) - .build(), - ] - - # Mock the get_parent_state method to return the parent state - mocked_partition_router.get_stream_state.return_value = parent_state - - # Initialize the PerPartitionCursor with the mocked cursor factory and partition router - cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) - - # Set the initial state, including the parent state - initial_state = { - "states": [{"partition": partition.partition, "cursor": CURSOR_STATE}], - "parent_state": parent_state, - } - cursor.set_initial_state(initial_state) - - # Verify that the parent state has been set correctly - assert cursor.get_stream_state()["parent_state"] == parent_state - - # Verify that set_parent_state was called on the partition router with the initial state - mocked_partition_router.set_initial_state.assert_called_once_with(initial_state) - - def test_get_stream_state_includes_parent_state(mocked_cursor_factory, mocked_partition_router): # Define the parent state to be used in the test parent_state = {"parent_cursor": "parent_state_value"} From ace8739c6b9b815a7935575ecdab090d8925677c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 28 Aug 2025 13:55:49 -0400 Subject: [PATCH 54/68] remove unused code and improve tests --- .../parsers/model_to_component_factory.py | 8 ------ .../test_per_partition_cursor_integration.py | 25 ++++++++++--------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 64a92cbeb..331f80337 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2098,14 +2098,6 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - # FIXME to be removed once we migrate everything to DefaultStream - # todo: blai This was originally added back in https://github.com/airbytehq/airbyte-python-cdk/pull/723. - # It does seem like this could be removed now that we only manage DefaultStream but noting to confirm in the PR - if isinstance(retriever, SimpleRetriever): - # We zero it out here, but since this is a cursor reference, the state is still properly - # instantiated for the other components that reference it - retriever.cursor = None - stream_name = model.name or "" return DefaultStream( partition_generator=StreamSlicerPartitionGenerator( diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 5a3f28b02..5eceec0e6 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -912,7 +912,7 @@ def test_parent_stream_is_updated_after_parent_record_fully_consumed(): end_datetime="2022-02-28", datetime_format="%Y-%m-%d", cursor_field=CURSOR_FIELD, - step="P1M", + step="P1Y", cursor_granularity="P1D", ) .with_incremental_sync( @@ -935,19 +935,20 @@ def test_parent_stream_is_updated_after_parent_record_fully_consumed(): # Request for parent stream m.get( "https://api.apilayer.com/exchangerates_data/parent/latest?from=2022-01-01&to=2022-02-28", - json=[{"id": "1"}], + json=[{"id": "1", CURSOR_FIELD: "2022-02-01"}, {"id": "2", CURSOR_FIELD: "2022-02-10"}], ) # Requests for child stream - record_from_first_cursor_interval = {"id": "child_1.1"} + record_from_first_partition = {"id": "child_1.1"} m.get( - "https://api.apilayer.com/exchangerates_data/parent/1/child/latest?from=2022-01-01&to=2022-01-31", - json=[record_from_first_cursor_interval], + "https://api.apilayer.com/exchangerates_data/parent/1/child/latest?from=2022-01-01&to=2022-02-28", + json=[record_from_first_partition], ) - record_from_second_cursor_interval = {"id": "child_1.2"} + + record_from_second_partition = {"id": "child_1.2"} m.get( - "https://api.apilayer.com/exchangerates_data/parent/1/child/latest?from=2022-02-01&to=2022-02-28", - json=[record_from_second_cursor_interval], + "https://api.apilayer.com/exchangerates_data/parent/2/child/latest?from=2022-01-01&to=2022-02-28", + json=[record_from_second_partition], ) message_iterator = source.read( @@ -960,12 +961,12 @@ def test_parent_stream_is_updated_after_parent_record_fully_consumed(): ) records, state = get_records_until_state_message(message_iterator) - - assert len(records) == 1 and records[0].data == record_from_first_cursor_interval - assert "parent_state" not in state.stream.stream_state.__dict__ + assert len(records) == 1 and records[0].data == record_from_first_partition + assert state.stream.stream_state.__dict__["parent_state"] == {"Rates": {"cursor_field": "2022-01-01"}} # state cursor value == start_datetime records, state = get_records_until_state_message(message_iterator) - assert "parent_state" in state.stream.stream_state.__dict__ + assert len(records) == 1 and records[0].data == record_from_second_partition + assert state.stream.stream_state.__dict__["parent_state"] == {"Rates": {"cursor_field": "2022-02-10"}} # state cursor value == most_recent_cursor_value def get_records_until_state_message( From 0f99c9e13f868e0ee9fbc9b945e77c7873c7aa5c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 28 Aug 2025 14:14:45 -0400 Subject: [PATCH 55/68] emit updated parent before last record, not after --- .../declarative/partition_routers/substream_partition_router.py | 2 ++ .../incremental/test_concurrent_perpartitioncursor.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 84b1a8dcd..69ca8395e 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -249,6 +249,8 @@ def stream_slices(self) -> Iterable[StreamSlice]: if is_last_record_in_slice: parent_stream.cursor.close_partition(partition) + if is_last_slice: + parent_stream.cursor.ensure_at_least_one_state_emitted() yield StreamSlice( partition={ diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 26cc36e70..b972cbfed 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -1119,6 +1119,8 @@ def run_incremental_parent_state_test( # Assert that the number of intermediate states is as expected assert len(intermediate_states) - 1 == num_intermediate_states + # Assert that ensure_at_least_one_state_emitted is called before yielding the last record from the last slice + assert intermediate_states[-1][0].stream.stream_state.__dict__["parent_state"] == intermediate_states[-2][0].stream.stream_state.__dict__["parent_state"] # For each intermediate state, perform another read starting from that state for state, records_before_state in intermediate_states[:-1]: From efd1040c6743ebd232b7077dc1768e175d11dcb0 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 28 Aug 2025 16:48:12 -0400 Subject: [PATCH 56/68] mypy --- .../concurrent_partition_cursor.py | 2 +- .../incremental/global_substream_cursor.py | 6 ++- .../parsers/model_to_component_factory.py | 37 +++++++++++-------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 80843b518..41ee09e0b 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -555,7 +555,7 @@ def get_parent_state( stream_state: Optional[StreamState], parent_stream_name: str ) -> Optional[AirbyteStateMessage]: if not stream_state: - return stream_state + return None if "parent_state" not in stream_state: logger.warning( diff --git a/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py b/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py index 602ea051b..21733f94d 100644 --- a/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py @@ -192,8 +192,10 @@ def set_initial_state(self, stream_state: StreamState) -> None: # Example: {"global_state_format_key": "global_state_format_value"} self._stream_cursor.set_initial_state(stream_state) - # Set parent state for partition routers based on parent streams - self._partition_router.set_initial_state(stream_state) + # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the + # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called. + # We are still keeping this line as a comment to be explicit about the past behavior. + # self._partition_router.set_initial_state(stream_state) def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None: """ diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 331f80337..311795474 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3480,14 +3480,17 @@ def create_state_delegating_stream( f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." ) - stream_model = ( + stream_model = self._get_state_delegating_stream_model(has_parent_state, model) + + return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description + + def _get_state_delegating_stream_model(self, has_parent_state, model): + return ( model.incremental_stream if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state else model.full_refresh_stream ) - return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description - def _create_async_job_status_mapping( self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any ) -> Mapping[str, AsyncJobStatus]: @@ -3805,8 +3808,15 @@ def _create_message_repository_substream_wrapper( child_state = self._connector_state_manager.get_stream_state( kwargs["stream_name"], None ) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag + + # This flag will be used exclusively for StateDelegatingStream when a parent stream is created + has_parent_state = bool( + self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) + if model.incremental_dependency + else False + ) connector_state_manager = self._instantiate_parent_stream_state_manager( - child_state, config, model + child_state, config, model, has_parent_state ) substream_factory = ModelToComponentFactory( @@ -3828,19 +3838,13 @@ def _create_message_repository_substream_wrapper( ), ) - # This flag will be used exclusively for StateDelegatingStream when a parent stream is created - has_parent_state = bool( - self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) - if model.incremental_dependency - else False - ) return substream_factory._create_component_from_model( model=model, config=config, has_parent_state=has_parent_state, **kwargs ) def _instantiate_parent_stream_state_manager( - self, child_state: MutableMapping[str, Any], config: Config, model: ParentStreamConfigModel - ): + self, child_state: MutableMapping[str, Any], config: Config, model: ParentStreamConfigModel, has_parent_state: bool + ) -> ConnectorStateManager: """ With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the `set_initial_state` flow that existed for the declarative cursors. This state is taken from @@ -3862,12 +3866,13 @@ def _instantiate_parent_stream_state_manager( ) if not parent_state and not isinstance(parent_state, dict): - cursor_field = InterpolatedString.create( - model.stream.incremental_sync.cursor_field, - parameters=model.stream.incremental_sync.parameters or {}, - ).eval(config) cursor_values = child_state.values() if cursor_values: + incremental_sync_model = model.stream.incremental_sync if isinstance(model.stream, DeclarativeStreamModel) else self._get_state_delegating_stream_model(has_parent_state, model.stream) + cursor_field = InterpolatedString.create( + incremental_sync_model.cursor_field, + parameters=incremental_sync_model.parameters or {}, + ).eval(config) parent_state = AirbyteStateMessage( type=AirbyteStateType.STREAM, stream=AirbyteStreamState( From a04aead823a95a1103abc6b10ce0d85d7bafda6c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 28 Aug 2025 17:09:16 -0400 Subject: [PATCH 57/68] add typing --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 311795474..808c883d2 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3484,7 +3484,7 @@ def create_state_delegating_stream( return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description - def _get_state_delegating_stream_model(self, has_parent_state, model): + def _get_state_delegating_stream_model(self, has_parent_state: bool, model: StateDelegatingStreamModel): return ( model.incremental_stream if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state From c5b483761de034b78302da6d47406e3f4d0977de Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 28 Aug 2025 18:12:15 -0400 Subject: [PATCH 58/68] mypy --- .../parsers/model_to_component_factory.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index f50a72f6d..da73f8bce 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3480,13 +3480,15 @@ def create_state_delegating_stream( f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." ) - stream_model = self._get_state_delegating_stream_model(has_parent_state, model) + stream_model = self._get_state_delegating_stream_model( + False if has_parent_state is None else has_parent_state, model + ) return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description def _get_state_delegating_stream_model( self, has_parent_state: bool, model: StateDelegatingStreamModel - ): + ) -> DeclarativeStreamModel: return ( model.incremental_stream if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state @@ -3874,12 +3876,16 @@ def _instantiate_parent_stream_state_manager( if not parent_state and not isinstance(parent_state, dict): cursor_values = child_state.values() if cursor_values: - incremental_sync_model = ( - model.stream.incremental_sync + incremental_sync_model: Union[ + DatetimeBasedCursorModel, + IncrementingCountCursorModel, + CustomIncrementalSyncModel, + ] = ( + model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream if isinstance(model.stream, DeclarativeStreamModel) else self._get_state_delegating_stream_model( has_parent_state, model.stream - ) + ).incremental_sync ) cursor_field = InterpolatedString.create( incremental_sync_model.cursor_field, From bb96293233be2ce155d22c755ddd68ad2ad66441 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Fri, 29 Aug 2025 07:43:29 -0400 Subject: [PATCH 59/68] remove comments that documented the new behavior --- .../incremental/test_concurrent_perpartitioncursor.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 293db11ed..966d48fd9 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -1486,7 +1486,6 @@ def run_incremental_parent_state_test( ] }, ), - # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too ( f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, @@ -1632,7 +1631,6 @@ def run_incremental_parent_state_test( ] }, ), - # FIXME this is an interesting case. The previous solution would not update the parent state until `ensure_at_least_one_state_emitted` but the concurrent cursor does just before which is probably fine too ( f"https://api.example.com/community/posts?per_page=100&start_time={POST_1_UPDATED_AT}", {"posts": [{"id": 1, "updated_at": POST_1_UPDATED_AT}]}, @@ -1741,7 +1739,7 @@ def run_incremental_parent_state_test( ], } }, - "lookback_window": 86400, # FIXME this run only sync one record without cursor value hence why it might make sense not to update the lookback window + "lookback_window": 86400, "use_global_cursor": False, "states": [ { @@ -2137,11 +2135,10 @@ def test_incremental_parent_state_migration( "partition": {"id": 1, "parent_slice": {}}, "cursor": { "updated_at": START_DATE - }, # FIXME this happens because the concurrent framework gets the start date as the max between the state value and the start value. In this case, the start value is higher + }, } ], - "lookback_window": 0, # FIXME the concurrent framework sets the lookback window to 0 as opposed to the declarative framework which would set not define it - # FIXME the concurrent framework does not set the global state if there are none as opposed to the declarative framework which would set an empty global state + "lookback_window": 0, "use_global_cursor": False, "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, } @@ -2338,7 +2335,7 @@ def test_incremental_parent_state_no_slices( }, # Expected state { - "lookback_window": 0, # FIXME maybe I'm wrong but I don't think it makes sense to have a lookback window being added from the state of "not having a lookback window" before + "lookback_window": 0, "use_global_cursor": False, "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, "states": [ From 639a7347dc055979a83e27ab14fe49008cfd4eaf Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Fri, 29 Aug 2025 08:11:25 -0400 Subject: [PATCH 60/68] a bit more cleanup --- .../declarative/concurrent_declarative_source.py | 4 ++-- .../parsers/model_to_component_factory.py | 13 ++++++------- .../partition_routers/substream_partition_router.py | 1 - 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 16ff94abf..d4e69df78 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -703,7 +703,7 @@ def _group_streams( stream_slicer=declarative_stream.retriever.stream_slicer, slice_limit=self._limits.max_slices if self._limits - else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later + else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later ) else: if ( @@ -772,7 +772,7 @@ def _group_streams( declarative_stream.retriever.stream_slicer, slice_limit=self._limits.max_slices if self._limits - else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later + else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later ) final_state_cursor = FinalStateCursor( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index da73f8bce..25b105951 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -715,7 +715,7 @@ def _init_mappings(self) -> None: CustomValidationStrategyModel: self.create_custom_component, CustomConfigTransformationModel: self.create_custom_component, DatetimeBasedCursorModel: self.create_datetime_based_cursor, - DeclarativeStreamModel: self.create_declarative_stream, + DeclarativeStreamModel: self.create_default_stream, DefaultErrorHandlerModel: self.create_default_error_handler, DefaultPaginatorModel: self.create_default_paginator, DpathExtractorModel: self.create_dpath_extractor, @@ -1960,7 +1960,7 @@ def create_datetime_based_cursor( parameters=model.parameters or {}, ) - def create_declarative_stream( + def create_default_stream( self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any ) -> Union[DeclarativeStream, AbstractStream]: primary_key = model.primary_key.__root__ if model.primary_key else None @@ -1970,7 +1970,7 @@ def create_declarative_stream( ) concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): - cursor_model = model.incremental_sync + cursor_model: DatetimeBasedCursorModel = model.incremental_sync end_time_option = ( self._create_component_from_model( @@ -1990,7 +1990,7 @@ def create_declarative_stream( datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( start_time_option=start_time_option, end_time_option=end_time_option, - partition_field_start=cursor_model.partition_field_end, + partition_field_start=cursor_model.partition_field_start, partition_field_end=cursor_model.partition_field_end, config=config, parameters=model.parameters or {}, @@ -2117,7 +2117,6 @@ def create_declarative_stream( if hasattr(concurrent_cursor, "cursor_field") else "", # FIXME we should have the cursor field has part of the interface of cursor, logger=logging.getLogger(f"airbyte.{stream_name}"), - # FIXME this is a breaking change compared to the old implementation which used the source name instead cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), ) @@ -3484,7 +3483,7 @@ def create_state_delegating_stream( False if has_parent_state is None else has_parent_state, model ) - return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description + return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel def _get_state_delegating_stream_model( self, has_parent_state: bool, model: StateDelegatingStreamModel @@ -3811,7 +3810,7 @@ def _create_message_repository_substream_wrapper( # getting the parent state child_state = self._connector_state_manager.get_stream_state( kwargs["stream_name"], None - ) # FIXME adding `stream_name` as a parameter means it will be a breaking change. I assume this is mostly called internally so I don't think we need to bother that much about this but still raising the flag + ) # This flag will be used exclusively for StateDelegatingStream when a parent stream is created has_parent_state = bool( diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 69ca8395e..e41a0d9a1 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -261,7 +261,6 @@ def stream_slices(self) -> Iterable[StreamSlice]: extra_fields=extracted_extra_fields, ) - parent_stream.cursor.ensure_at_least_one_state_emitted() yield from [] def _extract_child_response( From 1b4b7561668f80619d61f599a7c0b6e3c96c94c0 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Fri, 29 Aug 2025 09:49:03 -0400 Subject: [PATCH 61/68] more clean up --- .../parsers/model_to_component_factory.py | 110 ------------------ 1 file changed, 110 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 25b105951..cafeb41d0 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2167,78 +2167,6 @@ def _build_stream_slicer_from_partition_router( ) return SinglePartitionRouter(parameters={}) - def _build_incremental_cursor( - self, - model: DeclarativeStreamModel, - stream_slicer: Optional[PartitionRouter], - config: Config, - ) -> Optional[StreamSlicer]: - state_transformations = ( - [ - self._create_component_from_model(state_migration, config, declarative_stream=model) - for state_migration in model.state_migrations - ] - if model.state_migrations - else [] - ) - - if model.incremental_sync and ( - stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) - ): - if model.retriever.type == "AsyncRetriever": - stream_name = model.name or "" - stream_namespace = None - stream_state = self._connector_state_manager.get_stream_state( - stream_name, stream_namespace - ) - - return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing - state_manager=self._connector_state_manager, - model_type=DatetimeBasedCursorModel, - component_definition=model.incremental_sync.__dict__, - stream_name=stream_name, - stream_namespace=stream_namespace, - config=config or {}, - stream_state=stream_state, - stream_state_migrations=state_transformations, - partition_router=stream_slicer, - ) - - incremental_sync_model = model.incremental_sync - cursor_component = self._create_component_from_model( - model=incremental_sync_model, config=config - ) - is_global_cursor = ( - hasattr(incremental_sync_model, "global_substream_cursor") - and incremental_sync_model.global_substream_cursor - ) - - if is_global_cursor: - return GlobalSubstreamCursor( - stream_cursor=cursor_component, partition_router=stream_slicer - ) - return PerPartitionWithGlobalCursor( - cursor_factory=CursorFactory( - lambda: self._create_component_from_model( - model=incremental_sync_model, config=config - ), - ), - partition_router=stream_slicer, - stream_cursor=cursor_component, - ) - elif model.incremental_sync: - if model.retriever.type == "AsyncRetriever": - return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing - model_type=DatetimeBasedCursorModel, - component_definition=model.incremental_sync.__dict__, - stream_name=model.name or "", - stream_namespace=None, - config=config or {}, - stream_state_migrations=state_transformations, - ) - return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync - return None - def _build_concurrent_cursor( self, model: DeclarativeStreamModel, @@ -2301,44 +2229,6 @@ def _build_concurrent_cursor( ) return FinalStateCursor(stream_name, None, self._message_repository) - def _merge_stream_slicers( - self, model: DeclarativeStreamModel, config: Config - ) -> Optional[StreamSlicer]: - retriever_model = model.retriever - - stream_slicer = self._build_stream_slicer_from_partition_router( - retriever_model, config, stream_name=model.name - ) - - if retriever_model.type == "AsyncRetriever": - is_not_datetime_cursor = ( - model.incremental_sync.type != "DatetimeBasedCursor" - if model.incremental_sync - else None - ) - is_partition_router = ( - bool(retriever_model.partition_router) if model.incremental_sync else None - ) - - if is_not_datetime_cursor: - # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the - # support or unordered slices (for example, when we trigger reports for January and February, the report - # in February can be completed first). Once we have support for custom concurrent cursor or have a new - # implementation available in the CDK, we can enable more cursors here. - raise ValueError( - "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." - ) - - if is_partition_router and not stream_slicer: - # Note that this development is also done in parallel to the per partition development which once merged - # we could support here by calling create_concurrent_cursor_from_perpartition_cursor - raise ValueError("Per partition state is not supported yet for AsyncRetriever.") - - if model.incremental_sync: - return self._build_incremental_cursor(model, stream_slicer, config) - - return stream_slicer - def create_default_error_handler( self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any ) -> DefaultErrorHandler: From c0046371680cfc4a06576e0fdc235da139a9b341 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Fri, 29 Aug 2025 13:39:19 -0400 Subject: [PATCH 62/68] remove unecessary test --- .../test_per_partition_cursor_integration.py | 136 ------------------ 1 file changed, 136 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py index 935b92996..2f2b6b2bd 100644 --- a/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +++ b/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -422,142 +422,6 @@ def test_substream_without_input_state(): ] -def test_switch_to_global_limit(caplog): - """ - Test that when the number of partitions exceeds the limit to switch to global state. - - In this test, we set the maximum number of partitions to 1 (not 2 because we evaluate this before generating a - partition and the limit is not inclusive) and provide 3 partitions. - We verify that the state switch to global. - """ - stream_name = "Rates" - - partition_slices = [ - StreamSlice(partition={"partition_field": "1"}, cursor_slice={}), - StreamSlice(partition={"partition_field": "2"}, cursor_slice={}), - StreamSlice(partition={"partition_field": "3"}, cursor_slice={}), - ] - - records_list = [ - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ), - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ), - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, - associated_slice=partition_slices[0], - stream_name=stream_name, - ) - ], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, - associated_slice=partition_slices[1], - stream_name=stream_name, - ) - ], - [], - [], - [ - Record( - data={"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, - associated_slice=partition_slices[2], - stream_name=stream_name, - ) - ], - ] - - configured_stream = ConfiguredAirbyteStream( - stream=AirbyteStream( - name="Rates", - json_schema={}, - supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], - ), - sync_mode=SyncMode.incremental, - destination_sync_mode=DestinationSyncMode.append, - ) - catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) - - initial_state = [ - AirbyteStateMessage( - type=AirbyteStateType.STREAM, - stream=AirbyteStreamState( - stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), - stream_state=AirbyteStateBlob( - { - "states": [ - { - "partition": {"partition_field": "1"}, - "cursor": {CURSOR_FIELD: "2022-01-01"}, - }, - { - "partition": {"partition_field": "2"}, - "cursor": {CURSOR_FIELD: "2022-01-02"}, - }, - { - "partition": {"partition_field": "3"}, - "cursor": {CURSOR_FIELD: "2022-01-03"}, - }, - ] - } - ), - ), - ) - ] - logger = MagicMock() - - source = ConcurrentDeclarativeSource( - source_config=ManifestBuilder() - .with_list_partition_router( - stream_name=stream_name, cursor_field="partition_field", partitions=["1", "2", "3"] - ) - .with_incremental_sync( - stream_name=stream_name, - start_datetime="2022-01-01", - end_datetime="2022-02-28", - datetime_format="%Y-%m-%d", - cursor_field=CURSOR_FIELD, - step="P1M", - cursor_granularity="P1D", - ) - .build(), - config={}, - catalog=catalog, - state=initial_state, - ) - - # Use caplog to capture logs - with caplog.at_level(logging.INFO, logger="airbyte"): - with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): - with patch.object(ConcurrentPerPartitionCursor, "SWITCH_TO_GLOBAL_LIMIT", 1): - output = list(source.read(logger, {}, catalog, initial_state)) - - # Check if the warning was logged - logged_messages = [record.message for record in caplog.records if record.levelname == "INFO"] - warning_message = "Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of" - assert any(map(lambda message: warning_message in message, logged_messages)) - - final_state = [ - orjson.loads(orjson.dumps(message.state.stream.stream_state)) - for message in output - if message.state - ] - assert final_state[-1] == { - "lookback_window": 1, - "state": {"cursor_field": "2022-02-17"}, - "use_global_cursor": True, - } - - def test_perpartition_with_fallback(caplog): """ Test that when the number of partitions exceeds the limit in PerPartitionCursor, From 25ca5b8ae688c5a361cb7834f7a4de00b4d90927 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 2 Sep 2025 10:18:42 -0400 Subject: [PATCH 63/68] allow for specific parameters to be passed to custom components --- .../parsers/model_to_component_factory.py | 23 ++++---- .../test_model_to_component_factory.py | 58 ++++++++++++++++++- .../declarative/parsers/testing_components.py | 6 ++ 3 files changed, 75 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index cafeb41d0..008d4a583 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -752,7 +752,7 @@ def _init_mappings(self) -> None: OAuthAuthenticatorModel: self.create_oauth_authenticator, OffsetIncrementModel: self.create_offset_increment, PageIncrementModel: self.create_page_increment, - ParentStreamConfigModel: self.create_parent_stream_config, + ParentStreamConfigModel: self._create_message_repository_substream_wrapper, PredicateValidatorModel: self.create_predicate_validator, PropertiesFromEndpointModel: self.create_properties_from_endpoint, PropertyChunkingModel: self.create_property_chunking, @@ -1748,7 +1748,7 @@ def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> if self._is_component(model_value): model_args[model_field] = self._create_nested_component( - model, model_field, model_value, config + model, model_field, model_value, config, **kwargs, ) elif isinstance(model_value, list): vals = [] @@ -1760,7 +1760,7 @@ def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> if derived_type: v["type"] = derived_type if self._is_component(v): - vals.append(self._create_nested_component(model, model_field, v, config)) + vals.append(self._create_nested_component(model, model_field, v, config, **kwargs,)) else: vals.append(v) model_args[model_field] = vals @@ -1850,7 +1850,7 @@ def _extract_missing_parameters(error: TypeError) -> List[str]: return [] def _create_nested_component( - self, model: Any, model_field: str, model_value: Any, config: Config + self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any ) -> Any: type_name = model_value.get("type", None) if not type_name: @@ -1875,8 +1875,11 @@ def _create_nested_component( for kwarg in constructor_kwargs if kwarg in model_parameters } + matching_kwargs = { + kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs + } return self._create_component_from_model( - model=parsed_model, config=config, **matching_parameters + model=parsed_model, config=config, **(matching_parameters | matching_kwargs) ) except TypeError as error: missing_parameters = self._extract_missing_parameters(error) @@ -2871,7 +2874,7 @@ def create_page_increment( ) def create_parent_stream_config( - self, model: ParentStreamConfigModel, config: Config, **kwargs: Any + self, model: ParentStreamConfigModel, config: Config, stream_name: str, **kwargs: Any ) -> ParentStreamConfig: declarative_stream = self._create_component_from_model( model.stream, @@ -3695,11 +3698,11 @@ def create_substream_partition_router( ) def _create_message_repository_substream_wrapper( - self, model: ParentStreamConfigModel, config: Config, **kwargs: Any + self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any ) -> Any: # getting the parent state child_state = self._connector_state_manager.get_stream_state( - kwargs["stream_name"], None + stream_name, None ) # This flag will be used exclusively for StateDelegatingStream when a parent stream is created @@ -3731,8 +3734,8 @@ def _create_message_repository_substream_wrapper( ), ) - return substream_factory._create_component_from_model( - model=model, config=config, has_parent_state=has_parent_state, **kwargs + return substream_factory.create_parent_stream_config( + model=model, config=config, stream_name=stream_name, **kwargs ) def _instantiate_parent_stream_state_manager( diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index c51383e8f..f359459e0 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -163,6 +163,7 @@ from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource +from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository from airbyte_cdk.sources.streams.call_rate import MovingWindowCallRatePolicy from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, @@ -944,6 +945,58 @@ def test_stream_with_incremental_and_retriever_with_partition_router(): assert list_stream_slicer._cursor_field.string == "a_key" +def test_stream_with_custom_retriever_and_transformations(): + content = """ +a_stream: + type: DeclarativeStream + primary_key: id + schema_loader: + type: InlineSchemaLoader + schema: + $schema: "http://json-schema.org/draft-07/schema" + type: object + properties: + id: + type: string + retriever: + type: CustomRetriever + class_name: unit_tests.sources.declarative.parsers.testing_components.TestingCustomRetriever + name: "{{ parameters['name'] }}" + decoder: + type: JsonDecoder + requester: + type: HttpRequester + name: "{{ parameters['name'] }}" + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: ["records"] + transformations: + - type: AddFields + fields: + - path: ["extra"] + value: "{{ response.to_add }}" + $parameters: + name: a_stream +""" + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["a_stream"], {} + ) + + stream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config + ) + + assert isinstance(stream, DefaultStream) + assert get_retriever(stream).record_selector.transformations + + @pytest.mark.parametrize( "use_legacy_state", [ @@ -2053,11 +2106,12 @@ def test_custom_components_do_not_contain_extra_fields(): } custom_substream_partition_router = factory.create_component( - CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config + CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config, stream_name="child_stream_name", ) assert isinstance(custom_substream_partition_router, TestingCustomSubstreamPartitionRouter) assert len(custom_substream_partition_router.parent_stream_configs) == 1 + assert isinstance(custom_substream_partition_router.parent_stream_configs[0].stream.cursor._message_repository, StateFilteringMessageRepository) assert custom_substream_partition_router.parent_stream_configs[0].parent_key.eval({}) == "id" assert ( custom_substream_partition_router.parent_stream_configs[0].partition_field.eval({}) @@ -2120,7 +2174,7 @@ def test_parse_custom_component_fields_if_subcomponent(): } custom_substream_partition_router = factory.create_component( - CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config + CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config, stream_name="child_stream_name" ) assert isinstance(custom_substream_partition_router, TestingCustomSubstreamPartitionRouter) assert custom_substream_partition_router.custom_field == "here" diff --git a/unit_tests/sources/declarative/parsers/testing_components.py b/unit_tests/sources/declarative/parsers/testing_components.py index ab9ae5346..0b9a68e6b 100644 --- a/unit_tests/sources/declarative/parsers/testing_components.py +++ b/unit_tests/sources/declarative/parsers/testing_components.py @@ -13,6 +13,7 @@ DefaultPaginator, PaginationStrategy, ) +from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever @dataclass @@ -43,3 +44,8 @@ class TestingCustomSubstreamPartitionRouter(SubstreamPartitionRouter): custom_field: str custom_pagination_strategy: PaginationStrategy + + +@dataclass +class TestingCustomRetriever(SimpleRetriever): + pass From e5ecf418868598b46f1fdde2026704a8d9d9133f Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 2 Sep 2025 11:10:24 -0400 Subject: [PATCH 64/68] fix internal _get_url --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 008d4a583..bbdc09fc3 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3155,7 +3155,7 @@ def _get_url(req: Requester) -> str: _url_base: str = ( model.requester.url_base if hasattr(model.requester, "url_base") and model.requester.url_base is not None - else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) + else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) ) return _url or _url_base From f2f136319a28de5c808714a75182f992ef0e42cd Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 2 Sep 2025 13:26:39 -0400 Subject: [PATCH 65/68] fix case where request option provider is stream slicer --- .../declarative/parsers/model_to_component_factory.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index bbdc09fc3..6dea21c57 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2062,6 +2062,7 @@ def create_default_stream( primary_key=primary_key, request_options_provider=request_options_provider, stream_slicer=stream_slicer, + partition_router=partition_router, stop_condition_cursor=concurrent_cursor if self._is_stop_condition_on_cursor(model) else None, @@ -2524,7 +2525,7 @@ def create_dynamic_schema_loader( config=config, name=name, primary_key=None, - stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), + partition_router=self._build_stream_slicer_from_partition_router(model.retriever, config), transformations=[], use_cache=True, log_formatter=( @@ -3139,6 +3140,7 @@ def create_simple_retriever( ] = None, use_cache: Optional[bool] = None, log_formatter: Optional[Callable[[Response], Any]] = None, + partition_router: Optional[PartitionRouter] = None, **kwargs: Any, ) -> SimpleRetriever: def _get_url(req: Requester) -> str: @@ -3236,6 +3238,8 @@ def _get_url(req: Requester) -> str: if not request_options_provider: request_options_provider = DefaultRequestOptionsProvider(parameters={}) + if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance(partition_router, PartitionRouter): + request_options_provider = partition_router paginator = ( self._create_component_from_model( From 38cd6570332d96f14ad6d8a287e89c496f1a5232 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 2 Sep 2025 20:00:33 -0400 Subject: [PATCH 66/68] add migration guide --- cdk-migrations.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cdk-migrations.md b/cdk-migrations.md index d07c184d2..3445a04b8 100644 --- a/cdk-migrations.md +++ b/cdk-migrations.md @@ -1,5 +1,37 @@ # CDK Migration Guide +## Upgrading to 7.0.0 + +[Version 7.0.0](https://github.com/airbytehq/airbyte-python-cdk/releases/tag/v7.0.0) of the CDK migrates the CDK to the Concurrent CDK by removing some of the Declarative CDK concepts that are better expressed in the Concurrent CDK or that are outright incompatible with it. This changes mostly impact the Python implementations although the concept of CustomIncrementalSync has been removed from the declarative language as well. + +### CustomIncrementalSync + +Migration steps: None available + +Rationale: Our current interface for CustomIncrementalSync was assuming that the first slice would be processed before the second which would be processed before the third, etc... In a concurrent world, the multiple units of work can be done in any order. The current implementations of CustomIncrementalSync do not account for that hence are not compatible with the new version of the CDK. Also, we've rarely seen CustomIncrementalSync that were actually needed. On top of that, state management is much more complex in a concurrent world as it requires the developer to track multiple units of work and combining them to provide a simple representation of a state. For all those reason, we have decided not to support CustomIncrementalSync but if needs be, feel free to reach out to our team and we will re-evaluate the need for those. + +### CustomRetriever State + +Migration steps: Ensures that you don't implement `Retriever.state` or relying on the field `SimpleRetriever.cursor`. For more information, see the point above. + +Rationale: As mentioned above, the state has been moved outside the realm of the stream responsibilities. Therefore, it does not make sense for the retriever (which is a stream specific concept) to hold state information. This way, a connector developer wanting to implement a CustomRetriever will not have to bother about state management anymore. + +### Inheriting from Substream Partition Routing + +Migration steps: If your custom component relies on SubstreamPartitionRouter.parent_stream_configs[x].stream, make sure you migrate from the `DeclarativeStream` interface to the `AbstractStream` one. + +Rationale: `DeclarativeStream` interface is not compatible with the `AbstractStream` from the Concurrent CDK. In order to avoid maintaining two different instantiation flows (one for the SubstreamPartitionRouter and one for the Concurrent CDK), we decided to migrate `SubstreamPartitionRouter` to use `AbstractStream`. + +### CustomRetriever.stream_slices + +Migration steps: Ensures that you don't implement `Retriever.stream_slices` or relying on the field `SimpleRetriever.stream_slicer`. You can implement your own PartitionRouter to influence how stream slices are generated. + +Rationale: Generating units of work has been re-implemented as part of the Concurrent CDK as those units will be parallelized. While doing this change, there were no apparent reasons to go through the retriever in order to get the stream slices. Hence, we are deprecating this method and will remove it. + +### Possible Missing Features + +We have seen that some custom components were create just for the RequestOptionsProvider interface. There should always be an escape path for that which is the string interpolation. Given this is not enough, feel free to reach out to our team so that we can figure out a solution. + ## Upgrading to 6.34.0 [Version 6.34.0](https://github.com/airbytehq/airbyte-python-cdk/releases/tag/v6.34.0) of the CDK removes support for `stream_state` in the Jinja interpolation context. This change is breaking for any low-code connectors that use `stream_state` in the interpolation context. From 5bafecaae0032f1d8f2f53cf7ec029133eed9dc8 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 3 Sep 2025 13:25:44 -0400 Subject: [PATCH 67/68] code review comment --- .../declarative/parsers/model_to_component_factory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 6dea21c57..2a668b2b8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -752,7 +752,7 @@ def _init_mappings(self) -> None: OAuthAuthenticatorModel: self.create_oauth_authenticator, OffsetIncrementModel: self.create_offset_increment, PageIncrementModel: self.create_page_increment, - ParentStreamConfigModel: self._create_message_repository_substream_wrapper, + ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, PredicateValidatorModel: self.create_predicate_validator, PropertiesFromEndpointModel: self.create_properties_from_endpoint, PropertyChunkingModel: self.create_property_chunking, @@ -3688,7 +3688,7 @@ def create_substream_partition_router( if model.parent_stream_configs: parent_stream_configs.extend( [ - self._create_message_repository_substream_wrapper( + self.create_parent_stream_config_with_substream_wrapper( model=parent_stream_config, config=config, **kwargs ) for parent_stream_config in model.parent_stream_configs @@ -3701,7 +3701,7 @@ def create_substream_partition_router( config=config, ) - def _create_message_repository_substream_wrapper( + def create_parent_stream_config_with_substream_wrapper( self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any ) -> Any: # getting the parent state From 8bef9dd5b8cc14838069a535a26f78f9bcf0b9bd Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 3 Sep 2025 13:26:14 -0400 Subject: [PATCH 68/68] format --- .../parsers/model_to_component_factory.py | 28 ++++++++++++++----- .../test_concurrent_perpartitioncursor.py | 4 +-- .../test_model_to_component_factory.py | 17 +++++++++-- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 2a668b2b8..105f472de 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1748,7 +1748,11 @@ def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> if self._is_component(model_value): model_args[model_field] = self._create_nested_component( - model, model_field, model_value, config, **kwargs, + model, + model_field, + model_value, + config, + **kwargs, ) elif isinstance(model_value, list): vals = [] @@ -1760,7 +1764,15 @@ def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> if derived_type: v["type"] = derived_type if self._is_component(v): - vals.append(self._create_nested_component(model, model_field, v, config, **kwargs,)) + vals.append( + self._create_nested_component( + model, + model_field, + v, + config, + **kwargs, + ) + ) else: vals.append(v) model_args[model_field] = vals @@ -2525,7 +2537,9 @@ def create_dynamic_schema_loader( config=config, name=name, primary_key=None, - partition_router=self._build_stream_slicer_from_partition_router(model.retriever, config), + partition_router=self._build_stream_slicer_from_partition_router( + model.retriever, config + ), transformations=[], use_cache=True, log_formatter=( @@ -3238,7 +3252,9 @@ def _get_url(req: Requester) -> str: if not request_options_provider: request_options_provider = DefaultRequestOptionsProvider(parameters={}) - if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance(partition_router, PartitionRouter): + if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( + partition_router, PartitionRouter + ): request_options_provider = partition_router paginator = ( @@ -3705,9 +3721,7 @@ def create_parent_stream_config_with_substream_wrapper( self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any ) -> Any: # getting the parent state - child_state = self._connector_state_manager.get_stream_state( - stream_name, None - ) + child_state = self._connector_state_manager.get_stream_state(stream_name, None) # This flag will be used exclusively for StateDelegatingStream when a parent stream is created has_parent_state = bool( diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 966d48fd9..3916a4da8 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -2133,9 +2133,7 @@ def test_incremental_parent_state_migration( "states": [ { "partition": {"id": 1, "parent_slice": {}}, - "cursor": { - "updated_at": START_DATE - }, + "cursor": {"updated_at": START_DATE}, } ], "lookback_window": 0, diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index f359459e0..3b2eaf03b 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -2106,12 +2106,20 @@ def test_custom_components_do_not_contain_extra_fields(): } custom_substream_partition_router = factory.create_component( - CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config, stream_name="child_stream_name", + CustomPartitionRouterModel, + custom_substream_partition_router_manifest, + input_config, + stream_name="child_stream_name", ) assert isinstance(custom_substream_partition_router, TestingCustomSubstreamPartitionRouter) assert len(custom_substream_partition_router.parent_stream_configs) == 1 - assert isinstance(custom_substream_partition_router.parent_stream_configs[0].stream.cursor._message_repository, StateFilteringMessageRepository) + assert isinstance( + custom_substream_partition_router.parent_stream_configs[ + 0 + ].stream.cursor._message_repository, + StateFilteringMessageRepository, + ) assert custom_substream_partition_router.parent_stream_configs[0].parent_key.eval({}) == "id" assert ( custom_substream_partition_router.parent_stream_configs[0].partition_field.eval({}) @@ -2174,7 +2182,10 @@ def test_parse_custom_component_fields_if_subcomponent(): } custom_substream_partition_router = factory.create_component( - CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config, stream_name="child_stream_name" + CustomPartitionRouterModel, + custom_substream_partition_router_manifest, + input_config, + stream_name="child_stream_name", ) assert isinstance(custom_substream_partition_router, TestingCustomSubstreamPartitionRouter) assert custom_substream_partition_router.custom_field == "here"