pr feedback

brianjlai · brianjlai · commit ef0f3f81ff1f · 2025-11-11T16:01:28.000-08:00
diff --git a/airbyte_cdk/sources/declarative/retrievers/retriever.py b/airbyte_cdk/sources/declarative/retrievers/retriever.py
@@ -30,7 +30,6 @@ def read_records(
         :return: The records read from the API source
         """
 
-    @abstractmethod
     @deprecated("Stream slicing is being moved to the stream level.")
     def stream_slices(self) -> Iterable[Optional[StreamSlice]]:
         """Does nothing as this method is deprecated, so underlying Retriever implementations
diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py
@@ -530,6 +530,9 @@ def _read_pages(
 
             yield from []
         else:
+            # coderabbit detected an interesting bug/gap where if we were to not get a child_response, we
+            # might recurse forever. This might not be the case, but it is worth noting that this code path
+            # isn't comprehensively tested.
             yield from self._read_pages(records_generator_fn, stream_slice)
 
     def _paginate(
diff --git a/unit_tests/sources/declarative/async_job/test_integration.py b/unit_tests/sources/declarative/async_job/test_integration.py
@@ -2,18 +2,25 @@
 
 
 import logging
-from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple
+from queue import Queue
+from typing import Any, Iterable, Iterator, List, Mapping, Optional, Set, Tuple
 from unittest import TestCase, mock
 
-from airbyte_cdk.legacy.sources.declarative.declarative_stream import DeclarativeStream
+from airbyte_protocol_dataclasses.models import (
+    AirbyteCatalog,
+    AirbyteMessage,
+    AirbyteStateMessage,
+    ConfiguredAirbyteCatalog,
+)
+
 from airbyte_cdk.models import ConnectorSpecification
-from airbyte_cdk.sources import AbstractSource
+from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
 from airbyte_cdk.sources.declarative.async_job.job import AsyncJob
 from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator
 from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker
 from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository
 from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus
-from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
+from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor
 from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector
 from airbyte_cdk.sources.declarative.partition_routers import SinglePartitionRouter
 from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import (
@@ -22,15 +29,23 @@
 from airbyte_cdk.sources.declarative.retrievers.async_retriever import AsyncRetriever
 from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader
 from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer
+from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
+    DeclarativePartitionFactory,
+    StreamSlicerPartitionGenerator,
+)
 from airbyte_cdk.sources.message import NoopMessageRepository
-from airbyte_cdk.sources.streams import Stream
+from airbyte_cdk.sources.source import Source
+from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
+from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
+from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
+from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
 from airbyte_cdk.sources.types import StreamSlice
+from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
 from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
 from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder
 from airbyte_cdk.test.entrypoint_wrapper import read
 
 _A_STREAM_NAME = "a_stream_name"
-_EXTRACTOR_NOT_USED: RecordExtractor = None  # type: ignore  # the extractor should not be used. If it is the case, there is an issue that needs fixing
 _NO_LIMIT = 10000
 
 
@@ -52,57 +67,139 @@ def delete(self, job: AsyncJob) -> None:
         pass
 
 
-class MockSource(AbstractSource):
+class MockSource(Source):
     def __init__(self, stream_slicer: Optional[StreamSlicer] = None) -> None:
         self._stream_slicer = SinglePartitionRouter({}) if stream_slicer is None else stream_slicer
+        queue: Queue[QueueItem] = Queue(maxsize=10_000)
         self._message_repository = NoopMessageRepository()
+        self._config = {}
+
+        self._concurrent_source = ConcurrentSource.create(
+            num_workers=1,
+            initial_number_of_partitions_to_generate=1,
+            logger=logging.getLogger("airbyte"),
+            slice_logger=DebugSliceLogger(),
+            queue=queue,
+            message_repository=self._message_repository,
+        )
 
-    def check_connection(
+    def check(
         self, logger: logging.Logger, config: Mapping[str, Any]
     ) -> Tuple[bool, Optional[Any]]:
         return True, None
 
     def spec(self, logger: logging.Logger) -> ConnectorSpecification:
         return ConnectorSpecification(connectionSpecification={})
 
-    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
-        noop_record_selector = RecordSelector(
-            extractor=_EXTRACTOR_NOT_USED,
+    def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]:
+        # Build the partition router with the mock repository
+        partition_router = AsyncJobPartitionRouter(
+            stream_slicer=self._stream_slicer,
+            job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator(
+                MockAsyncJobRepository(),
+                stream_slices,
+                JobTracker(_NO_LIMIT),
+                self._message_repository,
+            ),
+            config={},
+            parameters={},
+        )
+
+        # Create the extractor that extracts records from responses
+        extractor = DpathExtractor(
+            field_path=[],
+            config={},
+            parameters={},
+        )
+
+        # Create the record selector with the extractor
+        record_selector = RecordSelector(
+            extractor=extractor,
             config={},
             parameters={},
             schema_normalization=TypeTransformer(TransformConfig.NoTransform),
+            name=_A_STREAM_NAME,
             record_filter=None,
             transformations=[],
         )
-        return [
-            DeclarativeStream(
-                retriever=AsyncRetriever(
-                    config={},
-                    parameters={},
-                    record_selector=noop_record_selector,
-                    stream_slicer=AsyncJobPartitionRouter(
-                        stream_slicer=self._stream_slicer,
-                        job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator(
-                            MockAsyncJobRepository(),
-                            stream_slices,
-                            JobTracker(_NO_LIMIT),
-                            self._message_repository,
-                        ),
-                        config={},
-                        parameters={},
-                    ),
-                ),
-                config={},
-                parameters={},
-                name=_A_STREAM_NAME,
-                primary_key=["id"],
-                schema_loader=InlineSchemaLoader({}, {}),
-                # the interface mentions that this is Optional,
-                # but I get `'NoneType' object has no attribute 'eval'` by passing None
-                stream_cursor_field="",
-            )
+
+        # Build the retriever with the partition router
+        retriever = AsyncRetriever(
+            config={},
+            parameters={},
+            record_selector=record_selector,
+            stream_slicer=partition_router,
+        )
+
+        # Create schema loader
+        schema_loader = InlineSchemaLoader({}, {})
+
+        # Create partition factory that will create partitions from stream slices
+        partition_factory = DeclarativePartitionFactory(
+            stream_name=_A_STREAM_NAME,
+            schema_loader=schema_loader,
+            retriever=retriever,
+            message_repository=self._message_repository,
+            max_records_limit=None,
+        )
+
+        # Create partition generator that wraps the partition router
+        partition_generator = StreamSlicerPartitionGenerator(
+            partition_factory=partition_factory,
+            stream_slicer=partition_router,
+            slice_limit=None,
+            max_records_limit=None,
+        )
+
+        # Create cursor (using FinalStateCursor for full refresh)
+        cursor = FinalStateCursor(
+            stream_name=_A_STREAM_NAME,
+            stream_namespace=None,
+            message_repository=self._message_repository,
+        )
+
+        # Directly instantiate DefaultStream with all components
+        stream = DefaultStream(
+            partition_generator=partition_generator,
+            name=_A_STREAM_NAME,
+            json_schema={},
+            primary_key=["id"],
+            cursor_field=None,
+            logger=logging.getLogger("airbyte"),
+            cursor=cursor,
+        )
+
+        return [stream]
+
+    def read(
+        self,
+        logger: logging.Logger,
+        config: Mapping[str, Any],
+        catalog: ConfiguredAirbyteCatalog,
+        state: Optional[List[AirbyteStateMessage]] = None,
+    ) -> Iterator[AirbyteMessage]:
+        stream_name_to_instance = {s.name: s for s in self.streams(config=self._config)}
+        selected_concurrent_streams = [
+            stream_name_to_instance[configured_stream.stream.name]
+            for configured_stream in catalog.streams
+            if configured_stream.stream.name in stream_name_to_instance
         ]
 
+        # selected_concurrent_streams = self._select_streams(
+        #     streams=,  # type: ignore  # We are migrating away from the DeclarativeStream implementation and streams() only returns the concurrent-compatible AbstractStream. To preserve compatibility, we retain the existing method interface
+        #     configured_catalog=catalog,
+        # )
+
+        # It would appear that passing in an empty set of streams causes an infinite loop in ConcurrentReadProcessor.
+        # This is also evident in concurrent_source_adapter.py so I'll leave this out of scope to fix for now
+        if len(selected_concurrent_streams) > 0:
+            yield from self._concurrent_source.read(selected_concurrent_streams)
+
+    def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog:
+        return AirbyteCatalog(
+            streams=[stream.as_airbyte_stream() for stream in self.streams(config=self._config)]
+        )
+
 
 class JobDeclarativeStreamTest(TestCase):
     _CONFIG: Mapping[str, Any] = {}