remove per-partition limits in favor of global limit on SimpleRetriever

brianjlai · brianjlai · commit d1548c0caa1f · 2025-08-06T12:20:53.000-07:00
diff --git a/airbyte_cdk/connector_builder/test_reader/message_grouper.py b/airbyte_cdk/connector_builder/test_reader/message_grouper.py
@@ -96,16 +96,6 @@ def get_message_groups(
     slice_auxiliary_requests: List[AuxiliaryRequest] = []
 
     while message := next(messages, None):
-        # Even though we do not emit records beyond the limit in the message group response, we still
-        # need to process messages off the queue in order to avoid a deadlock that occurs if the amount
-        # of extracted records exceeds the size of the queue (which has a default of 10,000)
-        #
-        # A few other options considered was killing the thread pool, but that doesn't kill in-progress
-        # threads. We also considered adding another event to the main queue, but this is
-        # the simplest solution for the time being.
-        if records_count >= limit:
-            continue
-
         json_message = airbyte_message_to_json(message)
 
         if is_page_http_request_for_different_stream(json_message, stream_name):
diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py
@@ -45,7 +45,6 @@ def create(
         message_repository: MessageRepository,
         queue: Optional[Queue[QueueItem]] = None,
         timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
-        max_records_per_partition: Optional[int] = None,
     ) -> "ConcurrentSource":
         is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
         too_many_generator = (
@@ -68,7 +67,6 @@ def create(
             message_repository=message_repository,
             initial_number_partitions_to_generate=initial_number_of_partitions_to_generate,
             timeout_seconds=timeout_seconds,
-            max_records_per_partition=max_records_per_partition,
         )
 
     def __init__(
@@ -80,7 +78,6 @@ def __init__(
         message_repository: MessageRepository = InMemoryMessageRepository(),
         initial_number_partitions_to_generate: int = 1,
         timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
-        max_records_per_partition: Optional[int] = None,
     ) -> None:
         """
         :param threadpool: The threadpool to submit tasks to
@@ -96,7 +93,6 @@ def __init__(
         self._message_repository = message_repository
         self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
         self._timeout_seconds = timeout_seconds
-        self._max_records_per_partition = max_records_per_partition
 
         # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
         # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
@@ -119,7 +115,6 @@ def read(
             PartitionReader(
                 self._queue,
                 PartitionLogger(self._slice_logger, self._logger, self._message_repository),
-                self._max_records_per_partition,
             ),
         )
 
diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py
@@ -126,6 +126,7 @@ def __init__(
             max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
             limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None,
             limit_slices_fetched=limits.max_slices if limits else None,
+            limit_max_records=limits.max_records if limits else None,
             disable_retries=True if limits else False,
             disable_cache=True if limits else False,
         )
@@ -170,7 +171,6 @@ def __init__(
             slice_logger=self._slice_logger,
             queue=queue,
             message_repository=self.message_repository,
-            max_records_per_partition=limits.max_records if limits else None,
         )
 
     # TODO: Remove this. This property is necessary to safely migrate Stripe during the transition state.
diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -634,6 +634,7 @@ def __init__(
         self,
         limit_pages_fetched_per_slice: Optional[int] = None,
         limit_slices_fetched: Optional[int] = None,
+        limit_max_records: Optional[int] = None,
         emit_connector_builder_messages: bool = False,
         disable_retries: bool = False,
         disable_cache: bool = False,
@@ -645,6 +646,7 @@ def __init__(
         self._init_mappings()
         self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice
         self._limit_slices_fetched = limit_slices_fetched
+        self._limit_max_records = limit_max_records
         self._emit_connector_builder_messages = emit_connector_builder_messages
         self._disable_retries = disable_retries
         self._disable_cache = disable_cache
@@ -3398,6 +3400,7 @@ def _get_url() -> str:
             ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests,
             additional_query_properties=query_properties,
             log_formatter=self._get_log_formatter(log_formatter, name),
+            max_records=self._limit_max_records,
             parameters=model.parameters or {},
         )
 
diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py
@@ -92,6 +92,7 @@ class SimpleRetriever(Retriever):
     ignore_stream_slicer_parameters_on_paginated_requests: bool = False
     additional_query_properties: Optional[QueryProperties] = None
     log_formatter: Optional[Callable[[requests.Response], Any]] = None
+    max_records: Optional[int] = None
 
     def __post_init__(self, parameters: Mapping[str, Any]) -> None:
         self._paginator = self.paginator or NoPagination(parameters=parameters)
@@ -101,6 +102,7 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
             if isinstance(self._name, str)
             else self._name
         )
+        self._total_records_read = 0
 
     @property  # type: ignore
     def name(self) -> str:
@@ -501,6 +503,12 @@ def read_records(
         :param stream_slice: The stream slice to read data for
         :return: The records read from the API source
         """
+
+        # For Connector Builder test read operations, if the max number of records has already been
+        # reached, we just return without attempted to extract any more records
+        if self.max_records and self._total_records_read >= self.max_records:
+            return
+
         _slice = stream_slice or StreamSlice(partition={}, cursor_slice={})  # None-check
 
         most_recent_record_from_slice = None
@@ -529,6 +537,13 @@ def read_records(
 
                 yield stream_data
 
+                # For Connector Builder test read operations, if the max number of records is reached, we
+                # exit the process early without emitting more records or attempting to extract more
+                if self.max_records:
+                    self._total_records_read += 1
+                    if self._total_records_read >= self.max_records:
+                        break
+
             if self.cursor:
                 self.cursor.close_slice(_slice)
         return
diff --git a/airbyte_cdk/sources/streams/concurrent/partition_reader.py b/airbyte_cdk/sources/streams/concurrent/partition_reader.py
@@ -53,14 +53,12 @@ def __init__(
         self,
         queue: Queue[QueueItem],
         partition_logger: Optional[PartitionLogger] = None,
-        max_records_per_partition: Optional[int] = None,
     ) -> None:
         """
         :param queue: The queue to put the records in.
         """
         self._queue = queue
         self._partition_logger = partition_logger
-        self._max_records_per_partition = max_records_per_partition
 
     def process_partition(self, partition: Partition, cursor: Cursor) -> None:
         """
@@ -78,18 +76,9 @@ def process_partition(self, partition: Partition, cursor: Cursor) -> None:
             if self._partition_logger:
                 self._partition_logger.log(partition)
 
-            record_count = 0
             for record in partition.read():
                 self._queue.put(record)
                 cursor.observe(record)
-                record_count += 1
-                if (
-                    self._max_records_per_partition
-                    and record_count >= self._max_records_per_partition
-                ):
-                    # We stop processing a partition after exceeding the max_records for Connector
-                    # Builder test reads. The record limit only applies to an individual partition
-                    break
             cursor.close_partition(partition)
             self._queue.put(PartitionCompleteSentinel(partition, self._IS_SUCCESSFUL))
         except Exception as e:
diff --git a/unit_tests/connector_builder/test_message_grouper.py b/unit_tests/connector_builder/test_message_grouper.py
@@ -307,126 +307,6 @@ def test_get_grouped_messages_with_logs(mock_entrypoint_read: Mock) -> None:
         assert actual_log == expected_logs[i]
 
 
-@pytest.mark.parametrize(
-    "request_record_limit, max_record_limit, should_fail",
-    [
-        pytest.param(1, 3, False, id="test_create_request_with_record_limit"),
-        pytest.param(3, 1, True, id="test_create_request_record_limit_exceeds_max"),
-    ],
-)
-@patch("airbyte_cdk.connector_builder.test_reader.reader.AirbyteEntrypoint.read")
-def test_get_grouped_messages_record_limit(
-    mock_entrypoint_read: Mock, request_record_limit: int, max_record_limit: int, should_fail: bool
-) -> None:
-    stream_name = "hashiras"
-    url = "https://demonslayers.com/api/v1/hashiras?era=taisho"
-    request = {
-        "headers": {"Content-Type": "application/json"},
-        "method": "GET",
-        "body": {"content": '{"custom": "field"}'},
-    }
-    response = {
-        "status_code": 200,
-        "headers": {"field": "value"},
-        "body": {"content": '{"name": "field"}'},
-    }
-    mock_source = make_mock_source(
-        mock_entrypoint_read,
-        iter(
-            [
-                request_response_log_message(request, response, url, stream_name),
-                record_message(stream_name, {"name": "Shinobu Kocho"}),
-                record_message(stream_name, {"name": "Muichiro Tokito"}),
-                request_response_log_message(request, response, url, stream_name),
-                record_message(stream_name, {"name": "Mitsuri Kanroji"}),
-            ]
-        ),
-    )
-    n_records = 2
-    record_limit = min(request_record_limit, max_record_limit)
-
-    api = TestReader(MAX_PAGES_PER_SLICE, MAX_SLICES, max_record_limit=max_record_limit)
-    # this is the call we expect to raise an exception
-    if should_fail:
-        with pytest.raises(ValueError):
-            api.run_test_read(
-                mock_source,
-                config=CONFIG,
-                configured_catalog=create_configured_catalog(stream_name),
-                stream_name=stream_name,
-                state=_NO_STATE,
-                record_limit=request_record_limit,
-            )
-    else:
-        actual_response: StreamRead = api.run_test_read(
-            mock_source,
-            config=CONFIG,
-            configured_catalog=create_configured_catalog(stream_name),
-            stream_name=stream_name,
-            state=_NO_STATE,
-            record_limit=request_record_limit,
-        )
-        single_slice = actual_response.slices[0]
-        total_records = 0
-        for i, actual_page in enumerate(single_slice.pages):
-            total_records += len(actual_page.records)
-        assert total_records == min([record_limit, n_records])
-
-        assert (total_records >= max_record_limit) == actual_response.test_read_limit_reached
-
-
-@pytest.mark.parametrize(
-    "max_record_limit",
-    [
-        pytest.param(2, id="test_create_request_no_record_limit"),
-        pytest.param(1, id="test_create_request_no_record_limit_n_records_exceed_max"),
-    ],
-)
-@patch("airbyte_cdk.connector_builder.test_reader.reader.AirbyteEntrypoint.read")
-def test_get_grouped_messages_default_record_limit(
-    mock_entrypoint_read: Mock, max_record_limit: int
-) -> None:
-    stream_name = "hashiras"
-    url = "https://demonslayers.com/api/v1/hashiras?era=taisho"
-    request = {
-        "headers": {"Content-Type": "application/json"},
-        "method": "GET",
-        "body": {"content": '{"custom": "field"}'},
-    }
-    response = {
-        "status_code": 200,
-        "headers": {"field": "value"},
-        "body": {"content": '{"name": "field"}'},
-    }
-    mock_source = make_mock_source(
-        mock_entrypoint_read,
-        iter(
-            [
-                request_response_log_message(request, response, url, stream_name),
-                record_message(stream_name, {"name": "Shinobu Kocho"}),
-                record_message(stream_name, {"name": "Muichiro Tokito"}),
-                request_response_log_message(request, response, url, stream_name),
-                record_message(stream_name, {"name": "Mitsuri Kanroji"}),
-            ]
-        ),
-    )
-    n_records = 2
-
-    api = TestReader(MAX_PAGES_PER_SLICE, MAX_SLICES, max_record_limit=max_record_limit)
-    actual_response: StreamRead = api.run_test_read(
-        source=mock_source,
-        config=CONFIG,
-        configured_catalog=create_configured_catalog(stream_name),
-        stream_name=stream_name,
-        state=_NO_STATE,
-    )
-    single_slice = actual_response.slices[0]
-    total_records = 0
-    for i, actual_page in enumerate(single_slice.pages):
-        total_records += len(actual_page.records)
-    assert total_records == min([max_record_limit, n_records])
-
-
 @patch("airbyte_cdk.connector_builder.test_reader.reader.AirbyteEntrypoint.read")
 def test_get_grouped_messages_limit_0(mock_entrypoint_read: Mock) -> None:
     stream_name = "hashiras"
diff --git a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py

Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,7 @@ def __init__(`
`126`	`126`	`max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),`
`127`	`127`	`limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None,`
`128`	`128`	`limit_slices_fetched=limits.max_slices if limits else None,`
	`129`	`+ limit_max_records=limits.max_records if limits else None,`
`129`	`130`	`disable_retries=True if limits else False,`
`130`	`131`	`disable_cache=True if limits else False,`
`131`	`132`	`)`
`@@ -170,7 +171,6 @@ def __init__(`
`170`	`171`	`slice_logger=self._slice_logger,`
`171`	`172`	`queue=queue,`
`172`	`173`	`message_repository=self.message_repository,`
`173`		`- max_records_per_partition=limits.max_records if limits else None,`
`174`	`174`	`)`
`175`	`175`
`176`	`176`	`# TODO: Remove this. This property is necessary to safely migrate Stripe during the transition state.`