|
1 | 1 | # |
2 | 2 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. |
3 | 3 | # |
| 4 | + |
| 5 | + |
4 | 6 | import copy |
5 | 7 | import logging |
6 | 8 | from dataclasses import InitVar, dataclass |
7 | | -from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union |
| 9 | +from typing import TYPE_CHECKING, Any, Iterable, Tuple, List, Mapping, MutableMapping, Optional, Union |
8 | 10 |
|
9 | 11 | import dpath |
10 | 12 |
|
@@ -131,6 +133,40 @@ def _get_request_option( |
131 | 133 | parent_config.request_option.inject_into_request(params, value, self.config) |
132 | 134 | return params |
133 | 135 |
|
| 136 | + def process_parent_record(self, parent_record: Union[AirbyteMessage, Record, Mapping], parent_stream_name: str) -> Tuple[Optional[Mapping], Optional[Mapping]]: |
| 137 | + """ |
| 138 | + Processes and extracts data from a parent record, handling different record types |
| 139 | + and ensuring only valid types proceed. |
| 140 | +
|
| 141 | + :param parent_record: The parent record to process. |
| 142 | + :param parent_stream_name: The parent stream name associated with the record. |
| 143 | + :return: Extracted record data and partition (if applicable). |
| 144 | + :raises AirbyteTracedException: If the record type is invalid. |
| 145 | + """ |
| 146 | + if isinstance(parent_record, AirbyteMessage): |
| 147 | + self.logger.warning( |
| 148 | + f"Parent stream {parent_stream_name} returns records of type AirbyteMessage. " |
| 149 | + f"This SubstreamPartitionRouter is not able to checkpoint incremental parent state." |
| 150 | + ) |
| 151 | + if parent_record.type == MessageType.RECORD: |
| 152 | + return parent_record.record.data, {} |
| 153 | + return None, None # Skip invalid or non-record data |
| 154 | + |
| 155 | + # Handle Record type |
| 156 | + if isinstance(parent_record, Record): |
| 157 | + parent_partition = ( |
| 158 | + parent_record.associated_slice.partition if parent_record.associated_slice else {} |
| 159 | + ) |
| 160 | + return parent_record.data, parent_partition |
| 161 | + |
| 162 | + # Validate the record type |
| 163 | + if not isinstance(parent_record, Mapping): |
| 164 | + raise AirbyteTracedException( |
| 165 | + message=f"Parent stream returned records as invalid type {type(parent_record)}" |
| 166 | + ) |
| 167 | + |
| 168 | + return parent_record, {} |
| 169 | + |
134 | 170 | def stream_slices(self) -> Iterable[StreamSlice]: |
135 | 171 | """ |
136 | 172 | Iterate over each parent stream's record and create a StreamSlice for each record. |
@@ -163,28 +199,13 @@ def stream_slices(self) -> Iterable[StreamSlice]: |
163 | 199 | # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does |
164 | 200 | # not support either substreams or RFR, but something that needs to be considered once we do |
165 | 201 | for parent_record in parent_stream.read_only_records(): |
166 | | - parent_partition = None |
167 | | - # Skip non-records (eg AirbyteLogMessage) |
168 | | - if isinstance(parent_record, AirbyteMessage): |
169 | | - self.logger.warning( |
170 | | - f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." |
171 | | - ) |
172 | | - if parent_record.type == MessageType.RECORD: |
173 | | - parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record |
174 | | - else: |
175 | | - continue |
176 | | - elif isinstance(parent_record, Record): |
177 | | - parent_partition = ( |
178 | | - parent_record.associated_slice.partition |
179 | | - if parent_record.associated_slice |
180 | | - else {} |
181 | | - ) |
182 | | - parent_record = parent_record.data |
183 | | - elif not isinstance(parent_record, Mapping): |
184 | | - # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid |
185 | | - raise AirbyteTracedException( |
186 | | - message=f"Parent stream returned records as invalid type {type(parent_record)}" |
187 | | - ) |
| 202 | + # Process the parent record |
| 203 | + parent_record, parent_partition = self.process_parent_record(parent_record, parent_stream.name) |
| 204 | + |
| 205 | + # Skip invalid or non-record data |
| 206 | + if parent_record is None: |
| 207 | + continue |
| 208 | + |
188 | 209 | try: |
189 | 210 | partition_value = dpath.get( |
190 | 211 | parent_record, # type: ignore [arg-type] |
|
0 commit comments