77import io
88import json
99import logging
10- from abc import ABC , abstractmethod
1110from dataclasses import dataclass
1211from io import BufferedIOBase , TextIOWrapper
13- from typing import Any , Generator , MutableMapping , Optional
12+ from typing import Any , Optional
1413
1514import orjson
1615import requests
1716
1817from airbyte_cdk .models import FailureType
19- from airbyte_cdk .sources .declarative .decoders .decoder import Decoder
18+ from airbyte_cdk .sources .declarative .decoders .decoder import DECODER_OUTPUT_TYPE , Decoder
19+ from airbyte_cdk .sources .declarative .decoders .decoder_parser import (
20+ PARSER_OUTPUT_TYPE ,
21+ PARSERS_BY_HEADER_TYPE ,
22+ PARSERS_TYPE ,
23+ Parser ,
24+ )
2025from airbyte_cdk .utils import AirbyteTracedException
2126
2227logger = logging .getLogger ("airbyte" )
2328
2429
25- @dataclass
26- class Parser (ABC ):
27- @abstractmethod
28- def parse (
29- self ,
30- data : BufferedIOBase ,
31- compressed : Optional [bool ] = False ,
32- ) -> Generator [MutableMapping [str , Any ], None , None ]:
33- """
34- Parse data and yield dictionaries.
35- """
36- pass
37-
38-
3930@dataclass
4031class GzipParser (Parser ):
4132 inner_parser : Parser
4233
43- def parse (
44- self ,
45- data : BufferedIOBase ,
46- compressed : Optional [bool ] = False ,
47- ) -> Generator [MutableMapping [str , Any ], None , None ]:
34+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
4835 """
4936 Decompress gzipped bytes and pass decompressed data to the inner parser.
5037
@@ -55,22 +42,15 @@ def parse(
5542 - The data is not decoded by default.
5643 """
5744
58- if compressed :
59- with gzip .GzipFile (fileobj = data , mode = "rb" ) as gzipobj :
60- yield from self .inner_parser .parse (gzipobj )
61- else :
62- yield from self .inner_parser .parse (data )
45+ with gzip .GzipFile (fileobj = data , mode = "rb" ) as gzipobj :
46+ yield from self .inner_parser .parse (gzipobj )
6347
6448
6549@dataclass
6650class JsonParser (Parser ):
6751 encoding : str = "utf-8"
6852
69- def parse (
70- self ,
71- data : BufferedIOBase ,
72- compressed : Optional [bool ] = False ,
73- ) -> Generator [MutableMapping [str , Any ], None , None ]:
53+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
7454 """
7555 Attempts to deserialize data using orjson library. As an extra layer of safety we fallback on the json library to deserialize the data.
7656 """
@@ -110,11 +90,7 @@ def _parse_json(self, raw_data: bytes) -> Optional[Any]:
11090class JsonLineParser (Parser ):
11191 encoding : Optional [str ] = "utf-8"
11292
113- def parse (
114- self ,
115- data : BufferedIOBase ,
116- compressed : Optional [bool ] = False ,
117- ) -> Generator [MutableMapping [str , Any ], None , None ]:
93+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
11894 for line in data :
11995 try :
12096 yield json .loads (line .decode (encoding = self .encoding or "utf-8" ))
@@ -138,11 +114,7 @@ def _get_delimiter(self) -> Optional[str]:
138114
139115 return self .delimiter
140116
141- def parse (
142- self ,
143- data : BufferedIOBase ,
144- compressed : Optional [bool ] = False ,
145- ) -> Generator [MutableMapping [str , Any ], None , None ]:
117+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
146118 """
147119 Parse CSV data from decompressed bytes.
148120 """
@@ -152,10 +124,9 @@ def parse(
152124 yield row
153125
154126
155- @dataclass
156127class CompositeRawDecoder (Decoder ):
157128 """
158- Decoder strategy to transform a requests.Response into a Generator[MutableMapping[str, Any], None, None]
129+ Decoder strategy to transform a requests.Response into a PARSER_OUTPUT_TYPE
159130 passed response.raw to parser(s).
160131
161132 Note: response.raw is not decoded/decompressed by default. Parsers should be instantiated recursively.
@@ -168,26 +139,80 @@ class CompositeRawDecoder(Decoder):
168139 )
169140 """
170141
171- parser : Parser
172- stream_response : bool = True
142+ def __init__ (
143+ self ,
144+ parser : Parser ,
145+ stream_response : bool = True ,
146+ parsers_by_header : PARSERS_BY_HEADER_TYPE = None ,
147+ ) -> None :
148+ # since we moved from using `dataclass` to `__init__` method,
149+ # we need to keep using the `parser` to be able to resolve the depenencies
150+ # between the parsers correctly.
151+ self .parser = parser
152+
153+ self ._parsers_by_header = parsers_by_header if parsers_by_header else {}
154+ self ._stream_response = stream_response
155+
156+ @classmethod
157+ def by_headers (
158+ cls ,
159+ parsers : PARSERS_TYPE ,
160+ stream_response : bool ,
161+ fallback_parser : Parser ,
162+ ) -> "CompositeRawDecoder" :
163+ """
164+ Create a CompositeRawDecoder instance based on header values.
165+
166+ Args:
167+ parsers (PARSERS_TYPE): A list of tuples where each tuple contains headers, header values, and a parser.
168+ stream_response (bool): A flag indicating whether the response should be streamed.
169+ fallback_parser (Parser): A parser to use if no matching header is found.
170+
171+ Returns:
172+ CompositeRawDecoder: An instance of CompositeRawDecoder configured with the provided parsers.
173+ """
174+ parsers_by_header = {}
175+ for headers , header_values , parser in parsers :
176+ for header in headers :
177+ parsers_by_header [header ] = {header_value : parser for header_value in header_values }
178+ return cls (fallback_parser , stream_response , parsers_by_header )
173179
174180 def is_stream_response (self ) -> bool :
175- return self .stream_response
181+ return self ._stream_response
176182
177- def decode (
178- self ,
179- response : requests .Response ,
180- ) -> Generator [MutableMapping [str , Any ], None , None ]:
183+ def decode (self , response : requests .Response ) -> DECODER_OUTPUT_TYPE :
184+ parser = self ._select_parser (response )
181185 if self .is_stream_response ():
182186 # urllib mentions that some interfaces don't play nice with auto_close
183187 # More info here: https://urllib3.readthedocs.io/en/stable/user-guide.html#using-io-wrappers-with-response-content
184188 # We have indeed observed some issues with CSV parsing.
185189 # Hence, we will manage the closing of the file ourselves until we find a better solution.
186190 response .raw .auto_close = False
187- yield from self . parser .parse (
191+ yield from parser .parse (
188192 data = response .raw , # type: ignore[arg-type]
189- compressed = self .is_compressed_response (response ),
190193 )
191194 response .raw .close ()
192195 else :
193- yield from self .parser .parse (data = io .BytesIO (response .content ))
196+ yield from parser .parse (data = io .BytesIO (response .content ))
197+
198+ def _select_parser (self , response : requests .Response ) -> Parser :
199+ """
200+ Selects the appropriate parser based on the response headers.
201+
202+ This method iterates through the `_parsers_by_header` dictionary to find a matching parser
203+ based on the headers in the response. If a matching header and header value are found,
204+ the corresponding parser is returned. If no match is found, the default parser is returned.
205+
206+ Args:
207+ response (requests.Response): The HTTP response object containing headers to check.
208+
209+ Returns:
210+ Parser: The parser corresponding to the matched header value, or the default parser if no match is found.
211+ """
212+ for header , parser_by_header_value in self ._parsers_by_header .items ():
213+ if (
214+ header in response .headers
215+ and response .headers [header ] in parser_by_header_value .keys ()
216+ ):
217+ return parser_by_header_value [response .headers [header ]]
218+ return self .parser
0 commit comments