1+ #
2+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3+ #
4+
15import csv
26import gzip
37import io
48import json
59import logging
6- from abc import ABC , abstractmethod
710from dataclasses import dataclass
811from io import BufferedIOBase , TextIOWrapper
9- from typing import Any , Generator , MutableMapping , Optional
12+ from typing import Any , Optional
1013
1114import orjson
1215import requests
1316
1417from airbyte_cdk .models import FailureType
15- from airbyte_cdk .sources .declarative .decoders .decoder import Decoder
18+ from airbyte_cdk .sources .declarative .decoders .decoder import DECODER_OUTPUT_TYPE , Decoder
19+ from airbyte_cdk .sources .declarative .decoders .decoder_parser import (
20+ PARSER_OUTPUT_TYPE ,
21+ PARSERS_BY_HEADER_TYPE ,
22+ PARSERS_TYPE ,
23+ Parser ,
24+ )
1625from airbyte_cdk .utils import AirbyteTracedException
1726
1827logger = logging .getLogger ("airbyte" )
1928
2029
21- @dataclass
22- class Parser (ABC ):
23- @abstractmethod
24- def parse (
25- self ,
26- data : BufferedIOBase ,
27- ) -> Generator [MutableMapping [str , Any ], None , None ]:
28- """
29- Parse data and yield dictionaries.
30- """
31- pass
32-
33-
3430@dataclass
3531class GzipParser (Parser ):
3632 inner_parser : Parser
3733
38- def parse (
39- self ,
40- data : BufferedIOBase ,
41- ) -> Generator [MutableMapping [str , Any ], None , None ]:
34+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
4235 """
4336 Decompress gzipped bytes and pass decompressed data to the inner parser.
37+
38+ IMPORTANT:
39+ - If the data is not gzipped, reset the pointer and pass the data to the inner parser as is.
40+
41+ Note:
42+ - The data is not decoded by default.
4443 """
44+
4545 with gzip .GzipFile (fileobj = data , mode = "rb" ) as gzipobj :
4646 yield from self .inner_parser .parse (gzipobj )
4747
@@ -50,7 +50,7 @@ def parse(
5050class JsonParser (Parser ):
5151 encoding : str = "utf-8"
5252
53- def parse (self , data : BufferedIOBase ) -> Generator [ MutableMapping [ str , Any ], None , None ] :
53+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
5454 """
5555 Attempts to deserialize data using orjson library. As an extra layer of safety we fallback on the json library to deserialize the data.
5656 """
@@ -90,10 +90,7 @@ def _parse_json(self, raw_data: bytes) -> Optional[Any]:
9090class JsonLineParser (Parser ):
9191 encoding : Optional [str ] = "utf-8"
9292
93- def parse (
94- self ,
95- data : BufferedIOBase ,
96- ) -> Generator [MutableMapping [str , Any ], None , None ]:
93+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
9794 for line in data :
9895 try :
9996 yield json .loads (line .decode (encoding = self .encoding or "utf-8" ))
@@ -117,10 +114,7 @@ def _get_delimiter(self) -> Optional[str]:
117114
118115 return self .delimiter
119116
120- def parse (
121- self ,
122- data : BufferedIOBase ,
123- ) -> Generator [MutableMapping [str , Any ], None , None ]:
117+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
124118 """
125119 Parse CSV data from decompressed bytes.
126120 """
@@ -130,31 +124,95 @@ def parse(
130124 yield row
131125
132126
133- @dataclass
134127class CompositeRawDecoder (Decoder ):
135128 """
136- Decoder strategy to transform a requests.Response into a Generator[MutableMapping[str, Any], None, None]
129+ Decoder strategy to transform a requests.Response into a PARSER_OUTPUT_TYPE
137130 passed response.raw to parser(s).
138- Note: response.raw is not decoded/decompressed by default.
139- parsers should be instantiated recursively.
131+
132+ Note: response.raw is not decoded/decompressed by default. Parsers should be instantiated recursively.
133+
140134 Example:
141- composite_raw_decoder = CompositeRawDecoder(parser=GzipParser(inner_parser=JsonLineParser(encoding="iso-8859-1")))
135+ composite_raw_decoder = CompositeRawDecoder(
136+ parser=GzipParser(
137+ inner_parser=JsonLineParser(encoding="iso-8859-1")
138+ )
139+ )
142140 """
143141
144- parser : Parser
145- stream_response : bool = True
142+ def __init__ (
143+ self ,
144+ parser : Parser ,
145+ stream_response : bool = True ,
146+ parsers_by_header : PARSERS_BY_HEADER_TYPE = None ,
147+ ) -> None :
148+ # since we moved from using `dataclass` to `__init__` method,
149+ # we need to keep using the `parser` to be able to resolve the depenencies
150+ # between the parsers correctly.
151+ self .parser = parser
152+
153+ self ._parsers_by_header = parsers_by_header if parsers_by_header else {}
154+ self ._stream_response = stream_response
155+
156+ @classmethod
157+ def by_headers (
158+ cls ,
159+ parsers : PARSERS_TYPE ,
160+ stream_response : bool ,
161+ fallback_parser : Parser ,
162+ ) -> "CompositeRawDecoder" :
163+ """
164+ Create a CompositeRawDecoder instance based on header values.
165+
166+ Args:
167+ parsers (PARSERS_TYPE): A list of tuples where each tuple contains headers, header values, and a parser.
168+ stream_response (bool): A flag indicating whether the response should be streamed.
169+ fallback_parser (Parser): A parser to use if no matching header is found.
170+
171+ Returns:
172+ CompositeRawDecoder: An instance of CompositeRawDecoder configured with the provided parsers.
173+ """
174+ parsers_by_header = {}
175+ for headers , header_values , parser in parsers :
176+ for header in headers :
177+ parsers_by_header [header ] = {header_value : parser for header_value in header_values }
178+ return cls (fallback_parser , stream_response , parsers_by_header )
146179
147180 def is_stream_response (self ) -> bool :
148- return self .stream_response
181+ return self ._stream_response
149182
150- def decode (
151- self , response : requests .Response
152- ) -> Generator [MutableMapping [str , Any ], None , None ]:
183+ def decode (self , response : requests .Response ) -> DECODER_OUTPUT_TYPE :
184+ parser = self ._select_parser (response )
153185 if self .is_stream_response ():
154- # urllib mentions that some interfaces don't play nice with auto_close [here](https://urllib3.readthedocs.io/en/stable/user-guide.html#using-io-wrappers-with-response-content)
155- # We have indeed observed some issues with CSV parsing. Hence, we will manage the closing of the file ourselves until we find a better solution.
186+ # urllib mentions that some interfaces don't play nice with auto_close
187+ # More info here: https://urllib3.readthedocs.io/en/stable/user-guide.html#using-io-wrappers-with-response-content
188+ # We have indeed observed some issues with CSV parsing.
189+ # Hence, we will manage the closing of the file ourselves until we find a better solution.
156190 response .raw .auto_close = False
157- yield from self .parser .parse (data = response .raw ) # type: ignore[arg-type]
191+ yield from parser .parse (
192+ data = response .raw , # type: ignore[arg-type]
193+ )
158194 response .raw .close ()
159195 else :
160- yield from self .parser .parse (data = io .BytesIO (response .content ))
196+ yield from parser .parse (data = io .BytesIO (response .content ))
197+
198+ def _select_parser (self , response : requests .Response ) -> Parser :
199+ """
200+ Selects the appropriate parser based on the response headers.
201+
202+ This method iterates through the `_parsers_by_header` dictionary to find a matching parser
203+ based on the headers in the response. If a matching header and header value are found,
204+ the corresponding parser is returned. If no match is found, the default parser is returned.
205+
206+ Args:
207+ response (requests.Response): The HTTP response object containing headers to check.
208+
209+ Returns:
210+ Parser: The parser corresponding to the matched header value, or the default parser if no match is found.
211+ """
212+ for header , parser_by_header_value in self ._parsers_by_header .items ():
213+ if (
214+ header in response .headers
215+ and response .headers [header ] in parser_by_header_value .keys ()
216+ ):
217+ return parser_by_header_value [response .headers [header ]]
218+ return self .parser
0 commit comments