2727class ImprovedCompositeRawDecoder (CompositeRawDecoder ):
2828 """
2929 Enhanced CompositeRawDecoder with better GZIP detection and error handling.
30-
30+
3131 This addresses the StreamThreadException issue by:
3232 1. Auto-detecting GZIP content based on magic bytes
3333 2. Providing better error handling for decompression failures
3434 3. Falling back gracefully when parser selection fails
3535 """
36-
36+
3737 def __init__ (
3838 self ,
3939 parser : Parser ,
@@ -43,62 +43,62 @@ def __init__(
4343 ) -> None :
4444 super ().__init__ (parser , stream_response , parsers_by_header )
4545 self ._auto_detect_gzip = auto_detect_gzip
46-
46+
4747 def _detect_gzip_content (self , response : requests .Response ) -> bool :
4848 """
4949 Detect if response content is GZIP-compressed by checking magic bytes.
50-
50+
5151 Returns True if the response starts with GZIP magic number (0x1f, 0x8b).
5252 This helps identify GZIP content even when Content-Encoding header is missing.
5353 """
5454 if not self ._auto_detect_gzip :
5555 return False
56-
56+
5757 try :
58- if hasattr (response , ' raw' ) and response .raw :
59- current_pos = response .raw .tell () if hasattr (response .raw , ' tell' ) else None
60-
58+ if hasattr (response , " raw" ) and response .raw :
59+ current_pos = response .raw .tell () if hasattr (response .raw , " tell" ) else None
60+
6161 magic_bytes = response .raw .read (2 )
62-
63- if current_pos is not None and hasattr (response .raw , ' seek' ):
62+
63+ if current_pos is not None and hasattr (response .raw , " seek" ):
6464 response .raw .seek (current_pos )
65- elif hasattr (response .raw , ' seek' ):
65+ elif hasattr (response .raw , " seek" ):
6666 response .raw .seek (0 )
67-
68- return len (magic_bytes ) >= 2 and magic_bytes [0 ] == 0x1f and magic_bytes [1 ] == 0x8b
69-
70- elif hasattr (response , ' content' ) and len (response .content ) >= 2 :
71- return response .content [0 ] == 0x1f and response .content [1 ] == 0x8b
72-
67+
68+ return len (magic_bytes ) >= 2 and magic_bytes [0 ] == 0x1F and magic_bytes [1 ] == 0x8B
69+
70+ elif hasattr (response , " content" ) and len (response .content ) >= 2 :
71+ return response .content [0 ] == 0x1F and response .content [1 ] == 0x8B
72+
7373 except Exception as e :
7474 logger .debug (f"Failed to detect GZIP content: { e } " )
75-
75+
7676 return False
77-
77+
7878 def _select_parser (self , response : requests .Response ) -> Parser :
7979 """
8080 Enhanced parser selection with GZIP auto-detection.
81-
81+
8282 This method extends the base implementation to:
8383 1. Check Content-Encoding header (existing behavior)
8484 2. Auto-detect GZIP content by magic bytes
8585 3. Wrap parser with GzipParser if GZIP is detected
8686 """
8787 selected_parser = super ()._select_parser (response )
88-
89- if ( not isinstance (selected_parser , GzipParser ) and
90- self . _detect_gzip_content ( response )):
91-
92- logger . info ( "Auto-detected GZIP content without Content-Encoding header, wrapping parser" )
93-
88+
89+ if not isinstance (selected_parser , GzipParser ) and self . _detect_gzip_content ( response ):
90+ logger . info (
91+ "Auto-detected GZIP content without Content-Encoding header, wrapping parser"
92+ )
93+
9494 return GzipParser (inner_parser = selected_parser )
95-
95+
9696 return selected_parser
97-
97+
9898 def decode (self , response : requests .Response ):
9999 """
100100 Enhanced decode method with better error handling.
101-
101+
102102 Provides more informative error messages and graceful fallback
103103 when decompression or parsing fails.
104104 """
@@ -113,14 +113,14 @@ def decode(self, response: requests.Response):
113113 f"Original error: { e } "
114114 )
115115 logger .error (error_msg )
116-
116+
117117 if self ._auto_detect_gzip and self ._detect_gzip_content (response ):
118118 logger .info ("Attempting recovery with GZIP decompression" )
119119 gzip_parser = GzipParser (inner_parser = self .parser )
120-
121- if hasattr (response , ' raw' ) and hasattr (response .raw , ' seek' ):
120+
121+ if hasattr (response , " raw" ) and hasattr (response .raw , " seek" ):
122122 response .raw .seek (0 )
123-
123+
124124 try :
125125 if self .is_stream_response ():
126126 response .raw .auto_close = False
@@ -131,7 +131,7 @@ def decode(self, response: requests.Response):
131131 return
132132 except Exception as recovery_error :
133133 logger .error (f"GZIP recovery failed: { recovery_error } " )
134-
134+
135135 raise RuntimeError (error_msg ) from e
136136 else :
137137 raise
@@ -143,22 +143,22 @@ def decode(self, response: requests.Response):
143143def create_bing_ads_compatible_decoder () -> ImprovedCompositeRawDecoder :
144144 """
145145 Create a CompositeRawDecoder configured for Bing Ads bulk streams.
146-
146+
147147 This decoder handles the campaign_labels stream and other bulk streams
148148 that use GZIP compression with CSV data.
149149 """
150150 csv_parser = CsvParser (encoding = "utf-8-sig" , set_values_to_none = ["" ])
151-
151+
152152 gzip_parser = GzipParser (inner_parser = csv_parser )
153-
153+
154154 decoder = ImprovedCompositeRawDecoder .by_headers (
155155 parsers = [({"Content-Encoding" }, {"gzip" }, gzip_parser )],
156156 stream_response = True ,
157157 fallback_parser = csv_parser ,
158158 )
159-
159+
160160 decoder ._auto_detect_gzip = True
161-
161+
162162 return decoder
163163
164164
0 commit comments