Skip to content

Commit db8c089

Browse files
feat: add test to reproduce GZIP UTF-8 decoding issue
- Create test script demonstrating StreamThreadException root cause - Reproduce exact error: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte - Test both failing scenario (missing Content-Encoding) and correct GZIP handling - Validate header-based parser selection in CompositeRawDecoder Co-Authored-By: unknown <>
1 parent 73290de commit db8c089

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

test_gzip_utf8_issue.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
Test script to reproduce the StreamThreadException issue with GZIP data and UTF-8 decoding.
3+
4+
This test demonstrates the root cause of issue #8301 where GZIP-compressed data
5+
(starting with byte 0x8b) is incorrectly treated as UTF-8 text, causing the
6+
'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte error.
7+
"""
8+
9+
import gzip
10+
import io
11+
from unittest.mock import Mock
12+
13+
import requests
14+
15+
from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
16+
CompositeRawDecoder,
17+
CsvParser,
18+
GzipParser,
19+
)
20+
21+
22+
def test_gzip_utf8_decoding_issue():
23+
"""
24+
Reproduce the issue where GZIP data is incorrectly treated as UTF-8.
25+
26+
This simulates the scenario in Bing Ads campaign_labels stream where:
27+
1. Response contains GZIP-compressed CSV data
28+
2. Parser selection fails to detect GZIP content-encoding
29+
3. Compressed data is passed to UTF-8 decoder
30+
4. UTF-8 decoder fails with byte 0x8b error
31+
"""
32+
csv_data = "Account Id,Campaign,Client Id\n123,Test Campaign,456\n"
33+
34+
compressed_data = gzip.compress(csv_data.encode('utf-8'))
35+
36+
assert compressed_data[1] == 0x8b, f"Expected GZIP magic number 0x8b, got {hex(compressed_data[1])}"
37+
38+
mock_response = Mock(spec=requests.Response)
39+
mock_response.content = compressed_data
40+
mock_response.raw = io.BytesIO(compressed_data)
41+
mock_response.headers = {} # Missing Content-Encoding: gzip header
42+
43+
csv_parser = CsvParser(encoding="utf-8")
44+
decoder = CompositeRawDecoder(parser=csv_parser, stream_response=False)
45+
46+
try:
47+
list(decoder.decode(mock_response))
48+
assert False, "Expected UTF-8 decoding error but none occurred"
49+
except UnicodeDecodeError as e:
50+
assert "can't decode byte 0x8b" in str(e)
51+
assert "invalid start byte" in str(e)
52+
print(f"✓ Reproduced the issue: {e}")
53+
54+
gzip_parser = GzipParser(inner_parser=csv_parser)
55+
correct_decoder = CompositeRawDecoder(parser=gzip_parser, stream_response=False)
56+
57+
mock_response.raw = io.BytesIO(compressed_data)
58+
59+
records = list(correct_decoder.decode(mock_response))
60+
assert len(records) == 1
61+
assert records[0]["Account Id"] == "123"
62+
assert records[0]["Campaign"] == "Test Campaign"
63+
print("✓ Correct GZIP handling works as expected")
64+
65+
66+
def test_header_based_parser_selection():
67+
"""
68+
Test that CompositeRawDecoder.by_headers() correctly selects GZIP parser
69+
when Content-Encoding header is present.
70+
"""
71+
csv_data = "Account Id,Campaign\n123,Test\n"
72+
compressed_data = gzip.compress(csv_data.encode('utf-8'))
73+
74+
mock_response = Mock(spec=requests.Response)
75+
mock_response.content = compressed_data
76+
mock_response.raw = io.BytesIO(compressed_data)
77+
mock_response.headers = {"Content-Encoding": "gzip"}
78+
79+
gzip_parser = GzipParser(inner_parser=CsvParser(encoding="utf-8"))
80+
fallback_parser = CsvParser(encoding="utf-8")
81+
82+
decoder = CompositeRawDecoder.by_headers(
83+
parsers=[({"Content-Encoding"}, {"gzip"}, gzip_parser)],
84+
stream_response=False,
85+
fallback_parser=fallback_parser,
86+
)
87+
88+
records = list(decoder.decode(mock_response))
89+
assert len(records) == 1
90+
assert records[0]["Account Id"] == "123"
91+
print("✓ Header-based parser selection works correctly")
92+
93+
94+
if __name__ == "__main__":
95+
print("Testing GZIP UTF-8 decoding issue reproduction...")
96+
test_gzip_utf8_decoding_issue()
97+
test_header_based_parser_selection()
98+
print("All tests completed successfully!")

0 commit comments

Comments
 (0)