|
1 | 1 | import hashlib |
2 | 2 | import os |
| 3 | +import tempfile |
3 | 4 | import unittest |
4 | 5 | from unittest.mock import Mock, MagicMock |
5 | 6 | from unittest.mock import patch |
6 | 7 |
|
7 | 8 | import pytest |
8 | 9 | import urllib3_mock |
9 | 10 |
|
10 | | -from utils import create_bucket, download_and_get_hash, download_url_content |
| 11 | +from utils import ( |
| 12 | + create_bucket, |
| 13 | + download_and_get_hash, |
| 14 | + download_url_content, |
| 15 | + detect_encoding, |
| 16 | +) |
11 | 17 |
|
12 | 18 | responses = urllib3_mock.Responses("requests.packages.urllib3") |
13 | 19 | expected_user_agent = ( |
@@ -260,3 +266,48 @@ def test_create_http_pmtiles_builder_task( |
260 | 266 | self.assertEqual(args[3], "my-project") |
261 | 267 | self.assertEqual(args[4], "northamerica-northeast1") |
262 | 268 | self.assertEqual(args[5], "pmtiles-queue") |
| 269 | + |
| 270 | + |
| 271 | +class TestDetectEncoding(unittest.TestCase): |
| 272 | + def test_utf8_encoding(self): |
| 273 | + with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as f: |
| 274 | + f.write("\ufeff") # Write BOM |
| 275 | + f.write("col1,col2\nval1,val2\n") |
| 276 | + fname = f.name |
| 277 | + enc = detect_encoding(fname) |
| 278 | + self.assertEqual(enc, "utf-8-sig") |
| 279 | + os.remove(fname) |
| 280 | + |
| 281 | + # Add a non-ASCII character (e.g., é, ü, ñ) to the test data |
| 282 | + def test_utf8_encoding_non_ascii(self): |
| 283 | + with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as f: |
| 284 | + f.write("col1,col2\nval1,valü2\n") # ü is non-ASCII |
| 285 | + fname = f.name |
| 286 | + enc = detect_encoding(fname) |
| 287 | + self.assertEqual(enc, "utf-8-sig") |
| 288 | + os.remove(fname) |
| 289 | + |
| 290 | + def test_latin1_encoding(self): |
| 291 | + # Use a longer string with several Latin-1 characters |
| 292 | + latin1_text = "col1,col2\nvalñ,valö,valü,valé,valà,valç,valø\n" |
| 293 | + with tempfile.NamedTemporaryFile( |
| 294 | + delete=False, mode="w", encoding="latin1" |
| 295 | + ) as f: |
| 296 | + f.write(latin1_text) |
| 297 | + fname = f.name |
| 298 | + enc = detect_encoding(fname) |
| 299 | + # Multiple encodings can represent Latin-1 characters, this is a safe considering changes in the OS |
| 300 | + # and charset_normalizer library |
| 301 | + self.assertIn( |
| 302 | + enc, ["latin_1", "iso-8859-1", "windows-1252", "latin1", "cp1250"] |
| 303 | + ) |
| 304 | + os.remove(fname) |
| 305 | + |
| 306 | + def test_encoding_detection_failure(self): |
| 307 | + # Write some random bytes that charset_normalizer can't detect |
| 308 | + with tempfile.NamedTemporaryFile(delete=False, mode="wb") as f: |
| 309 | + f.write(b"\x00\x01\x02\x03\x04") |
| 310 | + fname = f.name |
| 311 | + enc = detect_encoding(fname) |
| 312 | + self.assertEqual(enc, "utf-8-sig") |
| 313 | + os.remove(fname) |
0 commit comments