fix: YAML escape gaps, encoding fallback, simplify tokens & logging

nikolay-e · nikolay-e · commit bad17312dd56 · 2026-04-03T09:19:40.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,12 +53,13 @@ classifiers = [
 dynamic = [ "version" ]  # Version is still managed in version.py
 dependencies = [
   "pathspec>=0.11,<2.0",
-  "tiktoken>=0.7,<1.0",
+  "tiktoken>=0.8,<1.0",
 ]
 optional-dependencies.dev = [
   "black>=23.0.0,<27.0",
   # Build and release
   "build>=0.10,<2.0",
+  "charset-normalizer>=3.0,<4.0",
   "coverage>=7.0,<8.0",
   "hypothesis>=6.0,<7.0",
   "import-linter>=2.0,<3.0",
@@ -87,6 +88,7 @@ optional-dependencies.embeddings = [
   "sentence-transformers>=3.0,<6.0",
 ]
 optional-dependencies.full = [
+  "charset-normalizer>=3.0,<4.0",
   "lxml>=5.0,<7.0",
   "mistune>=3.0,<4.0",
   "pysbd>=0.3,<1.0",
diff --git a/src/treemapper/cli.py b/src/treemapper/cli.py
@@ -143,7 +143,7 @@ class ParsedArgs:
     whitelist_file: Path | None
     output_file: Path | None
     no_default_ignores: bool
-    verbosity: int
+    verbosity: int | str
     output_format: str
     max_depth: int | None
     no_content: bool
@@ -315,11 +315,7 @@ def parse_args() -> ParsedArgs:
     ignore_file = _resolve_ignore_file(args.ignore, root_dir)
     whitelist_file = _resolve_whitelist_file(args.whitelist, root_dir)
 
-    log_level_map = {"error": 0, "warning": 1, "info": 2, "debug": 3}
-    verbosity = log_level_map[args.log_level]
-
-    if args.quiet:
-        verbosity = 0
+    verbosity = "error" if args.quiet else args.log_level
 
     return ParsedArgs(
         root_dir=root_dir,
diff --git a/src/treemapper/logger.py b/src/treemapper/logger.py
@@ -2,15 +2,20 @@
 
 PACKAGE_LOGGER_NAME = "treemapper"
 
+_LOG_LEVEL_MAP = {
+    "error": logging.ERROR,
+    "warning": logging.WARNING,
+    "info": logging.INFO,
+    "debug": logging.DEBUG,
+}
 
-def setup_logging(verbosity: int) -> None:
-    level_map = {
-        0: logging.ERROR,
-        1: logging.WARNING,
-        2: logging.INFO,
-        3: logging.DEBUG,
-    }
-    level = level_map.get(verbosity, logging.INFO)
+
+def setup_logging(verbosity: int | str) -> None:
+    if isinstance(verbosity, str):
+        level = _LOG_LEVEL_MAP.get(verbosity, logging.INFO)
+    else:
+        int_to_level = {0: logging.ERROR, 1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}
+        level = int_to_level.get(verbosity, logging.INFO)
 
     pkg_logger = logging.getLogger(PACKAGE_LOGGER_NAME)
     pkg_logger.setLevel(level)
diff --git a/src/treemapper/tokens.py b/src/treemapper/tokens.py
@@ -7,11 +7,6 @@
 
 logger = logging.getLogger(__name__)
 
-CHUNK_SIZE = 500_000
-CHUNK_THRESHOLD = 1_000_000
-SAMPLE_CHAR_THRESHOLD = 50_000_000  # 50M characters - use sampling above this
-SAMPLE_COUNT = 5
-
 
 @dataclass
 class TokenCountResult:
@@ -42,43 +37,7 @@ def count_tokens(text: str, encoding: str = "o200k_base") -> TokenCountResult:
         logger.debug("tiktoken unavailable, using char/4 approximation")
         return TokenCountResult(len(text) // 4, False, "approximation")
 
-    text_len = len(text)
-    if text_len <= CHUNK_THRESHOLD:
-        logger.debug("Token counting: exact mode (%d chars)", text_len)
-        return TokenCountResult(len(encoder.encode(text)), True, encoding)
-
-    if text_len > SAMPLE_CHAR_THRESHOLD:
-        logger.debug("Token counting: sampled mode (%d chars, %d samples)", text_len, SAMPLE_COUNT)
-        return _count_tokens_sampled(text, text_len, encoder, encoding)
-
-    logger.debug("Token counting: chunked mode (%d chars)", text_len)
-    total = 0
-    for i in range(0, text_len, CHUNK_SIZE):
-        chunk = text[i : i + CHUNK_SIZE]
-        total += len(encoder.encode(chunk))
-    return TokenCountResult(total, False, encoding)
-
-
-def _count_tokens_sampled(text: str, text_len: int, encoder: Any, encoding: str) -> TokenCountResult:
-    num_chunks = text_len // CHUNK_SIZE
-    step = max(1, num_chunks // SAMPLE_COUNT)
-    sampled_tokens = 0
-    sampled_chars = 0
-
-    for i in range(0, num_chunks, step):
-        start = i * CHUNK_SIZE
-        chunk = text[start : start + CHUNK_SIZE]
-        sampled_tokens += len(encoder.encode(chunk))
-        sampled_chars += len(chunk)
-        if sampled_chars >= SAMPLE_COUNT * CHUNK_SIZE:
-            break
-
-    if sampled_chars == 0:
-        return TokenCountResult(text_len // 4, False, "approximation")
-
-    tokens_per_char = sampled_tokens / sampled_chars
-    estimated_total = int(tokens_per_char * text_len)
-    return TokenCountResult(estimated_total, False, encoding)
+    return TokenCountResult(len(encoder.encode(text)), True, encoding)
 
 
 def _format_size(byte_size: int) -> str:
diff --git a/src/treemapper/tree.py b/src/treemapper/tree.py
@@ -212,12 +212,35 @@ def _detect_binary_in_sample(file_path: Path, file_size: int) -> tuple[bytes | N
     return raw_bytes, None
 
 
+def _try_charset_normalizer(raw_bytes: bytes, file_path: Path) -> str | None:
+    try:
+        from charset_normalizer import from_bytes
+
+        matches = from_bytes(raw_bytes)
+        best = matches.best()
+        if best is not None:
+            logger.info("Decoded %s as %s via charset-normalizer", file_path.name, best.encoding)
+            return str(best)
+    except ImportError:
+        pass
+    except Exception:
+        pass
+    return None
+
+
 def _decode_file_content(raw_bytes: bytes, file_path: Path, file_size: int) -> str:
     if b"\x00" in raw_bytes[BINARY_DETECTION_SAMPLE_SIZE:]:
         logger.debug("Detected binary file %s (null in remainder)", file_path.name)
         return _format_binary_placeholder(file_size)
 
-    content = raw_bytes.decode("utf-8")
+    try:
+        content = raw_bytes.decode("utf-8")
+    except UnicodeDecodeError:
+        fallback = _try_charset_normalizer(raw_bytes, file_path)
+        if fallback is None:
+            raise
+        content = fallback
+
     content = content.replace("\r\n", "\n").replace("\r", "\n")
     if not content:
         return ""
diff --git a/src/treemapper/writer.py b/src/treemapper/writer.py
@@ -15,26 +15,30 @@
 
 _YAML_PROBLEMATIC_RE = re.compile(r"[\r\x00\x85\u2028\u2029]")
 
-_YAML_STRING_ESCAPE_PATTERN = re.compile(r'[\\"\n\r\x00\x85\u2028\u2029]')
+_YAML_STRING_ESCAPE_PATTERN = re.compile(r'[\\"\n\r\x00\x08\x0c\x85\u2028\u2029]')
 _YAML_STRING_ESCAPE_MAP = {
     "\\": "\\\\",
     '"': '\\"',
     "\n": "\\n",
     "\r": "\\r",
     "\x00": "\\0",
+    "\x08": "\\b",
+    "\x0c": "\\f",
     "\x85": "\\x85",
     "\u2028": "\\u2028",
     "\u2029": "\\u2029",
 }
 
-_YAML_CONTENT_ESCAPE_PATTERN = re.compile(r'[\\"\n\t\r\x00\x85\u2028\u2029]')
+_YAML_CONTENT_ESCAPE_PATTERN = re.compile(r'[\\"\n\t\r\x00\x08\x0c\x85\u2028\u2029]')
 _YAML_CONTENT_ESCAPE_MAP = {
     "\\": "\\\\",
     '"': '\\"',
     "\n": "\\n",
     "\t": "\\t",
     "\r": "\\r",
     "\x00": "\\0",
+    "\x08": "\\b",
+    "\x0c": "\\f",
     "\x85": "\\x85",
     "\u2028": "\\u2028",
     "\u2029": "\\u2029",
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -260,10 +260,20 @@ def test_unicode_content_and_encoding_errors(temp_project, run_mapper, caplog):
 
     assert cp1251_node is not None, "'cp1251.txt' not found"
     cp1251_content = cp1251_node.get("content", "")
-    assert "unreadable" in cp1251_content, f"CP1251 file should be marked unreadable, got: {cp1251_content!r}"
-    assert any(
-        "cp1251.txt" in record.message for record in caplog.records if record.levelno >= logging.WARNING
-    ), "Expected WARNING about cp1251.txt not found in logs"
+    try:
+        from charset_normalizer import from_bytes  # noqa: F401
+
+        has_charset_normalizer = True
+    except ImportError:
+        has_charset_normalizer = False
+    if has_charset_normalizer:
+        assert (
+            cp1251_content and "unreadable" not in cp1251_content
+        ), f"charset-normalizer should decode CP1251, got: {cp1251_content!r}"
+    else:
+        assert (
+            "unreadable" in cp1251_content
+        ), f"CP1251 file should be marked unreadable without charset-normalizer, got: {cp1251_content!r}"
 
     assert binary_node is not None, "'binary.bin' not found"
     binary_content = binary_node.get("content", "")
diff --git a/tests/test_complete_coverage.py b/tests/test_complete_coverage.py
@@ -472,7 +472,12 @@ def test_non_utf8_placeholder(self, tmp_path):
         node = find_node_by_path(tree, ["non_utf8.txt"])
         assert node is not None
         content = node.get("content", "")
-        assert "<unreadable content" in content
+        try:
+            from charset_normalizer import from_bytes  # noqa: F401
+
+            assert content and "<unreadable content" not in content
+        except ImportError:
+            assert "<unreadable content" in content
 
     @pytest.mark.skipif(
         sys.platform == "win32",
diff --git a/tests/test_tokens.py b/tests/test_tokens.py
@@ -75,6 +75,24 @@ def test_newlines_tabs(self):
         result = count_tokens("line1\nline2\tline3\r\nline4")
         assert result.count > 0
 
+    def test_large_text_exact(self):
+        large_text = "word " * 500_000
+        result = count_tokens(large_text)
+        assert result.count > 0
+        assert result.is_exact is True
+
+    def test_exact_count_matches_direct_encode(self):
+        from treemapper.tokens import _get_encoder
+
+        encoder = _get_encoder("o200k_base")
+        if encoder is None:
+            return
+
+        text = "word " * 5_000
+        exact_count = len(encoder.encode(text))
+        result = count_tokens(text)
+        assert result.count == exact_count
+
 
 class TestPrintTokenSummary:
     def test_prints_to_stderr(self):
@@ -139,67 +157,3 @@ def test_different_encodings_cached_separately(self):
         r1 = count_tokens("test", encoding="o200k_base")
         r2 = count_tokens("test", encoding="cl100k_base")
         assert r1.encoding != r2.encoding or r1.encoding == "approximation"
-
-
-class TestChunkedCounting:
-    def test_chunked_counting_for_large_text(self):
-        from treemapper.tokens import CHUNK_THRESHOLD
-
-        large_text = "word " * (CHUNK_THRESHOLD // 5 + 1000)
-        result = count_tokens(large_text)
-        assert result.count > 0
-        # Chunked counting is not exact due to BPE context sensitivity
-        # is_exact=False with real encoding, or approximation fallback
-        assert result.is_exact is False
-
-    def test_chunked_count_close_to_exact(self, monkeypatch):
-        import treemapper.tokens as tokens_module
-        from treemapper.tokens import _get_encoder
-
-        encoder = _get_encoder("o200k_base")
-        if encoder is None:
-            return
-
-        text = "word " * 5_000
-        exact_count = len(encoder.encode(text))
-
-        monkeypatch.setattr(tokens_module, "CHUNK_THRESHOLD", 1_000)
-        chunked_result = count_tokens(text)
-
-        assert abs(chunked_result.count - exact_count) / exact_count < 0.05
-
-    def test_small_text_not_chunked(self):
-        small_text = "hello world"
-        result = count_tokens(small_text)
-        assert result.count > 0
-
-
-class TestSampledCounting:
-    def test_sampling_threshold_is_reasonable(self):
-        from treemapper.tokens import SAMPLE_CHAR_THRESHOLD
-
-        assert SAMPLE_CHAR_THRESHOLD >= 1_000_000
-
-    def test_very_large_text_uses_sampling(self, monkeypatch):
-        import treemapper.tokens as tokens_module
-        from treemapper.tokens import _count_tokens_sampled, _get_encoder
-
-        encoder = _get_encoder("o200k_base")
-        if encoder is None:
-            return
-
-        monkeypatch.setattr(tokens_module, "SAMPLE_CHAR_THRESHOLD", 10_000)
-        large_text = "x" * 15_000
-        result = _count_tokens_sampled(large_text, len(large_text), encoder, "o200k_base")
-        assert result.is_exact is False
-        assert result.count > 0
-
-    def test_sampled_result_is_approximate(self, monkeypatch):
-        import treemapper.tokens as tokens_module
-
-        monkeypatch.setattr(tokens_module, "SAMPLE_CHAR_THRESHOLD", 10_000)
-        monkeypatch.setattr(tokens_module, "CHUNK_THRESHOLD", 1_000)
-        text = "word " * 5_000
-        result = count_tokens(text)
-        if result.encoding != "approximation":
-            assert result.is_exact is False