Skip to content

Commit f1cf609

Browse files
committed
fix: code review improvements - Python 3.13 compat, Unicode handling, performance
- Update dependencies for Python 3.13: pathspec>=0.11, pyyaml>=6.0.2 - Handle Unicode line separators (U+2028, U+2029) in YAML output - Add middle-slash pattern detection for proper gitignore anchoring - Add BrokenPipeError handling for stdout piping - Optimize file reading: single binary read instead of double read - Remove obsolete type ignore comments (pathspec now has types) - Thread-safe YAML representer registration with Lock - Use IsADirectoryError and proper PermissionError handling - Update to Production/Stable status, add Typing::Typed classifier - Simplify CI matrix: ubuntu-latest instead of ubuntu-22.04/24.04 - Add tests for Unicode, binary boundary, deep nesting, middle-slash
1 parent 4fa0411 commit f1cf609

File tree

10 files changed

+230
-67
lines changed

10 files changed

+230
-67
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ jobs:
8686
strategy:
8787
fail-fast: false
8888
matrix:
89-
os: [ubuntu-22.04, ubuntu-24.04, macos-latest, windows-latest]
90-
python-version: [3.9, '3.10', '3.11', '3.12']
89+
os: [ubuntu-latest, macos-latest, windows-latest]
90+
python-version: [3.9, '3.10', '3.11', '3.12', '3.13']
9191

9292
runs-on: ${{ matrix.os }}
9393

@@ -137,7 +137,7 @@ jobs:
137137

138138
- name: Upload coverage for SonarCloud
139139
uses: actions/upload-artifact@v6
140-
if: matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.12'
140+
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'
141141
with:
142142
name: coverage-report
143143
path: |
@@ -336,7 +336,7 @@ jobs:
336336
sonar.tests=tests
337337
sonar.python.coverage.reportPaths=coverage.xml
338338
sonar.python.xunit.reportPath=test-results.xml
339-
sonar.python.version=3.9,3.10,3.11,3.12
339+
sonar.python.version=3.9,3.10,3.11,3.12,3.13
340340
EOF
341341
342342
- name: SonarCloud Scan

pyproject.toml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,26 @@ description = "Export codebase structure and contents for AI/LLM context"
2121
readme = "README.md"
2222
requires-python = ">=3.9"
2323
license = { file = "LICENSE" }
24+
keywords = ["code-analysis", "directory-tree", "yaml", "json", "llm", "ai", "codebase", "context", "chatgpt", "claude", "code-context", "export", "tree"]
2425
classifiers = [
26+
"Development Status :: 5 - Production/Stable",
27+
"Environment :: Console",
28+
"Intended Audience :: Developers",
2529
"Programming Language :: Python :: 3",
30+
"Programming Language :: Python :: 3.9",
31+
"Programming Language :: Python :: 3.10",
32+
"Programming Language :: Python :: 3.11",
33+
"Programming Language :: Python :: 3.12",
34+
"Programming Language :: Python :: 3.13",
2635
"License :: OSI Approved :: Apache Software License",
2736
"Operating System :: OS Independent",
37+
"Topic :: Software Development :: Libraries :: Python Modules",
38+
"Topic :: Utilities",
39+
"Typing :: Typed",
2840
]
2941
dependencies = [
30-
"pathspec>=0.9,<1.0",
31-
"pyyaml>=5.4,<7.0",
42+
"pathspec>=0.11,<2.0",
43+
"pyyaml>=6.0.2,<8.0",
3244
]
3345

3446
[project.urls]

src/treemapper/ignore.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pathlib import Path
44
from typing import List, Optional
55

6-
import pathspec # type: ignore
6+
import pathspec
77

88

99
def read_ignore_file(file_path: Path) -> List[str]:
@@ -67,10 +67,13 @@ def _aggregate_ignore_patterns(root: Path, ignore_filename: str) -> List[str]:
6767
neg = line.startswith("!")
6868
pat = line[1:] if neg else line
6969

70-
if pat.startswith("/"):
71-
full = f"/{rel}{pat}" if rel else pat
70+
if pat.startswith("/") or "/" in pat:
71+
anchored_pat = pat.lstrip("/")
72+
full = f"/{rel}/{anchored_pat}" if rel else f"/{anchored_pat}"
73+
elif rel:
74+
full = f"{rel}/**/{pat}"
7275
else:
73-
full = f"{rel}/{pat}" if rel else pat
76+
full = pat
7477

7578
out.append(("!" + full) if neg else full)
7679

src/treemapper/logger.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33

44
def setup_logging(verbosity: int) -> None:
5-
"""Configure the logging level based on verbosity."""
65
level_map = {
76
0: logging.ERROR,
87
1: logging.WARNING,

src/treemapper/tree.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
from pathlib import Path
44
from typing import Any, Dict, List, Optional
55

6-
import pathspec # type: ignore
6+
import pathspec
77

88
from .ignore import should_ignore
99

10+
BINARY_DETECTION_SAMPLE_SIZE = 8192
11+
1012

1113
@dataclass
1214
class TreeBuildContext:
@@ -49,7 +51,7 @@ def _process_entry(entry: Path, ctx: TreeBuildContext, current_depth: int) -> Op
4951
try:
5052
relative_path = entry.relative_to(ctx.base_dir).as_posix()
5153
is_dir = entry.is_dir()
52-
except OSError as e:
54+
except (OSError, ValueError) as e:
5355
logging.warning(f"Could not process path for entry {entry}: {e}")
5456
return None
5557

@@ -94,11 +96,14 @@ def _read_file_content(file_path: Path, max_file_bytes: Optional[int]) -> str:
9496
logging.info(f"Skipping large file {file_path.name}: {file_size} bytes > {max_file_bytes} bytes")
9597
return f"<file too large: {file_size} bytes>\n"
9698

97-
if _is_binary_file(file_path):
99+
with file_path.open("rb") as f:
100+
raw_bytes = f.read()
101+
102+
if b"\x00" in raw_bytes[:BINARY_DETECTION_SAMPLE_SIZE]:
98103
logging.debug(f"Detected binary file {file_path.name}")
99104
return f"<binary file: {file_size} bytes>\n"
100105

101-
content = file_path.read_text(encoding="utf-8")
106+
content = raw_bytes.decode("utf-8")
102107
cleaned = content.replace("\x00", "")
103108
if cleaned != content:
104109
logging.warning(f"Removed NULL bytes from content of {file_path.name}")
@@ -114,14 +119,6 @@ def _read_file_content(file_path: Path, max_file_bytes: Optional[int]) -> str:
114119
except UnicodeDecodeError:
115120
logging.error(f"Cannot decode {file_path.name} as UTF-8. Marking as unreadable.")
116121
return "<unreadable content: not utf-8>\n"
117-
except IOError as e:
122+
except OSError as e:
118123
logging.error(f"Could not read {file_path.name}: {e}")
119124
return "<unreadable content>\n"
120-
121-
122-
def _is_binary_file(file_path: Path, sample_size: int = 8192) -> bool:
123-
try:
124-
with file_path.open("rb") as f:
125-
return b"\x00" in f.read(sample_size)
126-
except (OSError, IOError):
127-
return False

src/treemapper/writer.py

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
import json
33
import logging
44
import sys
5+
import threading
56
from pathlib import Path
67
from typing import Any, Dict, Optional, TextIO
78

89
import yaml
910

11+
YAML_PROBLEMATIC_CHARS = frozenset({"\x85", "\u2028", "\u2029"})
12+
1013

1114
class LiteralStr(str):
1215
pass
@@ -16,32 +19,34 @@ class QuotedStr(str):
1619
pass
1720

1821

22+
_yaml_representer_lock = threading.Lock()
1923
_yaml_representer_registered = False
2024

2125

2226
def _ensure_yaml_representer() -> None:
2327
global _yaml_representer_registered
2428
if _yaml_representer_registered:
2529
return
30+
with _yaml_representer_lock:
31+
if _yaml_representer_registered:
32+
return
2633

27-
def literal_representer(dumper: yaml.SafeDumper, data: LiteralStr) -> yaml.ScalarNode:
28-
style = "|" if data and not data.endswith("\n") else "|+"
29-
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
34+
def literal_representer(dumper: yaml.SafeDumper, data: LiteralStr) -> yaml.ScalarNode:
35+
style = "|+" if data.endswith("\n") else "|"
36+
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
3037

31-
def quoted_representer(dumper: yaml.SafeDumper, data: QuotedStr) -> yaml.ScalarNode:
32-
# Use double-quote style to properly escape NEL (U+0085) and other special chars
33-
return dumper.represent_scalar("tag:yaml.org,2002:str", str(data), style='"')
38+
def quoted_representer(dumper: yaml.SafeDumper, data: QuotedStr) -> yaml.ScalarNode:
39+
return dumper.represent_scalar("tag:yaml.org,2002:str", str(data), style='"')
3440

35-
yaml.add_representer(LiteralStr, literal_representer, Dumper=yaml.SafeDumper)
36-
yaml.add_representer(QuotedStr, quoted_representer, Dumper=yaml.SafeDumper)
37-
_yaml_representer_registered = True
41+
yaml.add_representer(LiteralStr, literal_representer, Dumper=yaml.SafeDumper)
42+
yaml.add_representer(QuotedStr, quoted_representer, Dumper=yaml.SafeDumper)
43+
_yaml_representer_registered = True
3844

3945

4046
def _prepare_tree_for_yaml(node: Dict[str, Any]) -> Dict[str, Any]:
4147
result: Dict[str, Any] = {}
4248
for key, value in node.items():
43-
if isinstance(value, str) and "\x85" in value:
44-
# NEL (U+0085) must be quoted to preserve roundtrip - check FIRST
49+
if isinstance(value, str) and any(c in value for c in YAML_PROBLEMATIC_CHARS):
4550
result[key] = QuotedStr(value)
4651
elif key == "content" and isinstance(value, str) and "\n" in value:
4752
result[key] = LiteralStr(value)
@@ -109,16 +114,19 @@ def write_tree_content(f: TextIO) -> None:
109114
except AttributeError:
110115
buf = None
111116

112-
if buf:
113-
utf8_stdout = io.TextIOWrapper(buf, encoding="utf-8", newline="")
114-
try:
115-
write_tree_content(utf8_stdout)
116-
utf8_stdout.flush()
117-
finally:
118-
utf8_stdout.detach()
119-
else:
120-
write_tree_content(sys.stdout)
121-
sys.stdout.flush()
117+
try:
118+
if buf:
119+
utf8_stdout = io.TextIOWrapper(buf, encoding="utf-8", newline="")
120+
try:
121+
write_tree_content(utf8_stdout)
122+
utf8_stdout.flush()
123+
finally:
124+
utf8_stdout.detach()
125+
else:
126+
write_tree_content(sys.stdout)
127+
sys.stdout.flush()
128+
except BrokenPipeError:
129+
pass
122130

123131
logging.info(f"Directory tree written to stdout in {output_format} format")
124132
else:
@@ -127,14 +135,14 @@ def write_tree_content(f: TextIO) -> None:
127135

128136
if output_file.is_dir():
129137
logging.error(f"Cannot write to '{output_file}': is a directory")
130-
raise IOError(f"Is a directory: {output_file}")
138+
raise IsADirectoryError(f"Is a directory: {output_file}")
131139

132140
with output_file.open("w", encoding="utf-8") as f:
133141
write_tree_content(f)
134142
logging.info(f"Directory tree saved to {output_file} in {output_format} format")
135-
except PermissionError as e:
143+
except PermissionError:
136144
logging.error(f"Unable to write to file '{output_file}': Permission denied")
137-
raise IOError(f"Permission denied: {output_file}") from e
138-
except IOError as e:
145+
raise
146+
except OSError as e:
139147
logging.error(f"Unable to write to file '{output_file}': {e}")
140148
raise

tests/test_coverage_gaps.py

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# tests/test_coverage_gaps.py
22
import sys
3-
from pathlib import Path
43

54
import pytest
65

@@ -316,6 +315,51 @@ def test_yaml_with_special_unicode_nel(tmp_path):
316315
assert parsed is not None
317316

318317

318+
def test_yaml_with_unicode_line_separators(tmp_path):
319+
import yaml
320+
321+
project = tmp_path / "project"
322+
project.mkdir()
323+
324+
ls_char = "\u2028"
325+
ps_char = "\u2029"
326+
(project / "line_sep.txt").write_text(f"line1{ls_char}line2")
327+
(project / "para_sep.txt").write_text(f"para1{ps_char}para2")
328+
329+
tree = map_directory(project)
330+
yaml_output = to_yaml(tree)
331+
332+
parsed = yaml.safe_load(yaml_output)
333+
assert parsed is not None
334+
335+
line_sep_node = find_node_by_path(parsed, ["line_sep.txt"])
336+
para_sep_node = find_node_by_path(parsed, ["para_sep.txt"])
337+
338+
assert line_sep_node is not None
339+
assert ls_char in line_sep_node.get("content", "")
340+
assert para_sep_node is not None
341+
assert ps_char in para_sep_node.get("content", "")
342+
343+
344+
def test_yaml_literal_style_without_trailing_newline(tmp_path):
345+
project = tmp_path / "project"
346+
project.mkdir()
347+
348+
(project / "no_newline.txt").write_text("content without newline")
349+
350+
tree = map_directory(project)
351+
yaml_output = to_yaml(tree)
352+
353+
assert "no_newline.txt" in yaml_output
354+
355+
import yaml
356+
357+
parsed = yaml.safe_load(yaml_output)
358+
node = find_node_by_path(parsed, ["no_newline.txt"])
359+
assert node is not None
360+
assert "content without newline" in node.get("content", "")
361+
362+
319363
def test_empty_directory_handling(tmp_path):
320364
project = tmp_path / "project"
321365
project.mkdir()
@@ -363,3 +407,76 @@ def test_all_verbosity_levels(tmp_path):
363407
finally:
364408
root_logger.setLevel(original_level)
365409
root_logger.handlers = original_handlers
410+
411+
412+
def test_binary_detection_at_exact_boundary(tmp_path):
413+
from treemapper.tree import BINARY_DETECTION_SAMPLE_SIZE
414+
415+
project = tmp_path / "project"
416+
project.mkdir()
417+
418+
null_at_boundary = project / "null_at_boundary.bin"
419+
content = b"x" * (BINARY_DETECTION_SAMPLE_SIZE - 1) + b"\x00" + b"y" * 100
420+
null_at_boundary.write_bytes(content)
421+
422+
null_after_boundary = project / "null_after_boundary.txt"
423+
content2 = b"x" * BINARY_DETECTION_SAMPLE_SIZE + b"\x00" + b"y" * 100
424+
null_after_boundary.write_bytes(content2)
425+
426+
tree = map_directory(project)
427+
428+
boundary_node = find_node_by_path(tree, ["null_at_boundary.bin"])
429+
after_node = find_node_by_path(tree, ["null_after_boundary.txt"])
430+
431+
assert boundary_node is not None
432+
assert "<binary file:" in boundary_node.get("content", "")
433+
434+
assert after_node is not None
435+
assert "x" * 100 in after_node.get("content", "")
436+
437+
438+
def test_deep_nesting_with_ignore_patterns(tmp_path):
439+
project = tmp_path / "project"
440+
project.mkdir()
441+
442+
depth = 12
443+
current = project
444+
for i in range(depth):
445+
current = current / f"level{i}"
446+
current.mkdir()
447+
(current / f"keep{i}.txt").write_text(f"keep{i}")
448+
(current / f"ignore{i}.bak").write_text(f"ignore{i}")
449+
450+
(project / "level0" / ".gitignore").write_text("*.bak\n")
451+
452+
tree = map_directory(project)
453+
names = get_all_files_in_tree(tree)
454+
455+
for i in range(depth):
456+
assert f"keep{i}.txt" in names
457+
458+
assert "ignore0.bak" not in names
459+
assert "ignore5.bak" not in names
460+
assert "ignore11.bak" not in names
461+
462+
463+
def test_middle_slash_pattern_is_anchored(tmp_path):
464+
project = tmp_path / "project"
465+
project.mkdir()
466+
467+
(project / "subdir").mkdir()
468+
(project / "subdir" / "docs").mkdir()
469+
(project / "subdir" / "docs" / "api.txt").write_text("should be ignored")
470+
(project / "subdir" / "other").mkdir()
471+
(project / "subdir" / "other" / "docs").mkdir()
472+
(project / "subdir" / "other" / "docs" / "api.txt").write_text("should be kept")
473+
474+
(project / "subdir" / ".gitignore").write_text("docs/api.txt\n")
475+
476+
tree = map_directory(project)
477+
478+
direct_node = find_node_by_path(tree, ["subdir", "docs", "api.txt"])
479+
nested_node = find_node_by_path(tree, ["subdir", "other", "docs", "api.txt"])
480+
481+
assert direct_node is None
482+
assert nested_node is not None

0 commit comments

Comments
 (0)