Skip to content

Commit f8e0d17

Browse files
committed
feat: add token counting, markdown format, and YAML escaping fixes
- Add token counting module with tiktoken support and fallback approximation - Add o200k_harmony encoding for newer models - Add warning when --token-encoding used without --tokens - Fix YAML escaping for \n, \r, \0, \x85, \u2028, \u2029 in filenames - Add markdown output format with language-aware code fences - Add comprehensive tests for tokens (23), markdown (56), YAML escaping (11)
1 parent 7b087b8 commit f8e0d17

15 files changed

+1067
-120
lines changed

CLAUDE.md

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,27 +43,44 @@ treemapper . # YAML to stdout
4343
treemapper . -o tree.yaml # save to file
4444
treemapper . -o - # explicit stdout output
4545
treemapper . --format json # JSON format
46-
treemapper . --format text # tree-style text
46+
treemapper . --format text # plain text with indentation
47+
treemapper . --format md # Markdown with headings and fenced code blocks
4748
treemapper . --no-content # structure only (no file contents)
4849
treemapper . --max-depth 3 # limit directory depth
4950
treemapper . --max-file-bytes 10000 # skip files larger than 10KB
5051
treemapper . -i custom.ignore # custom ignore patterns
5152
treemapper . --no-default-ignores # disable .gitignore/.treemapperignore (custom -i still works)
5253
treemapper . -v 2 # verbose output (0=ERROR, 1=WARNING, 2=INFO, 3=DEBUG)
53-
treemapper . -c # copy output to clipboard (also outputs to stdout)
54-
treemapper . --copy-only # copy to clipboard only (no stdout output)
54+
treemapper . -c # copy to clipboard (no stdout)
55+
treemapper . -c -o tree.yaml # copy to clipboard + save to file
56+
treemapper . --tokens # show token count (for LLM context planning)
57+
treemapper . --tokens --token-encoding cl100k_base # use GPT-4 tokenizer
5558
treemapper --version # show version
5659
```
5760

61+
## Token Counting
62+
63+
Show token count for LLM context planning with `--tokens`:
64+
65+
```bash
66+
treemapper . --tokens # 12,847 tokens (o200k_base)
67+
treemapper . --tokens --copy # tokens + clipboard
68+
```
69+
70+
**Encodings:**
71+
- `o200k_base` (default) — GPT-4o tokenizer
72+
- `o200k_harmony` — GPT-4.1/newer models tokenizer
73+
- `cl100k_base` — GPT-4/GPT-3.5 tokenizer
74+
75+
Token count is displayed on stderr only when connected to a TTY (won't break pipes).
76+
5877
## Clipboard Support
5978

6079
Copy output directly to clipboard with `-c` or `--copy`:
6180

6281
```bash
63-
treemapper . -c # copy to clipboard + stdout
82+
treemapper . -c # copy to clipboard (no stdout)
6483
treemapper . -c -o tree.yaml # copy to clipboard + save to file
65-
treemapper . --copy-only # copy to clipboard only
66-
treemapper . --copy-only -o tree.yaml # copy to clipboard + save to file (no stdout)
6784
```
6885

6986
**System Requirements:**
@@ -75,7 +92,7 @@ treemapper . --copy-only -o tree.yaml # copy to clipboard + save to file (no std
7592
## Python API
7693

7794
```python
78-
from treemapper import map_directory, to_yaml, to_json, to_text
95+
from treemapper import map_directory, to_yaml, to_json, to_text, to_markdown
7996

8097
# Full function signature
8198
tree = map_directory(
@@ -96,6 +113,7 @@ tree = map_directory(".", max_file_bytes=50000, ignore_file="custom.ignore")
96113
yaml_str = to_yaml(tree)
97114
json_str = to_json(tree)
98115
text_str = to_text(tree)
116+
md_str = to_markdown(tree) # or to_md(tree)
99117
```
100118

101119
## Ignore Patterns
@@ -134,8 +152,9 @@ Integration tests only - test against real filesystem. No mocking.
134152
src/treemapper/
135153
├── cli.py # argument parsing
136154
├── ignore.py # gitignore/treemapperignore handling
155+
├── tokens.py # token counting (optional tiktoken)
137156
├── tree.py # directory traversal
138-
├── writer.py # YAML/JSON/text output
157+
├── writer.py # YAML/JSON/text/Markdown output
139158
└── treemapper.py # main entry point
140159
```
141160

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ classifiers = [
7070
dependencies = [
7171
"pathspec>=0.11,<2.0",
7272
"pyyaml>=6.0.2,<8.0",
73+
"tiktoken>=0.7,<1.0",
7374
]
7475

7576
[project.urls]

src/treemapper/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
from .ignore import get_ignore_specs
66
from .tree import TreeBuildContext, build_tree
77
from .version import __version__
8-
from .writer import write_tree_json, write_tree_text, write_tree_yaml
8+
from .writer import write_tree_json, write_tree_markdown, write_tree_text, write_tree_yaml
99

1010
__all__ = [
1111
"__version__",
1212
"map_directory",
1313
"to_json",
14+
"to_markdown",
15+
"to_md",
1416
"to_text",
1517
"to_yaml",
1618
]
@@ -63,3 +65,12 @@ def to_text(tree: dict[str, Any]) -> str:
6365
buf = io.StringIO()
6466
write_tree_text(buf, tree)
6567
return buf.getvalue()
68+
69+
70+
def to_markdown(tree: dict[str, Any]) -> str:
71+
buf = io.StringIO()
72+
write_tree_markdown(buf, tree)
73+
return buf.getvalue()
74+
75+
76+
to_md = to_markdown

src/treemapper/cli.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ class ParsedArgs:
1919
no_content: bool
2020
max_file_bytes: Optional[int]
2121
copy: bool
22-
copy_only: bool
22+
show_tokens: bool
23+
token_encoding: str
2324

2425

2526
DEFAULT_IGNORES_HELP = """
@@ -53,15 +54,12 @@ def parse_args() -> ParsedArgs:
5354
parser.add_argument("directory", nargs="?", default=".", help="The directory to analyze")
5455
parser.add_argument("-i", "--ignore-file", default=None, help="Path to custom ignore file")
5556
parser.add_argument("-o", "--output-file", default=None, help="Output file (default: stdout, use '-' to force stdout)")
56-
parser.add_argument("--format", choices=["yaml", "json", "text"], default="yaml", help="Output format")
57+
parser.add_argument("--format", choices=["yaml", "json", "text", "md"], default="yaml", help="Output format")
5758
parser.add_argument("--no-default-ignores", action="store_true", help="Disable all default ignores")
5859
parser.add_argument("--max-depth", type=int, default=None, metavar="N", help="Maximum traversal depth")
5960
parser.add_argument("--no-content", action="store_true", help="Skip file contents (structure only)")
6061
parser.add_argument("--max-file-bytes", type=int, default=None, metavar="N", help="Skip files larger than N bytes")
61-
parser.add_argument("-c", "--copy", action="store_true", help="Copy output to clipboard")
62-
parser.add_argument(
63-
"--copy-only", action="store_true", help="Copy to clipboard, suppress stdout (file output with -o still works)"
64-
)
62+
parser.add_argument("-c", "--copy", action="store_true", help="Copy to clipboard (suppresses stdout unless -o is used)")
6563
parser.add_argument(
6664
"-v",
6765
"--verbosity",
@@ -71,9 +69,19 @@ def parse_args() -> ParsedArgs:
7169
metavar="[0-3]",
7270
help="Verbosity: 0=ERROR, 1=WARNING, 2=INFO, 3=DEBUG",
7371
)
72+
parser.add_argument("--tokens", action="store_true", help="Show token count for output")
73+
parser.add_argument(
74+
"--token-encoding",
75+
choices=["o200k_base", "o200k_harmony", "cl100k_base"],
76+
default="o200k_base",
77+
help="Tokenizer encoding (default: o200k_base)",
78+
)
7479

7580
args = parser.parse_args()
7681

82+
if args.token_encoding != "o200k_base" and not args.tokens:
83+
print("Warning: --token-encoding has no effect without --tokens", file=sys.stderr)
84+
7785
if args.max_depth is not None and args.max_depth < 0:
7886
print(f"Error: --max-depth must be non-negative, got {args.max_depth}", file=sys.stderr)
7987
sys.exit(1)
@@ -97,6 +105,9 @@ def parse_args() -> ParsedArgs:
97105
output_file = None
98106
if args.output_file and args.output_file != "-":
99107
output_file = Path(args.output_file).resolve()
108+
if output_file.is_dir():
109+
print(f"Error: '{args.output_file}' is a directory, not a file.", file=sys.stderr)
110+
sys.exit(1)
100111

101112
ignore_file = None
102113
if args.ignore_file:
@@ -115,6 +126,7 @@ def parse_args() -> ParsedArgs:
115126
max_depth=args.max_depth,
116127
no_content=args.no_content,
117128
max_file_bytes=args.max_file_bytes,
118-
copy=args.copy or args.copy_only,
119-
copy_only=args.copy_only,
129+
copy=args.copy,
130+
show_tokens=args.tokens,
131+
token_encoding=args.token_encoding,
120132
)

src/treemapper/tokens.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import sys
2+
from dataclasses import dataclass
3+
from functools import lru_cache
4+
from typing import Any, Optional
5+
6+
7+
@dataclass
8+
class TokenCountResult:
9+
count: int
10+
is_exact: bool
11+
encoding: str
12+
13+
14+
@lru_cache(maxsize=4)
15+
def _get_encoder(encoding: str) -> Optional[Any]:
16+
try:
17+
import tiktoken
18+
19+
return tiktoken.get_encoding(encoding)
20+
except (ImportError, Exception):
21+
return None
22+
23+
24+
def count_tokens(text: str, encoding: str = "o200k_base") -> TokenCountResult:
25+
encoder = _get_encoder(encoding)
26+
if encoder:
27+
return TokenCountResult(len(encoder.encode(text)), True, encoding)
28+
return TokenCountResult(len(text) // 4, False, "approximation")
29+
30+
31+
def print_token_summary(text: str, encoding: str = "o200k_base") -> None:
32+
if not sys.stderr.isatty():
33+
return
34+
result = count_tokens(text, encoding)
35+
prefix = "" if result.is_exact else "~"
36+
print(f"{prefix}{result.count:,} tokens ({result.encoding})", file=sys.stderr)

src/treemapper/treemapper.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def main() -> None:
1313
from .clipboard import ClipboardError, copy_to_clipboard
1414
from .ignore import get_ignore_specs
1515
from .logger import setup_logging
16+
from .tokens import print_token_summary
1617
from .tree import TreeBuildContext, build_tree
1718
from .writer import tree_to_string, write_string_to_file, write_tree_to_file
1819

@@ -34,22 +35,24 @@ def main() -> None:
3435
"children": build_tree(args.root_dir, ctx),
3536
}
3637

37-
output_content = None
38-
if args.copy:
38+
output_content: str = ""
39+
if args.copy or args.output_file or args.show_tokens:
3940
output_content = tree_to_string(directory_tree, args.output_format)
41+
42+
if args.show_tokens:
43+
print_token_summary(output_content, args.token_encoding)
44+
45+
if args.copy:
4046
try:
4147
byte_size = copy_to_clipboard(output_content)
4248
print(f"Copied to clipboard ({_format_size(byte_size)})", file=sys.stderr)
4349
except ClipboardError as e:
4450
logging.warning(f"Clipboard: {e}")
4551

46-
if args.copy_only and args.output_file is None:
47-
return
48-
49-
if output_content is not None:
52+
if args.output_file:
5053
write_string_to_file(output_content, args.output_file, args.output_format)
51-
else:
52-
write_tree_to_file(directory_tree, args.output_file, args.output_format)
54+
elif not args.copy:
55+
write_tree_to_file(directory_tree, None, args.output_format)
5356

5457

5558
if __name__ == "__main__":

0 commit comments

Comments
 (0)