diff --git a/README.md b/README.md index 06e1dad..4bbe135 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,12 @@ Install this tool using `pip`: pip install files-to-prompt ``` +For more accurate token counting with the `--stats` flag, install with tiktoken support: + +```bash +pip install 'files-to-prompt[stats]' +``` + ## Usage To use `files-to-prompt`, provide the path to one or more files or directories you want to process: @@ -98,6 +104,34 @@ This will output the contents of every file, with each file preceded by its rela find . -name "*.py" -print0 | files-to-prompt --null ``` +- `--stats`: Show statistics about processed files, including file count, token count, and directory breakdown. Statistics are printed to stderr. + + ```bash + files-to-prompt path/to/directory --stats + ``` + + Example output (to stderr): + ``` + Summary: + ======== + Files processed: 69 + Files ignored: 3 + Total tokens: 202,477 + Total lines: 21,363 + + Top 20 files by token count: + 17,563 engine/analysis.py + 15,036 ui/run_benchmark_tab.py + 11,975 tests/test_benchmark.py + ... + + Token count by directory: + tests 62,835 tokens (31.0%) + engine 62,082 tokens (30.7%) + ui 25,627 tokens (12.7%) + ... + ``` + ### Example Suppose you have a directory structure like this: diff --git a/files_to_prompt/cli.py b/files_to_prompt/cli.py index 7eee04f..0f2f540 100644 --- a/files_to_prompt/cli.py +++ b/files_to_prompt/cli.py @@ -1,11 +1,105 @@ import os import sys from fnmatch import fnmatch +from collections import defaultdict +from pathlib import Path import click global_index = 1 +# Token counting function with tiktoken fallback +def count_tokens(content): + """Count tokens with tiktoken fallback to char approximation.""" + try: + import tiktoken + # Use cl100k_base encoding (GPT-3.5/4) + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(content)) + except ImportError: + # Fallback: chars/4 approximation + return len(content) // 4 + + +class FileStats: + """Collect statistics about processed files.""" + + def __init__(self): + self.file_tokens = {} # path -> token_count + self.file_lines = {} # path -> line_count + self.total_lines = 0 + self.files_processed = 0 + self.files_ignored = 0 + + def add_file(self, path, content): + """Add a file's statistics.""" + tokens = count_tokens(content) + lines = content.count('\n') + 1 + + self.file_tokens[path] = tokens + self.file_lines[path] = lines + self.total_lines += lines + self.files_processed += 1 + + def increment_ignored(self): + """Increment the ignored file counter.""" + self.files_ignored += 1 + + def get_top_files(self, n=20): + """Return top N files by token count.""" + sorted_files = sorted(self.file_tokens.items(), key=lambda x: x[1], reverse=True) + return sorted_files[:n] + + def get_directory_summary(self): + """Aggregate token counts by first-level directories.""" + dir_tokens = defaultdict(int) + + for path, tokens in self.file_tokens.items(): + # Normalize path separators + path_parts = Path(path).parts + + if len(path_parts) > 1: + # Use first directory in path + first_dir = path_parts[0] + dir_tokens[first_dir] += tokens + else: + # File in root + dir_tokens["(root)"] += tokens + + # Sort by token count descending + return sorted(dir_tokens.items(), key=lambda x: x[1], reverse=True) + + def get_total_tokens(self): + """Get total token count across all files.""" + return sum(self.file_tokens.values()) + + def print_summary(self, writer=None): + """Print the statistics summary to stderr.""" + if writer is None: + writer = lambda s: click.echo(s, err=True) + + total_tokens = self.get_total_tokens() + + writer("\nSummary:") + writer("========") + writer(f"Files processed: {self.files_processed:,}") + writer(f"Files ignored: {self.files_ignored:,}") + writer(f"Total tokens: {total_tokens:,}") + writer(f"Total lines: {self.total_lines:,}") + + # Top files + writer("\nTop 20 files by token count:") + for path, tokens in self.get_top_files(20): + writer(f"{tokens:8,} {path}") + + # Directory summary + writer("\nToken count by directory:") + dir_summary = self.get_directory_summary() + for dir_name, tokens in dir_summary: + percentage = (tokens / total_tokens * 100) if total_tokens > 0 else 0 + writer(f"{dir_name:15} {tokens:8,} tokens ({percentage:4.1f}%)") + + EXT_TO_LANG = { "py": "python", "c": "c", @@ -110,14 +204,20 @@ def process_path( claude_xml, markdown, line_numbers=False, + stats=None, ): if os.path.isfile(path): try: with open(path, "r") as f: - print_path(writer, path, f.read(), claude_xml, markdown, line_numbers) + content = f.read() + print_path(writer, path, content, claude_xml, markdown, line_numbers) + if stats: + stats.add_file(path, content) except UnicodeDecodeError: warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError" click.echo(click.style(warning_message, fg="red"), err=True) + if stats: + stats.increment_ignored() elif os.path.isdir(path): for root, dirs, files in os.walk(path): if not include_hidden: @@ -157,19 +257,24 @@ def process_path( file_path = os.path.join(root, file) try: with open(file_path, "r") as f: + content = f.read() print_path( writer, file_path, - f.read(), + content, claude_xml, markdown, line_numbers, ) + if stats: + stats.add_file(file_path, content) except UnicodeDecodeError: warning_message = ( f"Warning: Skipping file {file_path} due to UnicodeDecodeError" ) click.echo(click.style(warning_message, fg="red"), err=True) + if stats: + stats.increment_ignored() def read_paths_from_stdin(use_null_separator): @@ -244,6 +349,11 @@ def read_paths_from_stdin(use_null_separator): is_flag=True, help="Use NUL character as separator when reading from stdin", ) +@click.option( + "--stats", + is_flag=True, + help="Show statistics about processed files (file count, token count, etc.)", +) @click.version_option() def cli( paths, @@ -257,6 +367,7 @@ def cli( markdown, line_numbers, null, + stats, ): """ Takes one or more paths to files or directories and outputs every file, @@ -308,6 +419,9 @@ def cli( if output_file: fp = open(output_file, "w", encoding="utf-8") writer = lambda s: print(s, file=fp) + + # Initialize stats collector if requested + file_stats = FileStats() if stats else None for path in paths: if not os.path.exists(path): raise click.BadArgumentUsage(f"Path does not exist: {path}") @@ -327,8 +441,13 @@ def cli( claude_xml, markdown, line_numbers, + file_stats, ) if claude_xml: writer("") if fp: fp.close() + + # Print statistics summary to stderr if requested + if file_stats: + file_stats.print_summary() diff --git a/pyproject.toml b/pyproject.toml index 9cf07cb..e6d464d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,3 +24,4 @@ files-to-prompt = "files_to_prompt.cli:cli" [project.optional-dependencies] test = ["pytest"] +stats = ["tiktoken>=0.5.0"] diff --git a/tests/test_files_to_prompt.py b/tests/test_files_to_prompt.py index 5268995..fd25cab 100644 --- a/tests/test_files_to_prompt.py +++ b/tests/test_files_to_prompt.py @@ -439,3 +439,159 @@ def test_markdown(tmpdir, option): "`````\n" ) assert expected.strip() == actual.strip() + + +def test_stats_basic(tmpdir): + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + os.makedirs("test_dir") + with open("test_dir/file1.py", "w") as f: + f.write("def hello():\n return 'world'") + with open("test_dir/file2.py", "w") as f: + f.write("# A comment\n" * 50) + + result = runner.invoke(cli, ["test_dir", "--stats"]) + assert result.exit_code == 0 + + # Check stdout has normal output + assert "test_dir/file1.py" in result.stdout + assert "def hello():" in result.stdout + + # Check stderr has stats + assert "Summary:" in result.stderr + assert "Files processed: 2" in result.stderr + assert "Total tokens:" in result.stderr + assert "Total lines:" in result.stderr + assert "Top 20 files by token count:" in result.stderr + assert "test_dir/file1.py" in result.stderr + assert "test_dir/file2.py" in result.stderr + assert "Token count by directory:" in result.stderr + + +def test_stats_with_subdirectories(tmpdir): + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + os.makedirs("src/models") + os.makedirs("src/utils") + os.makedirs("tests") + + with open("src/models/user.py", "w") as f: + f.write("class User:\n pass\n" * 20) + with open("src/utils/helpers.py", "w") as f: + f.write("def helper():\n pass\n" * 10) + with open("tests/test_user.py", "w") as f: + f.write("def test_user():\n pass\n" * 5) + with open("README.md", "w") as f: + f.write("# Project README\n" * 3) + + result = runner.invoke(cli, [".", "--stats"]) + assert result.exit_code == 0 + + # Check directory aggregation + assert "src" in result.stderr + assert "tests" in result.stderr + assert "(root)" in result.stderr # For README.md + assert "tokens (" in result.stderr # Check percentage display + + +def test_stats_with_ignored_files(tmpdir): + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + os.makedirs("test_dir") + + # Create a binary file that will be skipped + with open("test_dir/binary.bin", "wb") as f: + f.write(b"\xff\xfe\xfd") + + # Create text files + with open("test_dir/text1.txt", "w") as f: + f.write("This is text") + with open("test_dir/text2.txt", "w") as f: + f.write("More text here") + + result = runner.invoke(cli, ["test_dir", "--stats"]) + assert result.exit_code == 0 + + # Should show ignored files + assert "Files ignored: 1" in result.stderr + assert "Files processed: 2" in result.stderr + + +def test_stats_with_extensions_filter(tmpdir): + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + os.makedirs("test_dir") + + with open("test_dir/code.py", "w") as f: + f.write("print('hello')") + with open("test_dir/doc.md", "w") as f: + f.write("# Documentation") + with open("test_dir/data.json", "w") as f: + f.write('{"key": "value"}') + + result = runner.invoke(cli, ["test_dir", "--stats", "-e", "py", "-e", "md"]) + assert result.exit_code == 0 + + # Only .py and .md files should be processed + assert "Files processed: 2" in result.stderr + assert "test_dir/code.py" in result.stderr + assert "test_dir/doc.md" in result.stderr + assert "test_dir/data.json" not in result.stderr + + +def test_stats_with_output_file(tmpdir): + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + os.makedirs("test_dir") + with open("test_dir/file.txt", "w") as f: + f.write("Content") + + result = runner.invoke(cli, ["test_dir", "--stats", "-o", "output.txt"]) + assert result.exit_code == 0 + + # Stats should still go to stderr + assert "Summary:" in result.stderr + assert "Files processed: 1" in result.stderr + + # Main output should be in file + with open("output.txt", "r") as f: + content = f.read() + assert "test_dir/file.txt" in content + assert "Content" in content + + +def test_stats_empty_directory(tmpdir): + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + os.makedirs("empty_dir") + + result = runner.invoke(cli, ["empty_dir", "--stats"]) + assert result.exit_code == 0 + + assert "Files processed: 0" in result.stderr + assert "Total tokens: 0" in result.stderr + + +def test_stats_token_counting_accuracy(tmpdir): + """Test that token counting is working (tiktoken or fallback).""" + runner = CliRunner(mix_stderr=False) + with tmpdir.as_cwd(): + # Create a file with known content + content = "The quick brown fox jumps over the lazy dog. " * 10 + + with open("test.txt", "w") as f: + f.write(content) + + result = runner.invoke(cli, ["test.txt", "--stats"]) + assert result.exit_code == 0 + + # Should show some reasonable token count + # The exact count depends on whether tiktoken is installed + assert "Total tokens:" in result.stderr + # Extract token count from output + import re + match = re.search(r"Total tokens: ([\d,]+)", result.stderr) + assert match + token_count = int(match.group(1).replace(",", "")) + # Should be reasonable - not 0, not huge + assert 50 < token_count < 500 # Reasonable range for repeated sentence