Skip to content

Add --stats flag for token-count statistics #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ Install this tool using `pip`:
pip install files-to-prompt
```

For more accurate token counting with the `--stats` flag, install with tiktoken support:

```bash
pip install 'files-to-prompt[stats]'
```

## Usage

To use `files-to-prompt`, provide the path to one or more files or directories you want to process:
Expand Down Expand Up @@ -98,6 +104,34 @@ This will output the contents of every file, with each file preceded by its rela
find . -name "*.py" -print0 | files-to-prompt --null
```

- `--stats`: Show statistics about processed files, including file count, token count, and directory breakdown. Statistics are printed to stderr.

```bash
files-to-prompt path/to/directory --stats
```

Example output (to stderr):
```
Summary:
========
Files processed: 69
Files ignored: 3
Total tokens: 202,477
Total lines: 21,363

Top 20 files by token count:
17,563 engine/analysis.py
15,036 ui/run_benchmark_tab.py
11,975 tests/test_benchmark.py
...

Token count by directory:
tests 62,835 tokens (31.0%)
engine 62,082 tokens (30.7%)
ui 25,627 tokens (12.7%)
...
```

### Example

Suppose you have a directory structure like this:
Expand Down
123 changes: 121 additions & 2 deletions files_to_prompt/cli.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,105 @@
import os
import sys
from fnmatch import fnmatch
from collections import defaultdict
from pathlib import Path

import click

global_index = 1

# Token counting function with tiktoken fallback
def count_tokens(content):
"""Count tokens with tiktoken fallback to char approximation."""
try:
import tiktoken
# Use cl100k_base encoding (GPT-3.5/4)
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(content))
except ImportError:
# Fallback: chars/4 approximation
return len(content) // 4


class FileStats:
"""Collect statistics about processed files."""

def __init__(self):
self.file_tokens = {} # path -> token_count
self.file_lines = {} # path -> line_count
self.total_lines = 0
self.files_processed = 0
self.files_ignored = 0

def add_file(self, path, content):
"""Add a file's statistics."""
tokens = count_tokens(content)
lines = content.count('\n') + 1

self.file_tokens[path] = tokens
self.file_lines[path] = lines
self.total_lines += lines
self.files_processed += 1

def increment_ignored(self):
"""Increment the ignored file counter."""
self.files_ignored += 1

def get_top_files(self, n=20):
"""Return top N files by token count."""
sorted_files = sorted(self.file_tokens.items(), key=lambda x: x[1], reverse=True)
return sorted_files[:n]

def get_directory_summary(self):
"""Aggregate token counts by first-level directories."""
dir_tokens = defaultdict(int)

for path, tokens in self.file_tokens.items():
# Normalize path separators
path_parts = Path(path).parts

if len(path_parts) > 1:
# Use first directory in path
first_dir = path_parts[0]
dir_tokens[first_dir] += tokens
else:
# File in root
dir_tokens["(root)"] += tokens

# Sort by token count descending
return sorted(dir_tokens.items(), key=lambda x: x[1], reverse=True)

def get_total_tokens(self):
"""Get total token count across all files."""
return sum(self.file_tokens.values())

def print_summary(self, writer=None):
"""Print the statistics summary to stderr."""
if writer is None:
writer = lambda s: click.echo(s, err=True)

total_tokens = self.get_total_tokens()

writer("\nSummary:")
writer("========")
writer(f"Files processed: {self.files_processed:,}")
writer(f"Files ignored: {self.files_ignored:,}")
writer(f"Total tokens: {total_tokens:,}")
writer(f"Total lines: {self.total_lines:,}")

# Top files
writer("\nTop 20 files by token count:")
for path, tokens in self.get_top_files(20):
writer(f"{tokens:8,} {path}")

# Directory summary
writer("\nToken count by directory:")
dir_summary = self.get_directory_summary()
for dir_name, tokens in dir_summary:
percentage = (tokens / total_tokens * 100) if total_tokens > 0 else 0
writer(f"{dir_name:15} {tokens:8,} tokens ({percentage:4.1f}%)")


EXT_TO_LANG = {
"py": "python",
"c": "c",
Expand Down Expand Up @@ -110,14 +204,20 @@ def process_path(
claude_xml,
markdown,
line_numbers=False,
stats=None,
):
if os.path.isfile(path):
try:
with open(path, "r") as f:
print_path(writer, path, f.read(), claude_xml, markdown, line_numbers)
content = f.read()
print_path(writer, path, content, claude_xml, markdown, line_numbers)
if stats:
stats.add_file(path, content)
except UnicodeDecodeError:
warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError"
click.echo(click.style(warning_message, fg="red"), err=True)
if stats:
stats.increment_ignored()
elif os.path.isdir(path):
for root, dirs, files in os.walk(path):
if not include_hidden:
Expand Down Expand Up @@ -157,19 +257,24 @@ def process_path(
file_path = os.path.join(root, file)
try:
with open(file_path, "r") as f:
content = f.read()
print_path(
writer,
file_path,
f.read(),
content,
claude_xml,
markdown,
line_numbers,
)
if stats:
stats.add_file(file_path, content)
except UnicodeDecodeError:
warning_message = (
f"Warning: Skipping file {file_path} due to UnicodeDecodeError"
)
click.echo(click.style(warning_message, fg="red"), err=True)
if stats:
stats.increment_ignored()


def read_paths_from_stdin(use_null_separator):
Expand Down Expand Up @@ -244,6 +349,11 @@ def read_paths_from_stdin(use_null_separator):
is_flag=True,
help="Use NUL character as separator when reading from stdin",
)
@click.option(
"--stats",
is_flag=True,
help="Show statistics about processed files (file count, token count, etc.)",
)
@click.version_option()
def cli(
paths,
Expand All @@ -257,6 +367,7 @@ def cli(
markdown,
line_numbers,
null,
stats,
):
"""
Takes one or more paths to files or directories and outputs every file,
Expand Down Expand Up @@ -308,6 +419,9 @@ def cli(
if output_file:
fp = open(output_file, "w", encoding="utf-8")
writer = lambda s: print(s, file=fp)

# Initialize stats collector if requested
file_stats = FileStats() if stats else None
for path in paths:
if not os.path.exists(path):
raise click.BadArgumentUsage(f"Path does not exist: {path}")
Expand All @@ -327,8 +441,13 @@ def cli(
claude_xml,
markdown,
line_numbers,
file_stats,
)
if claude_xml:
writer("</documents>")
if fp:
fp.close()

# Print statistics summary to stderr if requested
if file_stats:
file_stats.print_summary()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ files-to-prompt = "files_to_prompt.cli:cli"

[project.optional-dependencies]
test = ["pytest"]
stats = ["tiktoken>=0.5.0"]
Loading