simonw · danshapiro · Jul 13, 2025
diff --git a/README.md b/README.md
@@ -17,6 +17,12 @@ Install this tool using `pip`:
 pip install files-to-prompt
 ```
 
+For more accurate token counting with the `--stats` flag, install with tiktoken support:
+
+```bash
+pip install 'files-to-prompt[stats]'
+```
+
 ## Usage
 
 To use `files-to-prompt`, provide the path to one or more files or directories you want to process:
@@ -98,6 +104,34 @@ This will output the contents of every file, with each file preceded by its rela
   find . -name "*.py" -print0 | files-to-prompt --null
   ```
 
+- `--stats`: Show statistics about processed files, including file count, token count, and directory breakdown. Statistics are printed to stderr.
+
+  ```bash
+  files-to-prompt path/to/directory --stats
+  ```
+
+  Example output (to stderr):
+  ```
+  Summary:
+  ========
+  Files processed: 69
+  Files ignored: 3
+  Total tokens: 202,477
+  Total lines: 21,363
+
+  Top 20 files by token count:
+    17,563  engine/analysis.py
+    15,036  ui/run_benchmark_tab.py
+    11,975  tests/test_benchmark.py
+    ...
+
+  Token count by directory:
+  tests        62,835 tokens (31.0%)
+  engine       62,082 tokens (30.7%)
+  ui           25,627 tokens (12.7%)
+  ...
+  ```
+
 ### Example
 
 Suppose you have a directory structure like this:

diff --git a/files_to_prompt/cli.py b/files_to_prompt/cli.py
@@ -1,11 +1,105 @@
 import os
 import sys
 from fnmatch import fnmatch
+from collections import defaultdict
+from pathlib import Path
 
 import click
 
 global_index = 1
 
+# Token counting function with tiktoken fallback
+def count_tokens(content):
+    """Count tokens with tiktoken fallback to char approximation."""
+    try:
+        import tiktoken
+        # Use cl100k_base encoding (GPT-3.5/4)
+        enc = tiktoken.get_encoding("cl100k_base")
+        return len(enc.encode(content))
+    except ImportError:
+        # Fallback: chars/4 approximation
+        return len(content) // 4
+
+
+class FileStats:
+    """Collect statistics about processed files."""
+
+    def __init__(self):
+        self.file_tokens = {}  # path -> token_count
+        self.file_lines = {}   # path -> line_count
+        self.total_lines = 0
+        self.files_processed = 0
+        self.files_ignored = 0
+
+    def add_file(self, path, content):
+        """Add a file's statistics."""
+        tokens = count_tokens(content)
+        lines = content.count('\n') + 1
+
+        self.file_tokens[path] = tokens
+        self.file_lines[path] = lines
+        self.total_lines += lines
+        self.files_processed += 1
+
+    def increment_ignored(self):
+        """Increment the ignored file counter."""
+        self.files_ignored += 1
+
+    def get_top_files(self, n=20):
+        """Return top N files by token count."""
+        sorted_files = sorted(self.file_tokens.items(), key=lambda x: x[1], reverse=True)
+        return sorted_files[:n]
+
+    def get_directory_summary(self):
+        """Aggregate token counts by first-level directories."""
+        dir_tokens = defaultdict(int)
+
+        for path, tokens in self.file_tokens.items():
+            # Normalize path separators
+            path_parts = Path(path).parts
+
+            if len(path_parts) > 1:
+                # Use first directory in path
+                first_dir = path_parts[0]
+                dir_tokens[first_dir] += tokens
+            else:
+                # File in root
+                dir_tokens["(root)"] += tokens
+
+        # Sort by token count descending
+        return sorted(dir_tokens.items(), key=lambda x: x[1], reverse=True)
+
+    def get_total_tokens(self):
+        """Get total token count across all files."""
+        return sum(self.file_tokens.values())
+
+    def print_summary(self, writer=None):
+        """Print the statistics summary to stderr."""
+        if writer is None:
+            writer = lambda s: click.echo(s, err=True)
+
+        total_tokens = self.get_total_tokens()
+
+        writer("\nSummary:")
+        writer("========")
+        writer(f"Files processed: {self.files_processed:,}")
+        writer(f"Files ignored: {self.files_ignored:,}")
+        writer(f"Total tokens: {total_tokens:,}")
+        writer(f"Total lines: {self.total_lines:,}")
+
+        # Top files
+        writer("\nTop 20 files by token count:")
+        for path, tokens in self.get_top_files(20):
+            writer(f"{tokens:8,}  {path}")
+
+        # Directory summary
+        writer("\nToken count by directory:")
+        dir_summary = self.get_directory_summary()
+        for dir_name, tokens in dir_summary:
+            percentage = (tokens / total_tokens * 100) if total_tokens > 0 else 0
+            writer(f"{dir_name:15} {tokens:8,} tokens ({percentage:4.1f}%)")
+
+
 EXT_TO_LANG = {
     "py": "python",
     "c": "c",
@@ -110,14 +204,20 @@ def process_path(
     claude_xml,
     markdown,
     line_numbers=False,
+    stats=None,
 ):
     if os.path.isfile(path):
         try:
             with open(path, "r") as f:
-                print_path(writer, path, f.read(), claude_xml, markdown, line_numbers)
+                content = f.read()
+                print_path(writer, path, content, claude_xml, markdown, line_numbers)
+                if stats:
+                    stats.add_file(path, content)
         except UnicodeDecodeError:
             warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError"
             click.echo(click.style(warning_message, fg="red"), err=True)
+            if stats:
+                stats.increment_ignored()
     elif os.path.isdir(path):
         for root, dirs, files in os.walk(path):
             if not include_hidden:
@@ -157,19 +257,24 @@ def process_path(
                 file_path = os.path.join(root, file)
                 try:
                     with open(file_path, "r") as f:
+                        content = f.read()
                         print_path(
                             writer,
                             file_path,
-                            f.read(),
+                            content,
                             claude_xml,
                             markdown,
                             line_numbers,
                         )
+                        if stats:
+                            stats.add_file(file_path, content)
                 except UnicodeDecodeError:
                     warning_message = (
                         f"Warning: Skipping file {file_path} due to UnicodeDecodeError"
                     )
                     click.echo(click.style(warning_message, fg="red"), err=True)
+                    if stats:
+                        stats.increment_ignored()
 
 
 def read_paths_from_stdin(use_null_separator):
@@ -244,6 +349,11 @@ def read_paths_from_stdin(use_null_separator):
     is_flag=True,
     help="Use NUL character as separator when reading from stdin",
 )
+@click.option(
+    "--stats",
+    is_flag=True,
+    help="Show statistics about processed files (file count, token count, etc.)",
+)
 @click.version_option()
 def cli(
     paths,
@@ -257,6 +367,7 @@ def cli(
     markdown,
     line_numbers,
     null,
+    stats,
 ):
     """
     Takes one or more paths to files or directories and outputs every file,
@@ -308,6 +419,9 @@ def cli(
     if output_file:
         fp = open(output_file, "w", encoding="utf-8")
         writer = lambda s: print(s, file=fp)
+
+    # Initialize stats collector if requested
+    file_stats = FileStats() if stats else None
     for path in paths:
         if not os.path.exists(path):
             raise click.BadArgumentUsage(f"Path does not exist: {path}")
@@ -327,8 +441,13 @@ def cli(
             claude_xml,
             markdown,
             line_numbers,
+            file_stats,
         )
     if claude_xml:
         writer("</documents>")
     if fp:
         fp.close()
+
+    # Print statistics summary to stderr if requested
+    if file_stats:
+        file_stats.print_summary()
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,3 +24,4 @@ files-to-prompt = "files_to_prompt.cli:cli"
 
 [project.optional-dependencies]
 test = ["pytest"]
+stats = ["tiktoken>=0.5.0"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,3 +24,4 @@ files-to-prompt = "files_to_prompt.cli:cli"

		[project.optional-dependencies]
		test = ["pytest"]
		stats = ["tiktoken>=0.5.0"]