|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse, os, sys, pathlib, io, subprocess, shutil |
| 3 | + |
| 4 | +DEFAULT_EXCLUDE_DIRS = {".git", ".hg", ".svn", ".idea", ".vscode", "dist", "build", "out", "target", ".next", ".nuxt", ".tox", "__pycache__"} |
| 5 | +DEFAULT_EXCLUDE_GLOBS = {"*.min.js", "*.map", "*.lock", "*.jar", "*.zip", "*.gz", "*.tgz", "*.bz2", "*.7z", "*.png", |
| 6 | + "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.ico", "*.pdf", "*.woff", "*.woff2", "*.ttf", "*.otf", |
| 7 | + "*.mp4", "*.mov", "*.avi", "*.mp3", "*.flac", "*.wav", "*.iso", "*.bin", "*.secret"} |
| 8 | +DEFAULT_INCLUDE_EXTS = { |
| 9 | + ".go",".ts",".tsx",".js",".jsx",".json",".yml",".yaml",".toml",".ini",".env",".md",".txt", |
| 10 | + ".proto",".graphql",".sql",".py",".rs",".java",".c",".h",".cpp",".hpp",".cc",".m",".mm", |
| 11 | + ".rb",".php",".pl",".sh",".bash",".zsh",".fish",".ps1",".bat",".dockerfile",".gradle",".properties" |
| 12 | +} |
| 13 | +ALSO_ALLOW_NAME = {"dockerfile", "makefile", "makefile.win"} |
| 14 | + |
| 15 | +def is_text_file(path: str, max_probe=65536) -> bool: |
| 16 | + try: |
| 17 | + with open(path, "rb") as f: |
| 18 | + chunk = f.read(max_probe) |
| 19 | + if b"\x00" in chunk: |
| 20 | + return False |
| 21 | + chunk.decode("utf-8", errors="strict") |
| 22 | + return True |
| 23 | + except Exception: |
| 24 | + return False |
| 25 | + |
| 26 | +def should_keep_by_ext(p: pathlib.Path, include_exts) -> bool: |
| 27 | + if include_exts and p.suffix.lower() not in include_exts: |
| 28 | + if p.name.lower() not in ALSO_ALLOW_NAME: |
| 29 | + return False |
| 30 | + return True |
| 31 | + |
| 32 | +def should_exclude_by_glob(rel: str) -> bool: |
| 33 | + from pathlib import PurePath |
| 34 | + pp = PurePath(rel) |
| 35 | + for g in DEFAULT_EXCLUDE_GLOBS: |
| 36 | + if pp.match(g): |
| 37 | + return True |
| 38 | + return False |
| 39 | + |
| 40 | +def git_available() -> bool: |
| 41 | + return shutil.which("git") is not None |
| 42 | + |
| 43 | +def iter_files_git(repo_root: pathlib.Path): |
| 44 | + """ |
| 45 | + Yields repo files not ignored by .gitignore/.git/info/exclude/global ignores. |
| 46 | + Uses: git ls-files --cached --others --exclude-standard |
| 47 | + """ |
| 48 | + cmd = ["git", "-C", str(repo_root), "ls-files", "-z", "--cached", "--others", "--exclude-standard", "--"] |
| 49 | + out = subprocess.check_output(cmd) |
| 50 | + for rel_b in out.split(b"\x00"): |
| 51 | + if not rel_b: |
| 52 | + continue |
| 53 | + rel = rel_b.decode("utf-8", errors="replace") |
| 54 | + p = repo_root / rel |
| 55 | + if p.is_file(): |
| 56 | + yield p |
| 57 | + |
| 58 | +def iter_files_walk(repo_root: pathlib.Path): |
| 59 | + """ |
| 60 | + Fallback: walk the tree and filter manually (does NOT perfectly mirror .gitignore). |
| 61 | + """ |
| 62 | + for root, dirs, files in os.walk(repo_root): |
| 63 | + # prune common junk dirs |
| 64 | + dirs[:] = [d for d in dirs if d not in DEFAULT_EXCLUDE_DIRS] |
| 65 | + for name in files: |
| 66 | + p = pathlib.Path(root) / name |
| 67 | + rel = p.relative_to(repo_root).as_posix() |
| 68 | + if should_exclude_by_glob(rel): |
| 69 | + continue |
| 70 | + yield p |
| 71 | + |
| 72 | +def iter_repo_files(repo_root: pathlib.Path, use_git: bool): |
| 73 | + if use_git: |
| 74 | + yield from iter_files_git(repo_root) |
| 75 | + else: |
| 76 | + # use git if available and .git exists |
| 77 | + if (repo_root/".git").exists() and git_available(): |
| 78 | + yield from iter_files_git(repo_root) |
| 79 | + else: |
| 80 | + yield from iter_files_walk(repo_root) |
| 81 | + |
| 82 | +def write_repo(repo_root: pathlib.Path, out_prefix: pathlib.Path, max_mb: float, force_git: bool): |
| 83 | + repo_root = repo_root.resolve() |
| 84 | + max_bytes = int(max_mb * (1024**2)) |
| 85 | + chunk_idx = 1 |
| 86 | + bytes_in_chunk = 0 |
| 87 | + |
| 88 | + def open_chunk(idx): |
| 89 | + suffix = "" if idx == 1 else f".part{idx}" |
| 90 | + path = out_prefix if idx == 1 else out_prefix.with_name(out_prefix.name + suffix) |
| 91 | + return path, io.open(path, "w", encoding="utf-8", newline="\n") |
| 92 | + |
| 93 | + out_path, fh = open_chunk(chunk_idx) |
| 94 | + |
| 95 | + count = 0 |
| 96 | + for p in iter_repo_files(repo_root, force_git): |
| 97 | + rel = p.relative_to(repo_root).as_posix() |
| 98 | + if should_exclude_by_glob(rel): |
| 99 | + continue |
| 100 | + if not should_keep_by_ext(p, DEFAULT_INCLUDE_EXTS): |
| 101 | + continue |
| 102 | + if not is_text_file(str(p)): |
| 103 | + continue |
| 104 | + |
| 105 | + header = f"File: {rel}\nContents:\n" |
| 106 | + try: |
| 107 | + with io.open(p, "r", encoding="utf-8") as rf: |
| 108 | + content = rf.read() |
| 109 | + except UnicodeDecodeError: |
| 110 | + with io.open(p, "r", encoding="latin-1") as rf: |
| 111 | + content = rf.read() |
| 112 | + |
| 113 | + block = header + content.rstrip() + "\n\n" |
| 114 | + block_bytes = len(block.encode("utf-8")) |
| 115 | + |
| 116 | + if bytes_in_chunk + block_bytes > max_bytes and bytes_in_chunk > 0: |
| 117 | + fh.close() |
| 118 | + chunk_idx += 1 |
| 119 | + out_path, fh = open_chunk(chunk_idx) |
| 120 | + bytes_in_chunk = 0 |
| 121 | + |
| 122 | + fh.write(block) |
| 123 | + bytes_in_chunk += block_bytes |
| 124 | + count += 1 |
| 125 | + |
| 126 | + fh.close() |
| 127 | + return chunk_idx, count |
| 128 | + |
| 129 | +def main(): |
| 130 | + ap = argparse.ArgumentParser(description="Dump repo sources to 'File: ...\\nContents:\\n...' format, honoring .gitignore.") |
| 131 | + ap.add_argument("--repo", default=".", help="Path to repo root (default: .)") |
| 132 | + ap.add_argument("--out", default="src.txt", help="Output filename/prefix") |
| 133 | + ap.add_argument("--max-mb", type=float, default=100.0, help="Max size per output file in MB (default: 100MB)") |
| 134 | + ap.add_argument("--git-mode", action="store_true", help="Force using 'git ls-files' (best accuracy for .gitignore).") |
| 135 | + args = ap.parse_args() |
| 136 | + |
| 137 | + repo_root = pathlib.Path(args.repo) |
| 138 | + out_prefix = pathlib.Path(args.out) |
| 139 | + |
| 140 | + try: |
| 141 | + chunks, files = write_repo(repo_root, out_prefix, args.max_mb, args.git_mode) |
| 142 | + print(f"Wrote {chunks} file(s); included {files} text source files. Upload the first file and any '.partN' files too.") |
| 143 | + except subprocess.CalledProcessError as e: |
| 144 | + print("Warning: Git mode failed; falling back to walk() (may include gitignored files).", file=sys.stderr) |
| 145 | + chunks, files = write_repo(repo_root, out_prefix, args.max_mb, force_git=False) |
| 146 | + print(f"Wrote {chunks} file(s); included {files} text source files. Upload the first file and any '.partN' files too.") |
| 147 | + |
| 148 | +if __name__ == "__main__": |
| 149 | + sys.exit(main()) |
| 150 | + |
0 commit comments