|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | + |
| 4 | +import ast |
| 5 | +import sys |
| 6 | +import re |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| 10 | +REQ_FILE = PROJECT_ROOT / "requirements.txt" |
| 11 | +EXCLUDE_DIRS = { |
| 12 | + ".git", ".github", ".venv", "venv", "env", "__pycache__", "site-packages", |
| 13 | + "dist", "build", "docs", "data", "assets", "node_modules" |
| 14 | +} |
| 15 | + |
| 16 | +# Map distribution name -> list of top level import names commonly used |
| 17 | +KNOWN_IMPORTS = { |
| 18 | + "tabula-py": ["tabula"], |
| 19 | + "textblob": ["textblob"], |
| 20 | + "nltk": ["nltk"], |
| 21 | + "xlsxwriter": ["xlsxwriter"], |
| 22 | + "PyPDF2": ["PyPDF2"], |
| 23 | + "PyMuPDF": ["fitz", "pymupdf"], |
| 24 | + "requests": ["requests"], |
| 25 | + "beautifulsoup4": ["bs4"], |
| 26 | + "GitPython": ["git"], |
| 27 | + "scipy": ["scipy"], |
| 28 | + # add here if project uses others |
| 29 | +} |
| 30 | + |
| 31 | +def canonicalize_name(name: str) -> str: |
| 32 | + # similar to packaging.utils.canonicalize_name, without dependency |
| 33 | + return re.sub(r"[-_.]+", "-", name).lower().strip() |
| 34 | + |
| 35 | +def parse_requirements_lines(path: Path): |
| 36 | + """Return tuple: (listed_names_set, metadata_list) |
| 37 | + metadata_list keeps original lines and parsed canonical dist names where present. |
| 38 | + """ |
| 39 | + listed = set() |
| 40 | + lines_meta = [] |
| 41 | + req_re = re.compile(r"^\s*([A-Za-z0-9_.\-]+)") |
| 42 | + for raw in path.read_text(encoding="utf-8").splitlines(): |
| 43 | + line = raw.strip() |
| 44 | + if not line or line.startswith("#"): |
| 45 | + lines_meta.append(("comment_or_blank", raw, None)) |
| 46 | + continue |
| 47 | + m = req_re.match(line) |
| 48 | + if not m: |
| 49 | + lines_meta.append(("other", raw, None)) |
| 50 | + continue |
| 51 | + dist = canonicalize_name(m.group(1)) |
| 52 | + listed.add(dist) |
| 53 | + lines_meta.append(("requirement", raw, dist)) |
| 54 | + return listed, lines_meta |
| 55 | + |
| 56 | +def iter_python_files(root: Path): |
| 57 | + for p in root.rglob("*.py"): |
| 58 | + rel_parts = p.relative_to(root).parts |
| 59 | + if any(part in EXCLUDE_DIRS for part in rel_parts): |
| 60 | + continue |
| 61 | + yield p |
| 62 | + |
| 63 | +def collect_local_packages(root: Path): |
| 64 | + """Top level package names to treat as local, exclude from third party detection.""" |
| 65 | + locals_set = set() |
| 66 | + for d in root.iterdir(): |
| 67 | + if d.is_dir(): |
| 68 | + init_py = d / "__init__.py" |
| 69 | + if init_py.exists(): |
| 70 | + locals_set.add(d.name.split(".")[0]) |
| 71 | + # also include top level scripts as local modules |
| 72 | + for f in root.glob("*.py"): |
| 73 | + locals_set.add(f.stem) |
| 74 | + return locals_set |
| 75 | + |
| 76 | +def collect_imports(pyfile: Path): |
| 77 | + try: |
| 78 | + tree = ast.parse(pyfile.read_text(encoding="utf-8"), filename=str(pyfile)) |
| 79 | + except Exception: |
| 80 | + return set() |
| 81 | + used = set() |
| 82 | + for node in ast.walk(tree): |
| 83 | + if isinstance(node, ast.Import): |
| 84 | + for n in node.names: |
| 85 | + top = n.name.split(".", 1)[0] |
| 86 | + used.add(top) |
| 87 | + elif isinstance(node, ast.ImportFrom): |
| 88 | + if node.level and node.level > 0: |
| 89 | + # relative import, treat as local |
| 90 | + continue |
| 91 | + if node.module: |
| 92 | + top = node.module.split(".", 1)[0] |
| 93 | + used.add(top) |
| 94 | + return used |
| 95 | + |
| 96 | +def build_reverse_map(): |
| 97 | + rev = {} |
| 98 | + for dist, tops in KNOWN_IMPORTS.items(): |
| 99 | + for top in tops: |
| 100 | + rev.setdefault(top, set()).add(canonicalize_name(dist)) |
| 101 | + return rev |
| 102 | + |
| 103 | +def main(): |
| 104 | + root = PROJECT_ROOT |
| 105 | + if not REQ_FILE.exists(): |
| 106 | + print(f"requirements.txt not found at {REQ_FILE}", file=sys.stderr) |
| 107 | + sys.exit(2) |
| 108 | + |
| 109 | + listed, lines_meta = parse_requirements_lines(REQ_FILE) |
| 110 | + reverse_map = build_reverse_map() |
| 111 | + stdlib = set(getattr(sys, "stdlib_module_names", set())) # available on 3.10+ |
| 112 | + |
| 113 | + local_pkgs = collect_local_packages(root) |
| 114 | + |
| 115 | + imports = set() |
| 116 | + for f in iter_python_files(root): |
| 117 | + imports |= collect_imports(f) |
| 118 | + |
| 119 | + # Classify imports |
| 120 | + third_party_imports = set() |
| 121 | + unknown_imports = set() |
| 122 | + import_to_dists = {} |
| 123 | + |
| 124 | + for imp in sorted(imports): |
| 125 | + if imp in stdlib: |
| 126 | + continue |
| 127 | + if imp in local_pkgs: |
| 128 | + continue |
| 129 | + dists = reverse_map.get(imp) |
| 130 | + if dists: |
| 131 | + import_to_dists[imp] = dists |
| 132 | + third_party_imports.add(imp) |
| 133 | + else: |
| 134 | + # try heuristic where dist name equals import name |
| 135 | + guessed = canonicalize_name(imp) |
| 136 | + import_to_dists[imp] = {guessed} |
| 137 | + third_party_imports.add(imp) |
| 138 | + # mark as unknown if not in known map, this may be stdlib alias or missing mapping |
| 139 | + if guessed not in listed: |
| 140 | + unknown_imports.add(imp) |
| 141 | + |
| 142 | + # Compute used distributions by mapping known imports |
| 143 | + used_dists = set() |
| 144 | + for im, dists in import_to_dists.items(): |
| 145 | + for d in dists: |
| 146 | + used_dists.add(d) |
| 147 | + |
| 148 | + # Of used distributions, keep only those that look like real third party packages |
| 149 | + # avoids flagging local packages that slipped through |
| 150 | + # consider real if either it is listed already, or it is in known map values |
| 151 | + known_dist_names = {canonicalize_name(k) for k in KNOWN_IMPORTS.keys()} |
| 152 | + used_real = {d for d in used_dists if d in listed or d in known_dist_names} |
| 153 | + |
| 154 | + unused_listed = sorted(listed - used_real) |
| 155 | + missing_direct = sorted(used_real - listed) |
| 156 | + |
| 157 | + print("\n=== Import scan summary ===") |
| 158 | + print(f"Python files scanned: {len(list(iter_python_files(root)))}") |
| 159 | + print(f"Total unique imports found: {len(imports)}") |
| 160 | + print(f"Third party import roots detected: {sorted(third_party_imports)}") |
| 161 | + print() |
| 162 | + |
| 163 | + if unused_listed: |
| 164 | + print("Possibly unused in code, listed in requirements.txt:") |
| 165 | + for d in unused_listed: |
| 166 | + print(f" - {d}") |
| 167 | + else: |
| 168 | + print("No obviously unused packages from requirements.txt") |
| 169 | + |
| 170 | + if missing_direct: |
| 171 | + print("\nDirect imports in code that are not in requirements.txt:") |
| 172 | + for d in missing_direct: |
| 173 | + print(f" - {d}") |
| 174 | + print("These may be satisfied as transitive deps, but best practice is to list direct imports you use.") |
| 175 | + else: |
| 176 | + print("\nNo missing direct packages based on import scan") |
| 177 | + |
| 178 | + unknown_third_party = sorted( |
| 179 | + {u for u in unknown_imports if canonicalize_name(u) not in listed} |
| 180 | + ) |
| 181 | + if unknown_third_party: |
| 182 | + print("\nUnknown imports not mapped to a distribution name:") |
| 183 | + for u in unknown_third_party: |
| 184 | + print(f" - {u} (add to KNOWN_IMPORTS if this is third party)") |
| 185 | + print("Some of these could be stdlib modules on your Python version, or local modules.") |
| 186 | + |
| 187 | + # trimmed requirements file based on used_real |
| 188 | + out_lines = [] |
| 189 | + kept = set() |
| 190 | + for kind, raw, dist in lines_meta: |
| 191 | + if kind != "requirement": |
| 192 | + out_lines.append(raw) |
| 193 | + continue |
| 194 | + if dist in used_real: |
| 195 | + out_lines.append(raw) |
| 196 | + kept.add(dist) |
| 197 | + else: |
| 198 | + # skip unused requirement lines |
| 199 | + pass |
| 200 | + |
| 201 | + out_path = REQ_FILE.with_name("requirements.used.txt") |
| 202 | + out_path.write_text("\n".join(out_lines) + "\n", encoding="utf-8") |
| 203 | + print(f"\nWrote trimmed requirements to {out_path}") |
| 204 | + if unused_listed: |
| 205 | + print("Review manually before replacing requirements.txt") |
| 206 | + |
| 207 | +if __name__ == "__main__": |
| 208 | + main() |
0 commit comments