Skip to content

Commit a29b717

Browse files
committed
new to review lib use within script
1 parent d1da475 commit a29b717

File tree

1 file changed

+208
-0
lines changed

1 file changed

+208
-0
lines changed
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env python3
2+
3+
4+
import ast
5+
import sys
6+
import re
7+
from pathlib import Path
8+
9+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
10+
REQ_FILE = PROJECT_ROOT / "requirements.txt"
11+
EXCLUDE_DIRS = {
12+
".git", ".github", ".venv", "venv", "env", "__pycache__", "site-packages",
13+
"dist", "build", "docs", "data", "assets", "node_modules"
14+
}
15+
16+
# Map distribution name -> list of top level import names commonly used
17+
KNOWN_IMPORTS = {
18+
"tabula-py": ["tabula"],
19+
"textblob": ["textblob"],
20+
"nltk": ["nltk"],
21+
"xlsxwriter": ["xlsxwriter"],
22+
"PyPDF2": ["PyPDF2"],
23+
"PyMuPDF": ["fitz", "pymupdf"],
24+
"requests": ["requests"],
25+
"beautifulsoup4": ["bs4"],
26+
"GitPython": ["git"],
27+
"scipy": ["scipy"],
28+
# add here if project uses others
29+
}
30+
31+
def canonicalize_name(name: str) -> str:
32+
# similar to packaging.utils.canonicalize_name, without dependency
33+
return re.sub(r"[-_.]+", "-", name).lower().strip()
34+
35+
def parse_requirements_lines(path: Path):
36+
"""Return tuple: (listed_names_set, metadata_list)
37+
metadata_list keeps original lines and parsed canonical dist names where present.
38+
"""
39+
listed = set()
40+
lines_meta = []
41+
req_re = re.compile(r"^\s*([A-Za-z0-9_.\-]+)")
42+
for raw in path.read_text(encoding="utf-8").splitlines():
43+
line = raw.strip()
44+
if not line or line.startswith("#"):
45+
lines_meta.append(("comment_or_blank", raw, None))
46+
continue
47+
m = req_re.match(line)
48+
if not m:
49+
lines_meta.append(("other", raw, None))
50+
continue
51+
dist = canonicalize_name(m.group(1))
52+
listed.add(dist)
53+
lines_meta.append(("requirement", raw, dist))
54+
return listed, lines_meta
55+
56+
def iter_python_files(root: Path):
57+
for p in root.rglob("*.py"):
58+
rel_parts = p.relative_to(root).parts
59+
if any(part in EXCLUDE_DIRS for part in rel_parts):
60+
continue
61+
yield p
62+
63+
def collect_local_packages(root: Path):
64+
"""Top level package names to treat as local, exclude from third party detection."""
65+
locals_set = set()
66+
for d in root.iterdir():
67+
if d.is_dir():
68+
init_py = d / "__init__.py"
69+
if init_py.exists():
70+
locals_set.add(d.name.split(".")[0])
71+
# also include top level scripts as local modules
72+
for f in root.glob("*.py"):
73+
locals_set.add(f.stem)
74+
return locals_set
75+
76+
def collect_imports(pyfile: Path):
77+
try:
78+
tree = ast.parse(pyfile.read_text(encoding="utf-8"), filename=str(pyfile))
79+
except Exception:
80+
return set()
81+
used = set()
82+
for node in ast.walk(tree):
83+
if isinstance(node, ast.Import):
84+
for n in node.names:
85+
top = n.name.split(".", 1)[0]
86+
used.add(top)
87+
elif isinstance(node, ast.ImportFrom):
88+
if node.level and node.level > 0:
89+
# relative import, treat as local
90+
continue
91+
if node.module:
92+
top = node.module.split(".", 1)[0]
93+
used.add(top)
94+
return used
95+
96+
def build_reverse_map():
97+
rev = {}
98+
for dist, tops in KNOWN_IMPORTS.items():
99+
for top in tops:
100+
rev.setdefault(top, set()).add(canonicalize_name(dist))
101+
return rev
102+
103+
def main():
104+
root = PROJECT_ROOT
105+
if not REQ_FILE.exists():
106+
print(f"requirements.txt not found at {REQ_FILE}", file=sys.stderr)
107+
sys.exit(2)
108+
109+
listed, lines_meta = parse_requirements_lines(REQ_FILE)
110+
reverse_map = build_reverse_map()
111+
stdlib = set(getattr(sys, "stdlib_module_names", set())) # available on 3.10+
112+
113+
local_pkgs = collect_local_packages(root)
114+
115+
imports = set()
116+
for f in iter_python_files(root):
117+
imports |= collect_imports(f)
118+
119+
# Classify imports
120+
third_party_imports = set()
121+
unknown_imports = set()
122+
import_to_dists = {}
123+
124+
for imp in sorted(imports):
125+
if imp in stdlib:
126+
continue
127+
if imp in local_pkgs:
128+
continue
129+
dists = reverse_map.get(imp)
130+
if dists:
131+
import_to_dists[imp] = dists
132+
third_party_imports.add(imp)
133+
else:
134+
# try heuristic where dist name equals import name
135+
guessed = canonicalize_name(imp)
136+
import_to_dists[imp] = {guessed}
137+
third_party_imports.add(imp)
138+
# mark as unknown if not in known map, this may be stdlib alias or missing mapping
139+
if guessed not in listed:
140+
unknown_imports.add(imp)
141+
142+
# Compute used distributions by mapping known imports
143+
used_dists = set()
144+
for im, dists in import_to_dists.items():
145+
for d in dists:
146+
used_dists.add(d)
147+
148+
# Of used distributions, keep only those that look like real third party packages
149+
# avoids flagging local packages that slipped through
150+
# consider real if either it is listed already, or it is in known map values
151+
known_dist_names = {canonicalize_name(k) for k in KNOWN_IMPORTS.keys()}
152+
used_real = {d for d in used_dists if d in listed or d in known_dist_names}
153+
154+
unused_listed = sorted(listed - used_real)
155+
missing_direct = sorted(used_real - listed)
156+
157+
print("\n=== Import scan summary ===")
158+
print(f"Python files scanned: {len(list(iter_python_files(root)))}")
159+
print(f"Total unique imports found: {len(imports)}")
160+
print(f"Third party import roots detected: {sorted(third_party_imports)}")
161+
print()
162+
163+
if unused_listed:
164+
print("Possibly unused in code, listed in requirements.txt:")
165+
for d in unused_listed:
166+
print(f" - {d}")
167+
else:
168+
print("No obviously unused packages from requirements.txt")
169+
170+
if missing_direct:
171+
print("\nDirect imports in code that are not in requirements.txt:")
172+
for d in missing_direct:
173+
print(f" - {d}")
174+
print("These may be satisfied as transitive deps, but best practice is to list direct imports you use.")
175+
else:
176+
print("\nNo missing direct packages based on import scan")
177+
178+
unknown_third_party = sorted(
179+
{u for u in unknown_imports if canonicalize_name(u) not in listed}
180+
)
181+
if unknown_third_party:
182+
print("\nUnknown imports not mapped to a distribution name:")
183+
for u in unknown_third_party:
184+
print(f" - {u} (add to KNOWN_IMPORTS if this is third party)")
185+
print("Some of these could be stdlib modules on your Python version, or local modules.")
186+
187+
# trimmed requirements file based on used_real
188+
out_lines = []
189+
kept = set()
190+
for kind, raw, dist in lines_meta:
191+
if kind != "requirement":
192+
out_lines.append(raw)
193+
continue
194+
if dist in used_real:
195+
out_lines.append(raw)
196+
kept.add(dist)
197+
else:
198+
# skip unused requirement lines
199+
pass
200+
201+
out_path = REQ_FILE.with_name("requirements.used.txt")
202+
out_path.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
203+
print(f"\nWrote trimmed requirements to {out_path}")
204+
if unused_listed:
205+
print("Review manually before replacing requirements.txt")
206+
207+
if __name__ == "__main__":
208+
main()

0 commit comments

Comments
 (0)