Skip to content

Commit 9c476e3

Browse files
kennethreitzclaude
andcommitted
Add shallow commentary detection script
Detects: - Generic templated questions - Boilerplate historical sections - Missing Greek/Hebrew terms - Short analysis sections - Templated analysis patterns Usage: python scripts/detect_shallow_commentary.py # Full scan python scripts/detect_shallow_commentary.py --worst # Only 3+ issues python scripts/detect_shallow_commentary.py --book romans Found 1,911 severe cases (3+ issues) mostly in Romans, 1-2 Corinthians 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 194a905 commit 9c476e3

File tree

1 file changed

+224
-0
lines changed

1 file changed

+224
-0
lines changed
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Detect shallow/generic commentary entries that need improvement.
4+
5+
Flags entries that have:
6+
- Generic templated questions ("How does X:Y deepen my understanding...")
7+
- Boilerplate historical sections
8+
- Missing Greek/Hebrew terms
9+
- Very short analysis sections
10+
- Templated analysis patterns
11+
"""
12+
13+
import json
14+
import os
15+
import re
16+
from pathlib import Path
17+
18+
# Patterns that indicate generic/shallow commentary
19+
GENERIC_QUESTION_PATTERNS = [
20+
r"How does .+ deepen my understanding of the gospel",
21+
r"What specific action or attitude change does this verse call me to make",
22+
r"How can I more sacrificially love the people",
23+
r"How does this passage point to Christ and His redemptive work\?$",
24+
]
25+
26+
GENERIC_HISTORICAL_PATTERNS = [
27+
r"<strong>Historical Setting:</strong> .+ was written around \d+ CE from",
28+
r"<strong>Occasion:</strong> Preparing for visit to Rome",
29+
r"The Greco-Roman world valued rhetoric, philosophy, and social status",
30+
r"First-century believers lived in a pluralistic, pagan society",
31+
r"Paul's instructions addressed both timeless theological truths and specific cultural situations",
32+
]
33+
34+
GENERIC_ANALYSIS_PATTERNS = [
35+
r"This verse contributes to .+'s overall purpose in",
36+
r"The key themes of justification by faith, law and grace, Israel and the church are evident",
37+
r"Paul carefully explains the law's role: revealing sin and pointing to Christ",
38+
r"The Holy Spirit empowers believers for holiness and service",
39+
r"Christ is the center of Paul's theology and message",
40+
r"Paul's discussion of Israel's role in God's redemptive plan\.$",
41+
r"Paul's teaching on sanctification and life in the Spirit\.$",
42+
]
43+
44+
# Good indicators (if missing, flag the entry)
45+
GOOD_INDICATORS = {
46+
'greek': [r'<em>[^<]+</em>', r'[Gg]reek', r'[α-ωΑ-Ω]'],
47+
'hebrew': [r'<em>[^<]+</em>', r'[Hh]ebrew', r'[\u0590-\u05FF]'],
48+
}
49+
50+
51+
def check_entry(book: str, chapter: str, verse: str, entry: dict) -> list[str]:
52+
"""Check a single commentary entry for quality issues."""
53+
issues = []
54+
55+
analysis = entry.get('analysis', '')
56+
historical = entry.get('historical', '')
57+
questions = entry.get('questions', [])
58+
59+
ref = f"{book} {chapter}:{verse}"
60+
61+
# Check for generic questions
62+
for q in questions:
63+
for pattern in GENERIC_QUESTION_PATTERNS:
64+
if re.search(pattern, q):
65+
issues.append(f"{ref}: Generic question pattern detected")
66+
break
67+
68+
# Check for generic historical content
69+
for pattern in GENERIC_HISTORICAL_PATTERNS:
70+
if re.search(pattern, historical):
71+
issues.append(f"{ref}: Generic historical boilerplate detected")
72+
break
73+
74+
# Check for generic analysis patterns
75+
for pattern in GENERIC_ANALYSIS_PATTERNS:
76+
if re.search(pattern, analysis):
77+
issues.append(f"{ref}: Generic analysis pattern detected")
78+
break
79+
80+
# Check analysis length (too short is suspicious)
81+
# Good commentary should be at least 500 chars
82+
if len(analysis) < 400:
83+
issues.append(f"{ref}: Analysis too short ({len(analysis)} chars)")
84+
85+
# Check for presence of original language terms in NT books
86+
nt_books = ['matthew', 'mark', 'luke', 'john', 'acts', 'romans',
87+
'1_corinthians', '2_corinthians', 'galatians', 'ephesians',
88+
'philippians', 'colossians', '1_thessalonians', '2_thessalonians',
89+
'1_timothy', '2_timothy', 'titus', 'philemon', 'hebrews',
90+
'james', '1_peter', '2_peter', '1_john', '2_john', '3_john',
91+
'jude', 'revelation']
92+
93+
book_lower = book.lower().replace(' ', '_')
94+
95+
# Check for Greek in NT
96+
if book_lower in nt_books:
97+
has_greek = any(re.search(p, analysis) for p in GOOD_INDICATORS['greek'])
98+
if not has_greek:
99+
issues.append(f"{ref}: Missing Greek terms (NT book)")
100+
101+
# Check for Hebrew in OT
102+
if book_lower not in nt_books:
103+
has_hebrew = any(re.search(p, analysis) for p in GOOD_INDICATORS['hebrew'])
104+
if not has_hebrew:
105+
issues.append(f"{ref}: Missing Hebrew terms (OT book)")
106+
107+
return issues
108+
109+
110+
def scan_book(filepath: Path) -> list[str]:
111+
"""Scan a single book's commentary file."""
112+
all_issues = []
113+
114+
with open(filepath) as f:
115+
data = json.load(f)
116+
117+
book = data.get('book', filepath.stem.replace('_', ' ').title())
118+
commentary = data.get('commentary', {})
119+
120+
for chapter, verses in commentary.items():
121+
if not isinstance(verses, dict):
122+
continue
123+
for verse, entry in verses.items():
124+
if not isinstance(entry, dict) or 'analysis' not in entry:
125+
continue
126+
issues = check_entry(book, chapter, verse, entry)
127+
all_issues.extend(issues)
128+
129+
return all_issues
130+
131+
132+
def main():
133+
"""Scan all commentary files and report issues."""
134+
import argparse
135+
parser = argparse.ArgumentParser(description='Detect shallow commentary')
136+
parser.add_argument('--worst', action='store_true', help='Show only worst offenders (3+ issues)')
137+
parser.add_argument('--book', type=str, help='Check specific book only')
138+
parser.add_argument('--export', type=str, help='Export problem verses to file')
139+
args = parser.parse_args()
140+
141+
commentary_dir = Path('kjvstudy_org/data/verse_commentary')
142+
143+
# Track issues per verse
144+
verse_issues = {} # ref -> list of issues
145+
146+
for filepath in sorted(commentary_dir.glob('*.json')):
147+
if args.book and args.book.lower() not in filepath.stem.lower():
148+
continue
149+
150+
with open(filepath) as f:
151+
data = json.load(f)
152+
153+
book = data.get('book', filepath.stem.replace('_', ' ').title())
154+
commentary = data.get('commentary', {})
155+
156+
for chapter, verses in commentary.items():
157+
if not isinstance(verses, dict):
158+
continue
159+
for verse, entry in verses.items():
160+
if not isinstance(entry, dict) or 'analysis' not in entry:
161+
continue
162+
issues = check_entry(book, chapter, verse, entry)
163+
if issues:
164+
ref = f"{book} {chapter}:{verse}"
165+
verse_issues[ref] = [i.split(': ', 1)[1] for i in issues]
166+
167+
# Summary
168+
print(f"\n{'='*60}")
169+
print(f"SHALLOW COMMENTARY DETECTION REPORT")
170+
print(f"{'='*60}\n")
171+
172+
if not verse_issues:
173+
print("✅ No issues detected! All commentary appears to be high quality.")
174+
return
175+
176+
# Filter to worst offenders if requested
177+
if args.worst:
178+
verse_issues = {k: v for k, v in verse_issues.items() if len(v) >= 3}
179+
print(f"Showing only verses with 3+ issues:\n")
180+
181+
# Sort by number of issues (worst first)
182+
sorted_verses = sorted(verse_issues.items(), key=lambda x: -len(x[1]))
183+
184+
# Count by severity
185+
severe = sum(1 for v in verse_issues.values() if len(v) >= 3)
186+
moderate = sum(1 for v in verse_issues.values() if len(v) == 2)
187+
minor = sum(1 for v in verse_issues.values() if len(v) == 1)
188+
189+
print(f"📊 Issue Summary:")
190+
print(f" 🔴 Severe (3+ issues): {severe} verses")
191+
print(f" 🟡 Moderate (2 issues): {moderate} verses")
192+
print(f" 🟢 Minor (1 issue): {minor} verses")
193+
print(f" Total: {len(verse_issues)} verses with issues\n")
194+
195+
# Show worst offenders
196+
print(f"\n🔴 WORST OFFENDERS (need immediate attention):")
197+
print("-" * 60)
198+
199+
shown = 0
200+
for ref, issues in sorted_verses:
201+
if len(issues) >= 3:
202+
print(f"\n{ref} ({len(issues)} issues):")
203+
for issue in issues:
204+
print(f" • {issue}")
205+
shown += 1
206+
if shown >= 50:
207+
remaining = sum(1 for _, v in sorted_verses if len(v) >= 3) - 50
208+
if remaining > 0:
209+
print(f"\n ... and {remaining} more severe cases")
210+
break
211+
212+
# Export if requested
213+
if args.export:
214+
with open(args.export, 'w') as f:
215+
for ref, issues in sorted_verses:
216+
if len(issues) >= 3:
217+
f.write(f"{ref}\n")
218+
print(f"\n📁 Exported {severe} severe cases to {args.export}")
219+
220+
print(f"\n{'='*60}")
221+
222+
223+
if __name__ == '__main__':
224+
main()

0 commit comments

Comments
 (0)