Add shallow commentary detection script

kennethreitz · claude · kennethreitz · commit 9c476e3582b5 · 2025-12-09T10:45:27.000-05:00
Detects: - Generic templated questions - Boilerplate historical sections - Missing Greek/Hebrew terms - Short analysis sections - Templated analysis patterns Usage: python scripts/detect_shallow_commentary.py # Full scan python scripts/detect_shallow_commentary.py --worst # Only 3+ issues python scripts/detect_shallow_commentary.py --book romans Found 1,911 severe cases (3+ issues) mostly in Romans, 1-2 Corinthians 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/scripts/detect_shallow_commentary.py b/scripts/detect_shallow_commentary.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""
+Detect shallow/generic commentary entries that need improvement.
+
+Flags entries that have:
+- Generic templated questions ("How does X:Y deepen my understanding...")
+- Boilerplate historical sections
+- Missing Greek/Hebrew terms
+- Very short analysis sections
+- Templated analysis patterns
+"""
+
+import json
+import os
+import re
+from pathlib import Path
+
+# Patterns that indicate generic/shallow commentary
+GENERIC_QUESTION_PATTERNS = [
+    r"How does .+ deepen my understanding of the gospel",
+    r"What specific action or attitude change does this verse call me to make",
+    r"How can I more sacrificially love the people",
+    r"How does this passage point to Christ and His redemptive work\?$",
+]
+
+GENERIC_HISTORICAL_PATTERNS = [
+    r"<strong>Historical Setting:</strong> .+ was written around \d+ CE from",
+    r"<strong>Occasion:</strong> Preparing for visit to Rome",
+    r"The Greco-Roman world valued rhetoric, philosophy, and social status",
+    r"First-century believers lived in a pluralistic, pagan society",
+    r"Paul's instructions addressed both timeless theological truths and specific cultural situations",
+]
+
+GENERIC_ANALYSIS_PATTERNS = [
+    r"This verse contributes to .+'s overall purpose in",
+    r"The key themes of justification by faith, law and grace, Israel and the church are evident",
+    r"Paul carefully explains the law's role: revealing sin and pointing to Christ",
+    r"The Holy Spirit empowers believers for holiness and service",
+    r"Christ is the center of Paul's theology and message",
+    r"Paul's discussion of Israel's role in God's redemptive plan\.$",
+    r"Paul's teaching on sanctification and life in the Spirit\.$",
+]
+
+# Good indicators (if missing, flag the entry)
+GOOD_INDICATORS = {
+    'greek': [r'<em>[^<]+</em>', r'[Gg]reek', r'[α-ωΑ-Ω]'],
+    'hebrew': [r'<em>[^<]+</em>', r'[Hh]ebrew', r'[\u0590-\u05FF]'],
+}
+
+
+def check_entry(book: str, chapter: str, verse: str, entry: dict) -> list[str]:
+    """Check a single commentary entry for quality issues."""
+    issues = []
+
+    analysis = entry.get('analysis', '')
+    historical = entry.get('historical', '')
+    questions = entry.get('questions', [])
+
+    ref = f"{book} {chapter}:{verse}"
+
+    # Check for generic questions
+    for q in questions:
+        for pattern in GENERIC_QUESTION_PATTERNS:
+            if re.search(pattern, q):
+                issues.append(f"{ref}: Generic question pattern detected")
+                break
+
+    # Check for generic historical content
+    for pattern in GENERIC_HISTORICAL_PATTERNS:
+        if re.search(pattern, historical):
+            issues.append(f"{ref}: Generic historical boilerplate detected")
+            break
+
+    # Check for generic analysis patterns
+    for pattern in GENERIC_ANALYSIS_PATTERNS:
+        if re.search(pattern, analysis):
+            issues.append(f"{ref}: Generic analysis pattern detected")
+            break
+
+    # Check analysis length (too short is suspicious)
+    # Good commentary should be at least 500 chars
+    if len(analysis) < 400:
+        issues.append(f"{ref}: Analysis too short ({len(analysis)} chars)")
+
+    # Check for presence of original language terms in NT books
+    nt_books = ['matthew', 'mark', 'luke', 'john', 'acts', 'romans',
+                '1_corinthians', '2_corinthians', 'galatians', 'ephesians',
+                'philippians', 'colossians', '1_thessalonians', '2_thessalonians',
+                '1_timothy', '2_timothy', 'titus', 'philemon', 'hebrews',
+                'james', '1_peter', '2_peter', '1_john', '2_john', '3_john',
+                'jude', 'revelation']
+
+    book_lower = book.lower().replace(' ', '_')
+
+    # Check for Greek in NT
+    if book_lower in nt_books:
+        has_greek = any(re.search(p, analysis) for p in GOOD_INDICATORS['greek'])
+        if not has_greek:
+            issues.append(f"{ref}: Missing Greek terms (NT book)")
+
+    # Check for Hebrew in OT
+    if book_lower not in nt_books:
+        has_hebrew = any(re.search(p, analysis) for p in GOOD_INDICATORS['hebrew'])
+        if not has_hebrew:
+            issues.append(f"{ref}: Missing Hebrew terms (OT book)")
+
+    return issues
+
+
+def scan_book(filepath: Path) -> list[str]:
+    """Scan a single book's commentary file."""
+    all_issues = []
+
+    with open(filepath) as f:
+        data = json.load(f)
+
+    book = data.get('book', filepath.stem.replace('_', ' ').title())
+    commentary = data.get('commentary', {})
+
+    for chapter, verses in commentary.items():
+        if not isinstance(verses, dict):
+            continue
+        for verse, entry in verses.items():
+            if not isinstance(entry, dict) or 'analysis' not in entry:
+                continue
+            issues = check_entry(book, chapter, verse, entry)
+            all_issues.extend(issues)
+
+    return all_issues
+
+
+def main():
+    """Scan all commentary files and report issues."""
+    import argparse
+    parser = argparse.ArgumentParser(description='Detect shallow commentary')
+    parser.add_argument('--worst', action='store_true', help='Show only worst offenders (3+ issues)')
+    parser.add_argument('--book', type=str, help='Check specific book only')
+    parser.add_argument('--export', type=str, help='Export problem verses to file')
+    args = parser.parse_args()
+
+    commentary_dir = Path('kjvstudy_org/data/verse_commentary')
+
+    # Track issues per verse
+    verse_issues = {}  # ref -> list of issues
+
+    for filepath in sorted(commentary_dir.glob('*.json')):
+        if args.book and args.book.lower() not in filepath.stem.lower():
+            continue
+
+        with open(filepath) as f:
+            data = json.load(f)
+
+        book = data.get('book', filepath.stem.replace('_', ' ').title())
+        commentary = data.get('commentary', {})
+
+        for chapter, verses in commentary.items():
+            if not isinstance(verses, dict):
+                continue
+            for verse, entry in verses.items():
+                if not isinstance(entry, dict) or 'analysis' not in entry:
+                    continue
+                issues = check_entry(book, chapter, verse, entry)
+                if issues:
+                    ref = f"{book} {chapter}:{verse}"
+                    verse_issues[ref] = [i.split(': ', 1)[1] for i in issues]
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"SHALLOW COMMENTARY DETECTION REPORT")
+    print(f"{'='*60}\n")
+
+    if not verse_issues:
+        print("✅ No issues detected! All commentary appears to be high quality.")
+        return
+
+    # Filter to worst offenders if requested
+    if args.worst:
+        verse_issues = {k: v for k, v in verse_issues.items() if len(v) >= 3}
+        print(f"Showing only verses with 3+ issues:\n")
+
+    # Sort by number of issues (worst first)
+    sorted_verses = sorted(verse_issues.items(), key=lambda x: -len(x[1]))
+
+    # Count by severity
+    severe = sum(1 for v in verse_issues.values() if len(v) >= 3)
+    moderate = sum(1 for v in verse_issues.values() if len(v) == 2)
+    minor = sum(1 for v in verse_issues.values() if len(v) == 1)
+
+    print(f"📊 Issue Summary:")
+    print(f"   🔴 Severe (3+ issues): {severe} verses")
+    print(f"   🟡 Moderate (2 issues): {moderate} verses")
+    print(f"   🟢 Minor (1 issue): {minor} verses")
+    print(f"   Total: {len(verse_issues)} verses with issues\n")
+
+    # Show worst offenders
+    print(f"\n🔴 WORST OFFENDERS (need immediate attention):")
+    print("-" * 60)
+
+    shown = 0
+    for ref, issues in sorted_verses:
+        if len(issues) >= 3:
+            print(f"\n{ref} ({len(issues)} issues):")
+            for issue in issues:
+                print(f"   • {issue}")
+            shown += 1
+            if shown >= 50:
+                remaining = sum(1 for _, v in sorted_verses if len(v) >= 3) - 50
+                if remaining > 0:
+                    print(f"\n   ... and {remaining} more severe cases")
+                break
+
+    # Export if requested
+    if args.export:
+        with open(args.export, 'w') as f:
+            for ref, issues in sorted_verses:
+                if len(issues) >= 3:
+                    f.write(f"{ref}\n")
+        print(f"\n📁 Exported {severe} severe cases to {args.export}")
+
+    print(f"\n{'='*60}")
+
+
+if __name__ == '__main__':
+    main()