Skip to content

Commit 719db3c

Browse files
committed
Automatic checking of glossary terms on CI. Warning for now
1 parent 6a6d429 commit 719db3c

File tree

4 files changed

+134
-31
lines changed

4 files changed

+134
-31
lines changed

.github/workflows/check-build.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ jobs:
1616
runs-on: ubuntu-latest
1717
strategy:
1818
matrix:
19-
check_type: [spellcheck, kbcheck, md-lint]
19+
check_type: [spellcheck, kbcheck, md-lint, glossary-check]
2020
steps:
2121
# Add setup steps per check here
2222
- uses: actions/checkout@v4
2323
- name: Install Aspell
2424
if: matrix.check_type == 'spellcheck'
2525
run: sudo apt-get update && sudo apt-get install -y aspell aspell-en
2626
- name: Set up Python
27-
if: matrix.check_type == 'kbcheck'
27+
if: matrix.check_type == 'kbcheck' || matrix.check_type == 'glossary-check'
2828
run: |
2929
curl -Ls https://astral.sh/uv/install.sh | sh
3030
uv clean
@@ -51,6 +51,12 @@ jobs:
5151
elif [[ "${{ matrix.check_type }}" == "md-lint" ]]; then
5252
yarn check-markdown
5353
exit_code=$?
54+
elif [[ "${{ matrix.check_type }}" == "glossary-check" ]]; then
55+
echo "Extracting glossary from markdown..."
56+
python3 scripts/glossary/extract_glossary.py
57+
echo "Checking glossary coverage..."
58+
python3 scripts/glossary/wrap_glossary_terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)"
59+
exit_code=0 # Always succeed for glossary check
5460
fi
5561
5662
if [[ $exit_code -ne 0 ]]; then
@@ -74,5 +80,4 @@ jobs:
7480
if: needs.stylecheck.result != 'success'
7581
run: |
7682
echo "::error::One or more checks of the style check failed."
77-
exit 1
78-
83+
exit 1
File renamed without changes.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env python3
2+
3+
import re
4+
import json
5+
import argparse
6+
7+
def extract_glossary_terms(markdown_content):
8+
"""Extract terms and definitions from ClickHouse glossary markdown"""
9+
10+
# Pattern to match: ## Term {#anchor} followed by definition paragraph(s)
11+
pattern = r'^## ([^{]+?)\s*\{#[^}]+\}\s*\n\n(.*?)(?=\n## |\Z)'
12+
13+
matches = re.findall(pattern, markdown_content, re.MULTILINE | re.DOTALL)
14+
15+
glossary = {}
16+
17+
for term, definition in matches:
18+
# Clean up the term
19+
term = term.strip()
20+
21+
# Clean up the definition
22+
definition = definition.strip()
23+
24+
# Remove extra whitespace and normalize line breaks
25+
definition = re.sub(r'\n+', ' ', definition)
26+
definition = re.sub(r'\s+', ' ', definition)
27+
28+
glossary[term] = definition
29+
30+
return glossary
31+
32+
def main():
33+
parser = argparse.ArgumentParser(description='Convert ClickHouse glossary.md to JSON')
34+
parser.add_argument('--input', '-i', default='./docs/concepts/glossary.md',
35+
help='Input markdown file')
36+
parser.add_argument('--output', '-o', default='./src/components/GlossaryTooltip/glossary.json',
37+
help='Output JSON file')
38+
39+
args = parser.parse_args()
40+
41+
# Read the markdown file
42+
try:
43+
with open(args.input, 'r', encoding='utf-8') as f:
44+
content = f.read()
45+
except FileNotFoundError:
46+
print(f"❌ Input file not found: {args.input}")
47+
return
48+
except Exception as e:
49+
print(f"❌ Error reading file: {e}")
50+
return
51+
52+
# Extract glossary terms
53+
glossary = extract_glossary_terms(content)
54+
55+
if not glossary:
56+
print("❌ No glossary terms found")
57+
return
58+
59+
print(f"✅ Extracted {len(glossary)} terms:")
60+
for term in sorted(glossary.keys()):
61+
print(f" - {term}")
62+
63+
# Write JSON file
64+
try:
65+
with open(args.output, 'w', encoding='utf-8') as f:
66+
json.dump(glossary, f, indent=2, ensure_ascii=False)
67+
68+
print(f"💾 Saved to: {args.output}")
69+
70+
except Exception as e:
71+
print(f"❌ Error writing JSON file: {e}")
72+
73+
if __name__ == '__main__':
74+
main()

scripts/wrap-glossary-terms/wrap-glossary-terms.py renamed to scripts/glossary/wrap-glossary-terms.py

Lines changed: 51 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,12 @@ def main():
173173
parser.add_argument('--glossary', default='./src/components/GlossaryTooltip/glossary.json', help='Glossary JSON file')
174174
parser.add_argument('--dry-run', action='store_true', help='Show changes without writing files')
175175
parser.add_argument('--force', action='store_true', help='Process files even if they already have glossary syntax')
176+
parser.add_argument('--check', action='store_true', help='Check for unwrapped terms and show warnings (non-blocking)')
176177

177178
args = parser.parse_args()
178179

179-
print("🚀 Starting Glossary Term Wrapper...\n")
180+
if not args.check:
181+
print("🚀 Starting Glossary Term Wrapper...\n")
180182

181183
# Load glossary
182184
if not os.path.exists(args.glossary):
@@ -192,45 +194,67 @@ def main():
192194
# Filter out skip patterns
193195
files = [f for f in all_files if not should_skip_file(f)]
194196

195-
print(f"📁 Found {len(all_files)} MDX files, processing {len(files)} files")
196-
print(f"⏭️ Skipped {len(all_files) - len(files)} files based on skip patterns")
197-
198-
if args.force:
199-
print("💪 FORCE MODE - Processing files even with existing glossary syntax")
200-
201-
if args.dry_run:
202-
print("🔍 DRY RUN MODE - No files will be modified")
203-
204-
print()
197+
if not args.check:
198+
print(f"📁 Found {len(all_files)} MDX files, processing {len(files)} files")
199+
print(f"⏭️ Skipped {len(all_files) - len(files)} files based on skip patterns")
200+
201+
if args.force:
202+
print("💪 FORCE MODE - Processing files even with existing glossary syntax")
203+
204+
if args.dry_run:
205+
print("🔍 DRY RUN MODE - No files will be modified")
206+
207+
print()
205208

206209
# Process files
207210
stats = {'modified': 0, 'unchanged': 0, 'skipped': 0, 'error': 0, 'terms_wrapped': 0}
211+
file_details = [] # Track which files had terms for warning display
208212

209213
for file_path in files:
210214
rel_path = os.path.relpath(file_path, args.docs_dir)
211-
status, changes = process_file(file_path, terms, args.dry_run, args.force)
215+
# For check mode, always use dry_run=True to avoid writing files
216+
status, changes = process_file(file_path, terms, args.dry_run or args.check, args.force)
212217

213-
if status == 'modified':
214-
print(f"✅ Modified {rel_path} ({changes} terms)")
215-
elif status == 'unchanged':
218+
if status == 'modified' and changes > 0:
219+
file_details.append((rel_path, changes))
220+
if not args.check:
221+
print(f"✅ Modified {rel_path} ({changes} terms)")
222+
elif status == 'unchanged' and not args.check:
216223
print(f"➖ No changes needed for {rel_path}")
217-
elif status == 'skipped':
224+
elif status == 'skipped' and not args.check:
218225
print(f"⏭️ Skipped {rel_path} (already has glossary syntax)")
219226

220227
stats[status] += 1
221228
stats['terms_wrapped'] += changes
222229

223-
# Print summary
224-
print(f"\n📊 Summary:")
225-
print(f" Files processed: {stats['modified'] + stats['unchanged']}")
226-
print(f" Files modified: {stats['modified']}")
227-
print(f" Files skipped: {stats['skipped']}")
228-
print(f" Terms wrapped: {stats['terms_wrapped']}")
229-
230-
if args.dry_run:
231-
print("\n💡 Run without --dry-run to apply changes")
232-
if not args.force and stats['skipped'] > 0:
233-
print("💡 Use --force to process files with existing glossary syntax")
230+
# Show results
231+
if args.check:
232+
# Check mode: show warning if terms found
233+
if stats['terms_wrapped'] > 0:
234+
print(f"⚠️ GLOSSARY WARNING: Found {stats['terms_wrapped']} unwrapped glossary terms in {len(file_details)} files")
235+
print("💡 Run 'python3 scripts/wrap_glossary_terms.py' to add glossary tooltips")
236+
237+
# Show files with opportunities (limit to top 10 to avoid spam)
238+
if file_details:
239+
print(" Files with unwrapped terms:")
240+
for rel_path, count in sorted(file_details, key=lambda x: x[1], reverse=True)[:10]:
241+
print(f" - {rel_path} ({count} terms)")
242+
if len(file_details) > 10:
243+
print(f" ... and {len(file_details) - 10} more files")
244+
else:
245+
print("✅ All glossary terms are properly wrapped")
246+
else:
247+
# Normal mode: show detailed summary
248+
print(f"\n📊 Summary:")
249+
print(f" Files processed: {stats['modified'] + stats['unchanged']}")
250+
print(f" Files modified: {stats['modified']}")
251+
print(f" Files skipped: {stats['skipped']}")
252+
print(f" Terms wrapped: {stats['terms_wrapped']}")
253+
254+
if args.dry_run:
255+
print("\n💡 Run without --dry-run to apply changes")
256+
if not args.force and stats['skipped'] > 0:
257+
print("💡 Use --force to process files with existing glossary syntax")
234258

235259
if __name__ == '__main__':
236260
main()

0 commit comments

Comments
 (0)