|
| 1 | +import os |
| 2 | +import re |
| 3 | +import json |
| 4 | + |
| 5 | +GLOSSARY_JSON_PATH = "/home/dtran/clickhouse-docs/src/components/GlossaryTooltip/glossary.json" |
| 6 | +DOCS_PATH = "/home/dtran/clickhouse-docs/docs" |
| 7 | + |
| 8 | +IGNORE_DIRS = { |
| 9 | + "changelog", "changelogs", "i18n", "scripts", "static", "styles", |
| 10 | + "contribute", "about-us", "_placeholders" |
| 11 | +} |
| 12 | + |
| 13 | +GLOSSARY_IMPORT = "import GlossaryTooltip from '@site/src/components/GlossaryTooltip/GlossaryTooltip.jsx';" |
| 14 | + |
| 15 | +def load_glossary(path): |
| 16 | + with open(path, 'r', encoding='utf-8') as f: |
| 17 | + return json.load(f) |
| 18 | + |
| 19 | +def mask_ignores(text): |
| 20 | + placeholders = {} |
| 21 | + patterns = { |
| 22 | + 'codeblocks': r'```[\s\S]*?```', |
| 23 | + 'inline_code': r'`[^`\n]+`', |
| 24 | + 'frontmatter': r'^---[\s\S]+?---', |
| 25 | + 'imports': r'^import .*?;$', |
| 26 | + 'headers': r'^(#+ .*)$', |
| 27 | + 'html_blocks': r'<(div|details|summary)[\s\S]*?<\/\1>', |
| 28 | + 'blockquotes': r'^\s*>.*$', |
| 29 | + 'links': r'\[([^\]]+)\]\([^)]+\)', |
| 30 | + 'images': r'!\[[^\]]*\]\([^)]+\)', |
| 31 | + 'comments': r'<!--[\s\S]*?-->', |
| 32 | + } |
| 33 | + |
| 34 | + for name, pattern in patterns.items(): |
| 35 | + regex = re.compile(pattern, re.MULTILINE) |
| 36 | + matches = list(regex.finditer(text)) |
| 37 | + for i, match in enumerate(matches): |
| 38 | + key = f"__MASKED_{name.upper()}_{i}__" |
| 39 | + placeholders[key] = match.group(0) |
| 40 | + text = text.replace(match.group(0), key) |
| 41 | + |
| 42 | + return text, placeholders |
| 43 | + |
| 44 | +def unmask_ignores(text, placeholders): |
| 45 | + for key, value in placeholders.items(): |
| 46 | + text = text.replace(key, value) |
| 47 | + return text |
| 48 | + |
| 49 | +def inject_tooltips(text, glossary): |
| 50 | + def replacement(match): |
| 51 | + word = match.group(0) |
| 52 | + definition = glossary.get(word) |
| 53 | + if definition: |
| 54 | + return f'<GlossaryTooltip term="{word}" definition="{definition}">{word}</GlossaryTooltip>' |
| 55 | + return word |
| 56 | + |
| 57 | + pattern = r'\b(' + '|'.join(re.escape(k) for k in glossary.keys()) + r')\b' |
| 58 | + return re.sub(pattern, replacement, text) |
| 59 | + |
| 60 | +def process_file(path, glossary): |
| 61 | + with open(path, 'r', encoding='utf-8') as f: |
| 62 | + content = f.read() |
| 63 | + |
| 64 | + masked_text, placeholders = mask_ignores(content) |
| 65 | + replaced = inject_tooltips(masked_text, glossary) |
| 66 | + final_text = unmask_ignores(replaced, placeholders) |
| 67 | + |
| 68 | + if '<GlossaryTooltip' in final_text and GLOSSARY_IMPORT not in final_text: |
| 69 | + final_text = GLOSSARY_IMPORT + "\n" + final_text |
| 70 | + |
| 71 | + if '<GlossaryTooltip' in final_text and final_text != content: |
| 72 | + new_path = path.replace(".md", ".mdx") |
| 73 | + with open(new_path, 'w', encoding='utf-8') as f: |
| 74 | + f.write(final_text) |
| 75 | + os.remove(path) |
| 76 | + print(f"✔ Renamed and updated: {path} -> {new_path}") |
| 77 | + else: |
| 78 | + print(f"– Skipped (no change): {path}") |
| 79 | + |
| 80 | +def process_directory(base_path, glossary): |
| 81 | + for root, dirs, files in os.walk(base_path): |
| 82 | + dirs[:] = [d for d in dirs if d not in IGNORE_DIRS] |
| 83 | + for file in files: |
| 84 | + if file.endswith(".md") and not file.startswith("_"): |
| 85 | + path = os.path.join(root, file) |
| 86 | + process_file(path, glossary) |
| 87 | + |
| 88 | +if __name__ == "__main__": |
| 89 | + glossary = load_glossary(GLOSSARY_JSON_PATH) |
| 90 | + process_directory(DOCS_PATH, glossary) |
0 commit comments