-
-
Notifications
You must be signed in to change notification settings - Fork 41
feat: Add translation tag checker for issue #1102 #1997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
725972f
d6b3263
0040e14
abe6585
fce2bf3
dc51433
0ba0fa4
6cddfa8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,229 @@ | ||
| """ | ||
| Translation Tag Checker for OWASP Cornucopia | ||
|
|
||
| This script checks that translation files have the same T0xxx tags as the English version. | ||
| It detects: | ||
| - Missing tags in translations | ||
| - Untranslated tags (text identical to English) | ||
| - Empty tag values | ||
| """ | ||
|
|
||
| import sys | ||
| import yaml | ||
| from pathlib import Path | ||
| from typing import Dict, List | ||
| from collections import defaultdict | ||
|
|
||
|
|
||
| class TranslationChecker: | ||
| """Check translations for missing, untranslated, or empty tags.""" | ||
|
|
||
| def __init__(self, source_dir: Path): | ||
| self.source_dir = source_dir | ||
| self.results = defaultdict(lambda: defaultdict(dict)) | ||
|
|
||
| def extract_tags(self, yaml_file: Path) -> Dict[str, str]: | ||
| """Extract T0xxx tags and their text from a YAML file.""" | ||
| tags = {} | ||
| try: | ||
| with open(yaml_file, 'r', encoding='utf-8') as f: | ||
| data = yaml.safe_load(f) | ||
|
|
||
| # Check if data has common_ids section | ||
| if data and 'common_ids' in data: | ||
| for item in data['common_ids']: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. common_ids doesn't exist. This is an AI halucination. This function returns an empty hash map. |
||
| tag_id = item.get('id', '') | ||
| if tag_id.startswith('T0'): | ||
| tags[tag_id] = item.get('text', '') | ||
|
|
||
| except Exception as e: | ||
| print(f"Error reading {yaml_file}: {e}", file=sys.stderr) | ||
|
|
||
| return tags | ||
|
|
||
| def get_file_groups(self) -> Dict[str, List[Path]]: | ||
| """Group YAML files by their base name (e.g., webapp-cards-2.2).""" | ||
| file_groups = defaultdict(list) | ||
|
|
||
| for yaml_file in self.source_dir.glob('*-*.yaml'): | ||
| # Skip archived files | ||
| if 'archive' in str(yaml_file): | ||
| continue | ||
|
|
||
| # Extract base name and language | ||
| # Format: {edition}-{component}-{version}-{lang}.yaml | ||
| parts = yaml_file.stem.split('-') | ||
| if len(parts) >= 3: | ||
| # Find language code (usually last part or second to last) | ||
| lang = parts[-1] | ||
| base_name = '-'.join(parts[:-1]) | ||
|
|
||
| # Only process card files with language codes | ||
| if 'cards' in base_name and (len(lang) == 2 or ('_' in lang and all(len(part) == 2 for part in lang.split('_')))): | ||
| file_groups[base_name].append(yaml_file) | ||
|
|
||
| return file_groups | ||
|
|
||
| def check_translations(self) -> Dict[str, Dict[str, Dict[str, List[str]]]]: | ||
| """ | ||
| Check all translation files against English versions. | ||
|
|
||
| Returns: | ||
| Dict with structure: | ||
| { | ||
| 'base_name': { | ||
| 'language': { | ||
| 'missing': ['T00145', ...], | ||
| 'untranslated': ['T00100', ...], | ||
| 'empty': ['T00200', ...] | ||
| } | ||
| } | ||
| } | ||
| """ | ||
| file_groups = self.get_file_groups() | ||
|
|
||
| for base_name, files in file_groups.items(): | ||
| # Find English reference file | ||
| english_file = None | ||
| translation_files = [] | ||
|
|
||
| for f in files: | ||
| lang = f.stem.split('-')[-1] | ||
| if lang == 'en': | ||
| english_file = f | ||
| else: | ||
| translation_files.append(f) | ||
|
|
||
| if not english_file: | ||
| print(f"Warning: No English file found for {base_name}", file=sys.stderr) | ||
| continue | ||
|
|
||
| # Extract English tags | ||
| english_tags = self.extract_tags(english_file) | ||
|
|
||
| if not english_tags: | ||
| continue | ||
|
|
||
| # Check each translation | ||
| for trans_file in translation_files: | ||
| lang = trans_file.stem.split('-')[-1] | ||
| trans_tags = self.extract_tags(trans_file) | ||
|
|
||
| # Find missing tags | ||
| missing = [] | ||
| untranslated = [] | ||
| empty = [] | ||
|
|
||
| for tag_id, eng_text in english_tags.items(): | ||
| if tag_id not in trans_tags: | ||
| missing.append(tag_id) | ||
| elif not trans_tags[tag_id]: | ||
| empty.append(tag_id) | ||
| elif trans_tags[tag_id] == eng_text: | ||
| untranslated.append(tag_id) | ||
|
|
||
| # Store results | ||
| if missing or untranslated or empty: | ||
| self.results[base_name][lang] = { | ||
| 'missing': sorted(missing), | ||
| 'untranslated': sorted(untranslated), | ||
| 'empty': sorted(empty), | ||
| 'file': str(trans_file.name) | ||
| } | ||
|
|
||
| return dict(self.results) | ||
|
|
||
| def generate_markdown_report(self) -> str: | ||
| """Generate a Markdown report of translation issues.""" | ||
| report_lines = [] | ||
|
|
||
| if not self.results: | ||
| report_lines.append("# Translation Check Report\n") | ||
| report_lines.append("✅ All translations have the same tags as the English version.\n") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be: ✅ All existing translations have been completed. |
||
| return '\n'.join(report_lines) | ||
|
|
||
| report_lines.append("# Translation Check Report\n") | ||
| report_lines.append("The following sentences/tags have issues in the translations:\n") | ||
|
|
||
| # Language name mapping | ||
| lang_names = { | ||
| 'es': 'Spanish', | ||
| 'fr': 'French', | ||
| 'hu': 'Hungarian', | ||
| 'it': 'Italian', | ||
| 'nl': 'Dutch', | ||
| 'no_nb': 'Norwegian', | ||
| 'pt_br': 'Portuguese (Brazil)', | ||
| 'pt_pt': 'Portuguese (Portugal)', | ||
| 'ru': 'Russian' | ||
| } | ||
|
|
||
| for base_name in sorted(self.results.keys()): | ||
| languages = self.results[base_name] | ||
|
|
||
| for lang in sorted(languages.keys()): | ||
| lang_name = lang_names.get(lang, lang.upper()) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lang.upper() is never used. |
||
| issues = languages[lang] | ||
| filename = issues.get('file', '') | ||
|
|
||
| report_lines.append(f"\n## {lang_name}\n") | ||
| report_lines.append(f"**File:** `{filename}`\n") | ||
|
|
||
| if issues['missing']: | ||
| report_lines.append("### Missing Tags\n") | ||
| report_lines.append("The following tags are present in the English version but missing in this translation:\n") | ||
| tags_str = ', '.join(issues['missing']) | ||
| report_lines.append(f"{tags_str}\n") | ||
|
|
||
| if issues['untranslated']: | ||
| report_lines.append("### Untranslated Tags\n") | ||
| report_lines.append("The following tags have identical text to English (not translated):\n") | ||
| tags_str = ', '.join(issues['untranslated']) | ||
| report_lines.append(f"{tags_str}\n") | ||
|
|
||
| if issues['empty']: | ||
| report_lines.append("### Empty Tags\n") | ||
| report_lines.append("The following tags are empty:\n") | ||
| tags_str = ', '.join(issues['empty']) | ||
| report_lines.append(f"{tags_str}\n") | ||
|
|
||
| return '\n'.join(report_lines) | ||
|
|
||
|
|
||
| def main(): | ||
| """Main entry point for the translation checker.""" | ||
| # Determine source directory | ||
| script_dir = Path(__file__).parent | ||
| base_dir = script_dir.parent | ||
| source_dir = base_dir / 'source' | ||
|
|
||
| if not source_dir.exists(): | ||
| print(f"Error: Source directory not found: {source_dir}", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| # Run checker | ||
| checker = TranslationChecker(source_dir) | ||
| results = checker.check_translations() | ||
|
|
||
| # Generate report | ||
| report = checker.generate_markdown_report() | ||
|
|
||
| # Output report | ||
| print(report) | ||
|
|
||
| # Write to file | ||
| output_file = base_dir / 'translation_check_report.md' | ||
| with open(output_file, 'w', encoding='utf-8') as f: | ||
| f.write(report) | ||
|
|
||
| print(f"\n---\nReport written to: {output_file}", file=sys.stderr) | ||
|
|
||
| # Exit with error code if issues found | ||
| if results: | ||
| sys.exit(1) | ||
| else: | ||
| sys.exit(0) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you pipe the result? The script is creating the file translation_check_report.md right?
You shouldn't append errors to that file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ohh thank you for mentioning it , I just noticed it ,I will make sure to change that