diff --git a/.github/workflows/sync_docs_analyze.yml b/.github/workflows/sync_docs_analyze.yml new file mode 100644 index 000000000..30fc5e876 --- /dev/null +++ b/.github/workflows/sync_docs_analyze.yml @@ -0,0 +1,391 @@ +name: Analyze Documentation Changes + +on: + pull_request: + branches: [main, revamp] + types: [opened, synchronize, reopened] + paths: + # IMPORTANT: These paths should match the language directories defined in tools/translate/config.json + # Currently configured for: en (source), cn, jp (targets) + # If you add/remove languages in config.json, update these paths accordingly + - 'docs.json' + - 'en/**/*.md' + - 'en/**/*.mdx' + - 'en/**/openapi*.json' + - 'cn/**/*.md' + - 'cn/**/*.mdx' + - 'cn/**/openapi*.json' + - 'jp/**/*.md' + - 'jp/**/*.mdx' + - 'jp/**/openapi*.json' + - 'versions/**/*.md' + - 'versions/**/*.mdx' + +permissions: + contents: read + pull-requests: read + +jobs: + analyze: + runs-on: ubuntu-latest + steps: + - name: Checkout PR + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Determine comparison range + id: determine-range + env: + GH_TOKEN: ${{ github.token }} + run: | + echo "Determining comparison range..." + + PR_NUMBER="${{ github.event.pull_request.number }}" + EVENT_ACTION="${{ github.event.action }}" + PR_BASE="${{ github.event.pull_request.base.sha }}" + PR_HEAD="${{ github.event.pull_request.head.sha }}" + + if [ "$EVENT_ACTION" = "synchronize" ]; then + echo "🔄 Synchronize event - detecting incremental changes" + + # Try to get last processed commit from translation PR + TRANSLATION_PR=$(gh pr list \ + --search "head:docs-sync-pr-${PR_NUMBER} state:open" \ + --json number \ + --jq '.[0].number // empty' 2>/dev/null || echo "") + + LAST_PROCESSED="" + if [ -n "$TRANSLATION_PR" ]; then + echo "Found translation PR #${TRANSLATION_PR}" + + # Extract last processed commit from comments (reverse order to get latest) + LAST_PROCESSED=$(gh pr view "$TRANSLATION_PR" \ + --json comments \ + --jq '.comments | reverse | .[] | .body' 2>/dev/null \ + | grep -oP 'Last-Processed-Commit: \K[a-f0-9]+' \ + | head -1 || echo "") + + if [ -n "$LAST_PROCESSED" ]; then + echo "✅ Found tracked commit in translation PR: $LAST_PROCESSED" + fi + fi + + # Use tracked commit if available, otherwise fall back to github.event.before + if [ -n "$LAST_PROCESSED" ]; then + COMPARE_BASE="$LAST_PROCESSED" + echo "Using last processed commit: $COMPARE_BASE" + elif [ -n "${{ github.event.before }}" ] && [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then + COMPARE_BASE="${{ github.event.before }}" + echo "Using github.event.before: $COMPARE_BASE" + else + # Fallback to PR base (first push after PR creation) + COMPARE_BASE="$PR_BASE" + echo "⚠️ No previous commit found, using PR base: $COMPARE_BASE" + fi + + COMPARE_HEAD="$PR_HEAD" + IS_INCREMENTAL="true" + + else + echo "🆕 New PR event - analyzing full changes" + COMPARE_BASE="$PR_BASE" + COMPARE_HEAD="$PR_HEAD" + IS_INCREMENTAL="false" + fi + + echo "compare_base=$COMPARE_BASE" >> $GITHUB_OUTPUT + echo "compare_head=$COMPARE_HEAD" >> $GITHUB_OUTPUT + echo "is_incremental=$IS_INCREMENTAL" >> $GITHUB_OUTPUT + + echo "📊 Comparison range: $COMPARE_BASE...$COMPARE_HEAD" + + - name: Categorize and validate PR changes + id: categorize + run: | + echo "Categorizing PR changes..." + + # Get comparison range from previous step + BASE_SHA="${{ steps.determine-range.outputs.compare_base }}" + HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}" + + echo "Base SHA: $BASE_SHA" + echo "Head SHA: $HEAD_SHA" + + # Run PR analyzer + cd tools/translate + python pr_analyzer.py "$BASE_SHA" "$HEAD_SHA" > /tmp/pr_analysis_output.txt 2>&1 + + # Parse analyzer output + if [ $? -eq 0 ]; then + # Successful analysis + source /tmp/pr_analysis_output.txt + echo "PR categorization successful" + echo "PR Type: $pr_type" + echo "Should Skip: $should_skip" + + # Set GitHub outputs + echo "pr_type=$pr_type" >> $GITHUB_OUTPUT + echo "should_skip=$should_skip" >> $GITHUB_OUTPUT + + if [ "$should_skip" = "true" ]; then + if [ "$pr_type" = "translation" ]; then + echo "✅ Translation-only PR detected. Skipping automation (direct review process)." + elif [ "$pr_type" = "none" ]; then + echo "✅ No relevant documentation changes detected. Skipping workflow." + fi + exit 0 + fi + else + # Analysis failed - likely mixed PR + echo "PR categorization failed - likely mixed content PR" + ERROR_MESSAGE=$(cat /tmp/pr_analysis_output.txt | grep "error_message=" | cut -d'=' -f2- || echo "Mixed content PR detected") + echo "error=mixed_pr" >> $GITHUB_OUTPUT + echo "error_message<> $GITHUB_OUTPUT + echo "$ERROR_MESSAGE" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + exit 1 + fi + + - name: Analyze English changes for translation + if: steps.categorize.outputs.pr_type == 'english' + id: analyze + run: | + echo "Analyzing English changes for automatic translation..." + + # Use comparison range from determine-range step + BASE_SHA="${{ steps.determine-range.outputs.compare_base }}" + HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}" + IS_INCREMENTAL="${{ steps.determine-range.outputs.is_incremental }}" + + echo "Comparison: $BASE_SHA...$HEAD_SHA" + echo "Incremental: $IS_INCREMENTAL" + + # Get all changed files (not just English ones for file analysis) + CHANGED_FILES=$(git diff --name-only $BASE_SHA $HEAD_SHA) + + # Count changes for security limits + FILE_COUNT=$(echo "$CHANGED_FILES" | wc -l) + echo "Changed files count: $FILE_COUNT" + + # Security check: Limit number of files + MAX_FILES=50 + if [ "$FILE_COUNT" -gt "$MAX_FILES" ]; then + echo "Error: Too many files changed ($FILE_COUNT > $MAX_FILES)" + echo "error=too_many_files" >> $GITHUB_OUTPUT + exit 1 + fi + + # Create analysis report + cat > /tmp/analysis.json < /tmp/changed_files.txt + + # Analyze file types and sizes for English files that need translation + > /tmp/file_analysis.txt + > /tmp/openapi_analysis.txt + while IFS= read -r file; do + if [[ "$file" =~ ^en/.*\.(md|mdx)$ ]] && [ -f "$file" ]; then + SIZE=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo "0") + echo "$file|$SIZE|markdown" >> /tmp/file_analysis.txt + + # Security check: File size limit (10MB) + MAX_SIZE=$((10 * 1024 * 1024)) + if [ "$SIZE" -gt "$MAX_SIZE" ]; then + echo "Error: File $file exceeds size limit ($SIZE > $MAX_SIZE)" + echo "error=file_too_large" >> $GITHUB_OUTPUT + exit 1 + fi + elif [[ "$file" =~ ^en/.*/openapi.*\.json$ ]] && [ -f "$file" ]; then + SIZE=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo "0") + echo "$file|$SIZE|openapi_json" >> /tmp/openapi_analysis.txt + + # Security check: File size limit for OpenAPI JSON (10MB) + MAX_SIZE=$((10 * 1024 * 1024)) + if [ "$SIZE" -gt "$MAX_SIZE" ]; then + echo "Error: OpenAPI file $file exceeds size limit ($SIZE > $MAX_SIZE)" + echo "error=file_too_large" >> $GITHUB_OUTPUT + exit 1 + fi + fi + done <<< "$CHANGED_FILES" + + # Check for docs.json changes + if echo "$CHANGED_FILES" | grep -q '^docs\.json$'; then + echo "true" > /tmp/docs_json_changed.txt + + # Use PR analyzer's docs.json analysis + cd tools/translate + python3 - < /tmp/docs_json_changed.txt + echo '{"structure_changed": false, "navigation_modified": false, "languages_affected": []}' > /tmp/structure_changes.json + fi + + echo "has_changes=true" >> $GITHUB_OUTPUT + echo "Analysis complete" + + - name: Validate file paths + if: steps.analyze.outputs.has_changes == 'true' + run: | + echo "Validating English file paths for translation..." + + # Security: Validate English files that will be translated + while IFS='|' read -r file size; do + if [ -n "$file" ]; then + # Check for directory traversal attempts + if echo "$file" | grep -q '\.\./'; then + echo "Error: Invalid file path detected: $file" + exit 1 + fi + + # Check file extension for English files + if ! echo "$file" | grep -qE '\.(md|mdx)$'; then + echo "Error: Invalid file type for translation: $file" + exit 1 + fi + + # Check path starts with en/ (only English files need translation) + if ! echo "$file" | grep -qE '^en/'; then + echo "Error: Non-English file in translation list: $file" + exit 1 + fi + fi + done < /tmp/file_analysis.txt + + # Validate OpenAPI JSON files + if [ -f "/tmp/openapi_analysis.txt" ] && [ -s "/tmp/openapi_analysis.txt" ]; then + while IFS='|' read -r file size file_type; do + if [ -n "$file" ]; then + # Check for directory traversal + if echo "$file" | grep -q '\.\./'; then + echo "Error: Invalid file path: $file" + exit 1 + fi + + # Check file extension + if ! echo "$file" | grep -qE '\.json$'; then + echo "Error: Invalid OpenAPI file type: $file" + exit 1 + fi + + # Check path starts with en/ + if ! echo "$file" | grep -qE '^en/'; then + echo "Error: Non-English OpenAPI file in translation list: $file" + exit 1 + fi + + # Check pattern match (configurable via openapi*.json) + if ! echo "$file" | grep -qE 'openapi.*\.json$'; then + echo "Error: File doesn't match OpenAPI pattern: $file" + exit 1 + fi + fi + done < /tmp/openapi_analysis.txt + fi + + echo "All English file paths validated for translation" + + - name: Create analysis summary + if: steps.analyze.outputs.has_changes == 'true' + run: | + echo "Creating analysis summary for English changes..." + + BASE_SHA="${{ steps.determine-range.outputs.compare_base }}" + HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}" + PR_NUMBER=${{ github.event.pull_request.number }} + IS_INCREMENTAL="${{ steps.determine-range.outputs.is_incremental }}" + + # Use SyncPlanGenerator for consistent logic across workflows + cd tools/translate + python3 - <> $GITHUB_OUTPUT + echo "✅ PR has ${SOURCE_DIR}/ docs changes - checking for sync PR" + else + echo "has_english_changes=false" >> $GITHUB_OUTPUT + echo "ℹ️ No ${SOURCE_DIR}/ docs changes - skipping" + fi + env: + GH_TOKEN: ${{ github.token }} + + - name: Find and close sync PR + if: steps.check-english.outputs.has_english_changes == 'true' + env: + GH_TOKEN: ${{ github.token }} + run: | + PR_NUMBER=${{ github.event.pull_request.number }} + PR_MERGED=${{ github.event.pull_request.merged }} + SYNC_BRANCH="docs-sync-pr-${PR_NUMBER}" + + echo "Looking for sync PR with branch: $SYNC_BRANCH" + + # Search for sync PR + SYNC_PR_DATA=$(gh pr list \ + --search "head:${SYNC_BRANCH}" \ + --json number,state \ + --jq '.[0] // empty' 2>/dev/null || echo "") + + if [ -z "$SYNC_PR_DATA" ] || [ "$SYNC_PR_DATA" = "null" ]; then + echo "ℹ️ No sync PR found for PR #${PR_NUMBER}" + exit 0 + fi + + SYNC_PR_NUMBER=$(echo "$SYNC_PR_DATA" | jq -r '.number') + SYNC_PR_STATE=$(echo "$SYNC_PR_DATA" | jq -r '.state') + + if [ "$SYNC_PR_STATE" != "OPEN" ]; then + echo "ℹ️ Sync PR #${SYNC_PR_NUMBER} is already ${SYNC_PR_STATE}" + exit 0 + fi + + echo "Found open sync PR #${SYNC_PR_NUMBER}" + + # Comment and close sync PR + if [ "$PR_MERGED" = "true" ]; then + gh pr close ${SYNC_PR_NUMBER} --comment "✅ Original PR #${PR_NUMBER} was merged. You can still merge this sync PR independently if translations are ready." + else + gh pr close ${SYNC_PR_NUMBER} --comment "❌ Original PR #${PR_NUMBER} was closed. If it reopens, sync will resume automatically." + fi + + echo "✅ Closed sync PR #${SYNC_PR_NUMBER}" diff --git a/.github/workflows/sync_docs_execute.yml b/.github/workflows/sync_docs_execute.yml new file mode 100644 index 000000000..3d458cdea --- /dev/null +++ b/.github/workflows/sync_docs_execute.yml @@ -0,0 +1,547 @@ +# Workflow for executing documentation translations +name: Execute Documentation Sync + +on: + workflow_run: + workflows: ["Analyze Documentation Changes"] + types: + - completed + workflow_dispatch: + inputs: + pr_number: + description: 'PR number to process' + required: true + type: string + +permissions: + contents: write + pull-requests: write + actions: read + +concurrency: + group: docs-translation-${{ github.event.workflow_run.head_branch || github.event.inputs.pr_number }} + cancel-in-progress: false + +jobs: + execute-sync: + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + steps: + - name: Check workflow source + id: check-source + run: | + echo "Checking workflow source..." + echo "Event: ${{ github.event.workflow_run.event }}" + echo "Repository: ${{ github.event.workflow_run.repository.full_name }}" + echo "Head Repository: ${{ github.event.workflow_run.head_repository.full_name }}" + echo "Head Branch: ${{ github.event.workflow_run.head_branch }}" + + # Security check: Only process PRs from the same repository or trusted forks + if [[ "${{ github.event.workflow_run.event }}" != "pull_request" ]]; then + echo "Not a pull request event, skipping" + echo "should_process=false" >> $GITHUB_OUTPUT + exit 0 + fi + + # Check if this is from a fork + IS_FORK="false" + if [[ "${{ github.event.workflow_run.repository.full_name }}" != "${{ github.event.workflow_run.head_repository.full_name }}" ]]; then + IS_FORK="true" + fi + + echo "is_fork=$IS_FORK" >> $GITHUB_OUTPUT + echo "should_process=true" >> $GITHUB_OUTPUT + + - name: Download analysis artifacts + if: steps.check-source.outputs.should_process == 'true' || github.event_name == 'workflow_dispatch' + uses: actions/github-script@v7 + id: download-artifacts + with: + script: | + const fs = require('fs'); + + // Determine which workflow run to get artifacts from + let runId; + let prNumber; + + if (context.eventName === 'workflow_dispatch') { + // Manual trigger - use the pr_number input + prNumber = '${{ github.event.inputs.pr_number }}'; + console.log(`Manual trigger for PR #${prNumber}`); + + // Find the most recent analyze workflow run for this specific PR + const runs = await github.rest.actions.listWorkflowRuns({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: 'sync_docs_analyze.yml', + per_page: 100 + }); + + // Find run that matches our specific PR number + let matchingRun = null; + for (const run of runs.data.workflow_runs) { + if (run.conclusion === 'success' && run.event === 'pull_request' && run.pull_requests.length > 0) { + const pullRequest = run.pull_requests[0]; + if (pullRequest.number.toString() === prNumber) { + matchingRun = run; + break; + } + } + } + + if (!matchingRun) { + console.log(`No successful analyze workflow run found for PR #${prNumber}`); + return false; + } + + runId = matchingRun.id; + console.log(`Found analyze workflow run: ${runId} for PR #${prNumber}`); + } else { + // Triggered by workflow_run + runId = context.payload.workflow_run.id; + console.log(`Workflow run trigger, run ID: ${runId}`); + } + + // List artifacts from the analyze workflow run + const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: runId + }); + + console.log(`Found ${artifacts.data.artifacts.length} artifacts`); + artifacts.data.artifacts.forEach(a => console.log(` - ${a.name}`)); + + const matchArtifact = artifacts.data.artifacts.find(artifact => { + if (context.eventName === 'workflow_dispatch') { + return artifact.name === `docs-sync-analysis-${prNumber}`; + } else { + return artifact.name.startsWith('docs-sync-analysis-'); + } + }); + + if (!matchArtifact) { + console.log('No matching analysis artifact found'); + return false; + } + + console.log(`Downloading artifact: ${matchArtifact.name}`); + + const download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: matchArtifact.id, + archive_format: 'zip' + }); + + fs.writeFileSync('/tmp/artifacts.zip', Buffer.from(download.data)); + console.log('Artifact downloaded successfully'); + + // Extract PR number from artifact name + if (!prNumber) { + prNumber = matchArtifact.name.split('-').pop(); + } + + core.setOutput('pr_number', prNumber); + core.setOutput('artifact_found', 'true'); + + return true; + + - name: Extract and validate artifacts + if: steps.download-artifacts.outputs.artifact_found == 'true' + id: extract-artifacts + run: | + echo "Extracting artifacts..." + + # Create secure temporary directory + WORK_DIR=$(mktemp -d /tmp/sync-XXXXXX) + echo "work_dir=$WORK_DIR" >> $GITHUB_OUTPUT + + # Extract to temporary directory + cd "$WORK_DIR" + unzip /tmp/artifacts.zip + + # Validate extracted files + REQUIRED_FILES="analysis.json sync_plan.json changed_files.txt" + for file in $REQUIRED_FILES; do + if [ ! -f "$file" ]; then + echo "Error: Required file $file not found" + exit 1 + fi + done + + # Validate JSON structure + python3 -c " + import json + import sys + + try: + with open('analysis.json') as f: + analysis = json.load(f) + with open('sync_plan.json') as f: + sync_plan = json.load(f) + + # Validate required fields + assert 'pr_number' in analysis + assert 'files_to_sync' in sync_plan + assert 'target_languages' in sync_plan + + print('Artifacts validated successfully') + except Exception as e: + print(f'Validation error: {e}') + sys.exit(1) + " + + # Extract PR number and other metadata + PR_NUMBER=$(python3 -c "import json; print(json.load(open('analysis.json'))['pr_number'])") + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + + # Extract head SHA to checkout the PR branch (needed for new files) + HEAD_SHA=$(python3 -c "import json; print(json.load(open('analysis.json'))['head_sha'])") + echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT + + # Extract base SHA for comparison + BASE_SHA=$(python3 -c "import json; print(json.load(open('analysis.json'))['base_sha'])") + echo "base_sha=$BASE_SHA" >> $GITHUB_OUTPUT + + # Extract incremental flag + IS_INCREMENTAL=$(python3 -c "import json; print(str(json.load(open('analysis.json'))['is_incremental']).lower())") + echo "is_incremental=$IS_INCREMENTAL" >> $GITHUB_OUTPUT + + # Check if sync is required + SYNC_REQUIRED=$(python3 -c "import json; print(str(json.load(open('sync_plan.json'))['sync_required']).lower())") + echo "sync_required=$SYNC_REQUIRED" >> $GITHUB_OUTPUT + + - name: Checkout PR branch + if: steps.extract-artifacts.outputs.sync_required == 'true' + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + ref: ${{ steps.extract-artifacts.outputs.head_sha }} # Checkout PR's head commit to access new files + + - name: Check if translation branch exists + if: steps.extract-artifacts.outputs.sync_required == 'true' + id: check-branch + run: | + PR_NUMBER="${{ steps.extract-artifacts.outputs.pr_number }}" + SYNC_BRANCH="docs-sync-pr-${PR_NUMBER}" + + # Check if translation branch exists on remote (after repo checkout) + if git ls-remote --exit-code --heads origin "$SYNC_BRANCH" >/dev/null 2>&1; then + echo "✅ Translation branch exists: $SYNC_BRANCH" + echo "branch_exists=true" >> $GITHUB_OUTPUT + else + echo "🆕 Translation branch does not exist yet" + echo "branch_exists=false" >> $GITHUB_OUTPUT + fi + + - name: Skip if translation PR already exists + if: steps.extract-artifacts.outputs.sync_required == 'true' && steps.check-branch.outputs.branch_exists == 'true' + run: | + PR_NUMBER="${{ steps.extract-artifacts.outputs.pr_number }}" + echo "ℹ️ Translation PR already exists for PR #${PR_NUMBER}" + echo "The 'Update Translation PR' workflow will handle incremental updates." + echo "Skipping execution to prevent duplicate commits." + exit 0 + + - name: Set up Python + if: steps.extract-artifacts.outputs.sync_required == 'true' && steps.check-branch.outputs.branch_exists != 'true' + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + if: steps.extract-artifacts.outputs.sync_required == 'true' && steps.check-branch.outputs.branch_exists != 'true' + run: | + cd tools/translate + pip install httpx aiofiles python-dotenv + + - name: Check for manual approval requirement + if: steps.extract-artifacts.outputs.sync_required == 'true' && steps.check-branch.outputs.branch_exists != 'true' && steps.check-source.outputs.is_fork == 'true' + id: check-approval + uses: actions/github-script@v7 + with: + script: | + const prNumber = ${{ steps.extract-artifacts.outputs.pr_number }}; + + // Get PR details + const pr = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber + }); + + const author = pr.data.user.login; + const authorAssociation = pr.data.author_association; + + // Check if author is trusted + const trustedAssociations = ['OWNER', 'MEMBER', 'COLLABORATOR']; + const trustedContributors = process.env.TRUSTED_CONTRIBUTORS?.split(',') || []; + + const isTrusted = trustedAssociations.includes(authorAssociation) || + trustedContributors.includes(author); + + if (!isTrusted) { + // Check for approval from maintainer + const reviews = await github.rest.pulls.listReviews({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber + }); + + const hasApproval = reviews.data.some(review => + review.state === 'APPROVED' && + trustedAssociations.includes(review.author_association) + ); + + if (!hasApproval) { + console.log('PR requires manual approval from a maintainer'); + core.setOutput('needs_approval', 'true'); + + // Comment on PR + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '⏸️ **Documentation sync is pending approval**\n\n' + + 'This PR requires approval from a maintainer before automatic synchronization can proceed.\n\n' + + 'Once approved, the documentation will be automatically translated and synchronized.' + }); + + return; + } + } + + core.setOutput('needs_approval', 'false'); + + - name: Run translation and commit + if: steps.extract-artifacts.outputs.sync_required == 'true' && steps.check-branch.outputs.branch_exists != 'true' && steps.check-approval.outputs.needs_approval != 'true' + id: translate + env: + DIFY_API_KEY: ${{ secrets.DIFY_API_KEY }} + GH_TOKEN: ${{ github.token }} + run: | + echo "Running translation workflow..." + + WORK_DIR="${{ steps.extract-artifacts.outputs.work_dir }}" + PR_NUMBER="${{ steps.extract-artifacts.outputs.pr_number }}" + HEAD_SHA="${{ steps.extract-artifacts.outputs.head_sha }}" + BASE_SHA="${{ steps.extract-artifacts.outputs.base_sha }}" + PR_TITLE=$(gh pr view ${PR_NUMBER} --json title --jq '.title' 2>/dev/null || echo "Documentation changes") + IS_INCREMENTAL="${{ steps.extract-artifacts.outputs.is_incremental }}" + + echo "PR: #${PR_NUMBER}" + echo "Comparison: ${BASE_SHA:0:8}...${HEAD_SHA:0:8}" + echo "Incremental: ${IS_INCREMENTAL}" + + # Call the Python script to handle translation + cd tools/translate + python translate_pr.py \ + --pr-number "$PR_NUMBER" \ + --head-sha "$HEAD_SHA" \ + --base-sha "$BASE_SHA" \ + --pr-title "$PR_TITLE" \ + --work-dir "$WORK_DIR" \ + ${IS_INCREMENTAL:+--is-incremental} \ + 2>&1 | tee /tmp/translation_output.log + + SCRIPT_EXIT_CODE=${PIPESTATUS[0]} + + # Extract JSON result from output + RESULT_JSON=$(grep -A 1000 "RESULT_JSON:" /tmp/translation_output.log | tail -n +2 | grep -B 1000 "^========" | head -n -1) + + if [ -n "$RESULT_JSON" ]; then + echo "$RESULT_JSON" > /tmp/translation_result.json + + # Parse key fields for workflow outputs + SUCCESS=$(echo "$RESULT_JSON" | jq -r '.success') + HAS_CHANGES=$(echo "$RESULT_JSON" | jq -r '.has_changes // false') + TRANSLATION_PR_NUMBER=$(echo "$RESULT_JSON" | jq -r '.translation_pr_number // ""') + TRANSLATION_PR_URL=$(echo "$RESULT_JSON" | jq -r '.translation_pr_url // ""') + PR_CREATED=$(echo "$RESULT_JSON" | jq -r '.created // false') + + # Set outputs for subsequent steps + echo "has_changes=$HAS_CHANGES" >> $GITHUB_OUTPUT + echo "translation_pr_number=$TRANSLATION_PR_NUMBER" >> $GITHUB_OUTPUT + echo "translation_pr_url=$TRANSLATION_PR_URL" >> $GITHUB_OUTPUT + echo "creation_successful=$([ -n "$TRANSLATION_PR_NUMBER" ] && echo true || echo false)" >> $GITHUB_OUTPUT + + # Extract translation results for comment + echo "$RESULT_JSON" | jq -r '.translation_results' > /tmp/sync_results.json 2>/dev/null || echo '{"translated":[],"failed":[],"skipped":[]}' > /tmp/sync_results.json + + echo "✅ Translation workflow completed successfully" + else + echo "❌ Could not parse result JSON" + echo "has_changes=false" >> $GITHUB_OUTPUT + echo "creation_successful=false" >> $GITHUB_OUTPUT + exit 1 + fi + + exit $SCRIPT_EXIT_CODE + + + - name: Comment on original PR with translation PR link + if: steps.extract-artifacts.outputs.sync_required == 'true' && steps.check-branch.outputs.branch_exists != 'true' && steps.check-approval.outputs.needs_approval != 'true' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const prNumber = ${{ steps.extract-artifacts.outputs.pr_number }}; + const hasChanges = '${{ steps.translate.outputs.has_changes }}' === 'true'; + const translationPrNumber = '${{ steps.translate.outputs.translation_pr_number }}'; + const translationPrUrl = '${{ steps.translate.outputs.translation_pr_url }}'; + const creationSuccessful = '${{ steps.translate.outputs.creation_successful }}' === 'true'; + const branchExists = '${{ steps.check-branch.outputs.branch_exists }}' === 'true'; + const headSha = '${{ steps.extract-artifacts.outputs.head_sha }}'; + + let comment = '## 🌐 Multi-language Sync\n\n'; + + if (hasChanges && creationSuccessful && translationPrNumber) { + // Load sync results if available + let results = { translated: [], failed: [], skipped: [] }; + try { + results = JSON.parse(fs.readFileSync('/tmp/sync_results.json', 'utf8')); + } catch (e) { + results = { translated: [], failed: [], skipped: [] }; + } + + if (branchExists) { + comment += `✅ Synced to PR [#${translationPrNumber}](${translationPrUrl || `https://github.com/${{ github.repository }}/pull/${translationPrNumber}`})\n\n`; + } else { + comment += `✅ Created sync PR [#${translationPrNumber}](${translationPrUrl || `https://github.com/${{ github.repository }}/pull/${translationPrNumber}`})\n\n`; + } + + if (results.translated && results.translated.length > 0) { + comment += `**Synced ${results.translated.length} file${results.translated.length > 1 ? 's' : ''}** to cn + jp\n\n`; + } + + if (results.failed && results.failed.length > 0) { + comment += `⚠️ **${results.failed.length} file${results.failed.length > 1 ? 's' : ''} failed:**\n`; + results.failed.slice(0, 3).forEach(file => { + comment += `- \`${file}\`\n`; + }); + if (results.failed.length > 3) { + comment += `- ... and ${results.failed.length - 3} more\n`; + } + comment += '\n'; + } + + comment += '_Both PRs can merge independently. Future commits here will auto-update the sync PR._'; + + } else if (hasChanges && !creationSuccessful) { + comment += '⚠️ **Sync PR creation failed**\n\nCheck workflow logs or contact a maintainer.'; + + } else { + comment += '✅ **No sync needed** - translations are up to date.'; + } + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: comment + }); + + - name: Comment on translation PR with original PR link + if: steps.translate.outputs.creation_successful == 'true' && steps.translate.outputs.translation_pr_number && steps.check-branch.outputs.branch_exists == 'false' + uses: actions/github-script@v7 + continue-on-error: true + with: + script: | + const prNumber = ${{ steps.extract-artifacts.outputs.pr_number }}; + const translationPrNumber = ${{ steps.translate.outputs.translation_pr_number }}; + + const backLinkComment = `🔗 Auto-synced from PR #${prNumber}\n\n` + + `Updates to #${prNumber} will automatically update this PR. Both can merge independently.`; + + try { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: translationPrNumber, + body: backLinkComment + }); + console.log(`Successfully linked translation PR #${translationPrNumber} to original PR #${prNumber}`); + } catch (error) { + console.log(`Could not comment on translation PR #${translationPrNumber}:`, error.message); + } + + handle-cancellation: + runs-on: ubuntu-latest + needs: execute-sync + if: always() && needs.execute-sync.result == 'cancelled' + steps: + - name: Notify about cancelled workflow + uses: actions/github-script@v7 + continue-on-error: true + with: + script: | + console.log('⚠️ Execute workflow was cancelled - likely due to newer commit'); + + // Try to get PR number from workflow run artifacts + const workflowRunId = context.payload.workflow_run.id; + const headBranch = context.payload.workflow_run.head_branch; + + try { + // List artifacts from the analyze workflow + const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: workflowRunId + }); + + // Find analysis artifact + const analysisArtifact = artifacts.data.artifacts.find(a => + a.name.startsWith('docs-sync-analysis-') + ); + + if (!analysisArtifact) { + console.log('No analysis artifact found - cannot determine PR number'); + return; + } + + // Extract PR number from artifact name (format: docs-sync-analysis-PR_NUMBER) + const prNumber = analysisArtifact.name.split('-').pop(); + + console.log(`Found PR #${prNumber} for cancelled workflow`); + + // Get repository info for workflow dispatch link + const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`; + const workflowDispatchUrl = `${repoUrl}/actions/workflows/sync_docs_execute.yml`; + + const comment = '## ⚠️ Sync Skipped\n\n' + + 'This commit was not synced because a newer commit arrived. **Your latest commit will be synced automatically.**\n\n' + + '**If you need this specific commit synced:**\n' + + `Go to [Actions → Execute Documentation Sync](${workflowDispatchUrl}) and manually run with PR number **${prNumber}**\n\n` + + '_When you push multiple commits quickly, only the first and last get synced to avoid backlog._'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: parseInt(prNumber), + body: comment + }); + + console.log(`✅ Posted cancellation notice to PR #${prNumber}`); + + } catch (error) { + console.log(`Failed to notify PR: ${error.message}`); + } + + handle-failure: + runs-on: ubuntu-latest + if: github.event.workflow_run.conclusion == 'failure' + steps: + - name: Report analysis failure + uses: actions/github-script@v7 + with: + script: | + // Try to extract PR number from workflow run + const workflowRun = context.payload.workflow_run; + + console.log('Analysis workflow failed'); + console.log('Attempting to notify PR if possible...'); + + // This is a best-effort attempt to notify + // In practice, you might want to store PR number differently \ No newline at end of file diff --git a/.github/workflows/sync_docs_update.yml b/.github/workflows/sync_docs_update.yml new file mode 100644 index 000000000..72d9759a5 --- /dev/null +++ b/.github/workflows/sync_docs_update.yml @@ -0,0 +1,382 @@ +# Workflow for updating translation PRs on sync events +# Triggered by Analyze workflow for security validation +name: Update Translation PR + +on: + workflow_run: + workflows: ["Analyze Documentation Changes"] + types: [completed] + branches-ignore: + - 'docs-sync-pr-*' + +permissions: + contents: write + pull-requests: write + actions: read + +concurrency: + group: docs-translation-${{ github.event.workflow_run.head_branch }} + cancel-in-progress: false + +jobs: + update-translation: + runs-on: ubuntu-latest + # Only run if analyze workflow succeeded + if: github.event.workflow_run.conclusion == 'success' + steps: + - name: Download analysis artifacts + uses: actions/download-artifact@v4 + with: + name: docs-sync-analysis-${{ github.event.workflow_run.id }} + path: /tmp/analysis + github-token: ${{ secrets.GITHUB_TOKEN }} + run-id: ${{ github.event.workflow_run.id }} + + - name: Load and validate analysis + id: load-analysis + run: | + echo "Loading validated analysis from secure workflow..." + + # Check if sync_plan.json exists (created by analyze workflow) + if [ ! -f "/tmp/analysis/sync_plan.json" ]; then + echo "❌ No sync plan found - analyze workflow may have skipped this PR" + echo "should_proceed=false" >> $GITHUB_OUTPUT + exit 0 + fi + + # Load analysis metadata + PR_NUMBER=$(jq -r '.metadata.pr_number' /tmp/analysis/sync_plan.json) + PR_TYPE=$(jq -r '.metadata.pr_type' /tmp/analysis/sync_plan.json) + IS_INCREMENTAL=$(jq -r '.metadata.is_incremental' /tmp/analysis/sync_plan.json) + BASE_SHA=$(jq -r '.metadata.base_sha' /tmp/analysis/sync_plan.json) + HEAD_SHA=$(jq -r '.metadata.head_sha' /tmp/analysis/sync_plan.json) + + # Verify this is an English-only PR (already validated by analyze workflow) + if [ "$PR_TYPE" != "english" ]; then + echo "ℹ️ Not an English-only PR (type: $PR_TYPE) - skipping translation update" + echo "should_proceed=false" >> $GITHUB_OUTPUT + exit 0 + fi + + echo "✅ Validated analysis loaded from secure workflow" + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + echo "is_incremental=$IS_INCREMENTAL" >> $GITHUB_OUTPUT + echo "base_sha=$BASE_SHA" >> $GITHUB_OUTPUT + echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT + echo "should_proceed=true" >> $GITHUB_OUTPUT + + # Display summary + FILE_COUNT=$(jq -r '.metadata.file_count // 0' /tmp/analysis/sync_plan.json) + echo "📊 Analysis Summary:" + echo " - PR: #$PR_NUMBER" + echo " - Type: $PR_TYPE" + echo " - Files: $FILE_COUNT" + echo " - Incremental: $IS_INCREMENTAL" + echo " - Range: ${BASE_SHA:0:8}...${HEAD_SHA:0:8}" + + - name: Checkout PR + if: steps.load-analysis.outputs.should_proceed == 'true' + uses: actions/checkout@v4 + with: + ref: ${{ steps.load-analysis.outputs.head_sha }} + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + if: steps.load-analysis.outputs.should_proceed == 'true' + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Find associated translation PR + if: steps.load-analysis.outputs.should_proceed == 'true' + id: find-translation-pr + env: + GH_TOKEN: ${{ github.token }} + run: | + PR_NUMBER="${{ steps.load-analysis.outputs.pr_number }}" + echo "Looking for translation PR associated with PR #${PR_NUMBER}..." + + # Search for translation PR by branch name pattern + TRANSLATION_PR_DATA=$(gh pr list \ + --search "head:docs-sync-pr-${PR_NUMBER}" \ + --json number,title,url,state \ + --jq '.[0] // empty' 2>/dev/null || echo "") + + if [ -n "$TRANSLATION_PR_DATA" ] && [ "$TRANSLATION_PR_DATA" != "null" ]; then + TRANSLATION_PR_NUMBER=$(echo "$TRANSLATION_PR_DATA" | jq -r '.number') + TRANSLATION_PR_STATE=$(echo "$TRANSLATION_PR_DATA" | jq -r '.state') + TRANSLATION_PR_URL=$(echo "$TRANSLATION_PR_DATA" | jq -r '.url') + + if [ "$TRANSLATION_PR_STATE" = "OPEN" ]; then + echo "✅ Found active translation PR #${TRANSLATION_PR_NUMBER}" + echo "translation_pr_number=$TRANSLATION_PR_NUMBER" >> $GITHUB_OUTPUT + echo "translation_pr_url=$TRANSLATION_PR_URL" >> $GITHUB_OUTPUT + echo "found_translation_pr=true" >> $GITHUB_OUTPUT + else + echo "ℹ️ Found translation PR #${TRANSLATION_PR_NUMBER} but it's ${TRANSLATION_PR_STATE} - skipping update" + echo "found_translation_pr=false" >> $GITHUB_OUTPUT + fi + else + echo "ℹ️ No translation PR found for PR #${PR_NUMBER} - this might be the first update" + echo "found_translation_pr=false" >> $GITHUB_OUTPUT + fi + + - name: Determine update range + if: steps.find-translation-pr.outputs.found_translation_pr == 'true' + id: update-range + env: + GH_TOKEN: ${{ github.token }} + run: | + PR_NUMBER="${{ steps.load-analysis.outputs.pr_number }}" + HEAD_SHA="${{ steps.load-analysis.outputs.head_sha }}" + BASE_SHA="${{ steps.load-analysis.outputs.base_sha }}" + + echo "Determining incremental update range..." + + # Get last processed commit from translation branch commit messages + SYNC_BRANCH="docs-sync-pr-${PR_NUMBER}" + git fetch origin "$SYNC_BRANCH" 2>/dev/null || true + + LAST_PROCESSED=$(git log "origin/$SYNC_BRANCH" --format=%B -1 \ + | grep -oP 'Last-Processed-Commit: \K[a-f0-9]+' \ + | head -1 || echo "") + + if [ -n "$LAST_PROCESSED" ]; then + echo "✅ Found last processed commit: $LAST_PROCESSED" + COMPARE_BASE="$LAST_PROCESSED" + else + echo "⚠️ No last processed commit found, using analysis base SHA" + COMPARE_BASE="$BASE_SHA" + fi + + COMPARE_HEAD="$HEAD_SHA" + + echo "compare_base=$COMPARE_BASE" >> $GITHUB_OUTPUT + echo "compare_head=$COMPARE_HEAD" >> $GITHUB_OUTPUT + + echo "📊 Incremental update range: $COMPARE_BASE...$COMPARE_HEAD" + + - name: Install dependencies + if: steps.find-translation-pr.outputs.found_translation_pr == 'true' + run: | + cd tools/translate + pip install httpx aiofiles python-dotenv + + - name: Run translation and commit + if: steps.find-translation-pr.outputs.found_translation_pr == 'true' + id: update-translations + env: + DIFY_API_KEY: ${{ secrets.DIFY_API_KEY }} + GH_TOKEN: ${{ github.token }} + run: | + echo "Running incremental translation update with validated inputs..." + + PR_NUMBER="${{ steps.load-analysis.outputs.pr_number }}" + HEAD_SHA="${{ steps.update-range.outputs.compare_head }}" + BASE_SHA="${{ steps.update-range.outputs.compare_base }}" + + # Get PR title from workflow run event + PR_TITLE="${{ github.event.workflow_run.pull_requests[0].title }}" + + echo "PR: #${PR_NUMBER}" + echo "Comparison: ${BASE_SHA:0:8}...${HEAD_SHA:0:8}" + echo "Using validated sync plan from analyze workflow" + + # Call the Python script for incremental translation + cd tools/translate + python translate_pr.py \ + --pr-number "$PR_NUMBER" \ + --head-sha "$HEAD_SHA" \ + --base-sha "$BASE_SHA" \ + --pr-title "$PR_TITLE" \ + --is-incremental \ + 2>&1 | tee /tmp/translation_output.log + + SCRIPT_EXIT_CODE=${PIPESTATUS[0]} + + # Extract JSON result + RESULT_JSON=$(grep -A 1000 "RESULT_JSON:" /tmp/translation_output.log | tail -n +2 | grep -B 1000 "^========" | head -n -1) + + if [ -n "$RESULT_JSON" ]; then + echo "$RESULT_JSON" > /tmp/translation_result.json + + # Parse outputs + SUCCESS=$(echo "$RESULT_JSON" | jq -r '.success') + HAS_CHANGES=$(echo "$RESULT_JSON" | jq -r '.has_changes // false') + + echo "has_changes=$HAS_CHANGES" >> $GITHUB_OUTPUT + echo "commit_successful=$([ "$SUCCESS" = "true" ] && echo true || echo false)" >> $GITHUB_OUTPUT + + # Extract translation results + echo "$RESULT_JSON" | jq -r '.translation_results' > /tmp/update_results.json 2>/dev/null || echo '{"translated":[],"failed":[],"skipped":[]}' > /tmp/update_results.json + + echo "✅ Translation update completed" + else + echo "❌ Could not parse result JSON" + echo "has_changes=false" >> $GITHUB_OUTPUT + echo "commit_successful=false" >> $GITHUB_OUTPUT + exit 1 + fi + + exit $SCRIPT_EXIT_CODE + + - name: Comment on original PR about update + if: steps.update-translations.outputs.has_changes == 'true' && steps.update-translations.outputs.commit_successful == 'true' + uses: actions/github-script@v7 + continue-on-error: true + with: + script: | + const fs = require('fs'); + const prNumber = ${{ steps.load-analysis.outputs.pr_number }}; + const translationPrNumber = '${{ steps.find-translation-pr.outputs.translation_pr_number }}'; + const translationPrUrl = '${{ steps.find-translation-pr.outputs.translation_pr_url }}'; + + // Load update results + let results = { translated: [], failed: [], skipped: [] }; + try { + results = JSON.parse(fs.readFileSync('/tmp/update_results.json', 'utf8')); + } catch (e) { + console.log('Could not load update results'); + } + + let comment = `## 🌐 Multi-language Sync\n\n`; + comment += `✅ Updated sync PR [#${translationPrNumber}](${translationPrUrl})\n\n`; + + if (results.translated && results.translated.length > 0) { + comment += `**Synced ${results.translated.length} file${results.translated.length > 1 ? 's' : ''}** to cn + jp\n\n`; + } + + if (results.failed && results.failed.length > 0) { + comment += `⚠️ **${results.failed.length} file${results.failed.length > 1 ? 's' : ''} failed**\n\n`; + } + + comment += `_Future commits will auto-update the sync PR._`; + + try { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: comment + }); + } catch (error) { + console.log('Could not comment on original PR:', error.message); + } + + - name: Comment on translation PR about update + if: steps.update-translations.outputs.has_changes == 'true' && steps.update-translations.outputs.commit_successful == 'true' + uses: actions/github-script@v7 + continue-on-error: true + with: + script: | + const fs = require('fs'); + const prNumber = ${{ steps.load-analysis.outputs.pr_number }}; + const translationPrNumber = '${{ steps.find-translation-pr.outputs.translation_pr_number }}'; + + // Load update results + let results = { translated: [], failed: [], skipped: [] }; + try { + results = JSON.parse(fs.readFileSync('/tmp/update_results.json', 'utf8')); + } catch (e) { + console.log('Could not load update results'); + } + + const fileCount = results.translated ? results.translated.length : 0; + const updateComment = `✅ Synced ${fileCount} file${fileCount !== 1 ? 's' : ''} from PR #${prNumber}` + + (results.failed && results.failed.length > 0 ? `\n\n⚠️ ${results.failed.length} file${results.failed.length !== 1 ? 's' : ''} failed` : ''); + + try { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: translationPrNumber, + body: updateComment + }); + } catch (error) { + console.log('Could not comment on translation PR:', error.message); + } + + - name: Handle no updates needed + if: steps.find-translation-pr.outputs.found_translation_pr == 'true' && steps.update-translations.outputs.has_changes != 'true' + uses: actions/github-script@v7 + continue-on-error: true + with: + script: | + const prNumber = ${{ steps.load-analysis.outputs.pr_number }}; + const translationPrNumber = '${{ steps.find-translation-pr.outputs.translation_pr_number }}'; + + const comment = `✅ Sync PR [#${translationPrNumber}](https://github.com/${{ github.repository }}/pull/${translationPrNumber}) is already up to date.`; + + try { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: comment + }); + } catch (error) { + console.log('Could not comment on original PR:', error.message); + } + + handle-cancellation: + runs-on: ubuntu-latest + needs: update-translation + if: always() && needs.update-translation.result == 'cancelled' + steps: + - name: Notify about cancelled workflow + uses: actions/github-script@v7 + continue-on-error: true + with: + script: | + console.log('⚠️ Update workflow was cancelled - likely due to newer commit'); + + // Try to get PR number from workflow run artifacts + const workflowRunId = context.payload.workflow_run.id; + const headBranch = context.payload.workflow_run.head_branch; + + try { + // List artifacts from the analyze workflow + const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: workflowRunId + }); + + // Find analysis artifact + const analysisArtifact = artifacts.data.artifacts.find(a => + a.name.startsWith('docs-sync-analysis-') + ); + + if (!analysisArtifact) { + console.log('No analysis artifact found - cannot determine PR number'); + return; + } + + // Extract PR number from artifact name + const prNumber = analysisArtifact.name.split('-').pop(); + + console.log(`Found PR #${prNumber} for cancelled workflow`); + + // Get repository info for workflow dispatch link + const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`; + const workflowDispatchUrl = `${repoUrl}/actions/workflows/sync_docs_execute.yml`; + + const comment = '## ⚠️ Sync Update Skipped\n\n' + + 'This commit was not synced because a newer commit arrived. **Your latest commit will be synced automatically.**\n\n' + + '**If you need this specific commit synced:**\n' + + `Go to [Actions → Execute Documentation Sync](${workflowDispatchUrl}) and manually run with PR number **${prNumber}**\n\n` + + '_When you push multiple commits quickly, only the first and last get synced to avoid backlog._'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: parseInt(prNumber), + body: comment + }); + + console.log(`✅ Posted cancellation notice to PR #${prNumber}`); + + } catch (error) { + console.log(`Failed to notify PR: ${error.message}`); + } diff --git a/tools/apply_docs_json.py b/tools/apply_docs_json.py index 9bb858c0f..0ea4d6da2 100644 --- a/tools/apply_docs_json.py +++ b/tools/apply_docs_json.py @@ -1,546 +1,367 @@ import json import os -import re -from collections import defaultdict from pathlib import Path +from collections import defaultdict # --- Script Base Paths --- SCRIPT_DIR = Path(__file__).resolve().parent BASE_DIR = SCRIPT_DIR.parent # --- Configuration --- -refresh = False # Flag to control whether to clear existing tabs before processing -DOCS_JSON_PATH = BASE_DIR / "docs.json" # Path to the main documentation structure JSON file - -# --- Language Configurations --- -# IMPORTANT: The string values for LANGUAGE_CODE, TARGET_TAB_NAME, and content within -# PWX_TO_GROUP_MAP and DESIRED_GROUP_ORDER are i18n-specific and MUST NOT be translated. - -# --- MODIFICATION START for FILENAME_PATTERN and FILE_EXTENSION_SUFFIX --- -DEV_ZH = { - "DOCS_DIR_RELATIVE": "plugin-dev-zh", "LANGUAGE_CODE": "简体中文", "FILE_EXTENSION_SUFFIX": "", # MODIFIED: No longer a distinct suffix in filename base - "TARGET_TAB_NAME": "插件开发", "FILENAME_PATTERN": re.compile(r"^(\d{4})-(.*?)\.mdx$"), # MODIFIED: Pattern no longer expects before .mdx - "PWX_TO_GROUP_MAP": { - ("0", "1", "1"): ("插件开发", "概念与入门", "概览"), ("0", "1", "3"): ("插件开发", "概念与入门", None), - ("0", "2", "1"): ("插件开发", "开发实践", "快速开始"),("0", "2", "2"): ("插件开发", "开发实践", "开发 Dify 插件"), - ("0", "3", "1"): ("插件开发", "贡献与发布", "行为准则与规范"),("0", "3", "2"): ("插件开发", "贡献与发布", "发布与上架"),("0", "3", "3"): ("插件开发", "贡献与发布", "常见问题解答"), - ("0", "4", "3"): ("插件开发", "实践案例与示例", "开发示例"), - ("9", "2", "2"): ("插件开发", "高级开发", "Extension 与 Agent"),("9", "2", "3"): ("插件开发", "高级开发", "Extension 与 Agent"),("9", "4", "3"): ("插件开发", "高级开发", "Extension 与 Agent"),("9", "2", "4"): ("插件开发", "高级开发", "反向调用"), - ("0", "4", "1"): ("插件开发", "Reference & Specifications", "核心规范与功能"), +refresh = False # Flag to control whether to clear existing dropdowns before processing +DOCS_JSON_PATH = BASE_DIR / "docs.json" # Path to the main documentation structure JSON file + +# --- Sync Configurations --- +# Define which dropdowns to sync between languages for each version +# Skip "Develop" dropdown as requested +SYNC_CONFIGS = [ + { + "VERSION_CODE": "Latest", + "BASE_PATHS": { + "en": "en", + "cn": "cn", + "ja": "jp" + }, + "DROPDOWNS_TO_SYNC": [ + { + "en": {"name": "Documentation", "path": "documentation"}, + "cn": {"name": "文档", "path": "documentation"}, + "ja": {"name": "ドキュメント", "path": "documentation"} + }, + { + "en": {"name": "Self Hosting", "path": "self-hosting"}, + "cn": {"name": "自托管", "path": "self-hosting"}, + "ja": {"name": "セルフホスティング", "path": "self-hosting"} + }, + { + "en": {"name": "API Reference", "path": "api-reference", "type": "openapi"}, + "cn": {"name": "访问 API", "path": "", "type": "openapi"}, + "ja": {"name": "APIアクセス", "path": "", "type": "openapi"} + } + ] }, - "DESIRED_GROUP_ORDER": ["概念与入门", "开发实践", "贡献与发布", "实践案例与示例", "高级开发", "Reference & Specifications"], -} -DEV_EN = { - "DOCS_DIR_RELATIVE": "plugin-dev-en", "LANGUAGE_CODE": "English", "FILE_EXTENSION_SUFFIX": "", # MODIFIED - "TARGET_TAB_NAME": "Plugin Development", "FILENAME_PATTERN": re.compile(r"^(\d{4})-(.*?)\.mdx$"), # MODIFIED - "PWX_TO_GROUP_MAP": { - ("0", "1", "1"): ("Plugin Development", "Concepts & Getting Started", "Overview"),("0", "1", "3"): ("Plugin Development", "Concepts & Getting Started", None), - ("0", "2", "1"): ("Plugin Development", "Development Practices", "Quick Start"),("0", "2", "2"): ("Plugin Development", "Development Practices", "Developing Dify Plugins"), - ("0", "3", "1"): ("Plugin Development", "Contribution & Publishing", "Code of Conduct & Standards"),("0", "3", "2"): ("Plugin Development", "Contribution & Publishing", "Publishing & Listing"),("0", "3", "3"): ("Plugin Development", "Contribution & Publishing", "FAQ"), - ("0", "4", "3"): ("Plugin Development", "Examples & Use Cases", "Development Examples"), - ("9", "2", "2"): ("Plugin Development", "Advanced Development", "Extension & Agent"),("9", "2", "3"): ("Plugin Development", "Advanced Development", "Extension & Agent"),("9", "4", "3"): ("Plugin Development", "Advanced Development", "Extension & Agent"),("9", "2", "4"): ("Plugin Development", "Advanced Development", "Reverse Calling"), - ("0", "4", "1"): ("Plugin Development", "Reference & Specifications", "Core Specifications & Features"), + { + "VERSION_CODE": "3.3.x (Enterprise)", + "BASE_PATHS": { + "en": "versions/3-3-x/en", + "cn": "versions/3-3-x/cn", + "ja": "versions/3-3-x/jp" + }, + "DROPDOWNS_TO_SYNC": [] # Add dropdowns for this version if needed }, - "DESIRED_GROUP_ORDER": ["Concepts & Getting Started", "Development Practices", "Contribution & Publishing", "Examples & Use Cases", "Advanced Development", "Reference & Specifications"], -} -DEV_JA = { - "DOCS_DIR_RELATIVE": "plugin-dev-ja", "LANGUAGE_CODE": "日本語", "FILE_EXTENSION_SUFFIX": "", # MODIFIED - "TARGET_TAB_NAME": "プラグイン開発", "FILENAME_PATTERN": re.compile(r"^(\d{4})-(.*?)\.mdx$"), # MODIFIED - "PWX_TO_GROUP_MAP": { - ("0", "1", "1"): ("プラグイン開発", "概念と概要", "概要"),("0", "1", "3"): ("プラグイン開発", "概念と概要", None), - ("0", "2", "1"): ("プラグイン開発", "開発実践", "クイックスタート"),("0", "2", "2"): ("プラグイン開発", "開発実践", "Difyプラグインの開発"), - ("0", "3", "1"): ("プラグイン開発", "貢献と公開", "行動規範と基準"),("0", "3", "2"): ("プラグイン開発", "貢献と公開", "公開と掲載"),("0", "3", "3"): ("プラグイン開発", "貢献と公開", "よくある質問 (FAQ)"), - ("0", "4", "3"): ("プラグイン開発", "実践例とユースケース", "開発例"), - ("9", "2", "2"): ("プラグイン開発", "高度な開発", "Extension と Agent"),("9", "2", "3"): ("プラグイン開発", "高度な開発", "Extension と Agent"),("9", "4", "3"): ("プラグイン開発", "高度な開発", "Extension と Agent"),("9", "2", "4"): ("プラグイン開発", "高度な開発", "リバースコール"), - ("0", "4", "1"): ("プラグイン開発", "リファレンスと仕様", "コア仕様と機能"), + { + "VERSION_CODE": "3.2.x (Enterprise)", + "BASE_PATHS": { + "en": "versions/3-2-x/en", + "cn": "versions/3-2-x/cn", + "ja": "versions/3-2-x/jp" + }, + "DROPDOWNS_TO_SYNC": [] # Add dropdowns for this version if needed }, - "DESIRED_GROUP_ORDER": ["概念と概要", "開発実践", "貢献と公開", "実践例とユースケース", "高度な開発", "リファレンスと仕様"], -} -# --- MODIFICATION END for FILENAME_PATTERN and FILE_EXTENSION_SUFFIX --- - + { + "VERSION_CODE": "3.0.x (Enterprise)", + "BASE_PATHS": { + "en": "versions/3-0-x/en", + "cn": "versions/3-0-x/cn", + "ja": "versions/3-0-x/jp" + }, + "DROPDOWNS_TO_SYNC": [] # Add dropdowns for this version if needed + } +] # --- Helper Functions --- -# Defines log issue types considered critical enough to be included in the commit message summary. -CRITICAL_ISSUE_TYPES = {"Error", "Critical", "ConfigError", "SeriousWarning", "InternalError"} # Added InternalError from process_single_config +CRITICAL_ISSUE_TYPES = {"Error", "Critical", "ConfigError", "SeriousWarning", "InternalError"} -def _log_issue(reports_list_for_commit_message: list, lang_code: str, issue_type: str, message: str, details: str = ""): +def _log_issue(reports_list_for_commit_message: list, context: str, issue_type: str, message: str, details: str = ""): """ Logs a detailed message to the console and adds a concise version to a list for commit messages if the issue_type is critical. - - Args: - reports_list_for_commit_message: List to accumulate messages for the commit summary. - lang_code: Language code or identifier for the context of the log (e.g., "简体中文", "GLOBAL"). - issue_type: Type of the issue (e.g., "Info", "Warning", "Error", "Critical"). - message: The main message of the log. - details: Optional additional details for the log. """ - full_log_message = f"[{issue_type.upper()}] Lang '{lang_code}': {message}" + full_log_message = f"[{issue_type.upper()}] {context}: {message}" if details: full_log_message += f" Details: {details}" - print(full_log_message) + print(full_log_message) if issue_type in CRITICAL_ISSUE_TYPES: - commit_msg_part = f"- Lang '{lang_code}': [{issue_type}] {message}" + commit_msg_part = f"- {context}: [{issue_type}] {message}" reports_list_for_commit_message.append(commit_msg_part) -def clear_tabs_if_refresh(navigation_data: dict, version_code: str, target_tab_name: str, do_refresh: bool, commit_message_reports_list: list) -> bool: - if not do_refresh: - return False - if not navigation_data or "versions" not in navigation_data: - _log_issue(commit_message_reports_list, version_code, "Warning", "'navigation.versions' not found, cannot clear tabs.") - return False - - version_found, tab_cleared = False, False - for version_nav in navigation_data.get("versions", []): - if version_nav.get("version") == version_code: - version_found = True - target_tab = next((t for t in version_nav.get("tabs", []) if isinstance(t, dict) and t.get("tab") == target_tab_name), None) - if target_tab: - target_tab["groups"] = [] - _log_issue(commit_message_reports_list, version_code, "Info", f"Cleared groups for Tab '{target_tab_name}'.") - tab_cleared = True - else: - _log_issue(commit_message_reports_list, version_code, "Info", f"Tab '{target_tab_name}' not found to clear groups (will be created if needed).") - break - if not version_found: - _log_issue(commit_message_reports_list, version_code, "Warning", f"Version '{version_code}' not found, cannot clear any Tab.") - return tab_cleared - -def get_page_path_from_filename(filename: str, docs_dir_name: str) -> str: - """ - Constructs the documentation page path from its filename and directory name. - Example: - Old: "0001-intro.mdx", "plugin-dev-en" -> "plugin-dev-en/0001-intro.en" - New: "0001-intro.mdx", "plugin-dev-en" -> "plugin-dev-en/0001-intro" - - Args: - filename: The .mdx filename (e.g., "0001-intro.mdx"). - docs_dir_name: The relative directory name for this set of docs (e.g., "plugin-dev-en"). - - Returns: - The page path string used in docs.json. - - Raises: - ValueError: If the filename does not end with ".mdx". - """ - if not filename.endswith(".mdx"): - raise ValueError(f"Internal Error: Filename '{filename}' received by get_page_path_from_filename does not end with '.mdx'.") - base_filename = filename[:-len(".mdx")] - return f"{docs_dir_name}/{base_filename}" - - -def extract_existing_pages(navigation_data: dict, version_code: str, target_tab_name: str, commit_message_reports_list: list): - existing_pages = set() - target_version_nav, target_tab_nav = None, None - - if not navigation_data or "versions" not in navigation_data: - return existing_pages, None, None - - target_version_nav = next((v for v in navigation_data.get("versions", []) if v.get("version") == version_code), None) - if not target_version_nav: - return existing_pages, None, None - - if "tabs" in target_version_nav and isinstance(target_version_nav["tabs"], list): - target_tab_nav = next((t for t in target_version_nav["tabs"] if isinstance(t,dict) and t.get("tab") == target_tab_name), None) - if target_tab_nav: - for group in target_tab_nav.get("groups", []): - if isinstance(group, dict): - _recursive_extract(group, existing_pages) - - return existing_pages, target_version_nav, target_tab_nav - -def _recursive_extract(group_item: dict, pages_set: set): - if not isinstance(group_item, dict): return - for page in group_item.get("pages", []): - if isinstance(page, str): - pages_set.add(page) - elif isinstance(page, dict) and "group" in page: - _recursive_extract(page, pages_set) - - -def remove_obsolete_pages(target_tab_data: dict, pages_to_remove: set, commit_message_reports_list: list, lang_code: str): - if not isinstance(target_tab_data, dict) or "groups" not in target_tab_data or not isinstance(target_tab_data.get("groups"), list): - _log_issue(commit_message_reports_list, lang_code, "Warning", "Attempted to remove obsolete pages from invalid target_tab_data structure.", f"Tab data: {target_tab_data}") - return - - groups = target_tab_data["groups"] - i = 0 - while i < len(groups): - group_item = groups[i] - if isinstance(group_item, dict): - _remove_obsolete_from_group(group_item, pages_to_remove, commit_message_reports_list, lang_code) - if not group_item.get("pages"): - _log_issue(commit_message_reports_list, lang_code, "Info", f"Group '{group_item.get('group', 'Unknown')}' emptied after removing obsolete pages; structure retained.") - i += 1 - else: - _log_issue(commit_message_reports_list, lang_code, "Warning", f"Encountered non-dict item in groups list of Tab '{target_tab_data.get('tab','Unknown')}' during obsolete page removal. Item: {group_item}") - i += 1 - -def _remove_obsolete_from_group(group_dict: dict, pages_to_remove: set, commit_message_reports_list: list, lang_code: str): - if not isinstance(group_dict, dict) or "pages" not in group_dict or not isinstance(group_dict.get("pages"), list): - group_name_for_log_err = group_dict.get('group', 'Unnamed Group with structural issue') if isinstance(group_dict, dict) else 'Non-dict item' - _log_issue(commit_message_reports_list, lang_code, "Warning", f"Group '{group_name_for_log_err}' has invalid 'pages' structure; cannot remove obsolete pages from it. Structure: {group_dict}") - return - - new_pages = [] - group_name_for_log = group_dict.get('group', 'Unknown') - for page_item in group_dict["pages"]: - if isinstance(page_item, str): - if page_item not in pages_to_remove: - new_pages.append(page_item) - else: - _log_issue(commit_message_reports_list, lang_code, "Info", f"Removed obsolete page '{page_item}' from Group '{group_name_for_log}'.") - elif isinstance(page_item, dict) and "group" in page_item: - _remove_obsolete_from_group(page_item, pages_to_remove, commit_message_reports_list, lang_code) - if page_item.get("pages"): - new_pages.append(page_item) - else: - _log_issue(commit_message_reports_list, lang_code, "Info", f"Nested group '{page_item.get('group', 'Unknown')}' in Group '{group_name_for_log}' emptied; structure retained.") - new_pages.append(page_item) - else: - _log_issue(commit_message_reports_list, lang_code, "Warning", f"Encountered unexpected item type in 'pages' list of Group '{group_name_for_log}'. Preserving item: {page_item}") - new_pages.append(page_item) - group_dict["pages"] = new_pages - - -def find_or_create_target_group(target_version_nav: dict, tab_name: str, group_name: str, nested_group_name: str | None, commit_message_reports_list: list, lang_code: str) -> list: - target_version_nav.setdefault("tabs", []) - if not isinstance(target_version_nav["tabs"], list): - _log_issue(commit_message_reports_list, lang_code, "Critical", f"Internal state error: version.tabs is not a list for version '{target_version_nav.get('version')}'. Attempting to recover by creating a new list.") - target_version_nav["tabs"] = [] - - target_tab = next((t for t in target_version_nav["tabs"] if isinstance(t,dict) and t.get("tab") == tab_name), None) - if not target_tab: - target_tab = {"tab": tab_name, "groups": []} - target_version_nav["tabs"].append(target_tab) - _log_issue(commit_message_reports_list, lang_code, "Info", f"Created new Tab '{tab_name}'.") - - target_tab.setdefault("groups", []) - if not isinstance(target_tab["groups"], list): - _log_issue(commit_message_reports_list, lang_code, "Critical", f"Internal state error: tab.groups is not a list for Tab '{tab_name}'. Attempting to recover.") - target_tab["groups"] = [] - - target_group = next((g for g in target_tab["groups"] if isinstance(g,dict) and g.get("group") == group_name), None) - if not target_group: - target_group = {"group": group_name, "pages": []} - target_tab["groups"].append(target_group) - _log_issue(commit_message_reports_list, lang_code, "Info", f"Created new Group '{group_name}' in Tab '{tab_name}'.") - - target_group.setdefault("pages", []) - if not isinstance(target_group["pages"], list): - _log_issue(commit_message_reports_list, lang_code, "Critical", f"Internal state error: group.pages is not a list for Group '{group_name}'. Attempting to recover.") - target_group["pages"] = [] - - container_for_pages = target_group["pages"] - - if nested_group_name: - nested_group = next((item for item in target_group["pages"] if isinstance(item, dict) and item.get("group") == nested_group_name), None) - if not nested_group: - nested_group = {"group": nested_group_name, "pages": []} - target_group["pages"].append(nested_group) - _log_issue(commit_message_reports_list, lang_code, "Info", f"Created new Nested Group '{nested_group_name}' in Group '{group_name}'.") - - nested_group.setdefault("pages", []) - if not isinstance(nested_group["pages"], list): - _log_issue(commit_message_reports_list, lang_code, "Critical", f"Internal state error: nested_group.pages is not a list for Nested Group '{nested_group_name}'. Attempting to recover.") - nested_group["pages"] = [] - container_for_pages = nested_group["pages"] - - return container_for_pages - -def get_group_sort_key(group_dict: dict, desired_order_list: list) -> int: - group_name = group_dict.get("group", "") - try: - return desired_order_list.index(group_name) - except ValueError: - return len(desired_order_list) - -# --- Main Logic --- -def process_single_config(docs_config: dict, navigation_data: dict, commit_message_reports_list: list): - lang_code = docs_config["LANGUAGE_CODE"] - docs_dir_relative = docs_config["DOCS_DIR_RELATIVE"] - docs_dir_abs = BASE_DIR / docs_dir_relative - pwx_map = docs_config["PWX_TO_GROUP_MAP"] - filename_pattern = docs_config["FILENAME_PATTERN"] - target_tab_name = docs_config["TARGET_TAB_NAME"] - desired_group_order = docs_config["DESIRED_GROUP_ORDER"] - # FILE_EXTENSION_SUFFIX is in docs_config but no longer directly used in this function's logic - # for deriving page paths, as get_page_path_from_filename handles the new simpler .mdx ending. - - _log_issue(commit_message_reports_list, lang_code, "Info", f"Processing Tab '{target_tab_name}'. Docs dir: '{docs_dir_abs}'") - - clear_tabs_if_refresh(navigation_data, lang_code, target_tab_name, refresh, commit_message_reports_list) - - existing_pages, target_version_nav, target_tab_nav = extract_existing_pages(navigation_data, lang_code, target_tab_name, commit_message_reports_list) - - if target_version_nav is None: - _log_issue(commit_message_reports_list, lang_code, "Info", f"Version '{lang_code}' not found in docs.json, creating it.") - navigation_data.setdefault("versions", []) - if not isinstance(navigation_data["versions"], list): - _log_issue(commit_message_reports_list, lang_code, "Critical", "Top-level 'navigation.versions' is not a list. Re-initializing.") - navigation_data["versions"] = [] - target_version_nav = {"version": lang_code, "tabs": []} - navigation_data["versions"].append(target_version_nav) - existing_pages = set() - target_tab_nav = None - - if target_tab_nav is None: - _log_issue(commit_message_reports_list, lang_code, "Info", f"Tab '{target_tab_name}' not found in version '{lang_code}'. It will be created if pages are added to it.") - existing_pages = set() - target_version_nav.setdefault("tabs", []) - if not isinstance(target_version_nav["tabs"], list): - _log_issue(commit_message_reports_list, lang_code, "Critical", f"Version '{lang_code}' 'tabs' attribute is not a list. Re-initializing.") - target_version_nav["tabs"] = [] - - _log_issue(commit_message_reports_list, lang_code, "Info", f"{len(existing_pages)} existing pages found in docs.json for Tab '{target_tab_name}'.") - - filesystem_pages_map = {} - valid_filenames_for_processing = [] - - if not docs_dir_abs.is_dir(): - _log_issue(commit_message_reports_list, lang_code, "Error", f"Documentation directory '{docs_dir_abs}' not found. Skipping file processing for this configuration.") - return - - for filename in os.listdir(docs_dir_abs): - if not filename.endswith(".mdx"): - continue - - match = filename_pattern.match(filename) # MODIFIED: use match result directly - if match: # MODIFIED: check if match is not None - try: - page_path = get_page_path_from_filename(filename, docs_dir_relative) - filesystem_pages_map[filename] = page_path - valid_filenames_for_processing.append(filename) - except ValueError as e: - _log_issue(commit_message_reports_list, lang_code, "Error", f"Error generating page path for '{filename}': {e}. Skipping this file.") - else: - _log_issue(commit_message_reports_list, lang_code, "SeriousWarning", f"File '{filename}' in '{docs_dir_relative}' is .mdx but does not match FILENAME_PATTERN. Skipping this file.") - - filesystem_page_paths_set = set(filesystem_pages_map.values()) - _log_issue(commit_message_reports_list, lang_code, "Info", f"{len(filesystem_page_paths_set)} valid .mdx files matching pattern found in '{docs_dir_relative}'.") - - new_page_paths = filesystem_page_paths_set - existing_pages - removed_page_paths = existing_pages - filesystem_page_paths_set - - if new_page_paths: - _log_issue(commit_message_reports_list, lang_code, "Info", f"{len(new_page_paths)} new page(s) to add to Tab '{target_tab_name}'.") - if removed_page_paths: - _log_issue(commit_message_reports_list, lang_code, "Info", f"{len(removed_page_paths)} obsolete page(s) to remove from Tab '{target_tab_name}'.") - - _current_tab_for_removal = next((t for t in target_version_nav.get("tabs", []) if isinstance(t, dict) and t.get("tab") == target_tab_name), None) - if removed_page_paths and _current_tab_for_removal: - remove_obsolete_pages(_current_tab_for_removal, removed_page_paths, commit_message_reports_list, lang_code) - elif removed_page_paths: - _log_issue(commit_message_reports_list, lang_code, "Warning", f"Obsolete pages detected for Tab '{target_tab_name}', but the tab was not found in the current version structure. Removal skipped.") - - if new_page_paths: - files_to_add_sorted = sorted([fn for fn, pp in filesystem_pages_map.items() if pp in new_page_paths]) - - for filename in files_to_add_sorted: - match_for_add = filename_pattern.match(filename) # Re-match, or reuse 'match' if it was stored from earlier loop. Re-matching is safer. - if not match_for_add: - _log_issue(commit_message_reports_list, lang_code, "InternalError", f"File '{filename}' was marked for addition but failed pattern match. Skipping.") - continue - - pwxy_str = match_for_add.group(1) - page_path = filesystem_pages_map[filename] - - if len(pwxy_str) < 3: # This check for P, W, X assumes they are single digits from filename. - # If FILENAME_PATTERN's group(1) captures more/less, this needs adjustment. - # Current pattern (\d{4}) captures 4 digits for PWXY. - _log_issue(commit_message_reports_list, lang_code, "Error", f"File '{filename}' has an invalid PWXY prefix '{pwxy_str}' (too short, expected 3+). Skipping this file.") - continue - - # Assuming PWXY is the first 4 digits, P, W, X are the first, second, third digits. - # The original code used pwxy_str[0], pwxy_str[1], pwxy_str[2] which implies PWX from the *first three* chars of the prefix. - # If the filename is 0123-title.mdx, and pwxy_str is "0123" (from (\d{4})), then: - # P = "0", W = "1", X = "2". (Y = "3" is not used for map key) - p, w, x = pwxy_str[0], pwxy_str[1], pwxy_str[2] - group_key = (p, w, x) - - if group_key in pwx_map: - map_val = pwx_map[group_key] - if not (isinstance(map_val, tuple) and (len(map_val) == 2 or len(map_val) == 3)): - _log_issue(commit_message_reports_list, lang_code, "ConfigError", f"PWX_TO_GROUP_MAP entry for key {group_key} has invalid format: {map_val}. Expected tuple of 2 or 3 strings. Skipping file '{filename}'.") - continue - - _tab_name_in_map, group_name_from_map = map_val[0], map_val[1] - nested_group_name_from_map = map_val[2] if len(map_val) == 3 else None - - if _tab_name_in_map != target_tab_name: - _log_issue(commit_message_reports_list, lang_code, "Warning", f"File '{filename}' (PWX key {group_key}) maps to Tab '{_tab_name_in_map}' in PWX_TO_GROUP_MAP, but current processing is for Tab '{target_tab_name}'. Page will be added to '{target_tab_name}' under group '{group_name_from_map}'.") - - target_pages_container_list = find_or_create_target_group( - target_version_nav, target_tab_name, group_name_from_map, nested_group_name_from_map, - commit_message_reports_list, lang_code - ) - if page_path not in target_pages_container_list: - target_pages_container_list.append(page_path) - _log_issue(commit_message_reports_list, lang_code, "Info", f"Added page '{page_path}' to Group '{group_name_from_map}' (Nested: {nested_group_name_from_map or 'No'}).") - else: - _log_issue(commit_message_reports_list, lang_code, "Info", f"Page '{page_path}' already exists in Group '{group_name_from_map}' (Nested: {nested_group_name_from_map or 'No'}). Skipping addition.") - else: - _log_issue(commit_message_reports_list, lang_code, "SeriousWarning", f"File '{filename}' (PWX prefix ({p},{w},{x})) has no corresponding entry in PWX_TO_GROUP_MAP. Skipping this file.") - - final_target_tab_nav = next((t for t in target_version_nav.get("tabs", []) if isinstance(t, dict) and t.get("tab") == target_tab_name), None) - - if final_target_tab_nav and "groups" in final_target_tab_nav and isinstance(final_target_tab_nav["groups"], list): - if final_target_tab_nav["groups"]: - final_target_tab_nav["groups"].sort(key=lambda g: get_group_sort_key(g, desired_group_order)) - _log_issue(commit_message_reports_list, lang_code, "Info", f"Sorted groups in Tab '{target_tab_name}'.") - else: - _log_issue(commit_message_reports_list, lang_code, "Info", f"No groups to sort in Tab '{target_tab_name}' (tab is empty or contains no group structures).") - elif final_target_tab_nav: - _log_issue(commit_message_reports_list, lang_code, "Warning", f"Tab '{target_tab_name}' exists but has no valid 'groups' list to sort.") - else: - _log_issue(commit_message_reports_list, lang_code, "Info", f"Tab '{target_tab_name}' does not exist in the final structure; no sorting needed.") - - -def load_docs_data_robust(path: Path, commit_message_reports_list: list, lang_for_report: str = "GLOBAL") -> dict: +def load_docs_data_robust(path: Path, commit_message_reports_list: list) -> dict: + """Load docs.json with error handling""" default_structure = {"navigation": {"versions": []}} try: if not path.exists(): - _log_issue(commit_message_reports_list, lang_for_report, "Info", f"File '{path}' not found. Initializing with a new default structure.") + _log_issue(commit_message_reports_list, "GLOBAL", "Info", f"File '{path}' not found. Initializing with default structure.") return default_structure with open(path, "r", encoding="utf-8") as f: data = json.load(f) if not isinstance(data, dict) or \ "navigation" not in data or not isinstance(data["navigation"], dict) or \ "versions" not in data["navigation"] or not isinstance(data["navigation"]["versions"], list): - _log_issue(commit_message_reports_list, lang_for_report, "Error", f"File '{path}' has an invalid root structure. Key 'navigation.versions' (as a list) is missing or malformed. Using default structure.") + _log_issue(commit_message_reports_list, "GLOBAL", "Error", f"Invalid structure in '{path}'. Using default.") return default_structure return data except json.JSONDecodeError as e: - _log_issue(commit_message_reports_list, lang_for_report, "Error", f"Failed to parse JSON from '{path}': {e}. Using default structure.") + _log_issue(commit_message_reports_list, "GLOBAL", "Error", f"Failed to parse JSON: {e}") return default_structure - except Exception as e: - _log_issue(commit_message_reports_list, lang_for_report, "Critical", f"Unexpected error loading file '{path}': {e}. Using default structure.") + except Exception as e: + _log_issue(commit_message_reports_list, "GLOBAL", "Critical", f"Unexpected error: {e}") return default_structure -def save_docs_data_robust(path: Path, data: dict, commit_message_reports_list: list, lang_for_report: str = "GLOBAL") -> bool: + +def save_docs_data_robust(path: Path, data: dict, commit_message_reports_list: list) -> bool: + """Save docs.json with error handling""" try: with open(path, "w", encoding="utf-8") as f: - json.dump(data, f, ensure_ascii=False, indent=4) - _log_issue(commit_message_reports_list, lang_for_report, "Info", f"Successfully saved updates to '{path}'.") + json.dump(data, f, ensure_ascii=False, indent=2) + _log_issue(commit_message_reports_list, "GLOBAL", "Info", f"Successfully saved to '{path}'.") return True except Exception as e: - _log_issue(commit_message_reports_list, lang_for_report, "Critical", f"Failed to save updates to '{path}': {e}.") + _log_issue(commit_message_reports_list, "GLOBAL", "Critical", f"Failed to save: {e}") return False -def validate_config(config: dict, config_name: str, commit_message_reports_list: list) -> bool: - is_valid = True - required_keys = [ - "DOCS_DIR_RELATIVE", "LANGUAGE_CODE", "FILE_EXTENSION_SUFFIX", # FILE_EXTENSION_SUFFIX still checked for presence - "TARGET_TAB_NAME", "FILENAME_PATTERN", "PWX_TO_GROUP_MAP", "DESIRED_GROUP_ORDER" - ] - for key in required_keys: - if key not in config: - _log_issue(commit_message_reports_list, config_name, "ConfigError", f"Configuration is missing required key '{key}'.") - is_valid = False - - if not is_valid: - _log_issue(commit_message_reports_list, config_name, "Info", f"Skipping configuration '{config_name}' due to missing required keys.") - return False - if not (isinstance(config["DOCS_DIR_RELATIVE"], str) and config["DOCS_DIR_RELATIVE"]): - _log_issue(commit_message_reports_list, config_name, "ConfigError", f"Key 'DOCS_DIR_RELATIVE' must be a non-empty string. Found: '{config.get('DOCS_DIR_RELATIVE')}'.") - is_valid = False - if not isinstance(config["FILENAME_PATTERN"], re.Pattern): - _log_issue(commit_message_reports_list, config_name, "ConfigError", f"Key 'FILENAME_PATTERN' must be a compiled regular expression (re.Pattern). Found type: {type(config.get('FILENAME_PATTERN'))}.") - is_valid = False - if not (isinstance(config["PWX_TO_GROUP_MAP"], dict) and config["PWX_TO_GROUP_MAP"]): - _log_issue(commit_message_reports_list, config_name, "ConfigError", f"Key 'PWX_TO_GROUP_MAP' must be a non-empty dictionary. Found: '{config.get('PWX_TO_GROUP_MAP')}'.") - is_valid = False - if not isinstance(config["DESIRED_GROUP_ORDER"], list): - _log_issue(commit_message_reports_list, config_name, "ConfigError", f"Key 'DESIRED_GROUP_ORDER' must be a list. Found type: {type(config.get('DESIRED_GROUP_ORDER'))}.") - is_valid = False - - # Validate FILE_EXTENSION_SUFFIX can be an empty string now - if "FILE_EXTENSION_SUFFIX" in config and not isinstance(config["FILE_EXTENSION_SUFFIX"], str): - _log_issue(commit_message_reports_list, config_name, "ConfigError", f"Key 'FILE_EXTENSION_SUFFIX' must be a string (can be empty). Found type: {type(config.get('FILE_EXTENSION_SUFFIX'))}.") - is_valid = False - - - if not is_valid: - _log_issue(commit_message_reports_list, config_name, "Info", f"Skipping configuration '{config_name}' due to type or content errors in its definition.") - return is_valid - - -def process_all_configs(configs_to_process: list[dict], docs_json_path: Path) -> list[str]: - commit_message_reports = [] - - docs_data = load_docs_data_robust(docs_json_path, commit_message_reports) - - navigation_data_to_modify = docs_data.setdefault("navigation", {}) - if not isinstance(navigation_data_to_modify, dict): - _log_issue(commit_message_reports, "GLOBAL", "Critical", "'navigation' key in docs.json is not a dictionary. Resetting to default structure.") - docs_data["navigation"] = {"versions": []} - navigation_data_to_modify = docs_data["navigation"] - - navigation_data_to_modify.setdefault("versions", []) - if not isinstance(navigation_data_to_modify.get("versions"), list): - _log_issue(commit_message_reports, "GLOBAL", "Error", "'navigation.versions' in docs.json was not a list. Resetting it to an empty list.") - navigation_data_to_modify["versions"] = [] - - processed_any_config_successfully = False - for i, config_item in enumerate(configs_to_process): - config_id = config_item.get("LANGUAGE_CODE", f"UnnamedConfig_{i+1}") - - _log_issue(commit_message_reports, config_id, "Info", f"Starting validation for configuration '{config_id}'.") - if validate_config(config_item, config_id, commit_message_reports): - _log_issue(commit_message_reports, config_id, "Info", f"Configuration '{config_id}' validated successfully. Starting processing.") - try: - process_single_config(config_item, navigation_data_to_modify, commit_message_reports) - processed_any_config_successfully = True - except Exception as e: - _log_issue(commit_message_reports, config_id, "Critical", f"Unhandled exception during processing of configuration '{config_id}': {e}.") - import traceback - tb_str = traceback.format_exc() - print(f"TRACEBACK for configuration '{config_id}':\n{tb_str}") - else: - _log_issue(commit_message_reports, config_id, "Info", f"Configuration '{config_id}' failed validation. Skipping processing.") - - - if processed_any_config_successfully: - _log_issue(commit_message_reports, "GLOBAL", "Info", "Attempting to save changes to docs.json.") - save_docs_data_robust(docs_json_path, docs_data, commit_message_reports) - elif not configs_to_process: - _log_issue(commit_message_reports, "GLOBAL", "Info", "No configurations were provided to process.") - else: - _log_issue(commit_message_reports, "GLOBAL", "Info", "No valid configurations were processed successfully. docs.json will not be modified.") - - return commit_message_reports +def find_or_create_version(navigation_data: dict, version_code: str, commit_reports: list) -> dict: + """Find or create a version in the navigation structure""" + navigation_data.setdefault("versions", []) + + for version in navigation_data["versions"]: + if version.get("version") == version_code: + return version + + # Create new version + new_version = {"version": version_code, "languages": []} + navigation_data["versions"].append(new_version) + _log_issue(commit_reports, version_code, "Info", f"Created new version '{version_code}'") + return new_version + + +def find_or_create_language(version_data: dict, lang_code: str, commit_reports: list) -> dict: + """Find or create a language in the version structure""" + version_data.setdefault("languages", []) + + for language in version_data["languages"]: + if language.get("language") == lang_code: + return language + + # Create new language + new_language = {"language": lang_code, "dropdowns": []} + version_data["languages"].append(new_language) + _log_issue(commit_reports, f"{version_data.get('version')}/{lang_code}", "Info", f"Created new language '{lang_code}'") + return new_language + + +def find_or_create_dropdown(language_data: dict, dropdown_name: str, commit_reports: list) -> dict: + """Find or create a dropdown in the language structure""" + language_data.setdefault("dropdowns", []) + + for dropdown in language_data["dropdowns"]: + if dropdown.get("dropdown") == dropdown_name: + return dropdown + + # Create new dropdown + new_dropdown = {"dropdown": dropdown_name} + language_data["dropdowns"].append(new_dropdown) + context = f"{language_data.get('language')}/{dropdown_name}" + _log_issue(commit_reports, context, "Info", f"Created new dropdown '{dropdown_name}'") + return new_dropdown + + +def extract_pages_from_structure(item, visited=None): + """Recursively extract all page paths from a dropdown/group structure""" + if visited is None: + visited = set() + + # Avoid infinite recursion by tracking visited items + item_id = id(item) + if item_id in visited: + return set() + visited.add(item_id) + + pages = set() + + if isinstance(item, str): + pages.add(item) + elif isinstance(item, dict): + # Handle 'pages' list + if "pages" in item and isinstance(item["pages"], list): + for page in item["pages"]: + pages.update(extract_pages_from_structure(page, visited)) + # Handle 'groups' list + if "groups" in item and isinstance(item["groups"], list): + for group in item["groups"]: + pages.update(extract_pages_from_structure(group, visited)) + elif isinstance(item, list): + for sub_item in item: + pages.update(extract_pages_from_structure(sub_item, visited)) + + return pages + + +def discover_files_in_directory(base_path: Path, dropdown_path: str) -> set: + """Discover all .mdx files in a directory and return their relative paths""" + files = set() + full_path = base_path / dropdown_path if dropdown_path else base_path + + if not full_path.exists(): + return files + + for mdx_file in full_path.rglob("*.mdx"): + # Get relative path from base directory + rel_path = mdx_file.relative_to(BASE_DIR) + # Remove .mdx extension for the page path + page_path = str(rel_path)[:-4] + files.add(page_path) + + return files + + +def sync_dropdown_between_languages( + version_config: dict, + dropdown_config: dict, + navigation_data: dict, + commit_reports: list +): + """Sync a specific dropdown between languages for a version""" + version_code = version_config["VERSION_CODE"] + base_paths = version_config["BASE_PATHS"] + + # Get English dropdown structure as source of truth + version_nav = find_or_create_version(navigation_data, version_code, commit_reports) + en_lang = find_or_create_language(version_nav, "en", commit_reports) + en_dropdown_name = dropdown_config["en"]["name"] + en_dropdown = None + + for dropdown in en_lang.get("dropdowns", []): + if dropdown.get("dropdown") == en_dropdown_name: + en_dropdown = dropdown + break + + if not en_dropdown: + _log_issue(commit_reports, f"{version_code}/en", "Warning", + f"English dropdown '{en_dropdown_name}' not found, skipping sync") + return + + # Extract pages from English structure + en_pages = extract_pages_from_structure(en_dropdown) + + # Skip if this is an OpenAPI type (handled differently) + if dropdown_config["en"].get("type") == "openapi": + _log_issue(commit_reports, f"{version_code}", "Info", + f"Skipping OpenAPI dropdown '{en_dropdown_name}'") + return + + # Sync to other languages + for lang_code in ["cn", "ja"]: + if lang_code not in dropdown_config: + continue + + lang_config = dropdown_config[lang_code] + lang_dropdown_name = lang_config["name"] + + # Find or create language and dropdown + lang_nav = find_or_create_language(version_nav, lang_code, commit_reports) + lang_dropdown = find_or_create_dropdown(lang_nav, lang_dropdown_name, commit_reports) + + # For now, copy the entire structure from English and adjust paths + # This ensures the navigation structure matches + copy_dropdown_structure(en_dropdown, lang_dropdown, "en", lang_code, base_paths) + + _log_issue(commit_reports, f"{version_code}/{lang_code}/{lang_dropdown_name}", + "Info", f"Synced dropdown structure from English") + + +def copy_dropdown_structure(source_dropdown: dict, target_dropdown: dict, + source_lang: str, target_lang: str, + base_paths: dict): + """Copy the structure from source dropdown to target, adjusting paths""" + + def adjust_path(path: str) -> str: + """Adjust a page path from source language to target language""" + # Replace source language path with target language path + if path.startswith(f"{source_lang}/"): + return path.replace(f"{source_lang}/", f"{base_paths[target_lang]}/", 1) + elif path.startswith(base_paths[source_lang]): + return path.replace(base_paths[source_lang], base_paths[target_lang], 1) + return path + + def copy_structure(source_item): + """Recursively copy and adjust structure""" + if isinstance(source_item, str): + return adjust_path(source_item) + elif isinstance(source_item, dict): + result = {} + for key, value in source_item.items(): + if key in ["pages", "groups"]: + result[key] = [copy_structure(item) for item in value] + elif key == "group" or key == "dropdown" or key == "tab": + result[key] = value # Keep group names as is + elif key == "icon": + result[key] = value # Keep icons + else: + result[key] = copy_structure(value) + return result + elif isinstance(source_item, list): + return [copy_structure(item) for item in source_item] + return source_item + + # Copy all keys from source to target, adjusting paths + for key, value in source_dropdown.items(): + if key == "dropdown": + continue # Keep target dropdown name + target_dropdown[key] = copy_structure(value) + + +def process_all_configs(configs: list, docs_json_path: Path) -> list[str]: + """Process all sync configurations""" + commit_reports = [] + + # Load existing docs.json + docs_data = load_docs_data_robust(docs_json_path, commit_reports) + navigation_data = docs_data.setdefault("navigation", {}) + + # Process each version configuration + for version_config in configs: + version_code = version_config["VERSION_CODE"] + _log_issue(commit_reports, version_code, "Info", f"Processing version '{version_code}'") + + # Skip if no dropdowns to sync + if not version_config.get("DROPDOWNS_TO_SYNC"): + _log_issue(commit_reports, version_code, "Info", "No dropdowns configured for sync") + continue + + # Sync each configured dropdown + for dropdown_config in version_config["DROPDOWNS_TO_SYNC"]: + sync_dropdown_between_languages( + version_config, + dropdown_config, + navigation_data, + commit_reports + ) + + # Save updated docs.json + save_docs_data_robust(docs_json_path, docs_data, commit_reports) + + return commit_reports + def main_apply_docs_json() -> str: + """Main function to sync documentation structure""" print(f"Script base directory: {BASE_DIR}") print(f"Docs JSON path: {DOCS_JSON_PATH}") - print(f"Refresh mode: {refresh}") - - CONFIGS_TO_PROCESS = [ - DEV_ZH, - DEV_EN, - DEV_JA, - ] - - commit_message_parts = process_all_configs(CONFIGS_TO_PROCESS, DOCS_JSON_PATH) - - if not commit_message_parts: - return "success" + print(f"Refresh mode: {refresh}") + + commit_message_parts = process_all_configs(SYNC_CONFIGS, DOCS_JSON_PATH) + + if not commit_message_parts: + return "Documentation sync completed successfully" else: - num_critical_issues = len(commit_message_parts) - commit_summary_line = f"docs.json processed with {num_critical_issues} critical issue(s) reported." - - max_lines_for_commit_detail = 10 - if len(commit_message_parts) > max_lines_for_commit_detail: - detailed_issues_str = "\n".join(commit_message_parts[:max_lines_for_commit_detail]) + \ - f"\n... and {len(commit_message_parts) - max_lines_for_commit_detail} more critical issues (see full console logs for details)." - else: - detailed_issues_str = "\n".join(commit_message_parts) - - return f"{commit_summary_line}\n\nDetails of critical issues:\n{detailed_issues_str}" + num_critical_issues = len([p for p in commit_message_parts if any(t in p for t in CRITICAL_ISSUE_TYPES)]) + if num_critical_issues > 0: + return f"Documentation sync completed with {num_critical_issues} critical issue(s)" + return "Documentation sync completed with warnings" if __name__ == "__main__": diff --git a/tools/contributing_in_page.py b/tools/contributing_in_page.py index b4cd5e546..6727f14d7 100644 --- a/tools/contributing_in_page.py +++ b/tools/contributing_in_page.py @@ -193,8 +193,56 @@ def loop(dict): ) +def main_remove_help_cip(): + help_docs = { + "zh_help": { + "target_dir_relative": "cn", + }, + "en_help": { + "target_dir_relative": "en", + }, + "ja_help": { + "target_dir_relative": "jp", + }, + # "zh_plugin_dev": { + # "target_dir_relative": "plugin-dev-zh", + # }, + # "en_plugin_dev": { + # "target_dir_relative": "plugin-dev-en", + # }, + # "ja_plugin_dev": { + # "target_dir_relative": "plugin-dev-ja", + # }, + } + try: + for config_name, config_data in help_docs.items(): + remove_contributing_section(config_data["target_dir_relative"]) + return "Successfully removed CIP from help documentation" + except Exception as e: + return f"Error: {str(e)}" + + def main_contributing_in_page(): process = { + # # Help Documentation + # "zh_help": { + # "target_dir_relative": "cn", + # "repo_owner": "langgenius", + # "repo_name": "dify-docs", + # "language": "zh", + # }, + # "en_help": { + # "target_dir_relative": "en", + # "repo_owner": "langgenius", + # "repo_name": "dify-docs", + # "language": "en", + # }, + # "ja_help": { + # "target_dir_relative": "jp", + # "repo_owner": "langgenius", + # "repo_name": "dify-docs", + # "language": "ja", + # }, # Plugin Development "zh_plugin_dev": { "target_dir_relative": "plugin-dev-zh", @@ -222,6 +270,7 @@ def main_contributing_in_page(): return (f"{str(e)}") if __name__ == "__main__": + result_message = main_remove_help_cip() result_message = main_contributing_in_page() print("\n--- Script Execution Result ---") print(result_message) diff --git a/tools/translate/.env.example b/tools/translate/.env.example new file mode 100644 index 000000000..3e36dd65b --- /dev/null +++ b/tools/translate/.env.example @@ -0,0 +1 @@ +dify_api_key=your_dify_api_key_here \ No newline at end of file diff --git a/tools/translate/.gitignore b/tools/translate/.gitignore new file mode 100644 index 000000000..183c5a975 --- /dev/null +++ b/tools/translate/.gitignore @@ -0,0 +1,2 @@ +.env +test_*.py diff --git a/tools/translate/README.md b/tools/translate/README.md new file mode 100644 index 000000000..d2cb02ccb --- /dev/null +++ b/tools/translate/README.md @@ -0,0 +1,193 @@ +# Automatic Document Translation + +Multi-language document auto-translation system based on GitHub Actions and Dify AI, supporting English, Chinese, and Japanese. + +## How It Works + +### Workflow Triggers + +1. **Execute Workflow** (New PRs): + - Triggers when PR is opened with `.md/.mdx` changes in `en/` directory + - Creates translation PR with fresh translations for all changed files + - Translation PR tracks the source PR + +2. **Update Workflow** (Incremental Changes): + - Triggers on new commits to source PR + - Updates existing translation PR with incremental changes + - **Context-aware translation**: Uses existing translation + git diff for modified files + - **Surgical reconciliation**: Detects and applies move/rename operations + +### Translation Operations + +- ✅ **New files**: Fresh translation to all target languages +- ✅ **Modified files**: Context-aware update using existing translation + git diff +- ✅ **Deleted files**: Removed from all language sections + physical files +- ✅ **Moved files**: Detected via `group_path` changes, applied with index-based navigation +- ✅ **Renamed files**: Detected when deleted+added in same location, preserves file extensions + +### Surgical Reconciliation + +Automatically detects structural changes in `docs.json`: + +- **Move detection**: Same file, different `group_path` → moves cn/jp files to same nested location using index-based navigation +- **Rename detection**: File deleted+added in same location → renames cn/jp files with extension preserved +- **Index-based navigation**: Groups matched by position, not name (works across translations: "Nodes" ≠ "节点") + +## System Features + +- 🌐 **Multi-language Support**: Configuration-based language mapping (`config.json`) +- 📚 **Terminology Consistency**: Built-in professional terminology database (`termbase_i18n.md`) +- 🔄 **Incremental Updates**: Context-aware translation using git diff for modified files +- 🎯 **Surgical Reconciliation**: Automatic detection and application of move/rename operations +- 🛡️ **Fault Tolerance**: Retry mechanism with exponential backoff +- ⚡ **Efficient Processing**: Only processes changed files since last commit + +## Language Directories + +- **General docs**: `en/` (source) → `cn/`, `jp/` (targets) +- **Plugin dev docs**: `plugin-dev-en/` → `plugin-dev-zh/`, `plugin-dev-ja/` +- **Versioned docs**: `versions/{version}/en-us/` → `versions/{version}/zh-cn/`, `versions/{version}/jp/` + +Configuration in `tools/translate/config.json`. + +## Usage + +### For Document Writers + +1. Create branch from main +2. Add/modify/delete files in `en/` directory +3. Update `docs.json` if adding/removing/moving/renaming files +4. Push to branch → workflow creates translation PR automatically +5. Make additional changes → workflow updates translation PR incrementally +6. Review and merge translation PR + +### Testing Moves & Renames + +**Move**: Edit `docs.json` to move file between groups (e.g., Getting Started → Nodes) +```json +// Before: en/test-file in "Getting Started" group +// After: en/test-file in "Nodes" group +``` + +**Rename**: Rename file + update `docs.json` entry +```bash +git mv en/old-name.md en/new-name.md +# Update docs.json: "en/old-name" → "en/new-name" +``` + +Logs will show: +``` +INFO: Detected 1 moves, 0 renames, 0 adds, 0 deletes +INFO: Moving en/test-file from 'Dropdown > GroupA' to 'Dropdown > GroupB' +SUCCESS: Moved cn/test-file to new location +SUCCESS: Moved jp/test-file to new location +``` + +## Configuration + +### Language Settings + +Edit `tools/translate/config.json`: + +```json +{ + "source_language": "en", + "target_languages": ["cn", "jp"], + "languages": { + "en": {"code": "en", "name": "English", "directory": "en"}, + "cn": { + "code": "cn", + "name": "Chinese", + "directory": "cn", + "translation_notice": "⚠️ AI translation..." + } + } +} +``` + +### Terminology Database + +Edit `tools/translate/termbase_i18n.md` to update professional terminology translations. + +### Translation Model + +Configure in Dify Studio - adjust prompts or change base models. + +## Local Development + +### Setup + +```bash +# Create virtual environment +python -m venv venv +source venv/bin/activate # macOS/Linux +# venv\Scripts\activate # Windows + +# Install dependencies +pip install -r tools/translate/requirements.txt + +# Configure API key +echo "DIFY_API_KEY=your_key" > tools/translate/.env +``` + +### Run Translation + +```bash +# Interactive mode +python tools/translate/main.py + +# Specify file +python tools/translate/main.py path/to/file.mdx +``` + +### Test Surgical Reconciliation + +```bash +# Test locally with git refs +cd tools/translate +python -c " +from sync_and_translate import DocsSynchronizer +import asyncio +import os + +api_key = os.getenv('DIFY_API_KEY') +sync = DocsSynchronizer(api_key) + +# Test with specific commits +logs = sync.reconcile_docs_json_structural_changes('base_sha', 'head_sha') +for log in logs: + print(log) +" +``` + +## Troubleshooting + +### Translation Issues + +- **HTTP 504**: Verify `response_mode: "streaming"` in `main.py` +- **Missing output**: Check Dify workflow has output variable `output1` +- **Failed workflow**: Review Dify workflow logs for node errors + +### Move/Rename Issues + +- **Not detected**: Check logs for "INFO: Detected X moves, Y renames" - verify `group_path` changed +- **Wrong location**: Structure mismatch between languages - verify group indices align +- **File not found**: Ensure file has .md or .mdx extension + +## Key Files + +- `config.json` - Language configuration (single source of truth) +- `termbase_i18n.md` - Translation terminology database +- `sync_and_translate.py` - Core translation + surgical reconciliation logic +- `main.py` - Local translation tool with Dify API integration +- `translate_pr.py` - PR workflow orchestration +- `.github/workflows/sync_docs_execute.yml` - Execute workflow (new PRs) +- `.github/workflows/sync_docs_update.yml` - Update workflow (incremental changes) + +## Technical Details + +- Concurrent translation limited to 2 tasks for API stability +- Supports `.md` and `.mdx` file formats +- Based on Dify API streaming mode +- Index-based navigation for language-independent group matching +- Extension detection and preservation for rename operations diff --git a/tools/translate/config.json b/tools/translate/config.json new file mode 100644 index 000000000..8ebe9a15f --- /dev/null +++ b/tools/translate/config.json @@ -0,0 +1,125 @@ +{ + "source_language": "en", + "target_languages": ["cn", "jp"], + + "processing_limits": { + "max_files_per_run": 10, + "max_openapi_files_per_run": 5 + }, + + "openapi": { + "enabled": true, + "file_patterns": ["openapi*.json"], + "directories": ["api-reference"], + "translatable_fields": ["title", "summary", "description"] + }, + + "languages": { + "en": { + "code": "en", + "name": "English", + "directory": "en" + }, + "cn": { + "code": "cn", + "name": "Chinese", + "directory": "cn", + "translation_notice": " ⚠️ 本文档由 AI 自动翻译。如有任何不准确之处,请参考[英文原版]({source_path})。\n\n" + }, + "jp": { + "code": "jp", + "name": "Japanese", + "directory": "jp", + "translation_notice": " ⚠️ このドキュメントはAIによって自動翻訳されています。不正確な部分がある場合は、[英語版]({source_path})を参照してください。\n\n" + } + }, + + "versioned_docs": { + "2-8-x": { + "en": "versions/2-8-x/en-us", + "cn": "versions/2-8-x/zh-cn", + "jp": "versions/2-8-x/jp" + }, + "3-0-x": { + "en": "versions/3-0-x/en-us", + "cn": "versions/3-0-x/zh-cn", + "jp": "versions/3-0-x/jp" + }, + "3-1-x": { + "en": "versions/3-1-x/en-us", + "cn": "versions/3-1-x/zh-cn", + "jp": "versions/3-1-x/jp" + } + }, + + "label_translations": { + "Getting Started": { + "cn": "快速开始", + "jp": "はじめに" + }, + "Documentation": { + "cn": "文档", + "jp": "ドキュメント" + }, + "Build": { + "cn": "构建", + "jp": "ビルド" + }, + "Debug": { + "cn": "调试", + "jp": "デバッグ" + }, + "Publish": { + "cn": "发布", + "jp": "公開" + }, + "Monitor": { + "cn": "监控", + "jp": "モニタリング" + }, + "Knowledge": { + "cn": "知识库", + "jp": "ナレッジベース" + }, + "Workspace": { + "cn": "工作区", + "jp": "ワークスペース" + }, + "Tutorials": { + "cn": "教程", + "jp": "チュートリアル" + }, + "FAQ": { + "cn": "常见问题", + "jp": "よくある質問" + }, + "Introduction": { + "cn": "介绍", + "jp": "紹介" + }, + "Quick Start": { + "cn": "快速开始", + "jp": "クイックスタート" + }, + "Key Concepts": { + "cn": "核心概念", + "jp": "主要概念" + }, + "Nodes": { + "cn": "节点", + "jp": "ノード" + }, + "Self Hosting": { + "cn": "自部署", + "jp": "セルフホスティング" + }, + "API Reference": { + "cn": "API 参考", + "jp": "API リファレンス" + }, + "Develop": { + "cn": "开发", + "jp": "開発" + } + } +} diff --git a/tools/translate/json_formatter.py b/tools/translate/json_formatter.py new file mode 100644 index 000000000..8e8e15bc8 --- /dev/null +++ b/tools/translate/json_formatter.py @@ -0,0 +1,306 @@ +""" +Format-preserving JSON serialization utilities. + +This module detects and preserves the exact formatting of existing JSON files, +allowing surgical edits without reformatting the entire file. +""" + +import json +import re +from typing import Any, Dict, Optional, Tuple +from pathlib import Path + + +class JSONFormat: + """Detected JSON formatting style""" + + def __init__(self): + self.indent_char = ' ' # ' ' or '\t' + self.indent_size = 4 # Number of indent chars per level + self.indent_pattern = 'consistent' # 'consistent' or 'mixed' + self.indent_increments = [4] # List of space counts per level + self.trailing_newline = True + self.key_spacing = True # Space after colon: "key": value vs "key":value + + def __repr__(self): + return (f"JSONFormat(char={repr(self.indent_char)}, " + f"size={self.indent_size}, pattern={self.indent_pattern}, " + f"increments={self.indent_increments})") + + +def detect_json_format(file_path: str) -> JSONFormat: + """ + Detect the formatting style of an existing JSON file. + + Analyzes indentation pattern, whitespace, and structural formatting + to enable format-preserving edits. + """ + fmt = JSONFormat() + + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + lines = content.split('\n') + + # Check trailing newline + fmt.trailing_newline = content.endswith('\n') + + # Detect indent character and pattern by tracking absolute indent levels + indent_levels = {} # Maps absolute space count to frequency + + for line_num, line in enumerate(lines[:300]): # Sample first 300 lines + if not line.strip() or line.strip().startswith('//'): + continue + + # Count leading whitespace + stripped = line.lstrip(' \t') + if not stripped: + continue + + spaces = len(line) - len(stripped) + tabs = line[:spaces].count('\t') + + # Detect tab vs space + if tabs > 0: + fmt.indent_char = '\t' + indent_count = tabs + else: + indent_count = spaces + + if indent_count > 0: + indent_levels[indent_count] = indent_levels.get(indent_count, 0) + 1 + + if not indent_levels: + # Fallback to default + return fmt + + # Sort indent levels to build the actual progression + sorted_levels = sorted(indent_levels.keys()) + + # Build increment pattern from actual levels seen + increments = [] + if sorted_levels: + prev_level = 0 + for level in sorted_levels: + increment = level - prev_level + increments.append(increment) + prev_level = level + + # Check if consistent (all increments the same) + unique_increments = list(set(increments)) + + if len(unique_increments) == 1: + fmt.indent_pattern = 'consistent' + fmt.indent_size = unique_increments[0] + fmt.indent_increments = [unique_increments[0]] + else: + fmt.indent_pattern = 'mixed' + fmt.indent_increments = increments + + # Detect key spacing (": " vs ":") + colon_samples = [line for line in lines[:100] if '":' in line] + if colon_samples: + with_space = sum(1 for line in colon_samples if '": ' in line) + fmt.key_spacing = with_space > len(colon_samples) // 2 + + return fmt + + +def get_indent_for_level(fmt: JSONFormat, level: int) -> str: + """ + Get the indent string for a specific nesting level. + Handles both consistent and mixed indent patterns. + """ + if level == 0: + return '' + + if fmt.indent_pattern == 'consistent': + count = fmt.indent_size * level + else: + # For mixed patterns, sum up increments up to this level + # increments[0] is the increment from level 0 to level 1 + # increments[1] is the increment from level 1 to level 2, etc. + count = 0 + for i in range(level): + if i < len(fmt.indent_increments): + count += fmt.indent_increments[i] + else: + # If we run out of recorded increments, use the last one + count += fmt.indent_increments[-1] if fmt.indent_increments else 2 + + return fmt.indent_char * count + + +def format_preserving_json_dump(data: Any, fmt: JSONFormat, level: int = 0) -> str: + """ + Serialize JSON data while preserving the detected formatting style. + + This custom serializer respects: + - Detected indent pattern (consistent vs mixed) + - Space vs tab indentation + - Key spacing preferences + - Trailing newline conventions + + Note: level indicates the nesting depth of the current structure's opening brace. + """ + indent = get_indent_for_level(fmt, level) + child_indent = get_indent_for_level(fmt, level + 1) + colon = ': ' if fmt.key_spacing else ':' + + if isinstance(data, dict): + if not data: + return '{}' + + lines = ['{'] + items = list(data.items()) + + for i, (key, value) in enumerate(items): + is_last = (i == len(items) - 1) + # Serialize child values at the same structural level (they'll handle their own nesting) + serialized_value = format_preserving_json_dump(value, fmt, level + 1) + + # Check if value is multiline + if '\n' in serialized_value: + # Multiline value (object or array) - needs special handling + value_lines = serialized_value.split('\n') + comma = '' if is_last else ',' + # First line goes on same line as key + lines.append(f'{child_indent}"{key}"{colon}{value_lines[0]}') + # Remaining lines keep their indentation + for vline in value_lines[1:-1]: + lines.append(vline) + # Last line gets the comma + lines.append(value_lines[-1] + comma) + else: + # Single line value + comma = '' if is_last else ',' + lines.append(f'{child_indent}"{key}"{colon}{serialized_value}{comma}') + + lines.append(f'{indent}}}') + return '\n'.join(lines) + + elif isinstance(data, list): + if not data: + return '[]' + + lines = ['['] + + for i, item in enumerate(data): + is_last = (i == len(data) - 1) + serialized_item = format_preserving_json_dump(item, fmt, level + 1) + + # Check if item is multiline + if '\n' in serialized_item: + # Multiline item needs proper indentation + item_lines = serialized_item.split('\n') + comma = '' if is_last else ',' + # First line gets child indent + lines.append(f'{child_indent}{item_lines[0]}') + # Remaining lines keep their indentation + for iline in item_lines[1:-1]: + lines.append(iline) + # Last line gets the comma + lines.append(item_lines[-1] + comma) + else: + # Single line item + comma = '' if is_last else ',' + lines.append(f'{child_indent}{serialized_item}{comma}') + + lines.append(f'{indent}]') + return '\n'.join(lines) + + elif isinstance(data, str): + # Escape special characters + escaped = json.dumps(data, ensure_ascii=False) + return escaped + + elif isinstance(data, bool): + return 'true' if data else 'false' + + elif data is None: + return 'null' + + elif isinstance(data, (int, float)): + return str(data) + + else: + # Fallback to standard JSON serialization + return json.dumps(data, ensure_ascii=False) + + +def save_json_with_preserved_format(file_path: str, data: Dict[str, Any], + reference_file: Optional[str] = None) -> bool: + """ + Save JSON data to file while preserving the original formatting style. + + Args: + file_path: Path to JSON file to write + data: Dictionary to serialize + reference_file: Optional path to reference file for format detection. + If not provided, uses file_path for detection. + + Returns: + True if successful, False otherwise + """ + try: + # Detect format from reference file or existing target file + format_source = reference_file if reference_file else file_path + + if Path(format_source).exists(): + fmt = detect_json_format(format_source) + else: + # Use sensible defaults for new files + fmt = JSONFormat() + fmt.indent_size = 4 + fmt.indent_pattern = 'consistent' + + # Serialize with preserved format + content = format_preserving_json_dump(data, fmt, level=0) + + # Add trailing newline if detected in original + if fmt.trailing_newline and not content.endswith('\n'): + content += '\n' + + # Write to file + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + + return True + + except Exception as e: + print(f"Error saving JSON with preserved format: {e}") + return False + + +def validate_format_preservation(original_path: str, new_path: str) -> Dict[str, Any]: + """ + Validate that formatting was preserved between two JSON files. + + Returns a report with: + - matching: bool (whether formats match) + - differences: list of detected differences + - original_format: detected format from original + - new_format: detected format from new file + """ + original_fmt = detect_json_format(original_path) + new_fmt = detect_json_format(new_path) + + differences = [] + + if original_fmt.indent_char != new_fmt.indent_char: + differences.append(f"Indent char: {repr(original_fmt.indent_char)} → {repr(new_fmt.indent_char)}") + + if original_fmt.indent_pattern != new_fmt.indent_pattern: + differences.append(f"Indent pattern: {original_fmt.indent_pattern} → {new_fmt.indent_pattern}") + + if original_fmt.indent_size != new_fmt.indent_size: + differences.append(f"Indent size: {original_fmt.indent_size} → {new_fmt.indent_size}") + + if original_fmt.trailing_newline != new_fmt.trailing_newline: + differences.append(f"Trailing newline: {original_fmt.trailing_newline} → {new_fmt.trailing_newline}") + + return { + 'matching': len(differences) == 0, + 'differences': differences, + 'original_format': original_fmt, + 'new_format': new_fmt + } diff --git a/tools/translate/main.py b/tools/translate/main.py new file mode 100644 index 000000000..1c3fbae73 --- /dev/null +++ b/tools/translate/main.py @@ -0,0 +1,588 @@ +import httpx +import os +import sys +import asyncio +import aiofiles +import json +from pathlib import Path + +# Load translation config +SCRIPT_DIR = Path(__file__).resolve().parent +CONFIG_PATH = SCRIPT_DIR / "config.json" + +def load_translation_config(): + """Load language configuration""" + if CONFIG_PATH.exists(): + with open(CONFIG_PATH, 'r', encoding='utf-8') as f: + return json.load(f) + return None + +TRANSLATION_CONFIG = load_translation_config() + +def build_docs_structure(): + """Build docs structure from config and hardcoded plugin-dev paths""" + structure = {} + + # General docs from config + if TRANSLATION_CONFIG and "languages" in TRANSLATION_CONFIG: + general_help = {} + for lang_code, lang_info in TRANSLATION_CONFIG["languages"].items(): + general_help[lang_info["name"]] = lang_info["directory"] + structure["general_help"] = general_help + else: + # Fallback if config not available + structure["general_help"] = { + "English": "en", + "Chinese": "cn", + "Japanese": "jp" + } + + # Plugin dev paths (keep hardcoded for now as requested) + structure["plugin_dev"] = { + "English": "plugin-dev-en", + "Chinese": "plugin-dev-zh", + "Japanese": "plugin-dev-ja" + } + + # Versioned docs from config + if TRANSLATION_CONFIG and "versioned_docs" in TRANSLATION_CONFIG: + for version_key, version_paths in TRANSLATION_CONFIG["versioned_docs"].items(): + # Convert version key (e.g., "2-8-x") to structure key (e.g., "version_28x") + structure_key = f"version_{version_key.replace('-', '')}" + version_structure = {} + + # Map language codes to language names + for lang_code, path in version_paths.items(): + if lang_code in TRANSLATION_CONFIG["languages"]: + lang_name = TRANSLATION_CONFIG["languages"][lang_code]["name"] + version_structure[lang_name] = path + + structure[structure_key] = version_structure + else: + # Fallback if versioned_docs not in config + structure["version_28x"] = { + "English": "versions/2-8-x/en-us", + "Chinese": "versions/2-8-x/zh-cn", + "Japanese": "versions/2-8-x/jp" + } + structure["version_30x"] = { + "English": "versions/3-0-x/en-us", + "Chinese": "versions/3-0-x/zh-cn", + "Japanese": "versions/3-0-x/jp" + } + structure["version_31x"] = { + "English": "versions/3-1-x/en-us", + "Chinese": "versions/3-1-x/zh-cn", + "Japanese": "versions/3-1-x/jp" + } + + return structure + +docs_structure = build_docs_structure() + + +async def translate_text(file_path, dify_api_key, original_language, target_language1, termbase_path=None, max_retries=5, the_doc_exist=None, diff_original=None): + """ + Translate text using Dify API with termbase from `tools/translate/termbase_i18n.md` + Includes retry logic with exponential backoff for handling API timeouts and gateway errors. + + Args: + file_path: Path to the document to translate + dify_api_key: Dify API key + original_language: Source language name + target_language1: Target language name + termbase_path: Optional path to terminology database + max_retries: Maximum number of retry attempts + the_doc_exist: Optional existing translation (for modified files) + diff_original: Optional git diff of the original file (for modified files) + """ + if termbase_path is None: + # Get project root directory + script_dir = os.path.dirname(os.path.abspath(__file__)) + base_dir = os.path.dirname(os.path.dirname(script_dir)) # Two levels up + termbase_path = os.path.join(base_dir, "tools", "translate", "termbase_i18n.md") + + url = "https://api.dify.ai/v1/workflows/run" + + termbase = await load_md_mdx(termbase_path) + the_doc = await load_md_mdx(file_path) + + # Build inputs - always include base inputs + inputs = { + "original_language": original_language, + "output_language1": target_language1, + "the_doc": the_doc, + "termbase": termbase + } + + # Add optional inputs for modified files + if the_doc_exist is not None: + inputs["the_doc_exist"] = the_doc_exist + if diff_original is not None: + inputs["diff_original"] = diff_original + + payload = { + "response_mode": "streaming", # Use streaming to avoid gateway timeouts + "user": "Dify", + "inputs": inputs + } + + headers = { + "Authorization": "Bearer " + dify_api_key, + "Content-Type": "application/json" + } + + # Retry mechanism with exponential backoff + for attempt in range(max_retries): + try: + # Add exponential backoff with jitter for retries + if attempt > 0: + # Exponential backoff: 30s, 60s, 120s, 240s, 300s with ±20% jitter + # Modified files take 2-3 minutes, so we need longer waits + import random + base_delay = min(30 * (2 ** (attempt - 1)), 300) # Cap at 300s (5 min) + jitter = random.uniform(0.8, 1.2) + delay = base_delay * jitter + print(f"⏳ Retry attempt {attempt + 1}/{max_retries} after {delay:.1f}s delay...") + await asyncio.sleep(delay) + + # Streaming mode: no gateway timeout issues + # Set timeout to 600s (10 min) for the entire stream + async with httpx.AsyncClient(timeout=600.0) as client: + async with client.stream("POST", url, json=payload, headers=headers) as response: + # Check initial response status + if response.status_code != 200: + print(f"❌ HTTP Error: {response.status_code}") + error_text = await response.aread() + print(f"Response: {error_text.decode('utf-8')[:500]}") + if response.status_code in [502, 503, 504]: + if attempt < max_retries - 1: + print(f"Will retry... ({max_retries - attempt - 1} attempts remaining)") + continue + return "" + + # Parse streaming response (Server-Sent Events format) + print(f"📥 Receiving streaming response...") + output1 = None + workflow_run_id = None + final_status = None + + async for line in response.aiter_lines(): + line = line.strip() + if not line or not line.startswith("data: "): + continue + + try: + # Remove "data: " prefix and parse JSON + json_str = line[6:] # Remove "data: " + event_data = json.loads(json_str) + event_type = event_data.get("event", "") + + # Track workflow ID + if "workflow_run_id" in event_data: + workflow_run_id = event_data["workflow_run_id"] + + # Handle different event types + if event_type == "workflow_started": + print(f"🔄 Workflow started: {workflow_run_id}") + elif event_type == "workflow_finished": + final_status = event_data.get("data", {}).get("status", "unknown") + print(f"🔄 Workflow finished with status: {final_status}") + # Extract output1 from final event + outputs = event_data.get("data", {}).get("outputs", {}) + output1 = outputs.get("output1", "") + elif event_type == "node_started": + node_type = event_data.get("data", {}).get("node_type", "") + print(f" ⚙️ Node started: {node_type}") + elif event_type == "error": + error_msg = event_data.get("message", "Unknown error") + print(f"❌ Workflow error: {error_msg}") + return "" + except json.JSONDecodeError as e: + # Skip invalid JSON lines + continue + + # Check final status and output + if final_status == "failed": + print(f"❌ Workflow execution failed") + return "" + + if not output1: + print(f"⚠️ Warning: No output1 found in workflow_finished event") + if attempt < max_retries - 1: + print(f"Will retry... ({max_retries - attempt - 1} attempts remaining)") + continue + return "" + + print(f"✅ Translation completed successfully (length: {len(output1)} chars)") + return output1 + + except httpx.ReadTimeout as e: + print(f"⏱️ Stream timeout after 600s (attempt {attempt + 1}/{max_retries})") + if attempt < max_retries - 1: + print(f"Will retry... ({max_retries - attempt - 1} attempts remaining)") + else: + print(f"❌ All {max_retries} attempts failed due to timeout") + return "" + + except httpx.ConnectTimeout as e: + print(f"🔌 Connection timeout (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: + print(f"❌ All {max_retries} attempts failed due to connection timeout") + return "" + + except httpx.HTTPError as e: + print(f"🌐 HTTP error (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: + print(f"❌ All {max_retries} attempts failed due to HTTP errors") + return "" + + except Exception as e: + print(f"❌ Unexpected error (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: + print(f"❌ All {max_retries} attempts failed due to unexpected errors") + return "" + + return "" + + +async def load_md_mdx(file_path): + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + return content + + +def determine_doc_type_and_language(file_path): + """ + Determine document type and current language based on file path + Returns (doc_type, current_language, language_name) + """ + # Normalize path separators + normalized_path = file_path.replace(os.sep, '/') + + # Collect all possible matches and find the longest one + matches = [] + for doc_type, languages in docs_structure.items(): + for lang_name, lang_code in languages.items(): + # Normalize lang_code path separators too + normalized_lang_code = lang_code.replace(os.sep, '/') + if normalized_lang_code in normalized_path: + matches.append((len(normalized_lang_code), doc_type, lang_code, lang_name)) + + # Return the match with the longest lang_code (most specific) + if matches: + matches.sort(reverse=True) # Sort by length descending + _, doc_type, lang_code, lang_name = matches[0] + return doc_type, lang_code, lang_name + + return None, None, None + + +def get_language_code_name_map(doc_type): + """ + Get mapping from language code to language name + """ + code_to_name = {} + for lang_name, lang_code in docs_structure[doc_type].items(): + code_to_name[lang_code] = lang_name + return code_to_name + + +def generate_target_path(file_path, current_lang_code, target_lang_code): + """ + Generate target language file path + """ + return file_path.replace(current_lang_code, target_lang_code) + + +async def save_translated_content(content, file_path): + """ + Save translated content to file + """ + try: + print(f"Attempting to save to: {file_path}") + print(f"Content length: {len(content)} characters") + + # Ensure directory exists + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + # Save file + async with aiofiles.open(file_path, "w", encoding="utf-8") as f: + await f.write(content) + + # Verify file was saved successfully + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + print(f"✓ Translated content saved to {file_path} (size: {file_size} bytes)") + else: + print(f"✗ Failed to save file: {file_path}") + except Exception as e: + print(f"Error saving file {file_path}: {str(e)}") + + +async def translate_single_file(file_path, dify_api_key, current_lang_name, target_lang_code, target_lang_name, current_lang_code, semaphore): + """ + Async translate single file (using semaphore to control concurrency) + """ + async with semaphore: # Control concurrency + # Generate target file path + target_file_path = generate_target_path(file_path, current_lang_code, target_lang_code) + + print(f"Source: {file_path}") + print(f"Target: {target_file_path}") + + # Check if target file exists + if os.path.exists(target_file_path): + print(f"Target file already exists: {target_file_path}") + return + + print(f"Translating from {current_lang_name} to {target_lang_name}...") + + try: + # Call translation function + translated_content = await translate_text( + file_path, + dify_api_key, + current_lang_name, + target_lang_name + ) + + print(f"Translation result length: {len(translated_content)} characters") + + if translated_content and translated_content.strip(): + # Save translation result + await save_translated_content(translated_content, target_file_path) + else: + print(f"Error: Translation failed for {target_lang_name} - empty or no content returned") + except Exception as e: + print(f"Error translating to {target_lang_name}: {str(e)}") + import traceback + traceback.print_exc() + + +async def main_async(file_path, dify_api_key=None): + """ + Async main function + """ + # Get script directory + script_dir = os.path.dirname(os.path.abspath(__file__)) + + # Try to load API key from .env file + env_path = os.path.join(script_dir, '.env') + if os.path.exists(env_path) and dify_api_key is None: + try: + # Import dotenv only when needed + import importlib.util + dotenv_spec = importlib.util.find_spec("dotenv") + if dotenv_spec is not None: + from dotenv import load_dotenv + load_dotenv(env_path) + dify_api_key = os.getenv('DIFY_API_KEY') or os.getenv('dify_api_key') + else: + raise ImportError + except ImportError: + # Manual parsing of .env file if dotenv is not available + with open(env_path, 'r') as f: + for line in f: + if line.strip().startswith('DIFY_API_KEY=') or line.strip().startswith('dify_api_key='): + dify_api_key = line.strip().split('=', 1)[1].strip('"\'') + break + + if not dify_api_key: + print("Error: DIFY_API_KEY not found. Please provide it as parameter or in .env file.") + return + + # Determine document type and current language + doc_type, current_lang_code, current_lang_name = determine_doc_type_and_language(file_path) + + if not doc_type: + print(f"Error: Unable to determine document type and language for {file_path}") + return + + print(f"Document type: {doc_type}, Current language: {current_lang_name} ({current_lang_code})") + + # Get all languages for current document type + code_to_name = get_language_code_name_map(doc_type) + + # Create semaphore to limit concurrency (avoid excessive API pressure) + semaphore = asyncio.Semaphore(2) + + # Create all translation tasks + tasks = [] + for target_lang_code, target_lang_name in code_to_name.items(): + # Skip current language + if target_lang_code == current_lang_code: + continue + + task = translate_single_file( + file_path, + dify_api_key, + current_lang_name, + target_lang_code, + target_lang_name, + current_lang_code, + semaphore + ) + tasks.append(task) + + # Execute all translation tasks + if tasks: + print("Running translations concurrently...") + await asyncio.gather(*tasks) + print("All translations completed!") + else: + print("No translations needed.") + + +def get_file_path_interactive(): + """ + Interactive file path input + """ + while True: + print("Please enter the file path to translate:") + print("请输入要翻译的文件路径:") + print("翻訳するファイルパスを入力してください:") + file_path = input("File path / 文件路径 / ファイルパス: ").strip() + + if not file_path: + print("File path cannot be empty. Please try again.") + print("文件路径不能为空,请重新输入。") + print("ファイルパスは空にできません。再度入力してください。") + continue + + # Remove quotes if user copy-pasted with quotes + file_path = file_path.strip('\'"') + + # Check if file exists + if not os.path.exists(file_path): + print(f"File does not exist: {file_path}") + print(f"文件不存在: {file_path}") + print(f"ファイルが存在しません: {file_path}") + print("Please check if the path is correct.") + print("请检查路径是否正确。") + print("パスが正しいか確認してください。") + continue + + # Check if it's a file + if not os.path.isfile(file_path): + print(f"The specified path is not a file: {file_path}") + print(f"指定的路径不是文件: {file_path}") + print(f"指定されたパスはファイルではありません: {file_path}") + continue + + # Check file extension + if not (file_path.endswith('.md') or file_path.endswith('.mdx')): + print(f"Warning: File is not .md or .mdx format: {file_path}") + print(f"警告: 文件不是 .md 或 .mdx 格式: {file_path}") + print(f"警告: ファイルは .md または .mdx 形式ではありません: {file_path}") + confirm = input("Continue anyway? (y/n) / 是否继续? (y/n) / 続行しますか? (y/n): ").strip().lower() + if confirm not in ['y', 'yes', 'Y', 'YES']: + continue + + return file_path + + +def load_local_api_key(): + """ + Load API key from local .env file + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + env_path = os.path.join(script_dir, '.env') + + if not os.path.exists(env_path): + print(f"Error: .env file not found: {env_path}") + print(f"错误: 未找到 .env 文件: {env_path}") + print(f"エラー: .env ファイルが見つかりません: {env_path}") + print("Please create .env file and add: DIFY_API_KEY=your_api_key") + print("请在当前目录创建 .env 文件并添加: DIFY_API_KEY=your_api_key") + print(".env ファイルを作成し、DIFY_API_KEY=your_api_key を追加してください") + return None + + try: + # Try using dotenv + import importlib.util + dotenv_spec = importlib.util.find_spec("dotenv") + if dotenv_spec is not None: + from dotenv import load_dotenv + load_dotenv(env_path) + api_key = os.getenv('DIFY_API_KEY') or os.getenv('dify_api_key') + else: + # Manual parsing of .env file + api_key = None + with open(env_path, 'r') as f: + for line in f: + line = line.strip() + if line.startswith('DIFY_API_KEY=') or line.startswith('dify_api_key='): + api_key = line.split('=', 1)[1].strip('"\'') + break + except Exception as e: + print(f"Error reading .env file: {e}") + print(f"读取 .env 文件时出错: {e}") + print(f".env ファイルの読み取りエラー: {e}") + return None + + if not api_key: + print("Error: DIFY_API_KEY not found in .env file") + print("错误: 在 .env 文件中未找到 DIFY_API_KEY") + print("エラー: .env ファイルに DIFY_API_KEY が見つかりません") + print("Please ensure .env file contains: DIFY_API_KEY=your_api_key") + print("请确保 .env 文件包含: DIFY_API_KEY=your_api_key") + print(".env ファイルに DIFY_API_KEY=your_api_key が含まれていることを確認してください") + return None + + print("✓ Successfully loaded local API key") + print("✓ 成功加载本地 API key") + print("✓ ローカル API キーの読み込みに成功しました") + return api_key + + +def main(file_path, dify_api_key=None): + """ + Sync wrapper function to run async main function + """ + asyncio.run(main_async(file_path, dify_api_key)) + + +if __name__ == "__main__": + # If no parameters provided, enter interactive mode + if len(sys.argv) == 1: + print("=== Dify Documentation Translation Tool ===") + print("=== Dify 文档翻译工具 ===") + print("=== Dify ドキュメント翻訳ツール ===") + print() + + # Interactive file path input + file_path = get_file_path_interactive() + + # Load local API key + dify_api_key = load_local_api_key() + if not dify_api_key: + sys.exit(1) + + print() + print(f"Starting translation for file: {file_path}") + print(f"开始翻译文件: {file_path}") + print(f"ファイルの翻訳を開始: {file_path}") + main(file_path, dify_api_key) + + # Command line argument mode + elif len(sys.argv) >= 2: + file_path = sys.argv[1] + dify_api_key = None + + # Parse command line arguments + for i, arg in enumerate(sys.argv[2:], 2): + if dify_api_key is None: + dify_api_key = arg + + main(file_path, dify_api_key) + + else: + print("Usage: python main.py [file_path] [dify_api_key]") + print(" No arguments: Enter interactive mode") + print(" file_path: File path to translate") + print(" dify_api_key: (Optional) Dify API key") + sys.exit(1) + + + diff --git a/tools/translate/openapi/__init__.py b/tools/translate/openapi/__init__.py new file mode 100644 index 000000000..c06f6559f --- /dev/null +++ b/tools/translate/openapi/__init__.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +OpenAPI Translation Pipeline + +Complete pipeline for translating OpenAPI JSON files: +Extract → Translate → Re-hydrate +""" + +from .extractor import OpenAPIExtractor +from .translator import OpenAPITranslator +from .rehydrator import OpenAPIRehydrator +import os +from pathlib import Path +import tempfile + + +async def translate_openapi_file_async(source_file: str, target_lang: str, output_file: str, dify_api_key: str = None) -> bool: + """ + Complete pipeline to translate an OpenAPI JSON file (async version). + + Pipeline stages: + 1. Extract: Pull all translatable fields into markdown + 2. Translate: Send to Dify API for translation + 3. Re-hydrate: Merge translations back into JSON structure + + Args: + source_file: Path to source English OpenAPI JSON file + target_lang: Target language code (cn, jp) + output_file: Path to save translated JSON file + dify_api_key: Optional Dify API key (if None, loads from env) + + Returns: + True if successful, False otherwise + """ + # Create temp directory for intermediate files + temp_dir = tempfile.mkdtemp(prefix=f"openapi_translation_{os.path.basename(source_file)}_") + + try: + print(f"\n{'='*60}") + print(f"🌐 Translating OpenAPI: {os.path.basename(source_file)} → {target_lang}") + print(f"{'='*60}\n") + + # Define temp file paths + extraction_map_path = os.path.join(temp_dir, "extraction_map.json") + markdown_path = os.path.join(temp_dir, "translation_input.md") + translated_md_path = os.path.join(temp_dir, f"translation_output_{target_lang}.md") + + # Step 1: Extract translatable fields + print(f"📤 Step 1/3: Extracting translatable fields from {source_file}...") + extractor = OpenAPIExtractor(source_file) + fields, markdown = extractor.extract() + + extractor.save_extraction_map(extraction_map_path) + extractor.save_markdown(markdown_path, markdown) + + print(f" ✓ Extracted {len(fields)} fields") + print(f" ✓ Saved extraction map: {extraction_map_path}") + print(f" ✓ Saved markdown for translation: {markdown_path}") + + # Step 2: Translate via Dify API (use async version) + print(f"\n🌐 Step 2/3: Translating to {target_lang}...") + translator = OpenAPITranslator(markdown_path, target_lang, dify_api_key) + translated_text = await translator.translate_async() + + translator.save_translation(translated_md_path, translated_text) + print(f" ✓ Translation complete") + print(f" ✓ Saved translated markdown: {translated_md_path}") + + # Step 3: Re-hydrate JSON structure + print(f"\n💧 Step 3/3: Re-hydrating JSON structure...") + rehydrator = OpenAPIRehydrator(source_file, extraction_map_path) + rehydrator.load_translation(translated_md_path) + stats = rehydrator.rehydrate(output_file) + + print(f" ✓ Created translated JSON: {output_file}") + print(f" 📊 Stats: {stats['updated']}/{stats['total']} fields translated") + + if stats['missing'] > 0: + print(f" ⚠️ {stats['missing']} fields kept in English (missing translations)") + + print(f"\n{'='*60}") + print(f"✅ Translation pipeline completed successfully!") + print(f" Source: {source_file}") + print(f" Output: {output_file}") + print(f" Language: {target_lang}") + print(f" Fields translated: {stats['updated']}/{stats['total']}") + print(f"{'='*60}\n") + + return True + + except Exception as e: + print(f"\n{'='*60}") + print(f"❌ Translation pipeline failed!") + print(f" Error: {str(e)}") + print(f"{'='*60}\n") + + import traceback + traceback.print_exc() + + return False + + finally: + # Optional: Cleanup temp files + # Uncomment to enable cleanup + # import shutil + # shutil.rmtree(temp_dir, ignore_errors=True) + print(f"🗂️ Temp files kept for debugging: {temp_dir}") + + +def translate_openapi_file(source_file: str, target_lang: str, output_file: str, dify_api_key: str = None) -> bool: + """ + Complete pipeline to translate an OpenAPI JSON file (sync wrapper). + + Pipeline stages: + 1. Extract: Pull all translatable fields into markdown + 2. Translate: Send to Dify API for translation + 3. Re-hydrate: Merge translations back into JSON structure + + Args: + source_file: Path to source English OpenAPI JSON file + target_lang: Target language code (cn, jp) + output_file: Path to save translated JSON file + dify_api_key: Optional Dify API key (if None, loads from env) + + Returns: + True if successful, False otherwise + """ + import asyncio + return asyncio.run(translate_openapi_file_async(source_file, target_lang, output_file, dify_api_key)) + + +# Export main functions +__all__ = ['translate_openapi_file', 'translate_openapi_file_async', 'OpenAPIExtractor', 'OpenAPITranslator', 'OpenAPIRehydrator'] diff --git a/tools/translate/openapi/extractor.py b/tools/translate/openapi/extractor.py new file mode 100644 index 000000000..9f436bc95 --- /dev/null +++ b/tools/translate/openapi/extractor.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +OpenAPI Field Extractor + +Recursively extracts translatable fields (title, summary, description) from OpenAPI JSON files. +Generates a markdown file for translation and an extraction map for re-hydration. +""" + +import json +from typing import List, Dict, Tuple +from pathlib import Path + + +class OpenAPIExtractor: + """Extracts translatable fields from OpenAPI JSON structure.""" + + def __init__(self, json_file_path: str, translatable_fields: List[str] = None): + """ + Initialize the extractor. + + Args: + json_file_path: Path to the source OpenAPI JSON file + translatable_fields: List of field names to extract (default: title, summary, description) + """ + self.source_path = json_file_path + self.translatable_fields = translatable_fields or ["title", "summary", "description"] + self.fields = [] # List of {id, path, value} + + def extract(self) -> Tuple[List[Dict], str]: + """ + Extract all translatable fields from the JSON file. + + Returns: + Tuple of (extraction_map list, markdown content string) + """ + with open(self.source_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Recursively walk and extract + self._walk(data, path=[]) + + # Generate markdown + markdown = self._generate_markdown() + + return self.fields, markdown + + def _walk(self, obj, path: List): + """ + Recursively walk JSON tree to find translatable fields. + + Args: + obj: Current object (dict, list, or primitive) + path: Current path as list of keys/indices + """ + if isinstance(obj, dict): + for key, value in obj.items(): + current_path = path + [key] + + # Check if this is a translatable field + if key in self.translatable_fields and isinstance(value, str) and value.strip(): + field_id = f"FIELD_{len(self.fields):04d}" + self.fields.append({ + "id": field_id, + "path": current_path.copy(), + "value": value + }) + else: + # Recurse deeper + self._walk(value, current_path) + + elif isinstance(obj, list): + for idx, item in enumerate(obj): + current_path = path + [f"[{idx}]"] + self._walk(item, current_path) + + def _generate_markdown(self) -> str: + """ + Generate markdown content for translation. + + Format: + ## FIELD_0000 + [PATH: info.title] + Chat App API + + Returns: + Markdown string ready for translation + """ + lines = ["# OpenAPI Translation Input\n"] + + for field in self.fields: + path_str = ".".join(str(p) for p in field["path"]) + lines.append(f"## {field['id']}") + lines.append(f"[PATH: {path_str}]") + lines.append(field["value"]) + lines.append("") # blank line separator + + return "\n".join(lines) + + def save_extraction_map(self, output_path: str): + """ + Save extraction metadata as JSON for later re-hydration. + + Args: + output_path: Path to save the extraction map JSON file + """ + extraction_data = { + "source_file": str(self.source_path), + "field_count": len(self.fields), + "translatable_fields": self.translatable_fields, + "fields": self.fields + } + + # Ensure output directory exists + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(extraction_data, f, indent=2, ensure_ascii=False) + + def save_markdown(self, output_path: str, markdown: str): + """ + Save generated markdown to file. + + Args: + output_path: Path to save the markdown file + markdown: Markdown content string + """ + # Ensure output directory exists + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(markdown) diff --git a/tools/translate/openapi/rehydrator.py b/tools/translate/openapi/rehydrator.py new file mode 100644 index 000000000..54c773079 --- /dev/null +++ b/tools/translate/openapi/rehydrator.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +""" +OpenAPI Re-hydrator + +Merges translated text back into the original JSON structure using extraction map. +""" + +import json +import re +from typing import Dict, List +from pathlib import Path + + +class OpenAPIRehydrator: + """Re-hydrates OpenAPI JSON with translated text.""" + + def __init__(self, original_json_path: str, extraction_map_path: str): + """ + Initialize the re-hydrator. + + Args: + original_json_path: Path to the original English JSON file + extraction_map_path: Path to the extraction map JSON created during extraction + """ + self.original_json_path = original_json_path + self.extraction_map_path = extraction_map_path + self.translation_map = {} # field_id -> translated_text + + def load_translation(self, translated_md_path: str): + """ + Parse translated markdown into field_id -> text mapping. + + Args: + translated_md_path: Path to the translated markdown file + + Expected format: + ## FIELD_0000 + [PATH: info.title] + Translated text here + + ## FIELD_0001 + [PATH: info.description] + Translated description here + """ + with open(translated_md_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Split by field markers + sections = re.split(r'\n## (FIELD_\d+)\n', content) + + # Process sections (pattern: text, field_id, section_content, field_id, section_content, ...) + for i in range(1, len(sections), 2): + if i + 1 >= len(sections): + break + + field_id = sections[i] + section_content = sections[i + 1] + + # Extract translated text after [PATH: ...] line + lines = section_content.split('\n') + path_line_found = False + translated_text_lines = [] + + for line in lines: + if line.startswith('[PATH:'): + path_line_found = True + continue + if path_line_found and line.strip(): + translated_text_lines.append(line) + + # Join lines and strip whitespace + translated_text = '\n'.join(translated_text_lines).strip() + + if translated_text: + self.translation_map[field_id] = translated_text + + print(f"📝 Parsed {len(self.translation_map)} translated fields from markdown") + + def rehydrate(self, output_path: str) -> Dict[str, int]: + """ + Create translated JSON file by merging translations into original structure. + + Args: + output_path: Path to save the translated JSON file + + Returns: + Statistics dict with keys: updated, missing, total + """ + # Load original JSON + with open(self.original_json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Load extraction map + with open(self.extraction_map_path, 'r', encoding='utf-8') as f: + extraction_map = json.load(f) + + # Apply translations + fields_updated = 0 + fields_missing = [] + + for field_info in extraction_map['fields']: + field_id = field_info['id'] + path = field_info['path'] + original_value = field_info['value'] + + # Get translated text + translated_text = self.translation_map.get(field_id) + + if translated_text: + # Navigate and update + try: + self._set_nested_value(data, path, translated_text) + fields_updated += 1 + except Exception as e: + print(f"⚠️ Error setting {field_id} at path {path}: {e}") + fields_missing.append(field_id) + else: + # Fallback to English + fields_missing.append(field_id) + print(f"⚠️ Missing translation for {field_id} (path: {'.'.join(str(p) for p in path)})") + print(f" Keeping English: {original_value[:80]}...") + + # Ensure output directory exists + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + # Save translated JSON + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + stats = { + "updated": fields_updated, + "missing": len(fields_missing), + "total": len(extraction_map['fields']) + } + + print(f"✅ Re-hydration complete: {fields_updated}/{stats['total']} fields updated") + if fields_missing: + print(f"⚠️ {len(fields_missing)} fields kept in English") + + return stats + + def _set_nested_value(self, obj, path: List, value: str): + """ + Navigate JSON path and set value. + + Args: + obj: Root JSON object + path: List of keys/indices representing the path + value: Value to set + + Example path: ['paths', '/chat-messages', 'post', 'summary'] + Example path with array: ['servers', '[0]', 'description'] + """ + current = obj + + # Navigate to parent + for key in path[:-1]: + if isinstance(key, str) and key.startswith('[') and key.endswith(']'): + # Array index + idx = int(key[1:-1]) + current = current[idx] + else: + current = current[key] + + # Set final value + final_key = path[-1] + if isinstance(final_key, str) and final_key.startswith('[') and final_key.endswith(']'): + # Array index + idx = int(final_key[1:-1]) + current[idx] = value + else: + current[final_key] = value diff --git a/tools/translate/openapi/translator.py b/tools/translate/openapi/translator.py new file mode 100644 index 000000000..4f16106b1 --- /dev/null +++ b/tools/translate/openapi/translator.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +OpenAPI Translator + +Coordinates translation of OpenAPI markdown via Dify API. +Wraps the existing translation infrastructure for OpenAPI-specific needs. +""" + +import httpx +import asyncio +import json +import os +from pathlib import Path + + +class OpenAPITranslator: + """Manages translation of OpenAPI markdown via Dify API.""" + + def __init__(self, markdown_path: str, target_lang: str, dify_api_key: str = None): + """ + Initialize the translator. + + Args: + markdown_path: Path to the markdown file to translate + target_lang: Target language code (cn, jp) + dify_api_key: Dify API key (if None, loads from env) + """ + self.markdown_path = markdown_path + self.target_lang = target_lang + self.dify_api_key = dify_api_key or self._load_api_key() + + # Load termbase + self.termbase_path = Path(__file__).parent.parent / "termbase_i18n.md" + + def _load_api_key(self) -> str: + """Load Dify API key from environment or .env file.""" + # Try environment variable first (case-insensitive) + api_key = os.getenv("DIFY_API_KEY") or os.getenv("dify_api_key") + if api_key: + return api_key + + # Try .env file (support both uppercase and lowercase) + env_file = Path(__file__).parent.parent / ".env" + if env_file.exists(): + with open(env_file) as f: + for line in f: + line = line.strip() + if line.startswith("DIFY_API_KEY=") or line.startswith("dify_api_key="): + return line.split("=", 1)[1].strip() + + raise ValueError("DIFY_API_KEY (or dify_api_key) not found in environment or .env file") + + async def translate_async(self, max_retries: int = 5) -> str: + """ + Translate the markdown file via Dify API (async version). + + Args: + max_retries: Maximum number of retry attempts + + Returns: + Translated markdown text + """ + # Read markdown content + with open(self.markdown_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Read termbase + termbase = "" + if self.termbase_path.exists(): + with open(self.termbase_path, 'r', encoding='utf-8') as f: + termbase = f.read() + + # Map language codes to full names + lang_map = { + "cn": "Chinese", + "jp": "Japanese" + } + target_language_name = lang_map.get(self.target_lang, self.target_lang) + + # Prepare API payload + url = "https://api.dify.ai/v1/workflows/run" + + inputs = { + "original_language": "English", + "output_language1": target_language_name, + "the_doc": content, + "termbase": termbase + } + + payload = { + "response_mode": "streaming", # Critical: avoid gateway timeouts + "user": "OpenAPI-Translator", + "inputs": inputs + } + + headers = { + "Authorization": f"Bearer {self.dify_api_key}", + "Content-Type": "application/json" + } + + # Retry mechanism with exponential backoff + for attempt in range(max_retries): + try: + if attempt > 0: + import random + base_delay = min(30 * (2 ** (attempt - 1)), 300) # Cap at 5 min + jitter = random.uniform(0.8, 1.2) + delay = base_delay * jitter + print(f"⏳ Retry attempt {attempt + 1}/{max_retries} after {delay:.1f}s delay...") + await asyncio.sleep(delay) + + # Stream the response (timeout: 10 minutes) + async with httpx.AsyncClient(timeout=600.0) as client: + async with client.stream("POST", url, json=payload, headers=headers) as response: + # Check initial status + if response.status_code != 200: + print(f"❌ HTTP Error: {response.status_code}") + error_text = await response.aread() + print(f"Response: {error_text.decode('utf-8')[:500]}") + if response.status_code in [502, 503, 504]: + if attempt < max_retries - 1: + print(f"Will retry... ({max_retries - attempt - 1} attempts remaining)") + continue + raise Exception(f"HTTP {response.status_code}") + + # Parse streaming response (SSE format) + print(f"📥 Receiving streaming response...") + output1 = None + workflow_run_id = None + final_status = None + + async for line in response.aiter_lines(): + line = line.strip() + if not line or not line.startswith("data: "): + continue + + try: + json_str = line[6:] # Remove "data: " prefix + event_data = json.loads(json_str) + event_type = event_data.get("event", "") + + if "workflow_run_id" in event_data: + workflow_run_id = event_data["workflow_run_id"] + + if event_type == "workflow_started": + print(f"🔄 Workflow started: {workflow_run_id}") + elif event_type == "workflow_finished": + final_status = event_data.get("data", {}).get("status", "unknown") + print(f"🔄 Workflow finished with status: {final_status}") + outputs = event_data.get("data", {}).get("outputs", {}) + output1 = outputs.get("output1", "") + elif event_type == "node_started": + node_type = event_data.get("data", {}).get("node_type", "") + print(f" ⚙️ Node started: {node_type}") + elif event_type == "error": + error_msg = event_data.get("message", "Unknown error") + print(f"❌ Workflow error: {error_msg}") + raise Exception(f"Workflow error: {error_msg}") + except json.JSONDecodeError: + continue + + # Check final status + if final_status == "failed": + print(f"❌ Workflow execution failed") + if attempt < max_retries - 1: + continue + raise Exception("Workflow execution failed") + + if not output1: + print(f"⚠️ Warning: No output1 found in workflow_finished event") + if attempt < max_retries - 1: + print(f"Will retry... ({max_retries - attempt - 1} attempts remaining)") + continue + raise Exception("No output received from workflow") + + print(f"✅ Translation completed successfully (length: {len(output1)} chars)") + return output1 + + except httpx.ReadTimeout: + print(f"⏱️ Stream timeout after 600s (attempt {attempt + 1}/{max_retries})") + if attempt == max_retries - 1: + raise Exception(f"All {max_retries} attempts failed due to timeout") + + except httpx.ConnectTimeout as e: + print(f"🔌 Connection timeout (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: + raise Exception(f"All {max_retries} attempts failed due to connection timeout") + + except httpx.HTTPError as e: + print(f"🌐 HTTP error (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: + raise Exception(f"All {max_retries} attempts failed due to HTTP errors") + + except Exception as e: + if "Workflow" in str(e) or "HTTP" in str(e): + if attempt < max_retries - 1: + continue + print(f"❌ Unexpected error (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: + raise + + raise Exception(f"Translation failed after {max_retries} attempts") + + def translate(self, max_retries: int = 5) -> str: + """ + Translate the markdown file via Dify API (sync wrapper). + + Args: + max_retries: Maximum number of retry attempts + + Returns: + Translated markdown text + """ + return asyncio.run(self.translate_async(max_retries=max_retries)) + + def save_translation(self, output_path: str, translated_text: str): + """ + Save translated markdown to file. + + Args: + output_path: Path to save the translated markdown + translated_text: Translated markdown content + """ + # Ensure output directory exists + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(translated_text) diff --git a/tools/translate/pr_analyzer.py b/tools/translate/pr_analyzer.py new file mode 100644 index 000000000..076b3353d --- /dev/null +++ b/tools/translate/pr_analyzer.py @@ -0,0 +1,512 @@ +#!/usr/bin/env python3 +""" +PR Analyzer for Documentation Translation Workflow + +This utility analyzes pull request changes to categorize them and validate +they follow the proper workflow requirements for English vs translation content. +""" + +import json +import subprocess +import sys +from pathlib import Path +from typing import Dict, List, Tuple, Optional +import re + +class PRAnalyzer: + """Analyzes PR changes to categorize and validate translation workflow requirements.""" + + def __init__(self, base_sha: str, head_sha: str, repo_root: Optional[str] = None): + self.base_sha = base_sha + self.head_sha = head_sha + self.repo_root = Path(repo_root) if repo_root else Path(__file__).parent.parent.parent + self.docs_json_path = self.repo_root / "docs.json" + self.config = self._load_config() + + def _load_config(self) -> Dict: + """Load translation configuration.""" + config_path = Path(__file__).parent / "config.json" + if config_path.exists(): + with open(config_path, 'r', encoding='utf-8') as f: + return json.load(f) + return {} + + def get_changed_files(self) -> List[str]: + """Get list of changed files between base and head commits.""" + try: + result = subprocess.run([ + "git", "diff", "--name-only", self.base_sha, self.head_sha + ], capture_output=True, text=True, check=True, cwd=self.repo_root) + + files = [f.strip() for f in result.stdout.strip().split('\n') if f.strip()] + return files + except subprocess.CalledProcessError as e: + print(f"Error getting changed files: {e}") + return [] + + def get_docs_json_at_sha(self, sha: str) -> Optional[Dict]: + """Get docs.json content at a specific commit.""" + try: + result = subprocess.run([ + "git", "show", f"{sha}:docs.json" + ], capture_output=True, text=True, check=True, cwd=self.repo_root) + + return json.loads(result.stdout) + except (subprocess.CalledProcessError, json.JSONDecodeError) as e: + print(f"Error loading docs.json at {sha}: {e}") + return None + + def extract_language_navigation(self, docs_data: Dict, language: str) -> Optional[Dict]: + """Extract navigation structure for a specific language from docs.json.""" + if not docs_data or 'navigation' not in docs_data: + return None + + navigation = docs_data['navigation'] + + # Handle both direct languages and versions structure + if 'languages' in navigation: + languages = navigation['languages'] + elif 'versions' in navigation and len(navigation['versions']) > 0: + languages = navigation['versions'][0].get('languages', []) + else: + return None + + for lang_data in languages: + if lang_data.get('language') == language: + return lang_data + + return None + + def analyze_docs_json_changes(self) -> Dict[str, bool]: + """Analyze which language sections changed in docs.json.""" + base_docs = self.get_docs_json_at_sha(self.base_sha) + head_docs = self.get_docs_json_at_sha(self.head_sha) + + changes = { + 'english_section': False, + 'translation_sections': False, + 'any_docs_json_changes': False + } + + if not base_docs or not head_docs: + return changes + + # Check if docs.json changed at all + if base_docs != head_docs: + changes['any_docs_json_changes'] = True + + # Check source language navigation section + source_lang = self.config['source_language'] + base_en = self.extract_language_navigation(base_docs, source_lang) + head_en = self.extract_language_navigation(head_docs, source_lang) + if base_en != head_en: + changes['english_section'] = True + + # Check translation sections + for lang in self.config['target_languages']: + base_lang = self.extract_language_navigation(base_docs, lang) + head_lang = self.extract_language_navigation(head_docs, lang) + if base_lang != head_lang: + changes['translation_sections'] = True + break + + return changes + + def is_openapi_file(self, file_path: str) -> bool: + """Check if file matches OpenAPI patterns from config.""" + openapi_config = self.config.get('openapi', {}) + + if not openapi_config.get('enabled', False): + return False + + patterns = openapi_config.get('file_patterns', []) + directories = openapi_config.get('directories', []) + + # Check if in allowed directory + path_parts = Path(file_path).parts + if len(path_parts) < 3: # e.g., en/api-reference/file.json + return False + + dir_name = path_parts[1] # Get directory after language code + if dir_name not in directories: + return False + + # Check if matches any pattern + file_name = Path(file_path).name + for pattern in patterns: + if self._match_pattern(file_name, pattern): + return True + + return False + + def _match_pattern(self, filename: str, pattern: str) -> bool: + """Simple glob-like pattern matching.""" + regex = pattern.replace('*', '.*').replace('?', '.') + return bool(re.match(f'^{regex}$', filename)) + + def categorize_files(self, files: List[str]) -> Dict[str, List[str]]: + """Categorize changed files by type.""" + categories = { + 'english': [], + 'english_openapi': [], # NEW category + 'translation': [], + 'translation_openapi': [], # NEW category + 'docs_json': [], + 'other': [] + } + + for file in files: + if file == 'docs.json': + categories['docs_json'].append(file) + elif file.startswith('en/'): + if file.endswith(('.md', '.mdx')): + categories['english'].append(file) + elif self.is_openapi_file(file): # NEW + categories['english_openapi'].append(file) + else: + categories['other'].append(file) + elif file.startswith(('jp/', 'cn/')): + if file.endswith(('.md', '.mdx')): + categories['translation'].append(file) + elif self.is_openapi_file(file): # NEW + categories['translation_openapi'].append(file) + else: + categories['other'].append(file) + else: + categories['other'].append(file) + + return categories + + def categorize_pr(self) -> Dict[str, any]: + """Categorize the PR based on changed files and docs.json sections.""" + changed_files = self.get_changed_files() + if not changed_files: + return { + 'type': 'none', + 'should_skip': True, + 'error': None, + 'files': {'english': [], 'translation': [], 'docs_json': [], 'other': []}, + 'docs_json_changes': {'english_section': False, 'translation_sections': False, 'any_docs_json_changes': False} + } + + file_categories = self.categorize_files(changed_files) + docs_json_changes = self.analyze_docs_json_changes() + + # Determine if there are English content changes (including OpenAPI) + has_english_files = len(file_categories['english']) > 0 or len(file_categories['english_openapi']) > 0 + has_english_docs_changes = docs_json_changes['english_section'] + + # Determine if there are translation changes (including OpenAPI) + has_translation_files = len(file_categories['translation']) > 0 or len(file_categories['translation_openapi']) > 0 + has_translation_docs_changes = docs_json_changes['translation_sections'] + + # Filter out non-documentation changes from consideration + relevant_english_changes = has_english_files or has_english_docs_changes + relevant_translation_changes = has_translation_files or has_translation_docs_changes + + # Categorize PR type + if relevant_english_changes and relevant_translation_changes: + pr_type = 'mixed' + should_skip = False + error = self.generate_mixed_pr_error(file_categories, docs_json_changes) + elif relevant_english_changes: + pr_type = 'english' + should_skip = False + error = None + elif relevant_translation_changes: + pr_type = 'translation' + should_skip = True + error = None + else: + pr_type = 'none' + should_skip = True + error = None + + return { + 'type': pr_type, + 'should_skip': should_skip, + 'error': error, + 'files': file_categories, + 'docs_json_changes': docs_json_changes + } + + def generate_mixed_pr_error(self, file_categories: Dict[str, List[str]], docs_json_changes: Dict[str, bool]) -> str: + """Generate comprehensive error message for mixed PRs.""" + + def format_file_list(files: List[str], max_files: int = 10) -> str: + if not files: + return " - (none)" + + formatted = [] + for file in files[:max_files]: + formatted.append(f" - `{file}`") + + if len(files) > max_files: + formatted.append(f" - ... and {len(files) - max_files} more") + + return '\n'.join(formatted) + + def format_docs_json_changes(changes: Dict[str, bool]) -> str: + parts = [] + if changes['english_section']: + parts.append(" - ✅ English navigation section") + if changes['translation_sections']: + parts.append(" - ✅ Translation navigation sections (jp, cn)") + if not parts: + parts.append(" - (no navigation changes)") + return '\n'.join(parts) + + error_msg = f"""❌ **Mixed Content PR Detected** + +This PR contains changes to both English content and translations, which violates our automated workflow requirements. + +**🔧 Required Action: Separate into Two PRs** + +Please create two separate pull requests: + +### 1️⃣ **English Content PR** +Create a PR containing only: +- Changes to `en/` files +- Changes to English navigation in `docs.json` +- This will trigger automatic translation generation + +### 2️⃣ **Translation Improvement PR** +Create a PR containing only: +- Changes to `jp/` and `cn/` files +- Changes to translation navigation sections in `docs.json` +- This will go through direct review (no automation) + +--- + +**📋 Files Detected in This PR:** + +**📝 English Content Files ({len(file_categories['english'])} files):** +{format_file_list(file_categories['english'])} + +**🌐 Translation Files ({len(file_categories['translation'])} files):** +{format_file_list(file_categories['translation'])} + +**📋 docs.json Navigation Changes:** +{format_docs_json_changes(docs_json_changes)} + +--- + +**💡 Why This Separation is Required:** + +- **Proper Review Process**: English content and translations have different review requirements +- **Automation Conflicts**: Mixed PRs break the automated translation workflow +- **Independent Merging**: Content and translations can be merged independently +- **Clear History**: Maintains clean git history for content vs translation changes + +**🤖 What Happens Next:** + +1. **English PR**: Will automatically generate translations and create a linked translation PR +2. **Translation PR**: Will go through standard review process +3. **Both PRs**: Can be reviewed and merged independently + +Please separate your changes and resubmit as two focused PRs. Thank you! 🙏""" + + return error_msg + + +class SyncPlanGenerator: + """ + Generates sync_plan.json with identical logic for both execute and update workflows. + + Extracts the sync plan generation logic from the analyze workflow to ensure + both workflows use the same file filtering and structure change detection. + """ + + def __init__(self, base_sha: str, head_sha: str, repo_root: Optional[str] = None): + self.base_sha = base_sha + self.head_sha = head_sha + self.repo_root = Path(repo_root) if repo_root else Path(__file__).parent.parent.parent + self.analyzer = PRAnalyzer(base_sha, head_sha, repo_root) + self.config = self.analyzer.config + + def get_changed_files_with_status(self) -> List[Tuple[str, str]]: + """ + Get list of changed files with their status (A=added, M=modified, D=deleted, etc). + + Returns list of tuples: [(status, filepath), ...] + Only returns A (added) and M (modified) files for translation. + Filters out files that don't exist at head_sha (handles add-then-delete scenario). + """ + try: + result = subprocess.run([ + "git", "diff", "--name-status", "--diff-filter=AM", + self.base_sha, self.head_sha + ], capture_output=True, text=True, check=True, cwd=self.repo_root) + + files_with_status = [] + for line in result.stdout.strip().split('\n'): + if line.strip(): + parts = line.split('\t', 1) + if len(parts) == 2: + status, filepath = parts[0], parts[1] + + # Verify file exists at head_sha (handles add-then-delete scenario) + if self._file_exists_at_commit(filepath, self.head_sha): + files_with_status.append((status, filepath)) + else: + print(f"Skipping {filepath}: added then deleted in same PR") + + return files_with_status + except subprocess.CalledProcessError as e: + print(f"Error getting changed files with status: {e}") + return [] + + def _file_exists_at_commit(self, filepath: str, commit_sha: str) -> bool: + """Check if a file exists at a specific commit.""" + try: + subprocess.run([ + "git", "cat-file", "-e", f"{commit_sha}:{filepath}" + ], capture_output=True, check=True, cwd=self.repo_root) + return True + except subprocess.CalledProcessError: + return False + + def get_file_size(self, filepath: str) -> int: + """Get file size in bytes.""" + full_path = self.repo_root / filepath + try: + return full_path.stat().st_size if full_path.exists() else 0 + except: + return 0 + + def is_openapi_file(self, filepath: str) -> bool: + """Check if file matches OpenAPI JSON pattern.""" + openapi_config = self.config.get("openapi", {}) + if not openapi_config.get("enabled", False): + return False + + file_patterns = openapi_config.get("file_patterns", ["openapi*.json"]) + directories = openapi_config.get("directories", ["api-reference"]) + + # Check if file is in allowed directories + if not any(f"/{dir}/" in filepath or filepath.startswith(f"{dir}/") for dir in directories): + return False + + # Check if filename matches patterns + filename = Path(filepath).name + for pattern in file_patterns: + regex = pattern.replace('*', '.*').replace('?', '.') + if re.match(f'^{regex}$', filename): + return True + + return False + + def generate_sync_plan(self) -> Dict: + """ + Generate sync plan with identical logic to analyze workflow. + + Returns sync_plan dict with: + - metadata: PR context and commit info + - files_to_sync: English markdown files (A/M only) + - openapi_files_to_sync: English OpenAPI JSON files (A/M only) + - structure_changes: docs.json change analysis + - target_languages: Languages to translate to + - sync_required: Whether any sync is needed + """ + # Get changed files with status + files_with_status = self.get_changed_files_with_status() + + # Categorize files for translation + files_to_sync = [] + openapi_files_to_sync = [] + docs_json_changed = False + + for status, filepath in files_with_status: + # Check for docs.json + if filepath == 'docs.json': + docs_json_changed = True + continue + + # Process English markdown files + if filepath.startswith('en/') and filepath.endswith(('.md', '.mdx')): + file_size = self.get_file_size(filepath) + file_type = 'mdx' if filepath.endswith('.mdx') else 'md' + files_to_sync.append({ + "path": filepath, + "size": file_size, + "type": file_type, + "status": status + }) + + # Process English OpenAPI JSON files + elif filepath.startswith('en/') and self.is_openapi_file(filepath): + file_size = self.get_file_size(filepath) + openapi_files_to_sync.append({ + "path": filepath, + "size": file_size, + "type": "openapi_json", + "status": status + }) + + # Analyze docs.json changes (if changed) + if docs_json_changed: + docs_changes = self.analyzer.analyze_docs_json_changes() + structure_changes = { + "structure_changed": docs_changes["any_docs_json_changes"], + "navigation_modified": docs_changes["english_section"], + "languages_affected": self.config["target_languages"] if docs_changes["english_section"] else [] + } + else: + structure_changes = { + "structure_changed": False, + "navigation_modified": False, + "languages_affected": [] + } + + # Create metadata + metadata = { + "base_sha": self.base_sha, + "head_sha": self.head_sha, + "comparison": f"{self.base_sha[:8]}...{self.head_sha[:8]}" + } + + # Build sync plan + sync_plan = { + "metadata": metadata, + "files_to_sync": files_to_sync, + "openapi_files_to_sync": openapi_files_to_sync, + "structure_changes": structure_changes, + "target_languages": self.config["target_languages"], + "sync_required": len(files_to_sync) > 0 or len(openapi_files_to_sync) > 0 or structure_changes.get("structure_changed", False) + } + + return sync_plan + + +def main(): + """Main entry point for command line usage.""" + if len(sys.argv) != 3: + print("Usage: python pr_analyzer.py ") + sys.exit(1) + + base_sha = sys.argv[1] + head_sha = sys.argv[2] + + analyzer = PRAnalyzer(base_sha, head_sha) + result = analyzer.categorize_pr() + + # Output results for GitHub Actions + print(f"pr_type={result['type']}") + print(f"should_skip={str(result['should_skip']).lower()}") + + if result['error']: + print(f"error_message={result['error']}") + sys.exit(1) + + # Output additional details + files = result['files'] + docs_changes = result['docs_json_changes'] + + print(f"english_files_count={len(files['english'])}") + print(f"translation_files_count={len(files['translation'])}") + print(f"docs_json_english_changes={str(docs_changes['english_section']).lower()}") + print(f"docs_json_translation_changes={str(docs_changes['translation_sections']).lower()}") + print(f"any_docs_json_changes={str(docs_changes['any_docs_json_changes']).lower()}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tools/translate/requirements.txt b/tools/translate/requirements.txt new file mode 100644 index 000000000..2bcd4dc95 --- /dev/null +++ b/tools/translate/requirements.txt @@ -0,0 +1,3 @@ +python-dotenv>=1.0.0 +httpx>=0.25.0 +aiofiles>=23.0.0 \ No newline at end of file diff --git a/tools/translate/security_validator.py b/tools/translate/security_validator.py new file mode 100644 index 000000000..59d7affe1 --- /dev/null +++ b/tools/translate/security_validator.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 +""" +Security validation utilities for documentation synchronization. +Provides input validation, path sanitization, and security checks. +""" + +import os +import re +import json +from pathlib import Path +from typing import Dict, List, Optional, Any, Tuple +import hashlib +import hmac + +class SecurityValidator: + """Validates and sanitizes inputs for documentation synchronization""" + + # Security constants + MAX_FILE_SIZE_MB = 10 + MAX_FILES_PER_SYNC = 50 + MAX_PATH_LENGTH = 255 + MAX_CONTENT_LENGTH = 1024 * 1024 * 10 # 10MB + + # Allowed file extensions + ALLOWED_EXTENSIONS = {'.md', '.mdx', '.json'} + + # Allowed base directories + ALLOWED_BASE_DIRS = {'en', 'cn', 'jp'} + + # Dangerous patterns to block + DANGEROUS_PATTERNS = [ + r'\.\.', # Directory traversal + r'^/', # Absolute paths + r'^~', # Home directory + r'\$\{', # Variable expansion + r'`', # Command substitution + r' Tuple[bool, Optional[str]]: + """ + Validate a file path for security issues. + + Args: + file_path: The file path to validate + + Returns: + Tuple of (is_valid, error_message) + """ + # Check path length + if len(file_path) > self.MAX_PATH_LENGTH: + return False, f"Path too long: {len(file_path)} > {self.MAX_PATH_LENGTH}" + + # Check for dangerous patterns + for pattern in self.DANGEROUS_PATTERNS: + if re.search(pattern, file_path, re.IGNORECASE): + return False, f"Dangerous pattern detected: {pattern}" + + # Parse path + path = Path(file_path) + + # Check for absolute path + if path.is_absolute(): + return False, "Absolute paths not allowed" + + # Check file extension + if path.suffix not in self.ALLOWED_EXTENSIONS: + return False, f"File extension not allowed: {path.suffix}" + + # Check if path starts with allowed directory + parts = path.parts + if not parts: + return False, "Empty path" + + if parts[0] not in self.ALLOWED_BASE_DIRS and not file_path == 'docs.json': + return False, f"Path must start with allowed directory: {self.ALLOWED_BASE_DIRS}" + + # Resolve and check if path stays within base directory + try: + full_path = (self.base_dir / path).resolve() + if not full_path.is_relative_to(self.base_dir): + return False, "Path escapes base directory" + except (ValueError, RuntimeError) as e: + return False, f"Invalid path: {e}" + + return True, None + + def validate_file_content(self, content: str) -> Tuple[bool, Optional[str]]: + """ + Validate file content for security issues. + + Args: + content: The file content to validate + + Returns: + Tuple of (is_valid, error_message) + """ + # Check content length + if len(content) > self.MAX_CONTENT_LENGTH: + return False, f"Content too large: {len(content)} > {self.MAX_CONTENT_LENGTH}" + + # Check for script injections in content + dangerous_content_patterns = [ + r']*>.*?', # Script tags + r'on\w+\s*=\s*["\']', # Event handlers + r'javascript:', # JavaScript protocol + r'data:text/html', # Data URLs with HTML + ] + + for pattern in dangerous_content_patterns: + if re.search(pattern, content, re.IGNORECASE | re.DOTALL): + return False, f"Dangerous content pattern detected" + + return True, None + + def validate_json_structure(self, json_data: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """ + Validate JSON structure for security issues. + + Args: + json_data: The JSON data to validate + + Returns: + Tuple of (is_valid, error_message) + """ + def check_value(value: Any, depth: int = 0) -> Optional[str]: + """Recursively check JSON values""" + if depth > 10: + return "JSON nesting too deep" + + if isinstance(value, str): + # Check for dangerous patterns in string values + for pattern in self.DANGEROUS_PATTERNS: + if re.search(pattern, value, re.IGNORECASE): + return f"Dangerous pattern in JSON value: {pattern}" + elif isinstance(value, dict): + for k, v in value.items(): + if not isinstance(k, str): + return "Non-string key in JSON" + error = check_value(v, depth + 1) + if error: + return error + elif isinstance(value, list): + for item in value: + error = check_value(item, depth + 1) + if error: + return error + + return None + + error = check_value(json_data) + if error: + return False, error + + return True, None + + def validate_sync_plan(self, sync_plan: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """ + Validate a synchronization plan. + + Args: + sync_plan: The sync plan to validate + + Returns: + Tuple of (is_valid, error_message) + """ + # Check required fields + required_fields = ['files_to_sync', 'target_languages', 'metadata'] + for field in required_fields: + if field not in sync_plan: + return False, f"Missing required field: {field}" + + # Validate file count + files = sync_plan.get('files_to_sync', []) + if len(files) > self.MAX_FILES_PER_SYNC: + return False, f"Too many files: {len(files)} > {self.MAX_FILES_PER_SYNC}" + + # Validate each file + for file_info in files: + if not isinstance(file_info, dict): + return False, "Invalid file info structure" + + file_path = file_info.get('path') + if not file_path: + return False, "File path missing in sync plan" + + valid, error = self.validate_file_path(file_path) + if not valid: + return False, f"Invalid file path in sync plan: {error}" + + # Validate file size if present + if 'size' in file_info: + max_size = self.MAX_FILE_SIZE_MB * 1024 * 1024 + if file_info['size'] > max_size: + return False, f"File too large: {file_path}" + + # Validate target languages + valid_languages = {'cn', 'jp'} + target_langs = sync_plan.get('target_languages', []) + for lang in target_langs: + if lang not in valid_languages: + return False, f"Invalid target language: {lang}" + + return True, None + + def sanitize_path(self, file_path: str) -> Optional[str]: + """ + Sanitize a file path by removing dangerous elements. + + Args: + file_path: The file path to sanitize + + Returns: + Sanitized path or None if path cannot be sanitized + """ + # Remove leading/trailing whitespace + file_path = file_path.strip() + + # Remove any null bytes + file_path = file_path.replace('\x00', '') + + # Normalize path separators + file_path = file_path.replace('\\', '/') + + # Remove double slashes + while '//' in file_path: + file_path = file_path.replace('//', '/') + + # Validate the sanitized path + valid, _ = self.validate_file_path(file_path) + if not valid: + return None + + return file_path + + def create_safe_temp_dir(self) -> Path: + """ + Create a safe temporary directory for operations. + + Returns: + Path to the temporary directory + """ + import tempfile + import secrets + + # Create temp dir with random suffix + suffix = secrets.token_hex(8) + temp_dir = Path(tempfile.mkdtemp(suffix=f'-sync-{suffix}')) + + # Set restrictive permissions (Unix only) + try: + os.chmod(temp_dir, 0o700) + except: + pass # Windows doesn't support chmod + + return temp_dir + + def calculate_file_hash(self, file_path: Path) -> str: + """ + Calculate SHA-256 hash of a file. + + Args: + file_path: Path to the file + + Returns: + Hex digest of the file hash + """ + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + + def verify_artifact_integrity(self, artifact_data: bytes, expected_hash: Optional[str] = None) -> bool: + """ + Verify the integrity of an artifact. + + Args: + artifact_data: The artifact data + expected_hash: Optional expected hash + + Returns: + True if artifact is valid + """ + if expected_hash: + actual_hash = hashlib.sha256(artifact_data).hexdigest() + return hmac.compare_digest(actual_hash, expected_hash) + + # Basic validation if no hash provided + return len(artifact_data) < self.MAX_CONTENT_LENGTH + + def is_trusted_contributor(self, username: str, trusted_list: List[str] = None) -> bool: + """ + Check if a user is a trusted contributor. + + Args: + username: GitHub username + trusted_list: Optional list of trusted usernames + + Returns: + True if user is trusted + """ + if not trusted_list: + # Default trusted contributors (should be configured) + trusted_list = [] + + return username in trusted_list + + def rate_limit_check(self, identifier: str, max_requests: int = 10, window_seconds: int = 60) -> bool: + """ + Simple rate limiting check (would need persistent storage in production). + + Args: + identifier: Unique identifier (e.g., PR number) + max_requests: Maximum requests allowed + window_seconds: Time window in seconds + + Returns: + True if within rate limit + """ + # This is a placeholder - in production, you'd use Redis or similar + # For now, always return True + return True + + +def create_validator(base_dir: Optional[Path] = None) -> SecurityValidator: + """ + Create a security validator instance. + + Args: + base_dir: Optional base directory (defaults to script parent) + + Returns: + SecurityValidator instance + """ + if base_dir is None: + base_dir = Path(__file__).parent.parent.parent + + return SecurityValidator(base_dir) + + +# Example usage and tests +if __name__ == "__main__": + validator = create_validator() + + # Test path validation + test_paths = [ + "en/docs/test.md", # Valid + "../../../etc/passwd", # Invalid - directory traversal + "/etc/passwd", # Invalid - absolute path + "en/test.exe", # Invalid - wrong extension + "cn/docs/test.mdx", # Valid + "docs.json", # Valid - special case + ] + + print("Path Validation Tests:") + for path in test_paths: + valid, error = validator.validate_file_path(path) + status = "✓" if valid else "✗" + print(f" {status} {path}: {error if error else 'Valid'}") + + print("\nContent Validation Tests:") + test_contents = [ + "# Normal markdown content", # Valid + "", # Invalid - script tag + "Normal text with onclick='alert()'", # Invalid - event handler + ] + + for content in test_contents: + valid, error = validator.validate_file_content(content) + status = "✓" if valid else "✗" + preview = content[:30] + "..." if len(content) > 30 else content + print(f" {status} {preview}: {error if error else 'Valid'}") \ No newline at end of file diff --git a/tools/translate/sync_and_translate.py b/tools/translate/sync_and_translate.py new file mode 100644 index 000000000..f17c55e6f --- /dev/null +++ b/tools/translate/sync_and_translate.py @@ -0,0 +1,2025 @@ +#!/usr/bin/env python3 +""" +Documentation Auto-Sync System +Synchronizes English documentation structure and content to Chinese and Japanese versions. +With enhanced security for handling external PRs. +""" + +import json +import os +import sys +import asyncio +import shutil +import re +from pathlib import Path +from typing import Dict, List, Set, Tuple, Optional, Any +import subprocess +import tempfile + +# Import the existing translation function +from main import translate_text, load_md_mdx + +# Import format-preserving JSON serialization +from json_formatter import save_json_with_preserved_format + +# Import OpenAPI translation pipeline +from openapi import translate_openapi_file, translate_openapi_file_async + +# Import security validator +try: + from security_validator import SecurityValidator, create_validator +except ImportError: + # Fallback if security module not available + SecurityValidator = None + create_validator = None + +# --- Configuration --- +SCRIPT_DIR = Path(__file__).resolve().parent +BASE_DIR = SCRIPT_DIR.parent.parent +DOCS_JSON_PATH = BASE_DIR / "docs.json" + +class DocsSynchronizer: + def __init__(self, dify_api_key: str, enable_security: bool = False): + self.dify_api_key = dify_api_key + self.base_dir = BASE_DIR + self.docs_json_path = DOCS_JSON_PATH + self.enable_security = enable_security + + # Initialize security validator if enabled + self.validator = None + if enable_security and create_validator: + self.validator = create_validator(self.base_dir) + self.config = self.load_config() + + def validate_file_path(self, file_path: str) -> Tuple[bool, Optional[str]]: + """Validate file path for security if security is enabled""" + if not self.enable_security or not self.validator: + return True, None + + return self.validator.validate_file_path(file_path) + + def validate_sync_plan(self, sync_plan: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """Validate synchronization plan for security if security is enabled""" + if not self.enable_security or not self.validator: + return True, None + + return self.validator.validate_sync_plan(sync_plan) + + def sanitize_path(self, file_path: str) -> Optional[str]: + """Sanitize file path if security is enabled""" + if not self.enable_security or not self.validator: + return file_path + + return self.validator.sanitize_path(file_path) + + def load_config(self) -> Dict[str, Any]: + """Load configuration file with language mappings""" + config_path = SCRIPT_DIR / "config.json" + if config_path.exists(): + with open(config_path, 'r', encoding='utf-8') as f: + config = json.load(f) + + # Validate required fields + required = ["source_language", "target_languages", "languages"] + for field in required: + if field not in config: + raise ValueError(f"Missing required field in config.json: {field}") + + # Validate all referenced languages exist + all_langs = [config["source_language"]] + config["target_languages"] + for lang in all_langs: + if lang not in config["languages"]: + raise ValueError(f"Language '{lang}' referenced but not defined in languages") + + # Validate target languages have translation_notice + for lang in config["target_languages"]: + if "translation_notice" not in config["languages"][lang]: + raise ValueError(f"Target language '{lang}' missing translation_notice") + + return config + + raise FileNotFoundError(f"Config file not found: {config_path}") + + @property + def source_language(self) -> str: + """Get source language code from config""" + return self.config["source_language"] + + @property + def target_languages(self) -> List[str]: + """Get list of target language codes from config""" + return self.config["target_languages"] + + def get_language_info(self, lang_code: str) -> Dict[str, Any]: + """Get full language information for a language code""" + return self.config["languages"].get(lang_code, {}) + + def get_language_name(self, lang_code: str) -> str: + """Get human-readable language name (e.g., 'English', 'Chinese')""" + return self.get_language_info(lang_code).get("name", "") + + def get_language_directory(self, lang_code: str) -> str: + """Get directory path for a language (e.g., 'en', 'cn')""" + return self.get_language_info(lang_code).get("directory", lang_code) + + def get_translation_notice(self, lang_code: str) -> str: + """Get AI translation notice template for a target language""" + return self.get_language_info(lang_code).get("translation_notice", "") + + def get_changed_files(self, since_commit: str = "HEAD~1") -> Dict[str, List[str]]: + """Get changed files using git diff""" + try: + # Get file changes + result = subprocess.run([ + "git", "diff", "--name-status", since_commit, "HEAD" + ], capture_output=True, text=True, cwd=self.base_dir) + + changes = { + "added": [], + "modified": [], + "deleted": [], + "renamed": [] + } + + for line in result.stdout.strip().split('\n'): + if not line: + continue + + parts = line.split('\t') + status = parts[0] + + if status == 'A': + changes["added"].append(parts[1]) + elif status == 'M': + changes["modified"].append(parts[1]) + elif status == 'D': + changes["deleted"].append(parts[1]) + elif status.startswith('R'): + changes["renamed"].append((parts[1], parts[2])) + + return changes + except subprocess.CalledProcessError as e: + print(f"Error getting git changes: {e}") + return {"added": [], "modified": [], "deleted": [], "renamed": []} + + def get_file_diff(self, file_path: str, since_commit: str = "HEAD~1") -> Optional[str]: + """Get git diff for a specific file""" + try: + result = subprocess.run([ + "git", "diff", since_commit, "HEAD", "--", file_path + ], capture_output=True, text=True, cwd=self.base_dir) + + if result.returncode == 0: + return result.stdout + else: + print(f"Warning: Could not get diff for {file_path}") + return None + except subprocess.CalledProcessError as e: + print(f"Error getting diff for {file_path}: {e}") + return None + + def is_english_doc_file(self, file_path: str) -> bool: + """Check if file is a source language documentation file that should be synced""" + source_dir = self.get_language_directory(self.source_language) + return (file_path.startswith(f"{source_dir}/") and + (file_path.endswith('.md') or file_path.endswith('.mdx')) and + not file_path.startswith(f"{source_dir}/api-reference/")) + + def convert_path_to_target_language(self, source_path: str, target_lang: str) -> str: + """Convert source language path to target language path""" + source_dir = self.get_language_directory(self.source_language) + target_dir = self.get_language_directory(target_lang) + if source_path.startswith(f"{source_dir}/"): + return source_path.replace(f"{source_dir}/", f"{target_dir}/", 1) + return source_path + + def get_relative_source_path_for_notice(self, target_path: str) -> str: + """Get absolute path to source language version for AI notice (without file extension)""" + source_dir = self.get_language_directory(self.source_language) + + # Find which target language directory this path is in + for target_lang in self.target_languages: + target_dir = self.get_language_directory(target_lang) + if target_path.startswith(f"{target_dir}/"): + # Replace target dir with source dir + source_path = target_path.replace(f"{target_dir}/", f"{source_dir}/", 1) + # Remove file extension (.md or .mdx) + source_path = source_path.rsplit('.', 1)[0] if '.' in source_path else source_path + # Return absolute path starting with / + return f"/{source_path}" + + return "" + + def _build_notice_removal_pattern(self) -> str: + """Build regex pattern to match any translation notice from config templates.""" + # Collect all translation notice templates from target languages + notice_templates = [] + for lang in self.target_languages: + template = self.get_translation_notice(lang) + if template: + # Escape regex special chars, but replace {source_path} with a wildcard + # First, escape the template for regex + escaped = re.escape(template.strip()) + # Replace escaped placeholder with pattern that matches any path + escaped = escaped.replace(r'\{source_path\}', r'[^\]]+') + notice_templates.append(escaped) + + # Build pattern that matches any of the templates, followed by optional whitespace/newlines + if notice_templates: + return '(?:' + '|'.join(notice_templates) + r')\s*\n*' + return '' + + def insert_notice_under_title(self, content: str, notice: str) -> str: + """Insert notice after frontmatter or first heading to keep it under the doc title.""" + if not notice.strip(): + return content + + if not content: + return notice + + bom_prefix = "" + if content.startswith("\ufeff"): + bom_prefix = "\ufeff" + content = content[len("\ufeff"):] + + # Remove any existing translation notice to prevent duplicates + # Pattern dynamically built from config templates + existing_notice_pattern = self._build_notice_removal_pattern() + if existing_notice_pattern: + content = re.sub(existing_notice_pattern, '', content, flags=re.DOTALL) + + notice_block = notice if notice.endswith("\n") else f"{notice}\n" + + frontmatter_match = re.match(r"^(---\s*\n.*?\n---\s*\n?)", content, flags=re.DOTALL) + if frontmatter_match: + frontmatter = frontmatter_match.group(1) + remainder = content[frontmatter_match.end():].lstrip("\n") + + final = frontmatter + if not final.endswith("\n"): + final += "\n" + final += notice_block + if remainder: + final += remainder + return bom_prefix + final + + heading_match = re.search(r"(?m)^(#{1,6}\s+.+)$", content) + if heading_match: + line_start = heading_match.start() + line_end = content.find("\n", line_start) + if line_end == -1: + line_end = len(content) + else: + line_end += 1 + + heading_section = content[:line_end] + remainder = content[line_end:].lstrip("\n") + + final = heading_section + if not final.endswith("\n"): + final += "\n" + final += notice_block + if remainder: + final += remainder + return bom_prefix + final + + return bom_prefix + notice_block + content.lstrip("\n") + + async def translate_file_with_notice(self, en_file_path: str, target_file_path: str, target_lang: str, + the_doc_exist: Optional[str] = None, diff_original: Optional[str] = None) -> bool: + """Translate a file and add AI notice at the top + + Args: + en_file_path: English source file path + target_file_path: Target translation file path + target_lang: Target language code (cn, jp) + the_doc_exist: Optional existing translation content (for modified files) + diff_original: Optional git diff of original file (for modified files) + """ + try: + # Security validation + if self.enable_security: + # Validate source path + valid, error = self.validate_file_path(en_file_path) + if not valid: + print(f"Security error - invalid source path {en_file_path}: {error}") + return False + + # Validate target path + valid, error = self.validate_file_path(target_file_path) + if not valid: + print(f"Security error - invalid target path {target_file_path}: {error}") + return False + + # Sanitize paths + en_file_path = self.sanitize_path(en_file_path) or en_file_path + target_file_path = self.sanitize_path(target_file_path) or target_file_path + + print(f"Translating {en_file_path} to {target_file_path}") + + # Ensure target directory exists + target_dir = Path(self.base_dir / target_file_path).parent + target_dir.mkdir(parents=True, exist_ok=True) + + # Get language names for translation API + source_lang_name = self.get_language_name(self.source_language) + target_lang_name = self.get_language_name(target_lang) + + # Translate content + translated_content = await translate_text( + str(self.base_dir / en_file_path), + self.dify_api_key, + source_lang_name, + target_lang_name, + the_doc_exist=the_doc_exist, + diff_original=diff_original + ) + + if not translated_content or not translated_content.strip(): + print(f"Warning: No translated content received for {en_file_path}") + return False + + # Prepare AI notice + source_relative_path = self.get_relative_source_path_for_notice(target_file_path) + notice = self.get_translation_notice(target_lang).format(source_path=source_relative_path) + + # Combine notice and translated content + final_content = self.insert_notice_under_title(translated_content, notice) + + # Write to target file + with open(self.base_dir / target_file_path, 'w', encoding='utf-8') as f: + f.write(final_content) + + print(f"✓ Successfully created {target_file_path}") + return True + + except Exception as e: + print(f"Error translating {en_file_path} to {target_file_path}: {e}") + return False + + def sync_file_operations(self, changes: Dict[str, List[str]]) -> List[str]: + """Sync file operations to target languages""" + operations_log = [] + + # Handle added files + for file_path in changes["added"]: + if self.is_english_doc_file(file_path): + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(file_path, target_lang) + # We'll translate these in the async part + operations_log.append(f"WILL_TRANSLATE: {file_path} -> {target_path}") + + # Handle deleted files + for file_path in changes["deleted"]: + if self.is_english_doc_file(file_path): + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(file_path, target_lang) + target_full_path = self.base_dir / target_path + if target_full_path.exists(): + target_full_path.unlink() + operations_log.append(f"DELETED: {target_path}") + + # Handle renamed files + for old_path, new_path in changes["renamed"]: + if self.is_english_doc_file(old_path) or self.is_english_doc_file(new_path): + for target_lang in self.target_languages: + old_target = self.convert_path_to_target_language(old_path, target_lang) + new_target = self.convert_path_to_target_language(new_path, target_lang) + + old_full_path = self.base_dir / old_target + new_full_path = self.base_dir / new_target + + if old_full_path.exists(): + # Ensure target directory exists + new_full_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(old_full_path), str(new_full_path)) + operations_log.append(f"RENAMED: {old_target} -> {new_target}") + + return operations_log + + async def translate_new_and_modified_files(self, changes: Dict[str, List[str]], since_commit: str = "HEAD~1") -> List[str]: + """Translate new and modified files + + Args: + changes: Dictionary with 'added', 'modified', 'deleted', 'renamed' file lists + since_commit: Git commit to compare against for diffs + """ + translation_log = [] + tasks = [] + + # Handle added files (no existing translation) + for file_path in changes["added"]: + if self.is_english_doc_file(file_path): + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(file_path, target_lang) + # New files - no existing translation or diff needed + task = self.translate_file_with_notice(file_path, target_path, target_lang) + tasks.append((task, file_path, target_path, "added")) + + # Handle modified files (may have existing translation) + for file_path in changes["modified"]: + if self.is_english_doc_file(file_path): + # Get diff for this file + diff_original = self.get_file_diff(file_path, since_commit) + + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(file_path, target_lang) + target_full_path = self.base_dir / target_path + + # Check if target translation exists + the_doc_exist = None + if target_full_path.exists(): + try: + with open(target_full_path, 'r', encoding='utf-8') as f: + the_doc_exist = f.read() + print(f"Found existing translation for {target_path} ({len(the_doc_exist)} chars)") + except Exception as e: + print(f"Warning: Could not read existing translation {target_path}: {e}") + the_doc_exist = None + + # Modified files - pass existing translation and diff if available + task = self.translate_file_with_notice( + file_path, + target_path, + target_lang, + the_doc_exist=the_doc_exist, + diff_original=diff_original + ) + tasks.append((task, file_path, target_path, "modified")) + + # Handle renamed files that need translation + for old_path, new_path in changes["renamed"]: + if self.is_english_doc_file(new_path): + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(new_path, target_lang) + # Renamed files treated as new + task = self.translate_file_with_notice(new_path, target_path, target_lang) + tasks.append((task, new_path, target_path, "renamed")) + + # Execute translations with concurrency control + semaphore = asyncio.Semaphore(2) # Limit concurrent translations + + async def bounded_translate(task, en_path, target_path, change_type): + async with semaphore: + success = await task + return success, en_path, target_path, change_type + + # Run translations + if tasks: + print(f"Starting {len(tasks)} translation tasks...") + results = await asyncio.gather(*[ + bounded_translate(task, en_path, target_path, change_type) + for task, en_path, target_path, change_type in tasks + ], return_exceptions=True) + + for result in results: + if isinstance(result, Exception): + translation_log.append(f"ERROR: {result}") + else: + success, en_path, target_path, change_type = result + if success: + translation_log.append(f"TRANSLATED ({change_type}): {en_path} -> {target_path}") + else: + translation_log.append(f"FAILED ({change_type}): {en_path} -> {target_path}") + + return translation_log + + def load_docs_json(self) -> Dict[str, Any]: + """Load docs.json file""" + try: + with open(self.docs_json_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + print(f"Error loading docs.json: {e}") + return {} + + def save_docs_json(self, data: Dict[str, Any]) -> bool: + """Save docs.json file while preserving original formatting""" + try: + # Use format-preserving serialization + # Pass the same file as both target and reference since we're overwriting + success = save_json_with_preserved_format( + self.docs_json_path, + data, + reference_file=self.docs_json_path + ) + if success: + print("✓ Saved docs.json with preserved formatting") + return success + except Exception as e: + print(f"Error saving docs.json: {e}") + import traceback + traceback.print_exc() + return False + + def extract_english_structure_changes(self, changes: Dict[str, List[str]]) -> bool: + """Check if docs.json was modified""" + return "docs.json" in changes["modified"] or "docs.json" in changes["added"] + + def get_dropdown_translation(self, en_dropdown_name: str, target_lang: str) -> str: + """ + Get translated dropdown name from config.json label_translations. + Falls back to English name if not found. + """ + label_translations = self.config.get("label_translations", {}) + if en_dropdown_name in label_translations: + translation = label_translations[en_dropdown_name].get(target_lang) + if translation: + return translation + # Fallback to English name + return en_dropdown_name + + def get_basic_label_translation(self, en_label: str, target_lang: str) -> str: + """Get basic translation for common labels""" + basic_translations = { + "cn": { + "Getting Started": "快速开始", + "Documentation": "文档", + "Build": "构建", + "Debug": "调试", + "Publish": "发布", + "Monitor": "监控", + "Knowledge": "知识库", + "Workspace": "工作区", + "Tutorials": "教程", + "FAQ": "常见问题", + "Introduction": "介绍", + "Quick Start": "快速开始", + "Key Concepts": "核心概念" + }, + "jp": { + "Getting Started": "はじめに", + "Documentation": "ドキュメント", + "Build": "ビルド", + "Debug": "デバッグ", + "Publish": "公開", + "Monitor": "モニタリング", + "Knowledge": "ナレッジベース", + "Workspace": "ワークスペース", + "Tutorials": "チュートリアル", + "FAQ": "よくある質問", + "Introduction": "紹介", + "Quick Start": "クイックスタート", + "Key Concepts": "主要概念" + } + } + + return basic_translations.get(target_lang, {}).get(en_label, en_label) + + def find_file_in_dropdown_structure(self, file_path: str, dropdown: Dict) -> Optional[List[str]]: + """ + Find a file path in a dropdown's pages structure or groups with openapi fields. + Returns the path to the item as a list of keys/indices, or None if not found. + Example: ["pages", 0, "pages", 2] means dropdown["pages"][0]["pages"][2] == file_path + Example: ["groups", 1, "openapi"] means dropdown["groups"][1]["openapi"] == file_path + + Note: docs.json stores paths without file extensions, so we strip them for comparison. + """ + # Strip file extension for comparison (docs.json doesn't include .md/.mdx extensions) + file_path_no_ext = re.sub(r'\.(mdx?|md)$', '', file_path) + + def search_pages(pages: List, current_path: List) -> Optional[List[str]]: + for i, item in enumerate(pages): + if isinstance(item, str): + # Compare without extensions + item_no_ext = re.sub(r'\.(mdx?|md)$', '', item) + if item_no_ext == file_path_no_ext: + return current_path + [i] + elif isinstance(item, dict) and "pages" in item: + result = search_pages(item["pages"], current_path + [i, "pages"]) + if result: + return result + return None + + # Search in pages array (markdown files) + if "pages" in dropdown: + result = search_pages(dropdown["pages"], ["pages"]) + if result: + return result + + # Search in groups array (OpenAPI files) + if "groups" in dropdown: + groups = dropdown["groups"] + for i, group in enumerate(groups): + if isinstance(group, dict) and "openapi" in group: + # Compare OpenAPI file paths (no extension stripping needed for .json) + if group["openapi"] == file_path: + return ["groups", i, "openapi"] + + return None + + def find_dropdown_containing_file(self, file_path: str, lang_section: Dict) -> Optional[Tuple[str, List[str]]]: + """ + Find which dropdown contains a specific file path. + Returns (dropdown_name, path_to_file) or None if not found. + """ + dropdowns = lang_section.get("dropdowns", []) + for dropdown in dropdowns: + dropdown_name = dropdown.get("dropdown", "") + file_location = self.find_file_in_dropdown_structure(file_path, dropdown) + if file_location: + return (dropdown_name, file_location) + return None + + def add_page_to_structure(self, pages: List, page_path: str, reference_structure: List = None) -> bool: + """ + Add a page to a pages array, attempting to maintain position relative to reference structure. + Returns True if added, False if already exists. + + Note: Strips file extensions before adding (docs.json doesn't include .md/.mdx extensions). + """ + # Strip file extension (docs.json doesn't include extensions) + page_path_no_ext = re.sub(r'\.(mdx?|md)$', '', page_path) + + # First pass: check if page already exists anywhere in the structure + def page_exists(pages_to_check): + for item in pages_to_check: + if isinstance(item, str): + item_no_ext = re.sub(r'\.(mdx?|md)$', '', item) + if item_no_ext == page_path_no_ext: + return True + elif isinstance(item, dict) and "pages" in item: + if page_exists(item["pages"]): + return True + return False + + if page_exists(pages): + return False + + # Page doesn't exist - add it to the top level (without extension) + pages.append(page_path_no_ext) + return True + + def _add_openapi_group(self, target_dropdown: Dict, openapi_path: str, file_location: List, en_dropdown: Dict) -> bool: + """ + Add an OpenAPI group to target dropdown. + file_location is like ["groups", 1, "openapi"] + + Args: + target_dropdown: Target language dropdown structure + openapi_path: Path to OpenAPI file (e.g., "cn/api-reference/openapi_test.json") + file_location: Location path like ["groups", 1, "openapi"] + en_dropdown: English dropdown structure for reference + + Returns: + True if added, False if already exists + """ + if len(file_location) < 3 or file_location[0] != "groups" or file_location[2] != "openapi": + return False + + group_index = file_location[1] + + # Ensure groups array exists + if "groups" not in target_dropdown: + target_dropdown["groups"] = [] + + # Check if this OpenAPI file already exists in target + for group in target_dropdown.get("groups", []): + if isinstance(group, dict) and group.get("openapi") == openapi_path: + return False # Already exists + + # Get the English group structure + en_groups = en_dropdown.get("groups", []) + if group_index >= len(en_groups): + return False + + en_group = en_groups[group_index] + + # Create the target group with the same structure but translated path + target_group = { + "group": en_group.get("group", ""), # Keep English group name for now (could translate later) + "openapi": openapi_path + } + + # Ensure we have enough slots in target groups + while len(target_dropdown["groups"]) <= group_index: + target_dropdown["groups"].append(None) + + # Insert at the same index position + if target_dropdown["groups"][group_index] is None: + target_dropdown["groups"][group_index] = target_group + else: + # Index already occupied, append instead + target_dropdown["groups"].append(target_group) + + return True + + def add_page_at_location(self, target_dropdown: Dict, page_path: str, file_location: List, en_dropdown: Dict) -> bool: + """ + Add a page to target dropdown at the same nested location as in English dropdown. + Uses the file_location path to navigate to the correct nested group. + + Args: + target_dropdown: Target language dropdown structure + page_path: Path of the file to add (e.g., "cn/documentation/pages/..." or "cn/api-reference/openapi_test.json") + file_location: Location path from find_file_in_dropdown_structure + (e.g., ["pages", 0, "pages", 0, "pages", 3] or ["groups", 1, "openapi"]) + en_dropdown: English dropdown structure for reference + + Returns: + True if added, False if already exists + """ + # Handle OpenAPI groups structure (e.g., ["groups", 1, "openapi"]) + if file_location and file_location[0] == "groups": + return self._add_openapi_group(target_dropdown, page_path, file_location, en_dropdown) + + # Strip file extension (docs.json doesn't include extensions) + page_path_no_ext = re.sub(r'\.(mdx?|md)$', '', page_path) + + # Check if page already exists anywhere in target + def page_exists(pages_to_check): + if not pages_to_check: + return False + for item in pages_to_check: + if isinstance(item, str): + item_no_ext = re.sub(r'\.(mdx?|md)$', '', item) + if item_no_ext == page_path_no_ext: + return True + elif isinstance(item, dict) and "pages" in item: + if page_exists(item["pages"]): + return True + return False + + if "pages" in target_dropdown and page_exists(target_dropdown["pages"]): + return False + + # Navigate to the correct nested location + # file_location is like ["pages", 0, "pages", 0, "pages", 3] + # We navigate through the path, creating groups as needed + + current_target = target_dropdown + current_en = en_dropdown + + # Process path in pairs: "pages" key, then index + i = 0 + while i < len(file_location) - 1: # Stop before final element (insertion point) + key = file_location[i] + + if key == "pages": + # Ensure pages array exists + if "pages" not in current_target: + current_target["pages"] = [] + + # Check if next element is an index + if i + 1 < len(file_location): + next_elem = file_location[i + 1] + + if isinstance(next_elem, int): + # Navigate to group at this index + idx = next_elem + + # Get corresponding English item + en_pages = current_en.get("pages", []) + if idx < len(en_pages): + en_item = en_pages[idx] + + # If English item is a group, ensure target has matching group + if isinstance(en_item, dict) and "pages" in en_item: + # Ensure target has items up to this index (only for groups we'll navigate through) + while len(current_target["pages"]) <= idx: + current_target["pages"].append(None) + target_item = current_target["pages"][idx] + + if not isinstance(target_item, dict) or "pages" not in target_item: + # Create group structure (preserve existing group name if present) + if isinstance(target_item, dict) and "group" in target_item: + existing_group = target_item["group"] + else: + existing_group = en_item.get("group", "") + + current_target["pages"][idx] = { + "group": existing_group, + "pages": target_item.get("pages", []) if isinstance(target_item, dict) else [] + } + if "icon" in en_item: + current_target["pages"][idx]["icon"] = en_item["icon"] + + # Navigate into this group + current_target = current_target["pages"][idx] + current_en = en_item + i += 2 # Skip "pages" and index + continue + + i += 1 + + # Add the page at the final location + if "pages" not in current_target: + current_target["pages"] = [] + + # Get the insertion index from file_location (last element) + # file_location is like ["pages", 1] or ["pages", 0, "pages", 3] + # The last element is the index where the file should be inserted + if file_location and isinstance(file_location[-1], int): + insert_index = file_location[-1] + # Insert at the same index position as in English structure + # If index is beyond current length, append to end + if insert_index <= len(current_target["pages"]): + current_target["pages"].insert(insert_index, page_path_no_ext) + else: + current_target["pages"].append(page_path_no_ext) + else: + # Fallback: append if we can't determine index + current_target["pages"].append(page_path_no_ext) + + return True + + def remove_page_from_structure(self, pages: List, page_path: str) -> bool: + """ + Remove a page from a pages array recursively. + Returns True if removed, False if not found. + + Note: Strips file extensions for comparison (docs.json doesn't include .md/.mdx extensions). + """ + # Strip file extension for comparison + page_path_no_ext = re.sub(r'\.(mdx?|md)$', '', page_path) + + for i, item in enumerate(pages): + if isinstance(item, str): + item_no_ext = re.sub(r'\.(mdx?|md)$', '', item) + if item_no_ext == page_path_no_ext: + pages.pop(i) + return True + elif isinstance(item, dict) and "pages" in item: + if self.remove_page_from_structure(item["pages"], page_path): + # Clean up empty groups + if not item["pages"]: + pages.pop(i) + return True + return False + + def extract_file_locations(self, section_data) -> Dict[str, Dict]: + """ + Extract all file paths and their locations in the navigation structure. + Returns dict mapping file path to location metadata including group indices for language-independent navigation. + """ + locations = {} + + if not section_data or "dropdowns" not in section_data: + return locations + + def traverse_structure(pages, dropdown_name, dropdown_idx, group_path, group_indices, path_prefix=""): + """Recursively traverse pages structure to extract file locations.""" + for idx, item in enumerate(pages): + if isinstance(item, str): + # Direct page reference + locations[item] = { + "dropdown": dropdown_name, + "dropdown_idx": dropdown_idx, + "group_path": group_path, # Full group path for logging/debugging + "group_indices": group_indices.copy(), # Index-based path for language-independent navigation + "page_index": idx, # Position within parent pages array + "path": f"{path_prefix}[{idx}]", + "type": "page" + } + elif isinstance(item, dict): + if "pages" in item: + # Nested group + group_name = item.get("group", item.get("label", "")) + new_group_path = f"{group_path} > {group_name}" if group_path else group_name + new_group_indices = group_indices + [idx] # Track the index of this group + traverse_structure( + item["pages"], + dropdown_name, + dropdown_idx, + new_group_path, + new_group_indices, + f"{path_prefix}[{idx}].pages" + ) + + for dropdown_idx, dropdown in enumerate(section_data.get("dropdowns", [])): + dropdown_name = dropdown.get("dropdown", "") + + # Check pages array + if "pages" in dropdown: + traverse_structure(dropdown["pages"], dropdown_name, dropdown_idx, dropdown_name, [], "pages") + + return locations + + def reconcile_docs_json_structural_changes( + self, + base_sha: str, + head_sha: str, + skip_rename_detection: bool = False + ) -> List[str]: + """ + Detect and apply specific structural changes (moves) from English section. + Compares base vs head English sections and applies only those changes to cn/jp. + + Args: + base_sha: Base commit SHA + head_sha: Head commit SHA + skip_rename_detection: If True, skip the broken rename detection logic + (renames should be handled by git-based detection instead) + """ + reconcile_log = [] + + try: + # Get docs.json from both commits + import subprocess + + base_docs_result = subprocess.run( + ["git", "show", f"{base_sha}:docs.json"], + capture_output=True, + text=True, + check=True, + cwd=self.base_dir + ) + base_docs = json.loads(base_docs_result.stdout) + + head_docs_result = subprocess.run( + ["git", "show", f"{head_sha}:docs.json"], + capture_output=True, + text=True, + check=True, + cwd=self.base_dir + ) + head_docs = json.loads(head_docs_result.stdout) + + # Extract English sections + def get_english_section(docs_data): + nav = docs_data.get("navigation", {}) + if "versions" in nav and nav["versions"]: + languages = nav["versions"][0].get("languages", []) + else: + languages = nav.get("languages", []) + + for lang in languages: + if lang.get("language") == self.source_language: + return lang + return None + + base_en = get_english_section(base_docs) + head_en = get_english_section(head_docs) + + if not base_en or not head_en: + reconcile_log.append("ERROR: Could not find English sections for comparison") + return reconcile_log + + # Extract file locations from both versions + base_locations = self.extract_file_locations(base_en) + head_locations = self.extract_file_locations(head_en) + + base_files = set(base_locations.keys()) + head_files = set(head_locations.keys()) + + # Detect operations + added = head_files - base_files + deleted = base_files - head_files + possibly_moved = base_files & head_files + + # Check for actual moves (same file, different location) + moved_files = [] + for file_path in possibly_moved: + base_loc = base_locations[file_path] + head_loc = head_locations[file_path] + + # Check if location changed (use group_path for accurate comparison) + if base_loc["group_path"] != head_loc["group_path"]: + moved_files.append({ + "file": file_path, + "from": base_loc, + "to": head_loc + }) + + # Detect renames: files that were deleted and added might be renames + # NOTE: This heuristic-based rename detection is BROKEN and causes false positives. + # It can incorrectly treat "delete file A + add unrelated file B" as a rename. + # Use git-based rename detection (--find-renames=100%) instead. + renamed_files = [] + + if not skip_rename_detection: + # DEPRECATED: This logic is kept for backward compatibility but should not be used + reconcile_log.append("WARNING: Using deprecated heuristic-based rename detection") + deleted_normalized = {} + added_normalized = {} + + source_dir = self.get_language_directory(self.source_language) + source_prefix = f"^{source_dir}/" + + for deleted_file in deleted: + # Normalize: {source_dir}/foo/bar.md -> foo/bar.md + normalized = re.sub(source_prefix, '', deleted_file) + deleted_normalized[normalized] = deleted_file + + for added_file in added: + # Normalize: {source_dir}/foo/baz.md -> foo/baz.md + normalized = re.sub(source_prefix, '', added_file) + added_normalized[normalized] = added_file + + # Check for renames: different paths but same location + # This is a simple heuristic - if added and deleted have different normalized paths + # but appear in similar locations, treat as rename + for del_norm, del_file in deleted_normalized.items(): + for add_norm, add_file in added_normalized.items(): + if del_norm != add_norm: + # Different paths - potential rename + del_loc = base_locations[del_file] + add_loc = head_locations[add_file] + + # If they're in the same location group, it's likely a rename + if del_loc["group_path"] == add_loc["group_path"]: + renamed_files.append({ + "from_file": del_file, + "to_file": add_file, + "location": add_loc + }) + # Remove from added/deleted to avoid processing twice + added.discard(add_file) + deleted.discard(del_file) + break + + if not moved_files and not added and not deleted and not renamed_files: + reconcile_log.append("INFO: No structural changes detected") + return reconcile_log + + reconcile_log.append(f"INFO: Detected {len(moved_files)} moves, {len(renamed_files)} renames, {len(added)} adds, {len(deleted)} deletes") + + # Load current docs.json + docs_data = self.load_docs_json() + if not docs_data: + reconcile_log.append("ERROR: Could not load docs.json") + return reconcile_log + + changes_made = False + + # Apply moves to cn/jp sections + for move_op in moved_files: + en_file = move_op["file"] + from_loc = move_op["from"] + to_loc = move_op["to"] + + reconcile_log.append(f"INFO: Moving {en_file} from '{from_loc['group_path']}' to '{to_loc['group_path']}'") + + # Apply to each target language + for target_lang in self.target_languages: + target_file = self.convert_path_to_target_language(en_file, target_lang) + + # Remove from old location + removed = self.remove_file_from_navigation(docs_data, target_file, target_lang) + + if removed: + # Add to new location + added = self.add_file_to_navigation(docs_data, target_file, target_lang, to_loc) + + if added: + reconcile_log.append(f"SUCCESS: Moved {target_file} to new location") + changes_made = True + else: + reconcile_log.append(f"WARNING: Could not add {target_file} to new location") + else: + reconcile_log.append(f"WARNING: Could not remove {target_file} from old location") + + # Apply renames to cn/jp sections + for rename_op in renamed_files: + from_file = rename_op["from_file"] + to_file = rename_op["to_file"] + location = rename_op["location"] + + reconcile_log.append(f"INFO: Renaming {from_file} to {to_file}") + + # Apply to each target language + for target_lang in self.target_languages: + old_target_file = self.convert_path_to_target_language(from_file, target_lang) + new_target_file = self.convert_path_to_target_language(to_file, target_lang) + + # Find the actual file with extension (docs.json entries don't have extensions) + old_file_path = None + file_extension = None + + # Try common extensions + for ext in ['.md', '.mdx', '']: + test_path = self.base_dir / f"{old_target_file}{ext}" + if test_path.exists(): + old_file_path = test_path + file_extension = ext + break + + if old_file_path and old_file_path.exists(): + # Create new file path with same extension + new_file_path = self.base_dir / f"{new_target_file}{file_extension}" + + # Create parent directories if needed + new_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Rename the file + old_file_path.rename(new_file_path) + reconcile_log.append(f"SUCCESS: Renamed file {old_target_file}{file_extension} to {new_target_file}{file_extension}") + + # Update docs.json: remove old entry, add new entry + removed = self.remove_file_from_navigation(docs_data, old_target_file, target_lang) + if removed: + added = self.add_file_to_navigation(docs_data, new_target_file, target_lang, location) + if added: + reconcile_log.append(f"SUCCESS: Updated docs.json for {target_lang} rename") + changes_made = True + else: + reconcile_log.append(f"WARNING: Could not add {new_target_file} to docs.json") + else: + reconcile_log.append(f"WARNING: Could not remove {old_target_file} from docs.json") + else: + reconcile_log.append(f"WARNING: File {old_target_file} not found for rename (tried .md, .mdx, and no extension)") + + # Apply deletes to cn/jp sections + for en_file in deleted: + reconcile_log.append(f"INFO: Deleting {en_file}") + + # Apply to each target language + for target_lang in self.target_languages: + target_file = self.convert_path_to_target_language(en_file, target_lang) + + # Remove from docs.json navigation + removed = self.remove_file_from_navigation(docs_data, target_file, target_lang) + + if removed: + reconcile_log.append(f"SUCCESS: Removed {target_file} from docs.json") + changes_made = True + + # Delete physical file + for ext in ['.md', '.mdx', '']: + file_path = self.base_dir / f"{target_file}{ext}" + if file_path.exists(): + file_path.unlink() + reconcile_log.append(f"SUCCESS: Deleted physical file {target_file}{ext}") + break + else: + reconcile_log.append(f"WARNING: Could not remove {target_file} from docs.json") + + # Save updated docs.json + if changes_made: + self.save_docs_json(docs_data) + reconcile_log.append("SUCCESS: Applied structural changes to docs.json") + + return reconcile_log + + except Exception as e: + reconcile_log.append(f"ERROR: Failed to reconcile structural changes: {e}") + return reconcile_log + + def remove_file_from_navigation(self, docs_data: Dict, file_path: str, target_lang: str) -> bool: + """Remove a file from target language navigation structure.""" + nav = docs_data.get("navigation", {}) + + # Find target language section + if "versions" in nav and nav["versions"]: + languages = nav["versions"][0].get("languages", []) + else: + languages = nav.get("languages", []) + + target_section = None + for lang in languages: + if lang.get("language") == target_lang: + target_section = lang + break + + if not target_section: + return False + + # Remove from dropdowns + for dropdown in target_section.get("dropdowns", []): + if "pages" in dropdown: + if self.remove_page_from_structure(dropdown["pages"], file_path): + return True + + return False + + def add_file_to_navigation(self, docs_data: Dict, file_path: str, target_lang: str, location_info: Dict) -> bool: + """Add a file to target language navigation at specified location using index-based navigation.""" + nav = docs_data.get("navigation", {}) + + # Find target language section + if "versions" in nav and nav["versions"]: + languages = nav["versions"][0].get("languages", []) + else: + languages = nav.get("languages", []) + + target_section = None + for lang in languages: + if lang.get("language") == target_lang: + target_section = lang + break + + if not target_section: + return False + + # Find target dropdown by index + dropdown_idx = location_info["dropdown_idx"] + dropdowns = target_section.get("dropdowns", []) + + if dropdown_idx >= len(dropdowns): + return False + + target_dropdown = dropdowns[dropdown_idx] + + # Start from dropdown's pages + if "pages" not in target_dropdown: + target_dropdown["pages"] = [] + + current_pages = target_dropdown["pages"] + + # Navigate through nested groups using indices (language-independent) + group_indices = location_info.get("group_indices", []) + + for group_idx in group_indices: + # Navigate to the group at this index + if group_idx >= len(current_pages): + # Index out of bounds - structure mismatch between languages + return False + + item = current_pages[group_idx] + + if isinstance(item, dict) and "pages" in item: + # Navigate into this group + if "pages" not in item: + item["pages"] = [] + current_pages = item["pages"] + else: + # Expected a group but found something else - structure mismatch + return False + + # Add file to the target location if not already present + if file_path not in str(current_pages): + # Insert at the same position as in the source language + page_index = location_info.get("page_index", len(current_pages)) + + # Ensure index is within bounds (append if beyond end) + if page_index > len(current_pages): + page_index = len(current_pages) + + current_pages.insert(page_index, file_path) + return True + + return False + + def _handle_rename(self, old_en_path: str, new_en_path: str) -> Tuple[List[str], List[str]]: + """ + Handle file rename operation for target languages. + + If the old translation file exists, rename it and update docs.json. + If it doesn't exist, return the new path for fresh translation. + + Args: + old_en_path: Old English file path (e.g., "en/docs/old.mdx") + new_en_path: New English file path (e.g., "en/docs/new.mdx") + + Returns: + Tuple of (log_messages, files_needing_translation) + files_needing_translation contains new paths that need fresh translation + """ + log = [] + files_needing_translation = [] + + log.append(f"INFO: Processing rename {old_en_path} -> {new_en_path}") + + # Load docs.json to get location info and update entries + docs_data = self.load_docs_json() + if not docs_data: + log.append("ERROR: Could not load docs.json for rename operation") + return log, files_needing_translation + + # Get English section to find the location of the new file + nav = docs_data.get("navigation", {}) + if "versions" in nav and nav["versions"]: + languages = nav["versions"][0].get("languages", []) + else: + languages = nav.get("languages", []) + + en_section = None + for lang in languages: + if lang.get("language") == self.source_language: + en_section = lang + break + + if not en_section: + log.append("ERROR: Could not find English section in docs.json") + return log, files_needing_translation + + # Extract file location from English section (use new path since English already renamed) + file_locations = self.extract_file_locations(en_section) + + # Strip file extension from path since docs.json entries don't include extensions + new_en_path_no_ext = new_en_path + if new_en_path.endswith('.mdx'): + new_en_path_no_ext = new_en_path[:-4] + elif new_en_path.endswith('.md'): + new_en_path_no_ext = new_en_path[:-3] + + location = file_locations.get(new_en_path_no_ext) + + if not location: + log.append(f"WARNING: Could not find location for {new_en_path_no_ext} in English section") + # Continue without updating docs.json entries + location = None + + docs_changed = False + + for target_lang in self.target_languages: + old_target = self.convert_path_to_target_language(old_en_path, target_lang) + new_target = self.convert_path_to_target_language(new_en_path, target_lang) + + # Find old file with extension (.md, .mdx, or no extension) + old_file_path = None + file_extension = None + for ext in ['.md', '.mdx', '']: + test_path = self.base_dir / f"{old_target}{ext}" + if test_path.exists(): + old_file_path = test_path + file_extension = ext + break + + if old_file_path and old_file_path.exists(): + # Old file exists - rename it + new_file_path = self.base_dir / f"{new_target}{file_extension}" + + # Create parent directories if needed + new_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Rename the physical file + old_file_path.rename(new_file_path) + log.append(f"SUCCESS: Renamed {old_target}{file_extension} -> {new_target}{file_extension}") + + # Update docs.json entry if we have location info + if location: + # Strip extension from new_target for docs.json (docs.json doesn't store extensions) + new_target_no_ext = new_target + if new_target.endswith('.mdx'): + new_target_no_ext = new_target[:-4] + elif new_target.endswith('.md'): + new_target_no_ext = new_target[:-3] + + # Remove old entry + removed = self.remove_file_from_navigation(docs_data, old_target, target_lang) + if removed: + # Add new entry at same location (without extension) + added = self.add_file_to_navigation(docs_data, new_target_no_ext, target_lang, location) + if added: + log.append(f"SUCCESS: Updated docs.json entry {old_target} -> {new_target_no_ext} for {target_lang}") + docs_changed = True + else: + log.append(f"WARNING: Could not add {new_target_no_ext} to docs.json for {target_lang}") + else: + log.append(f"WARNING: Could not remove {old_target} from docs.json for {target_lang}") + else: + # Old file not found - need fresh translation + log.append(f"INFO: Old file {old_target} not found, will generate new translation") + files_needing_translation.append(new_en_path) + + # Save docs.json if we made changes + if docs_changed: + self.save_docs_json(docs_data) + log.append("SUCCESS: Saved updated docs.json with rename entries") + + return log, files_needing_translation + + def sync_docs_json_incremental( + self, + added_files: List[str] = None, + deleted_files: List[str] = None, + renamed_files: List[Tuple[str, str]] = None, + base_sha: str = None, + head_sha: str = None + ) -> List[str]: + """ + Incrementally sync docs.json structure - only processes changed files. + Preserves existing dropdown names and only updates affected pages. + """ + sync_log = [] + added_files = added_files or [] + deleted_files = deleted_files or [] + renamed_files = renamed_files or [] + + # Process renames first (before adds/deletes) + # Renamed files that couldn't be renamed will be added to added_files for fresh translation + for old_path, new_path in renamed_files: + if old_path.startswith(f"{self.get_language_directory(self.source_language)}/"): + rename_log, files_to_translate = self._handle_rename(old_path, new_path) + sync_log.extend(rename_log) + + # If any translations need to be generated (old file didn't exist), add to added_files + if new_path in files_to_translate: + if new_path not in added_files: + added_files.append(new_path) + sync_log.append(f"INFO: Added {new_path} to translation queue (old translation not found)") + + # Check for structural changes (moves and possibly renames) + if base_sha and head_sha: + # Only skip rename detection if we actually processed renames via git + skip_renames = len(renamed_files) > 0 + if skip_renames: + sync_log.append("INFO: Checking for structural changes (moves only, renames already handled)...") + else: + sync_log.append("INFO: Checking for structural changes (moves and renames)...") + + reconcile_log = self.reconcile_docs_json_structural_changes( + base_sha, head_sha, + skip_rename_detection=skip_renames + ) + sync_log.extend(reconcile_log) + + # If no file adds/deletes after rename processing, we're done + if not added_files and not deleted_files: + sync_log.append("INFO: No file adds/deletes to sync") + return sync_log + + try: + docs_data = self.load_docs_json() + if not docs_data or "navigation" not in docs_data: + sync_log.append("ERROR: Invalid docs.json structure") + return sync_log + + navigation = docs_data["navigation"] + + # Handle both direct languages and versions structure + languages_array = None + if "languages" in navigation and isinstance(navigation["languages"], list): + languages_array = navigation["languages"] + elif "versions" in navigation and len(navigation["versions"]) > 0: + if "languages" in navigation["versions"][0]: + languages_array = navigation["versions"][0]["languages"] + + if not languages_array: + sync_log.append("ERROR: No languages found in navigation") + return sync_log + + # Find language sections + en_section = None + target_sections = {} + + for lang_data in languages_array: + if lang_data.get("language") == self.source_language: + en_section = lang_data + elif lang_data.get("language") in self.target_languages: + target_sections[lang_data.get("language")] = lang_data + + if not en_section: + sync_log.append("ERROR: English section not found") + return sync_log + + sync_log.append(f"INFO: Processing {len(added_files)} added, {len(deleted_files)} deleted files") + + # Process added files + for en_file in added_files: + if not en_file.startswith("en/"): + continue + + # Find which dropdown contains this file in English section + result = self.find_dropdown_containing_file(en_file, en_section) + if not result: + sync_log.append(f"WARNING: Could not find {en_file} in English navigation") + continue + + en_dropdown_name, file_location = result + sync_log.append(f"INFO: Found {en_file} in '{en_dropdown_name}' dropdown at location {file_location}") + + # Get the English dropdown for reference + en_dropdown = None + en_dropdown_index = -1 + for i, dropdown in enumerate(en_section.get("dropdowns", [])): + if dropdown.get("dropdown") == en_dropdown_name: + en_dropdown = dropdown + en_dropdown_index = i + break + + if not en_dropdown: + sync_log.append(f"WARNING: Could not find English dropdown '{en_dropdown_name}'") + continue + + # Add to each target language + for target_lang, target_section in target_sections.items(): + target_file = self.convert_path_to_target_language(en_file, target_lang) + + # Find or create corresponding dropdown + target_dropdown = None + target_dropdown_name = None + + # Strategy: Try to find the dropdown by matching index position first, + # then by translated name. This preserves correct dropdown associations. + target_dropdowns = target_section.get("dropdowns", []) + + # Try to use same index in target language (assuming dropdowns are in same order) + if en_dropdown_index >= 0 and en_dropdown_index < len(target_dropdowns): + target_dropdown = target_dropdowns[en_dropdown_index] + target_dropdown_name = target_dropdown.get("dropdown", "") + + # If index-based match failed, try matching by translated name + if not target_dropdown: + translated_name = self.get_dropdown_translation(en_dropdown_name, target_lang) + for dropdown in target_dropdowns: + if dropdown.get("dropdown") == translated_name: + target_dropdown = dropdown + target_dropdown_name = translated_name + break + + # If still not found, create new dropdown + if not target_dropdown: + translated_name = self.get_dropdown_translation(en_dropdown_name, target_lang) + + target_dropdown = { + "dropdown": translated_name, + "icon": en_dropdown.get("icon", "book-open"), + "pages": [] + } + target_section.setdefault("dropdowns", []) + target_section["dropdowns"].append(target_dropdown) + target_dropdown_name = translated_name + sync_log.append(f"INFO: Created new dropdown '{translated_name}' for {target_lang}") + + # Add the page to the dropdown at the correct nested location + if "pages" not in target_dropdown: + target_dropdown["pages"] = [] + + # Use the new method that preserves group structure + added = self.add_page_at_location(target_dropdown, target_file, file_location, en_dropdown) + if added: + sync_log.append(f"INFO: Added {target_file} to '{target_dropdown_name}' at nested location ({target_lang})") + else: + sync_log.append(f"INFO: {target_file} already exists in '{target_dropdown_name}' ({target_lang})") + + # Process deleted files + for en_file in deleted_files: + if not en_file.startswith("en/"): + continue + + sync_log.append(f"INFO: Processing deletion of {en_file}") + + # Remove from each target language (cn, jp) + for target_lang, target_section in target_sections.items(): + target_file = self.convert_path_to_target_language(en_file, target_lang) + sync_log.append(f"INFO: Attempting to remove {target_file} from {target_lang} section") + + # Find and remove from all dropdowns + removed = False + dropdowns = target_section.get("dropdowns", []) + sync_log.append(f"INFO: Searching through {len(dropdowns)} dropdowns in {target_lang} section") + + for idx, dropdown in enumerate(dropdowns): + dropdown_name = dropdown.get("dropdown", "") + sync_log.append(f"INFO: Checking dropdown {idx + 1}/{len(dropdowns)}: '{dropdown_name}'") + + # Check pages array for markdown files + if "pages" in dropdown: + if self.remove_page_from_structure(dropdown["pages"], target_file): + sync_log.append(f"SUCCESS: Removed {target_file} from '{dropdown_name}' ({target_lang})") + removed = True + break + + # Check groups array for OpenAPI files + if "groups" in dropdown: + groups = dropdown["groups"] + for i, group in enumerate(groups): + if isinstance(group, dict) and group.get("openapi") == target_file: + groups.pop(i) + sync_log.append(f"SUCCESS: Removed OpenAPI {target_file} from '{dropdown_name}' ({target_lang})") + removed = True + break + if removed: + break + + if "pages" not in dropdown and "groups" not in dropdown: + sync_log.append(f"INFO: Dropdown '{dropdown_name}' has no pages or groups array") + + if not removed: + sync_log.append(f"WARNING: Could not find {target_file} in {target_lang} navigation - file may not exist in navigation") + + # Save the updated docs.json + if self.save_docs_json(docs_data): + sync_log.append("INFO: Updated docs.json with incremental changes") + else: + sync_log.append("ERROR: Failed to save updated docs.json") + + except Exception as e: + sync_log.append(f"ERROR: Exception in incremental sync: {e}") + import traceback + sync_log.append(f"TRACE: {traceback.format_exc()}") + + return sync_log + + def sync_docs_json_structure(self) -> List[str]: + """ + DEPRECATED: Full sync of docs.json structure across languages. + This method syncs ALL dropdowns and is only kept for backward compatibility. + Use sync_docs_json_incremental() for new code. + """ + sync_log = [] + sync_log.append("WARNING: Using deprecated full sync method") + + try: + docs_data = self.load_docs_json() + if not docs_data or "navigation" not in docs_data: + sync_log.append("ERROR: Invalid docs.json structure") + return sync_log + + navigation = docs_data["navigation"] + + # Handle both direct languages and versions structure + languages_array = None + if "languages" in navigation and isinstance(navigation["languages"], list): + languages_array = navigation["languages"] + elif "versions" in navigation and len(navigation["versions"]) > 0: + if "languages" in navigation["versions"][0]: + languages_array = navigation["versions"][0]["languages"] + + if not languages_array: + sync_log.append("ERROR: No languages found in navigation") + return sync_log + + # Find language sections + en_section = None + target_sections = {} + + for lang_data in languages_array: + if lang_data.get("language") == self.source_language: + en_section = lang_data + elif lang_data.get("language") in self.target_languages: + target_sections[lang_data.get("language")] = lang_data + + if not en_section: + sync_log.append("ERROR: English section not found") + return sync_log + + # Get all English dropdowns + en_dropdowns = en_section.get("dropdowns", []) + if not en_dropdowns: + sync_log.append("INFO: No dropdowns found in English section") + return sync_log + + sync_log.append(f"INFO: Found {len(en_dropdowns)} English dropdowns to sync") + + # Sync each English dropdown to target languages + for target_lang, target_section in target_sections.items(): + if not target_section: + sync_log.append(f"WARNING: {target_lang} section not found") + continue + + # Ensure dropdowns array exists + target_section.setdefault("dropdowns", []) + + # Process each English dropdown + for en_dropdown in en_dropdowns: + en_dropdown_name = en_dropdown.get("dropdown", "") + if not en_dropdown_name: + continue + + # Get translated dropdown name from config.json + target_dropdown_name = self.get_dropdown_translation(en_dropdown_name, target_lang) + + # Find existing dropdown in target language by translated name + target_dropdown = None + dropdown_index = -1 + for i, dropdown in enumerate(target_section["dropdowns"]): + if dropdown.get("dropdown") == target_dropdown_name: + target_dropdown = dropdown + dropdown_index = i + break + + if not target_dropdown: + # Create new dropdown - SET translated name + target_dropdown = { + "dropdown": target_dropdown_name, + "icon": en_dropdown.get("icon", "book-open"), + "pages": [] + } + target_section["dropdowns"].append(target_dropdown) + sync_log.append(f"INFO: Created new '{target_dropdown_name}' dropdown for {target_lang}") + else: + # Update existing dropdown - PRESERVE existing name, only update icon + # Do NOT overwrite target_dropdown["dropdown"] to preserve existing translations + if "icon" in en_dropdown: + target_dropdown["icon"] = en_dropdown["icon"] + # Remove old structure fields if they exist + if "groups" in target_dropdown: + del target_dropdown["groups"] + sync_log.append(f"INFO: Updated existing '{target_dropdown.get('dropdown')}' dropdown for {target_lang}") + + # Sync the pages structure + if "pages" in en_dropdown: + existing_pages = target_dropdown.get("pages", []) + synced_pages = self.convert_pages_structure( + en_dropdown["pages"], + target_lang, + existing_pages + ) + target_dropdown["pages"] = synced_pages + sync_log.append(f"INFO: Synced pages structure for '{target_dropdown.get('dropdown')}' ({target_lang})") + + # Save the updated docs.json + if self.save_docs_json(docs_data): + sync_log.append("INFO: Updated docs.json with synced structure") + else: + sync_log.append("ERROR: Failed to save updated docs.json") + + except Exception as e: + sync_log.append(f"ERROR: Exception in docs.json sync: {e}") + import traceback + sync_log.append(f"TRACE: {traceback.format_exc()}") + + return sync_log + + def extract_page_paths(self, structure, normalize_lang=True): + """ + Extract all page paths from a structure recursively. + Returns a set of normalized paths (without language prefix) for comparison. + """ + paths = set() + + if not structure: + return paths + + for item in structure: + if isinstance(item, str): + # Normalize path by removing language prefix + if normalize_lang: + normalized = re.sub(r'^(en|cn|jp)/', '', item) + paths.add(normalized) + else: + paths.add(item) + elif isinstance(item, dict) and "pages" in item: + # Recursively extract from nested pages + nested_paths = self.extract_page_paths(item["pages"], normalize_lang) + paths.update(nested_paths) + + return paths + + def find_matching_group(self, en_group_item, existing_structure, target_lang): + """ + Find a matching group in existing structure based on page content. + Groups match if they contain the same normalized page paths. + """ + if not existing_structure or not isinstance(en_group_item, dict): + return None + + if "pages" not in en_group_item: + return None + + # Extract normalized paths from English group + en_paths = self.extract_page_paths(en_group_item["pages"], normalize_lang=True) + + if not en_paths: + return None + + # Search through existing structure for matching group + for existing_item in existing_structure: + if isinstance(existing_item, dict) and "pages" in existing_item: + existing_paths = self.extract_page_paths(existing_item["pages"], normalize_lang=True) + + # Groups match if they have identical page sets + if en_paths == existing_paths: + return existing_item + + return None + + def convert_pages_structure(self, pages_structure, target_lang: str, existing_structure=None): + """ + Recursively convert English page paths to target language paths. + Uses content-based matching to preserve existing group translations. + Groups are matched by their page content, not by position. + """ + if not pages_structure: + return [] + + converted = [] + for item in pages_structure: + if isinstance(item, str): + # Convert path: en/documentation/pages/... -> target_lang/documentation/pages/... + if item.startswith("en/"): + converted_path = item.replace("en/", f"{target_lang}/", 1) + converted.append(converted_path) + else: + converted.append(item) + elif isinstance(item, dict): + converted_item = {} + + # For groups, use content-based matching instead of index-based + existing_match = None + if "group" in item and existing_structure: + existing_match = self.find_matching_group(item, existing_structure, target_lang) + + for key, value in item.items(): + if key == "pages" and isinstance(value, list): + # Recursively convert nested pages + # Pass existing nested structure if we found a matching group + existing_nested = None + if existing_match and "pages" in existing_match: + existing_nested = existing_match["pages"] + + converted_item[key] = self.convert_pages_structure( + value, + target_lang, + existing_nested + ) + elif key == "group": + # Preserve existing human-translated group name if we found a match + if existing_match and "group" in existing_match: + # Use existing translated group name from matched group + converted_item[key] = existing_match["group"] + else: + # New group or no match - use basic translation + translated_group = self.get_basic_label_translation(value, target_lang) + converted_item[key] = translated_group + else: + converted_item[key] = value + converted.append(converted_item) + else: + converted.append(item) + + return converted + + async def run_sync(self, since_commit: str = "HEAD~1") -> Dict[str, List[str]]: + """Run the complete synchronization process""" + print("=== Starting Documentation Synchronization ===") + + # Get file changes + changes = self.get_changed_files(since_commit) + print(f"Detected changes: {changes}") + + results = { + "file_operations": [], + "translations": [], + "structure_sync": [], + "errors": [] + } + + try: + # 1. Sync file operations (delete, rename) + results["file_operations"] = self.sync_file_operations(changes) + + # 2. Translate new and modified files (pass since_commit for diffs) + results["translations"] = await self.translate_new_and_modified_files(changes, since_commit) + + # 3. Sync docs.json structure if needed + if self.extract_english_structure_changes(changes): + results["structure_sync"] = self.sync_docs_json_structure() + + except Exception as e: + results["errors"].append(f"CRITICAL: {e}") + print(f"Critical error during sync: {e}") + + print("=== Synchronization Complete ===") + return results + + async def secure_sync_from_plan(self, sync_plan: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute synchronization from a validated sync plan (for external PRs) + """ + print("=== Starting Secure Documentation Synchronization ===") + + # Validate sync plan + if self.enable_security: + valid, error = self.validate_sync_plan(sync_plan) + if not valid: + return {"errors": [f"Invalid sync plan: {error}"]} + + results = { + "translated": [], + "failed": [], + "skipped": [], + "structure_synced": False, + "errors": [] + } + + try: + # Process files from sync plan + files_to_sync = sync_plan.get("files_to_sync", []) + + # Limit number of files for security + max_files = 10 if self.enable_security else len(files_to_sync) + files_to_process = files_to_sync[:max_files] + + for file_info in files_to_process: + file_path = file_info.get("path") + if not file_path: + continue + + # Additional security validation per file + if self.enable_security: + valid, error = self.validate_file_path(file_path) + if not valid: + results["errors"].append(f"Invalid file path {file_path}: {error}") + continue + + print(f"Processing: {file_path}") + + # Check if source file exists + if not (self.base_dir / file_path).exists(): + results["skipped"].append(file_path) + continue + + # Translate to target languages + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(file_path, target_lang) + try: + success = await self.translate_file_with_notice( + file_path, target_path, target_lang + ) + if success: + results["translated"].append(target_path) + else: + results["failed"].append(target_path) + except Exception as e: + print(f"Error translating {file_path} to {target_lang}: {e}") + results["failed"].append(target_path) + + # Process OpenAPI JSON files + openapi_files_to_sync = sync_plan.get("openapi_files_to_sync", []) + + # Limit number of OpenAPI files for security + max_openapi_files = 5 if self.enable_security else len(openapi_files_to_sync) + openapi_files_to_process = openapi_files_to_sync[:max_openapi_files] + + for file_info in openapi_files_to_process: + file_path = file_info.get("path") + if not file_path: + continue + + # Additional security validation per file + if self.enable_security: + valid, error = self.validate_file_path(file_path) + if not valid: + results["errors"].append(f"Invalid OpenAPI file path {file_path}: {error}") + continue + + print(f"Processing OpenAPI: {file_path}") + + # Check if source file exists + source_full_path = self.base_dir / file_path + if not source_full_path.exists(): + results["skipped"].append(file_path) + continue + + # Translate to target languages + for target_lang in self.target_languages: + target_path = self.convert_path_to_target_language(file_path, target_lang) + target_full_path = self.base_dir / target_path + + try: + # Ensure target directory exists + target_full_path.parent.mkdir(parents=True, exist_ok=True) + + # Run OpenAPI translation pipeline (use async version) + success = await translate_openapi_file_async( + source_file=str(source_full_path), + target_lang=target_lang, + output_file=str(target_full_path), + dify_api_key=self.dify_api_key + ) + + if success: + results["translated"].append(target_path) + print(f"✅ Successfully translated OpenAPI: {file_path} → {target_path}") + else: + results["failed"].append(target_path) + print(f"❌ Failed to translate OpenAPI: {file_path} → {target_path}") + + except Exception as e: + print(f"Error translating OpenAPI {file_path} to {target_lang}: {e}") + results["failed"].append(target_path) + + # Handle structure changes + structure_changes = sync_plan.get("structure_changes", {}) + if structure_changes.get("structure_changed"): + print("Syncing documentation structure...") + try: + sync_log = self.sync_docs_json_structure() + results["structure_synced"] = True + print("Structure sync completed") + except Exception as e: + results["errors"].append(f"Structure sync failed: {e}") + + except Exception as e: + results["errors"].append(f"Critical error: {e}") + + print("=== Secure Synchronization Complete ===") + return results + +async def main(): + """Main entry point""" + if len(sys.argv) < 2: + print("Usage: python sync_and_translate.py [since_commit]") + print(" since_commit: Git commit to compare against (default: HEAD~1)") + sys.exit(1) + + dify_api_key = sys.argv[1] + since_commit = sys.argv[2] if len(sys.argv) > 2 else "HEAD~1" + + # Initialize synchronizer + synchronizer = DocsSynchronizer(dify_api_key) + + # Run synchronization + results = await synchronizer.run_sync(since_commit) + + # Print results + print("\n=== SYNCHRONIZATION RESULTS ===") + for category, logs in results.items(): + if logs: + print(f"\n{category.upper()}:") + for log in logs: + print(f" {log}") + + # Return appropriate exit code + if results["errors"]: + sys.exit(1) + else: + print("\n✓ Synchronization completed successfully") + sys.exit(0) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tools/translate/termbase_i18n.md b/tools/translate/termbase_i18n.md new file mode 100644 index 000000000..800a71b8b --- /dev/null +++ b/tools/translate/termbase_i18n.md @@ -0,0 +1,23 @@ +# Translation Termbase for i18n + +This termbase provides consistent terminology for translating Dify documentation. + +## Technical Terms + +- **Workflow** → 工作流 (CN) / ワークフロー (JP) +- **Agent** → 智能体 (CN) / エージェント (JP) +- **Knowledge Base** → 知识库 (CN) / ナレッジベース (JP) +- **Model** → 模型 (CN) / モデル (JP) +- **Node** → 节点 (CN) / ノード (JP) +- **Variable** → 变量 (CN) / 変数 (JP) +- **Parameter** → 参数 (CN) / パラメータ (JP) +- **API** → API (CN) / API (JP) +- **Token** → 令牌 (CN) / トークン (JP) +- **Prompt** → 提示词 (CN) / プロンプト (JP) + +## General Guidelines + +- Maintain technical accuracy while adapting to local conventions +- Keep code examples and technical identifiers in English +- Preserve markdown formatting and structure +- Maintain a professional and clear tone diff --git a/tools/translate/translate_pr.py b/tools/translate/translate_pr.py new file mode 100755 index 000000000..438e1cd7f --- /dev/null +++ b/tools/translate/translate_pr.py @@ -0,0 +1,773 @@ +#!/usr/bin/env python3 +""" +Translate and commit documentation changes to a translation PR. + +This script consolidates the core translation logic used by both the +execute and update workflows. It handles: +- Branch setup (create new or checkout existing) +- Translation of documentation files +- English file removal +- Committing and pushing changes +- Creating/updating translation PRs +""" + +import argparse +import asyncio +import json +import os +import subprocess +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +# Add current directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from sync_and_translate import DocsSynchronizer +from pr_analyzer import PRAnalyzer +from json_formatter import save_json_with_preserved_format + + +class TranslationPRManager: + """Manages the translation PR workflow.""" + + def __init__( + self, + pr_number: int, + head_sha: str, + base_sha: str, + is_incremental: bool, + pr_title: Optional[str] = None, + work_dir: Optional[str] = None, + api_key: Optional[str] = None + ): + self.pr_number = pr_number + self.head_sha = head_sha + self.base_sha = base_sha + self.is_incremental = is_incremental + self.pr_title = pr_title or "Documentation changes" + self.work_dir = work_dir or "/tmp" + self.api_key = api_key or os.environ.get("DIFY_API_KEY") + + self.sync_branch = f"docs-sync-pr-{pr_number}" + self.repo_root = Path(__file__).parent.parent.parent + + # Load translation config + config_path = self.repo_root / "tools/translate/config.json" + with open(config_path, 'r', encoding='utf-8') as f: + self.translation_config = json.load(f) + + self.source_language = self.translation_config["source_language"] + self.target_languages = self.translation_config["target_languages"] + self.source_dir = self.translation_config["languages"][self.source_language]["directory"] + + # Load processing limits + processing_limits = self.translation_config.get("processing_limits", {}) + self.max_files_per_run = processing_limits.get("max_files_per_run", 10) + self.max_openapi_files_per_run = processing_limits.get("max_openapi_files_per_run", 5) + + # Get repository name dynamically + self.repo_name = self.get_repository_name() + + def run_git(self, *args: str, check: bool = True, capture_output: bool = True) -> subprocess.CompletedProcess: + """Run a git command.""" + cmd = ["git", *args] + return subprocess.run( + cmd, + cwd=self.repo_root, + capture_output=capture_output, + text=True, + check=check + ) + + def run_gh(self, *args: str, check: bool = True) -> subprocess.CompletedProcess: + """Run a gh CLI command.""" + cmd = ["gh", *args] + return subprocess.run( + cmd, + cwd=self.repo_root, + capture_output=True, + text=True, + check=check + ) + + def get_repository_name(self) -> str: + """Get the repository name dynamically from environment or git remote.""" + # Try GitHub Actions environment variable first + repo_name = os.environ.get("GITHUB_REPOSITORY") + if repo_name: + return repo_name + + # Fall back to parsing git remote + try: + result = self.run_git("remote", "get-url", "origin", check=False) + if result.returncode == 0 and result.stdout: + remote_url = result.stdout.strip() + # Parse formats: git@github.com:owner/repo.git or https://github.com/owner/repo.git + if "github.com" in remote_url: + if remote_url.startswith("git@"): + # git@github.com:owner/repo.git + repo_part = remote_url.split(":", 1)[1] + else: + # https://github.com/owner/repo.git + repo_part = "/".join(remote_url.split("/")[-2:]) + # Remove .git suffix if present + repo_name = repo_part.rstrip(".git") + return repo_name + except Exception as e: + print(f"⚠️ Warning: Could not detect repository name from git remote: {e}") + + # Final fallback + return "unknown/repository" + + def check_branch_exists(self) -> bool: + """Check if translation branch exists on remote.""" + result = self.run_git( + "ls-remote", "--exit-code", "--heads", "origin", self.sync_branch, + check=False + ) + return result.returncode == 0 + + def merge_docs_json_for_incremental_update(self) -> None: + """ + Merge docs.json for incremental updates: + - English section from PR HEAD (latest structure) + - cn/jp sections from translation branch (preserve existing translations) + """ + print("Merging docs.json: English from PR, cn/jp from translation branch...") + + # Get docs.json from PR HEAD (has latest English structure) + result = self.run_git("show", f"{self.head_sha}:docs.json") + pr_docs = json.loads(result.stdout) + + # Get docs.json from translation branch (has cn/jp translations) + docs_json_path = self.repo_root / "docs.json" + with open(docs_json_path, 'r', encoding='utf-8') as f: + translation_docs = json.load(f) + + # Merge strategy: Replace English section from PR, keep cn/jp from translation branch + # Navigate to language sections + pr_navigation = pr_docs.get("navigation", {}) + translation_navigation = translation_docs.get("navigation", {}) + + # Handle both direct languages and versions structure + if "versions" in pr_navigation: + pr_languages = pr_navigation["versions"][0].get("languages", []) + translation_languages = translation_navigation.get("versions", [{}])[0].get("languages", []) + else: + pr_languages = pr_navigation.get("languages", []) + translation_languages = translation_navigation.get("languages", []) + + # Build language lookup from translation branch + translation_langs_by_code = {} + for lang_data in translation_languages: + lang_code = lang_data.get("language") + if lang_code: + translation_langs_by_code[lang_code] = lang_data + + # Merge: Use English from PR, cn/jp from translation branch + merged_languages = [] + for pr_lang in pr_languages: + lang_code = pr_lang.get("language") + + if lang_code == self.source_language: + # Use English section from PR (latest structure) + merged_languages.append(pr_lang) + elif lang_code in translation_langs_by_code: + # Use cn/jp from translation branch (preserve existing translations) + merged_languages.append(translation_langs_by_code[lang_code]) + else: + # Fallback: use from PR + merged_languages.append(pr_lang) + + # Update the merged docs.json + merged_docs = pr_docs.copy() + if "versions" in pr_navigation: + merged_docs["navigation"]["versions"][0]["languages"] = merged_languages + else: + merged_docs["navigation"]["languages"] = merged_languages + + # Write merged docs.json preserving original formatting + # Use translation branch's docs.json as reference for format detection + success = save_json_with_preserved_format( + str(docs_json_path), + merged_docs, + reference_file=str(docs_json_path) # Current file from translation branch + ) + + if success: + print(f"✓ Merged docs.json: English from PR {self.head_sha[:8]}, cn/jp from {self.sync_branch}") + else: + print(f"⚠️ Warning: Could not preserve formatting, using default") + # Fallback to standard json.dump if format preservation fails + with open(docs_json_path, 'w', encoding='utf-8') as f: + json.dump(merged_docs, f, indent=2, ensure_ascii=False) + + def setup_translation_branch(self, branch_exists: bool) -> None: + """Setup the translation branch (create or checkout existing).""" + if branch_exists: + print(f"✅ Fetching existing translation branch for incremental update: {self.sync_branch}") + self.run_git("fetch", "origin", f"{self.sync_branch}:{self.sync_branch}") + self.run_git("checkout", self.sync_branch) + + # For incremental updates, checkout English files only (not docs.json) + print(f"Checking out English files from {self.head_sha[:8]}...") + self.run_git("checkout", self.head_sha, "--", f"{self.source_dir}/", check=False) + + # Merge docs.json: English from PR HEAD, cn/jp from translation branch + self.merge_docs_json_for_incremental_update() + else: + print(f"🆕 Creating new translation branch: {self.sync_branch}") + self.run_git("checkout", "-b", self.sync_branch) + + # Reset branch to main to avoid including English file changes from PR + # Use --soft to keep working directory with PR files (needed for translation) + self.run_git("reset", "--soft", "origin/main") + # Unstage everything + self.run_git("reset") + + async def run_translation(self) -> Dict: + """Run the translation process using sync_and_translate logic.""" + if not self.api_key: + print("❌ Error: DIFY_API_KEY not set") + return {"translated": [], "failed": ["NO_API_KEY"], "skipped": []} + + # Load sync plan if available (from artifacts) + sync_plan_path = Path(self.work_dir) / "sync_plan.json" + if not sync_plan_path.exists(): + print(f"⚠️ Warning: No sync plan found at {sync_plan_path}") + print("This is expected for update workflow - will analyze PR changes directly") + return await self.run_translation_from_pr_analysis() + + with open(sync_plan_path) as f: + sync_plan = json.load(f) + + return await self.run_translation_from_sync_plan(sync_plan) + + async def run_translation_from_pr_analysis(self) -> Dict: + """Run translation by generating sync plan on-the-fly (used by update workflow).""" + print(f"Generating sync plan for PR changes: {self.base_sha[:8]}...{self.head_sha[:8]}") + + # Import here to avoid circular dependency + from pr_analyzer import SyncPlanGenerator + + # Generate sync plan with identical logic to analyze workflow + generator = SyncPlanGenerator(self.base_sha, self.head_sha) + sync_plan = generator.generate_sync_plan() + + # Log what we're syncing + files_count = len(sync_plan.get("files_to_sync", [])) + openapi_count = len(sync_plan.get("openapi_files_to_sync", [])) + structure_changed = sync_plan.get("structure_changes", {}).get("structure_changed", False) + + print(f"Sync plan generated:") + print(f" - {files_count} markdown files to translate") + print(f" - {openapi_count} OpenAPI files to translate") + print(f" - Structure changed: {structure_changed}") + + if not sync_plan.get("sync_required", False): + print("No sync required - no changes to translate") + return {"translated": [], "failed": [], "skipped": ["no_changes"]} + + return await self.run_translation_from_sync_plan(sync_plan) + + async def run_translation_from_sync_plan(self, sync_plan: Dict) -> Dict: + """Run translation from a sync plan.""" + synchronizer = DocsSynchronizer(self.api_key) + + results = { + "translated": [], + "failed": [], + "skipped": [] + } + + files_to_sync = sync_plan.get("files_to_sync", []) + metadata = sync_plan.get("metadata", {}) + base_sha = metadata.get("base_sha", self.base_sha) + head_sha = metadata.get("head_sha", self.head_sha) + + # Detect added vs modified files and renames + added_files, modified_files, renamed_files = self.detect_file_changes(base_sha, head_sha) + + print(f"Detected {len(added_files)} added files, {len(modified_files)} modified files, {len(renamed_files)} renamed files") + + # Translate each file with configurable limit + if len(files_to_sync) > self.max_files_per_run: + print(f"⚠️ Warning: PR has {len(files_to_sync)} files to sync, limiting to {self.max_files_per_run} for safety") + print(f" (Adjust 'processing_limits.max_files_per_run' in config.json to change this limit)") + + for file_info in files_to_sync[:self.max_files_per_run]: + file_path = file_info.get("path") if isinstance(file_info, dict) else file_info + + if file_path == "docs.json": + results["skipped"].append(f"{file_path} (structure file)") + continue + + if file_path.startswith("versions/"): + results["skipped"].append(f"{file_path} (versioned docs)") + continue + + if not (self.repo_root / file_path).exists(): + results["skipped"].append(f"{file_path} (not found)") + continue + + is_modified = file_path in modified_files + + # Get diff for modified files + diff_original = None + if is_modified: + diff_original = self.get_file_diff(base_sha, head_sha, file_path) + + # Translate to all target languages + for target_lang in self.target_languages: + target_dir = self.translation_config["languages"][target_lang]["directory"] + target_path = file_path.replace(f"{self.source_dir}/", f"{target_dir}/") + + # Load existing translation for modified files + the_doc_exist = None + if is_modified: + target_full_path = self.repo_root / target_path + if target_full_path.exists(): + with open(target_full_path, 'r', encoding='utf-8') as f: + the_doc_exist = f.read() + + try: + success = await synchronizer.translate_file_with_notice( + file_path, + target_path, + target_lang, + the_doc_exist=the_doc_exist, + diff_original=diff_original + ) + + if success: + change_type = "modified" if is_modified else "added" + results["translated"].append(f"{target_path} ({change_type})") + else: + results["failed"].append(target_path) + except Exception as e: + print(f"❌ Error translating {file_path} to {target_lang}: {e}") + results["failed"].append(target_path) + + # Handle OpenAPI files if present + openapi_files = sync_plan.get("openapi_files_to_sync", []) + if openapi_files: + await self.translate_openapi_files(openapi_files, results) + + # Sync docs.json structure + if sync_plan.get("structure_changes", {}).get("structure_changed"): + self.sync_docs_json_structure(synchronizer, added_files, renamed_files, base_sha, head_sha) + + return results + + def detect_file_changes(self, base_sha: str, head_sha: str) -> Tuple[List[str], List[str], List[Tuple[str, str]]]: + """Detect added, modified, and renamed files between two commits. + + Returns: + Tuple of (added_files, modified_files, renamed_files) + renamed_files is a list of (old_path, new_path) tuples for exact renames (100% content match) + """ + added_files = [] + modified_files = [] + renamed_files = [] + + try: + result = self.run_git( + "diff", "--name-status", "--find-renames=100%", + base_sha, head_sha + ) + + for line in result.stdout.strip().split('\n'): + if line and '\t' in line: + parts = line.split('\t') + status = parts[0] + + if status == 'A': + added_files.append(parts[1]) + elif status == 'M': + modified_files.append(parts[1]) + elif status.startswith('R'): # R100 = 100% identical content + old_path = parts[1] + new_path = parts[2] + renamed_files.append((old_path, new_path)) + # Note: 'D' (deleted) is handled separately via docs.json comparison + + except subprocess.CalledProcessError as e: + print(f"⚠️ Warning: Could not detect file status: {e}") + # Fallback: treat all as added + + return added_files, modified_files, renamed_files + + def get_file_diff(self, base_sha: str, head_sha: str, file_path: str) -> Optional[str]: + """Get the diff for a specific file between two commits.""" + try: + result = self.run_git("diff", base_sha, head_sha, "--", file_path) + return result.stdout if result.stdout else None + except subprocess.CalledProcessError: + return None + + async def translate_openapi_files(self, openapi_files: List, results: Dict) -> None: + """Translate OpenAPI JSON files.""" + from openapi import translate_openapi_file_async + + # Apply configurable limit with warning + if len(openapi_files) > self.max_openapi_files_per_run: + print(f"⚠️ Warning: PR has {len(openapi_files)} OpenAPI files, limiting to {self.max_openapi_files_per_run} for safety") + print(f" (Adjust 'processing_limits.max_openapi_files_per_run' in config.json to change this limit)") + + for file_info in openapi_files[:self.max_openapi_files_per_run]: + file_path = file_info.get("path") if isinstance(file_info, dict) else file_info + source_full_path = self.repo_root / file_path + + if not source_full_path.exists(): + results["skipped"].append(f"{file_path} (openapi not found)") + continue + + for target_lang in self.target_languages: + target_dir = self.translation_config["languages"][target_lang]["directory"] + target_path = file_path.replace(f"{self.source_dir}/", f"{target_dir}/") + target_full_path = self.repo_root / target_path + + target_full_path.parent.mkdir(parents=True, exist_ok=True) + + try: + success = await translate_openapi_file_async( + source_file=str(source_full_path), + target_lang=target_lang, + output_file=str(target_full_path), + dify_api_key=self.api_key + ) + + if success: + results["translated"].append(f"{target_path} (openapi)") + else: + results["failed"].append(target_path) + except Exception as e: + print(f"❌ Error translating OpenAPI {file_path}: {e}") + results["failed"].append(target_path) + + def sync_docs_json_structure( + self, + synchronizer: DocsSynchronizer, + added_files: List[str], + renamed_files: List[Tuple[str, str]], + base_sha: str, + head_sha: str + ) -> None: + """Sync docs.json navigation structure.""" + print("Syncing docs.json structure...") + + # Get deleted files + deleted_files = [] + try: + result = self.run_git( + "diff", "--name-status", "--diff-filter=D", + base_sha, head_sha + ) + + for line in result.stdout.strip().split('\n'): + if line and line.startswith('D\t'): + file_path = line.split('\t')[1] + if file_path.startswith(f"{self.source_dir}/"): + deleted_files.append(file_path) + except subprocess.CalledProcessError as e: + print(f"⚠️ Warning: Could not get deleted files: {e}") + + # Delete corresponding translation files + if deleted_files: + print(f"Deleting {len(deleted_files)} translation files...") + for source_file in deleted_files: + for target_lang in self.target_languages: + target_dir = self.translation_config["languages"][target_lang]["directory"] + target_file = source_file.replace(f"{self.source_dir}/", f"{target_dir}/") + target_path = self.repo_root / target_file + + if target_path.exists(): + target_path.unlink() + print(f"✓ Deleted {target_file}") + + # Remove empty parent directories + parent = target_path.parent + while parent != self.repo_root: + try: + if not any(parent.iterdir()): + parent.rmdir() + print(f"✓ Removed empty directory {parent.relative_to(self.repo_root)}") + parent = parent.parent + else: + break + except (OSError, ValueError): + break + + # Sync docs.json incrementally + sync_log = synchronizer.sync_docs_json_incremental( + added_files=added_files, + deleted_files=deleted_files, + renamed_files=renamed_files, + base_sha=base_sha, + head_sha=head_sha + ) + print("\n".join(sync_log)) + + def remove_english_files(self) -> None: + """Remove English source files from working directory before commit.""" + print("Removing English source files from working directory...") + + # Remove markdown and MDX files from English directory + en_dir = self.repo_root / self.source_dir + for pattern in ["*.md", "*.mdx"]: + for file_path in en_dir.glob(f"**/{pattern}"): + try: + file_path.unlink() + print(f" Removed {file_path.relative_to(self.repo_root)}") + except Exception as e: + print(f" Warning: Could not remove {file_path}: {e}") + + # Unstage any English files that might have been staged + self.run_git("reset", "HEAD", "--", f"{self.source_dir}/", check=False) + + print("✓ English source files removed") + + def commit_changes(self, branch_exists: bool) -> bool: + """Commit translation changes.""" + # Setup git identity + self.run_git("config", "user.name", "github-actions[bot]") + self.run_git("config", "user.email", "github-actions[bot]@users.noreply.github.com") + + # Checkout translation branch again (in case we're in detached state) + if branch_exists: + self.run_git("fetch", "origin", self.sync_branch) + # Try to checkout and merge remote changes instead of discarding them + try: + self.run_git("checkout", self.sync_branch) + # Attempt fast-forward merge with remote + merge_result = self.run_git("merge", f"origin/{self.sync_branch}", "--ff-only", check=False) + if merge_result.returncode != 0: + print("⚠️ Cannot fast-forward merge. Translation branch has diverged.") + print(" This may indicate concurrent workflow runs or manual modifications.") + raise RuntimeError("Translation branch has diverged - concurrent modification detected") + except subprocess.CalledProcessError as e: + print(f"❌ Error checking out translation branch: {e}") + raise + else: + # Branch was already created in setup_translation_branch(), just checkout + self.run_git("checkout", self.sync_branch) + + # Remove English files before staging + self.remove_english_files() + + # Stage only translation files + target_dirs = [self.translation_config["languages"][lang]["directory"] + for lang in self.target_languages] + stage_paths = target_dirs + ["docs.json"] + + for path in stage_paths: + self.run_git("add", path, check=False) + + # Check if there are changes to commit + status_result = self.run_git("status", "--porcelain") + if not status_result.stdout.strip(): + print("ℹ️ No changes to commit") + return False + + # Create commit message + if branch_exists: + commit_msg = f"""🔄 Update translations for commit {self.head_sha[:8]} + +Auto-generated translations for changes in commit {self.head_sha}. + +Last-Processed-Commit: {self.head_sha} +Original-PR: #{self.pr_number} +Languages: Chinese (cn), Japanese (jp) + +🤖 Generated with GitHub Actions""" + else: + commit_msg = f"""🌐 Initial translations for PR #{self.pr_number} + +Auto-generated translations for documentation changes in PR #{self.pr_number}. + +Last-Processed-Commit: {self.head_sha} +Original-PR: #{self.pr_number} +Languages: Chinese (cn), Japanese (jp) + +🤖 Generated with GitHub Actions""" + + self.run_git("commit", "-m", commit_msg) + print(f"✓ Committed changes to {self.sync_branch}") + + return True + + def push_changes(self) -> None: + """Push changes to remote translation branch.""" + # Use --force-with-lease for safety - allows push only if remote hasn't changed + # since we last fetched. This prevents accidental overwrites while being safer than --force. + self.run_git("push", "--force-with-lease", "origin", self.sync_branch) + print(f"✓ Pushed changes to origin/{self.sync_branch}") + + def create_or_update_pr(self, branch_exists: bool) -> Dict: + """Create new translation PR or update existing one.""" + if not branch_exists: + # Create new PR + print("Creating new translation PR...") + + pr_body = f"""Syncs PR #{self.pr_number} to other languages. + +**Original:** {self.pr_title} + +### What's synced +- 🇨🇳 Chinese (cn) +- 🇯🇵 Japanese (jp) +- 📋 Navigation (docs.json) + +Review translations and merge when ready. Both PRs can merge independently. + +--- +🤖 Auto-synced from PR #{self.pr_number}""" + + result = self.run_gh( + "pr", "create", + "--base", "main", + "--head", self.sync_branch, + "--title", f"🌐 Sync PR #{self.pr_number} to cn/jp: {self.pr_title}", + "--body", pr_body + ) + + pr_url = result.stdout.strip() + pr_number = pr_url.split('/')[-1] if pr_url else None + + print(f"✅ Created translation PR: {pr_url}") + + return { + "translation_pr_number": pr_number, + "translation_pr_url": pr_url, + "created": True + } + else: + # Update existing PR with comment + print("Finding existing translation PR...") + + result = self.run_gh( + "pr", "list", + "--search", f"head:{self.sync_branch}", + "--json", "number", + "--jq", ".[0].number", + check=False + ) + + pr_number = result.stdout.strip() + if not pr_number: + print("⚠️ Could not find existing translation PR") + return { + "created": False, + "translation_pr_number": None, + "translation_pr_url": None + } + + # Add tracking comment + comment = f""" +🔄 **Updated for commit `{self.head_sha[:8]}`** + +Latest source changes from PR #{self.pr_number} have been translated and committed. + +**Source commit:** [`{self.head_sha[:8]}`](https://github.com/{self.repo_name}/commit/{self.head_sha}) +**Original PR:** #{self.pr_number}""" + + self.run_gh("pr", "comment", pr_number, "--body", comment, check=False) + + pr_url = f"https://github.com/{self.repo_name}/pull/{pr_number}" + + print(f"✅ Updated translation PR #{pr_number}") + + return { + "translation_pr_number": pr_number, + "translation_pr_url": pr_url, + "created": False + } + + async def run(self) -> Dict: + """Run the complete translation PR workflow.""" + try: + # Check if branch exists + branch_exists = self.check_branch_exists() + print(f"Translation branch exists: {branch_exists}") + + # Setup translation branch + self.setup_translation_branch(branch_exists) + + # Run translation + translation_results = await self.run_translation() + + if translation_results["failed"]: + print(f"⚠️ Some translations failed: {translation_results['failed']}") + + # Commit changes + has_changes = self.commit_changes(branch_exists) + + if not has_changes: + return { + "success": True, + "has_changes": False, + "translation_results": translation_results + } + + # Push changes + self.push_changes() + + # Create or update PR + pr_info = self.create_or_update_pr(branch_exists) + + return { + "success": True, + "has_changes": True, + "translation_results": translation_results, + **pr_info + } + + except Exception as e: + print(f"❌ Error: {e}") + import traceback + traceback.print_exc() + return { + "success": False, + "error": str(e) + } + + +def main(): + parser = argparse.ArgumentParser( + description="Translate and commit documentation changes to a translation PR" + ) + parser.add_argument("--pr-number", type=int, required=True, help="Source PR number") + parser.add_argument("--head-sha", required=True, help="HEAD commit SHA") + parser.add_argument("--base-sha", required=True, help="Base commit SHA for comparison") + parser.add_argument("--is-incremental", action="store_true", help="Whether this is an incremental update") + parser.add_argument("--pr-title", help="Source PR title") + parser.add_argument("--work-dir", default="/tmp", help="Working directory for artifacts") + parser.add_argument("--api-key", help="Dify API key (defaults to DIFY_API_KEY env var)") + + args = parser.parse_args() + + manager = TranslationPRManager( + pr_number=args.pr_number, + head_sha=args.head_sha, + base_sha=args.base_sha, + is_incremental=args.is_incremental, + pr_title=args.pr_title, + work_dir=args.work_dir, + api_key=args.api_key + ) + + result = asyncio.run(manager.run()) + + # Output result as JSON for workflow parsing + print("\n" + "="*80) + print("RESULT_JSON:") + print(json.dumps(result, indent=2)) + print("="*80) + + # Exit with appropriate code + sys.exit(0 if result.get("success") else 1) + + +if __name__ == "__main__": + main()