feat(merge): promote develop to test (#5)

mckeea · Matteo Mattiuzzi · web-flow · commit 99cf4bda51da · 2025-06-11T17:20:40.000+02:00
* update: editor's manual

* Update deploy-docs.yml

* Small update to editor's manual to check git workflow

* LLM annotator for intros and keywords

* corrected to the new url

* Add date,version fields to listings

* Fix: generate a correct sitemap.xml

---------

Co-authored-by: Matteo Mattiuzzi &lt;matteo.mattiuzzi@eea.europa.eu&gt;
diff --git a/.github/scripts/build-docs.sh b/.github/scripts/build-docs.sh
@@ -1,42 +1,15 @@
 #!/bin/bash
 set -e
 
-# echo "🐍 Setting up Python environment..."
-# apt-get update
-# apt-get install -y python3 python3-venv python3-pip
-
-# echo "📦 Creating virtual environment..."
-# python3 -m venv venv
-# source venv/bin/activate
-
-# echo "⬆️ Upgrading pip inside virtual environment..."
-# pip install --upgrade pip
-
-# echo "📦 Installing Python dependencies..."
-# pip install \
-#     keybert \
-#     ruamel.yaml \
-#     pyyaml \
-#     transformers==4.37.2 \
-#     accelerate==0.27.2
-
-# source venv/bin/activate
-
-# echo "🛠 Setting up default Quarto configuration..."
-# mv _quarto_not_used.yaml _quarto.yaml
-
-# echo "🏷 Generating keywords..."
-# python scripts/render/generate_keywords.py
-
-#echo "🧹 Cleaning up cached _site directory..."
-#rm -rf _site
-
-
 echo "🖼 Render all documents into to HTML/DOCX"
 sudo cp /usr/bin/chromium /usr/bin/chromium-browser
-QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html
-QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx --no-clean
+QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx 
 find _site -type f -name 'index.docx' -delete
+QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html --no-clean
+
+# Backup the correct sitemap as it may be overwritten by next operations
+sleep 5
+mv _site/sitemap.xml _site/sitemap.xml.bkp
 
 echo "🛠 Generate index.qmd files for all DOCS/* folders"e
 node .github/scripts/generate_index_all.mjs
@@ -63,6 +36,9 @@ echo '<!DOCTYPE html>
   </body>
 </html>' > _site/index.html
 
+# Revert the correct sitemap
+cp _site/sitemap.xml.bkp _site/sitemap.xml
+rm -f _site/sitemap.xml.bkp
 
 echo "📄 Converting .docx files to .pdf..."
 #chmod +x ./convert_docx_to_pdf.sh
diff --git a/.github/scripts/generate_index_all.mjs b/.github/scripts/generate_index_all.mjs
@@ -19,7 +19,7 @@ listing:
   type: table
   contents: .
   sort: title
-  fields: [title]
+  fields: [title, date, version]
 ---
 `;
 
diff --git a/.github/scripts/generate_intros_and_keywords.py b/.github/scripts/generate_intros_and_keywords.py
@@ -0,0 +1,132 @@
+from pathlib import Path
+import json
+import time
+import re
+import google.generativeai as genai
+import tiktoken
+import yaml
+from io import StringIO
+import os
+from pathlib import Path
+
+# Configuration
+API_KEY = os.getenv("GEMINI_API_KEY")
+if not API_KEY:
+    raise EnvironmentError("GEMINI_API_KEY environment variable not set")
+MODEL_NAME = "gemini-2.0-flash"
+TOKEN_LIMIT_PER_MINUTE = 950_000  # Keep a safe margin below 1M
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve()
+
+PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA).
+
+Your tasks:
+1. Read and understand the entire attached document.
+2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus.
+3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document.
+
+Keyword guidance:
+- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\".
+- Focus on **specific concepts, methods, environmental indicators, technical systems, data processing strategies**, or **analytical results** that are central to the document.
+- Use **multi-word phrases** when needed for clarity and specificity.
+- Think like an expert indexing the document for scientific search or semantic web use.
+
+Return only the result as a raw JSON object (no code block, no explanation):
+
+{
+  \"introduction\": \"...\",
+  \"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"]
+}
+"""
+
+# Setup Gemini
+genai.configure(api_key=API_KEY)
+model = genai.GenerativeModel(MODEL_NAME)
+encoding = tiktoken.get_encoding("cl100k_base")
+total_tokens_sent = 0
+
+
+# Function to update YAML frontmatter using PyYAML
+def update_yaml_header(content: str, description: str, keywords_list: list):
+    lines = content.splitlines()
+    if lines[0].strip() != "---":
+        return content
+
+    try:
+        end_idx = lines[1:].index("---") + 1
+    except ValueError:
+        return content
+
+    yaml_block = "\n".join(lines[1:end_idx])
+    yaml_data = yaml.safe_load(yaml_block) or {}
+    yaml_data["description"] = description.replace("\n", " ").strip()
+    yaml_data["keywords"] = keywords_list
+
+    new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip()
+    new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :]
+    return "\n".join(new_lines)
+
+
+# Function to process one document with Gemini
+def process_document_with_llm(doc_path: Path):
+    print("Processing ", doc_path)
+    global total_tokens_sent
+
+    file_contents = doc_path.read_text(encoding="utf-8")
+    input_tokens = len(encoding.encode(file_contents))
+    if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE:
+        print(
+            f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens."
+        )
+        return
+
+    response = model.generate_content(
+        contents=[
+            {
+                "role": "user",
+                "parts": [
+                    {"text": PROMPT},
+                    {
+                        "inline_data": {
+                            "mime_type": "text/plain",
+                            "data": file_contents.encode("utf-8"),
+                        }
+                    },
+                ],
+            }
+        ]
+    )
+
+    total_tokens_sent += input_tokens
+
+    raw_text = response.text.strip()
+    if raw_text.startswith("```"):
+        raw_text = re.sub(r"^```(?:json)?\s*", "", raw_text)
+        raw_text = re.sub(r"\s*```$", "", raw_text)
+
+    try:
+        parsed_output = json.loads(raw_text)
+        introduction = parsed_output["introduction"]
+        keywords_list = parsed_output["keywords"]
+        keywords = ", ".join(keywords_list)
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"[ERROR] Invalid response for {doc_path}:", raw_text)
+        return
+
+    updated_content = update_yaml_header(file_contents, introduction, keywords_list)
+    output_file = doc_path.with_name(doc_path.stem + ".qmd")
+    output_file.write_text(updated_content, encoding="utf-8")
+
+    print("Estimated input tokens:", input_tokens)
+
+
+# Process all .qmd files
+BLACKLISTED_DIRS = {"templates", "includes", "theme"}
+
+for doc_path in INPUT_DIR.rglob("*.qmd"):
+    if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
+        continue
+    process_document_with_llm(doc_path)
+
+print("Total tokens sent:", total_tokens_sent)
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
@@ -21,6 +21,13 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Generate intros and keywords
+        uses: addnab/docker-run-action@v3
+        with:
+          image: mckeea/llm-doc-annotator:latest
+          options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app
+          run: python .github/scripts/generate_intros_and_keywords.py
+
       - name: Build Docs
         run: .github/scripts/build-docs.sh
 
diff --git a/DOCS/guidelines/editor-manual.qmd b/DOCS/guidelines/editor-manual.qmd
@@ -2,7 +2,7 @@
 title: "Guide for Writing Techncial Documentation"
 subtitle: "Copernicus Land Monitoring Service"
 author: "European Environment Agency (EEA)"
-version: 0.5
+version: 0.6
 description: "A comprehensive guide for creating technical documentation for the Copernicus
   Land Monitoring Service using Quarto. It covers Markdown basics, document rendering,
   and the review process, ensuring consistency and clarity in documentation."
diff --git a/README.md b/README.md
@@ -2,4 +2,4 @@
 
 This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines.
 
-The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/)
+The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/)

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`
`3`	`3`	`This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines.`
`4`	`4`
`5`		`-The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/)`
	`5`	`+The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/)`