Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/runners/Dockerfile.llm-doc-annotator
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM python:3.11-slim

# Set working directory
WORKDIR /app

# Install required system packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
curl \
git \
&& rm -rf /var/lib/apt/lists/*

# Install Python dependencies
RUN pip install --no-cache-dir \
google-generativeai==0.8.5 \
tiktoken==0.9.0 \
PyYAML==6.0.2 \
json5
55 changes: 55 additions & 0 deletions .github/runners/Dockerfile.quarto-doc-builder
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
FROM debian:bookworm-slim

ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
curl \
sudo \
git \
bash \
rsync \
ca-certificates \
python3 \
python3-pip \
fonts-dejavu \
coreutils \
procps \
chromium \
libreoffice-core \
libreoffice-writer \
libreoffice-java-common \
default-jre-headless


# Install Quarto CLI
RUN arch=$(dpkg --print-architecture) && \
case "$arch" in \
amd64) deb=quarto-1.7.30-linux-amd64.deb ;; \
arm64) deb=quarto-1.7.30-linux-arm64.deb ;; \
*) echo "Unsupported architecture: $arch" && exit 1 ;; \
esac && \
wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.7.30/$deb && \
dpkg -i $deb && \
rm $deb

# Install Node.js (for GitHub Actions written in Node)
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
apt-get install -y nodejs

# Clean up apt, pip, Python, and docs
RUN apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
python3 -m pip cache purge && \
find /usr/local/lib/python3.*/ -name '__pycache__' -exec rm -r {} + && \
find /usr/local/lib/python3.*/ -name '*.pyc' -delete && \
rm -rf /root/.cache/pip /root/.cache/fontconfig && \
rm -rf $HF_HOME/hub/tmp-* $HF_HOME/hub/*.lock $HF_HOME/hub/models--*/blobs &&\
rm -rf /usr/share/doc /usr/share/man /usr/share/locale /var/cache/* /tmp/* /var/tmp/*


# Entrypoint script
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
105 changes: 79 additions & 26 deletions .github/scripts/generate_intros_and_keywords.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from pathlib import Path
import json
import time
import re
import google.generativeai as genai
import tiktoken
import yaml
from io import StringIO
import os
from pathlib import Path
import sys
import json5
import hashlib

# Configuration
API_KEY = os.getenv("GEMINI_API_KEY")
Expand All @@ -18,13 +19,18 @@

SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve()
ROOT_DIR = (SCRIPT_DIR / "../../").resolve()
CACHE_DIR = (SCRIPT_DIR / "../../.llm_cache").resolve()
CACHE_DIR.mkdir(exist_ok=True)
BLACKLISTED_DIRS = {"templates", "includes", "theme"}

PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA).
PROMPT = """You are an AI assistant helping to enrich technical documents for the Copernicus Land Monitoring Service (CLMS).

Your tasks:
1. Read and understand the entire attached document.
1. Read and understand the entire attached document. Ignore yml metadata and focus on the main content.
2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus.
3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document.
4. Use British English spelling and terminology.

Keyword guidance:
- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\".
Expand All @@ -38,6 +44,8 @@
\"introduction\": \"...\",
\"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"]
}

Avoid trailing commas in the JSON output.
"""

# Setup Gemini
Expand All @@ -48,15 +56,15 @@


# Function to update YAML frontmatter using PyYAML
def update_yaml_header(content: str, description: str, keywords_list: list):
lines = content.splitlines()
def update_qmd_file(doc_path, description: str, keywords_list: list):
lines = doc_path.read_text(encoding="utf-8").splitlines()
if lines[0].strip() != "---":
return content
return

try:
end_idx = lines[1:].index("---") + 1
except ValueError:
return content
return

yaml_block = "\n".join(lines[1:end_idx])
yaml_data = yaml.safe_load(yaml_block) or {}
Expand All @@ -65,16 +73,17 @@ def update_yaml_header(content: str, description: str, keywords_list: list):

new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip()
new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :]
return "\n".join(new_lines)

doc_path.write_text("\n".join(new_lines), encoding="utf-8")


# Function to process one document with Gemini
def process_document_with_llm(doc_path: Path):
print("Processing ", doc_path)
global total_tokens_sent

file_contents = doc_path.read_text(encoding="utf-8")
input_tokens = len(encoding.encode(file_contents))
print(f"[LLM] Processing: {doc_path} ({input_tokens} input tokens)")
if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE:
print(
f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens."
Expand Down Expand Up @@ -106,27 +115,71 @@ def process_document_with_llm(doc_path: Path):
raw_text = re.sub(r"\s*```$", "", raw_text)

try:
parsed_output = json.loads(raw_text)
introduction = parsed_output["introduction"]
keywords_list = parsed_output["keywords"]
keywords = ", ".join(keywords_list)
parsed_output = json5.loads(raw_text)
return {
"introduction": parsed_output["introduction"],
"keywords": parsed_output["keywords"],
}
except (json.JSONDecodeError, KeyError) as e:
print(f"[ERROR] Invalid response for {doc_path}:", raw_text)
return
return None

updated_content = update_yaml_header(file_contents, introduction, keywords_list)
output_file = doc_path.with_name(doc_path.stem + ".qmd")
output_file.write_text(updated_content, encoding="utf-8")

print("Estimated input tokens:", input_tokens)
# Function to read paths of modified documents (.qmd) from a file. The file is provided by github actions as an input.
def read_paths_from_filename():
input_file = sys.argv[1]
try:
with open(input_file, "r") as f:
return [line.strip() for line in f if line.strip()]
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {input_file}")
except Exception as e:
raise RuntimeError(f"Error reading file: {e}")


# Process all .qmd files
BLACKLISTED_DIRS = {"templates", "includes", "theme"}
# Cache-related functions
def file_hash(path):
return hashlib.sha256(path.read_bytes()).hexdigest()


def get_cache_path(qmd_path):
safe_path = "__".join(qmd_path.parts)
return CACHE_DIR / f"{safe_path}.json"


def load_cached_result(cache_path):
if cache_path.exists():
with cache_path.open() as f:
return json.load(f)
return {}


def save_cached_result(cache_path, data):
with cache_path.open("w") as f:
json.dump(data, f, indent=2)


if __name__ == "__main__":
modified_paths = set(Path(p) for p in read_paths_from_filename())

for full_doc_path in INPUT_DIR.rglob("*.qmd"):
doc_path = full_doc_path.relative_to(ROOT_DIR)
if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
continue

cache_path = get_cache_path(doc_path)
current_hash = file_hash(doc_path)
cache = load_cached_result(cache_path)

if doc_path in modified_paths or cache.get("hash") != current_hash:
result = process_document_with_llm(doc_path)
cache = {
"hash": current_hash,
"intro": result["introduction"],
"keywords": result["keywords"],
}
save_cached_result(cache_path, cache)

for doc_path in INPUT_DIR.rglob("*.qmd"):
if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
continue
process_document_with_llm(doc_path)
update_qmd_file(doc_path, cache["intro"], cache["keywords"])

print("Total tokens sent:", total_tokens_sent)
print("Total tokens sent:", total_tokens_sent)
75 changes: 0 additions & 75 deletions .github/scripts/generate_keywords.py

This file was deleted.

35 changes: 34 additions & 1 deletion .github/workflows/deploy-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,40 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-depth: 2

- name: Mark repo as safe for Git
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"

- name: Show commit history
run: git log --oneline -n 5

- name: Get changed .qmd files on current branch
run: |
(git diff --name-only HEAD^ HEAD | grep -E '\.qmd$' || true) > modified_docs_list.txt
echo "Modified .qmd files:"
cat modified_docs_list.txt

- name: Generate intros and keywords
uses: addnab/docker-run-action@v3
with:
image: mckeea/llm-doc-annotator:latest
options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app
run: python .github/scripts/generate_intros_and_keywords.py modified_docs_list.txt

- name: Commit updated LLM cache
run: |
git config user.name "GitHub Actions"
git config user.email "[email protected]"

git add .llm_cache

if git diff --cached --quiet; then
echo "No changes to commit."
else
git commit -m "Update LLM cache [skip ci]"
git push origin HEAD
fi

- name: Generate intros and keywords
uses: addnab/docker-run-action@v3
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"hash": "6c2405be4e8491221a1444f66040b7dfa982e079673ff790b6d64a756a545499",
"intro": "This document outlines the IT architecture principles and implementation guidelines for the Copernicus Land Monitoring Service (CLMS), managed by the European Environment Agency (EEA). It establishes a framework for developing consistent, scalable, and secure IT solutions within the CLMS programme. Emphasising key aspects like modularity, reproducibility, transparency, maintainability, observability, security and resilience, these guidelines ensure that all IT solutions are coherent, adaptable, and efficiently operated, contributing to the overarching goals of the CLMS.",
"keywords": [
"IT solution reproducibility",
"REST API service",
"Continuous Integration and Continuous Deployment",
"Client specific software modularity",
"Infrastructure-as-a-code",
"Role-based access control",
"Disaster recovery plan",
"IT solution cloud agnosticism",
"Source code inline documentation",
"Automated metric monitoring"
]
}
Loading