Skip to content

Commit dccdc3b

Browse files
committed
Merge branch 'test'
2 parents c4f9660 + 10272ff commit dccdc3b

22 files changed

+510
-524
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
FROM python:3.11-slim
2+
3+
# Set working directory
4+
WORKDIR /app
5+
6+
# Install required system packages
7+
RUN apt-get update && \
8+
apt-get install -y --no-install-recommends \
9+
build-essential \
10+
curl \
11+
git \
12+
&& rm -rf /var/lib/apt/lists/*
13+
14+
# Install Python dependencies
15+
RUN pip install --no-cache-dir \
16+
google-generativeai==0.8.5 \
17+
tiktoken==0.9.0 \
18+
PyYAML==6.0.2 \
19+
json5
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
FROM debian:bookworm-slim
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Install system dependencies
6+
RUN apt-get update && apt-get install -y --no-install-recommends \
7+
wget \
8+
curl \
9+
sudo \
10+
git \
11+
bash \
12+
rsync \
13+
ca-certificates \
14+
python3 \
15+
python3-pip \
16+
fonts-dejavu \
17+
coreutils \
18+
procps \
19+
chromium \
20+
libreoffice-core \
21+
libreoffice-writer \
22+
libreoffice-java-common \
23+
default-jre-headless
24+
25+
26+
# Install Quarto CLI
27+
RUN arch=$(dpkg --print-architecture) && \
28+
case "$arch" in \
29+
amd64) deb=quarto-1.7.30-linux-amd64.deb ;; \
30+
arm64) deb=quarto-1.7.30-linux-arm64.deb ;; \
31+
*) echo "Unsupported architecture: $arch" && exit 1 ;; \
32+
esac && \
33+
wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.7.30/$deb && \
34+
dpkg -i $deb && \
35+
rm $deb
36+
37+
# Install Node.js (for GitHub Actions written in Node)
38+
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
39+
apt-get install -y nodejs
40+
41+
# Clean up apt, pip, Python, and docs
42+
RUN apt-get clean && \
43+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
44+
python3 -m pip cache purge && \
45+
find /usr/local/lib/python3.*/ -name '__pycache__' -exec rm -r {} + && \
46+
find /usr/local/lib/python3.*/ -name '*.pyc' -delete && \
47+
rm -rf /root/.cache/pip /root/.cache/fontconfig && \
48+
rm -rf $HF_HOME/hub/tmp-* $HF_HOME/hub/*.lock $HF_HOME/hub/models--*/blobs &&\
49+
rm -rf /usr/share/doc /usr/share/man /usr/share/locale /var/cache/* /tmp/* /var/tmp/*
50+
51+
52+
# Entrypoint script
53+
COPY entrypoint.sh /entrypoint.sh
54+
RUN chmod +x /entrypoint.sh
55+
ENTRYPOINT ["/entrypoint.sh"]

.github/scripts/generate_intros_and_keywords.py

Lines changed: 79 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from pathlib import Path
22
import json
3-
import time
43
import re
54
import google.generativeai as genai
65
import tiktoken
76
import yaml
8-
from io import StringIO
97
import os
108
from pathlib import Path
9+
import sys
10+
import json5
11+
import hashlib
1112

1213
# Configuration
1314
API_KEY = os.getenv("GEMINI_API_KEY")
@@ -18,13 +19,18 @@
1819

1920
SCRIPT_DIR = Path(__file__).resolve().parent
2021
INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve()
22+
ROOT_DIR = (SCRIPT_DIR / "../../").resolve()
23+
CACHE_DIR = (SCRIPT_DIR / "../../.llm_cache").resolve()
24+
CACHE_DIR.mkdir(exist_ok=True)
25+
BLACKLISTED_DIRS = {"templates", "includes", "theme"}
2126

22-
PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA).
27+
PROMPT = """You are an AI assistant helping to enrich technical documents for the Copernicus Land Monitoring Service (CLMS).
2328
2429
Your tasks:
25-
1. Read and understand the entire attached document.
30+
1. Read and understand the entire attached document. Ignore yml metadata and focus on the main content.
2631
2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus.
2732
3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document.
33+
4. Use British English spelling and terminology.
2834
2935
Keyword guidance:
3036
- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\".
@@ -38,6 +44,8 @@
3844
\"introduction\": \"...\",
3945
\"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"]
4046
}
47+
48+
Avoid trailing commas in the JSON output.
4149
"""
4250

4351
# Setup Gemini
@@ -48,15 +56,15 @@
4856

4957

5058
# Function to update YAML frontmatter using PyYAML
51-
def update_yaml_header(content: str, description: str, keywords_list: list):
52-
lines = content.splitlines()
59+
def update_qmd_file(doc_path, description: str, keywords_list: list):
60+
lines = doc_path.read_text(encoding="utf-8").splitlines()
5361
if lines[0].strip() != "---":
54-
return content
62+
return
5563

5664
try:
5765
end_idx = lines[1:].index("---") + 1
5866
except ValueError:
59-
return content
67+
return
6068

6169
yaml_block = "\n".join(lines[1:end_idx])
6270
yaml_data = yaml.safe_load(yaml_block) or {}
@@ -65,16 +73,17 @@ def update_yaml_header(content: str, description: str, keywords_list: list):
6573

6674
new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip()
6775
new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :]
68-
return "\n".join(new_lines)
76+
77+
doc_path.write_text("\n".join(new_lines), encoding="utf-8")
6978

7079

7180
# Function to process one document with Gemini
7281
def process_document_with_llm(doc_path: Path):
73-
print("Processing ", doc_path)
7482
global total_tokens_sent
7583

7684
file_contents = doc_path.read_text(encoding="utf-8")
7785
input_tokens = len(encoding.encode(file_contents))
86+
print(f"[LLM] Processing: {doc_path} ({input_tokens} input tokens)")
7887
if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE:
7988
print(
8089
f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens."
@@ -106,27 +115,71 @@ def process_document_with_llm(doc_path: Path):
106115
raw_text = re.sub(r"\s*```$", "", raw_text)
107116

108117
try:
109-
parsed_output = json.loads(raw_text)
110-
introduction = parsed_output["introduction"]
111-
keywords_list = parsed_output["keywords"]
112-
keywords = ", ".join(keywords_list)
118+
parsed_output = json5.loads(raw_text)
119+
return {
120+
"introduction": parsed_output["introduction"],
121+
"keywords": parsed_output["keywords"],
122+
}
113123
except (json.JSONDecodeError, KeyError) as e:
114124
print(f"[ERROR] Invalid response for {doc_path}:", raw_text)
115-
return
125+
return None
116126

117-
updated_content = update_yaml_header(file_contents, introduction, keywords_list)
118-
output_file = doc_path.with_name(doc_path.stem + ".qmd")
119-
output_file.write_text(updated_content, encoding="utf-8")
120127

121-
print("Estimated input tokens:", input_tokens)
128+
# Function to read paths of modified documents (.qmd) from a file. The file is provided by github actions as an input.
129+
def read_paths_from_filename():
130+
input_file = sys.argv[1]
131+
try:
132+
with open(input_file, "r") as f:
133+
return [line.strip() for line in f if line.strip()]
134+
except FileNotFoundError:
135+
raise FileNotFoundError(f"File not found: {input_file}")
136+
except Exception as e:
137+
raise RuntimeError(f"Error reading file: {e}")
122138

123139

124-
# Process all .qmd files
125-
BLACKLISTED_DIRS = {"templates", "includes", "theme"}
140+
# Cache-related functions
141+
def file_hash(path):
142+
return hashlib.sha256(path.read_bytes()).hexdigest()
143+
144+
145+
def get_cache_path(qmd_path):
146+
safe_path = "__".join(qmd_path.parts)
147+
return CACHE_DIR / f"{safe_path}.json"
148+
149+
150+
def load_cached_result(cache_path):
151+
if cache_path.exists():
152+
with cache_path.open() as f:
153+
return json.load(f)
154+
return {}
155+
156+
157+
def save_cached_result(cache_path, data):
158+
with cache_path.open("w") as f:
159+
json.dump(data, f, indent=2)
160+
161+
162+
if __name__ == "__main__":
163+
modified_paths = set(Path(p) for p in read_paths_from_filename())
164+
165+
for full_doc_path in INPUT_DIR.rglob("*.qmd"):
166+
doc_path = full_doc_path.relative_to(ROOT_DIR)
167+
if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
168+
continue
169+
170+
cache_path = get_cache_path(doc_path)
171+
current_hash = file_hash(doc_path)
172+
cache = load_cached_result(cache_path)
173+
174+
if doc_path in modified_paths or cache.get("hash") != current_hash:
175+
result = process_document_with_llm(doc_path)
176+
cache = {
177+
"hash": current_hash,
178+
"intro": result["introduction"],
179+
"keywords": result["keywords"],
180+
}
181+
save_cached_result(cache_path, cache)
126182

127-
for doc_path in INPUT_DIR.rglob("*.qmd"):
128-
if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
129-
continue
130-
process_document_with_llm(doc_path)
183+
update_qmd_file(doc_path, cache["intro"], cache["keywords"])
131184

132-
print("Total tokens sent:", total_tokens_sent)
185+
print("Total tokens sent:", total_tokens_sent)

.github/scripts/generate_keywords.py

Lines changed: 0 additions & 75 deletions
This file was deleted.

.github/workflows/deploy-docs.yml

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,40 @@ jobs:
1919
- name: Checkout code
2020
uses: actions/checkout@v4
2121
with:
22-
fetch-depth: 0
22+
fetch-depth: 2
23+
24+
- name: Mark repo as safe for Git
25+
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
26+
27+
- name: Show commit history
28+
run: git log --oneline -n 5
29+
30+
- name: Get changed .qmd files on current branch
31+
run: |
32+
(git diff --name-only HEAD^ HEAD | grep -E '\.qmd$' || true) > modified_docs_list.txt
33+
echo "Modified .qmd files:"
34+
cat modified_docs_list.txt
35+
36+
- name: Generate intros and keywords
37+
uses: addnab/docker-run-action@v3
38+
with:
39+
image: mckeea/llm-doc-annotator:latest
40+
options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app
41+
run: python .github/scripts/generate_intros_and_keywords.py modified_docs_list.txt
42+
43+
- name: Commit updated LLM cache
44+
run: |
45+
git config user.name "GitHub Actions"
46+
git config user.email "actions@github.com"
47+
48+
git add .llm_cache
49+
50+
if git diff --cached --quiet; then
51+
echo "No changes to commit."
52+
else
53+
git commit -m "Update LLM cache [skip ci]"
54+
git push origin HEAD
55+
fi
2356
2457
- name: Generate intros and keywords
2558
uses: addnab/docker-run-action@v3
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"hash": "6c2405be4e8491221a1444f66040b7dfa982e079673ff790b6d64a756a545499",
3+
"intro": "This document outlines the IT architecture principles and implementation guidelines for the Copernicus Land Monitoring Service (CLMS), managed by the European Environment Agency (EEA). It establishes a framework for developing consistent, scalable, and secure IT solutions within the CLMS programme. Emphasising key aspects like modularity, reproducibility, transparency, maintainability, observability, security and resilience, these guidelines ensure that all IT solutions are coherent, adaptable, and efficiently operated, contributing to the overarching goals of the CLMS.",
4+
"keywords": [
5+
"IT solution reproducibility",
6+
"REST API service",
7+
"Continuous Integration and Continuous Deployment",
8+
"Client specific software modularity",
9+
"Infrastructure-as-a-code",
10+
"Role-based access control",
11+
"Disaster recovery plan",
12+
"IT solution cloud agnosticism",
13+
"Source code inline documentation",
14+
"Automated metric monitoring"
15+
]
16+
}

0 commit comments

Comments
 (0)