Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/generate-llms-txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Generate llms.txt and llms-full.txt
on:
push:
branches: [ master ]
pull_request:
branches: [ '**' ]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

permissions:
models: read

jobs:
generate-llms:
permissions:
contents: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Setup Hugo
uses: peaceiris/actions-hugo@v2
with:
hugo-version: "0.148.2"
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install requests openai
- name: Run the generation script
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: python automation/generate-llms-txt.py
- uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: "Update llms.txt and llms-full.txt"
commit_user_name: "GitHub Actions"
commit_user_email: "[email protected]"
208 changes: 208 additions & 0 deletions automation/generate-llms-txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import csv
import glob
import subprocess
import os
import re
import openai
from typing import Iterable
from dataclasses import dataclass


BASE_DIR = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/../qdrant-landing")
OUTPUT_DIR = BASE_DIR + "/static"
GENERAL_DESCRIPTION = (
"Qdrant is a cutting-edge platform focused on delivering exceptional performance and efficiency in vector "
"similarity search. As a robust vector database, it specializes in managing, searching, and retrieving "
"high-dimensional vector data, essential for enhancing AI applications, machine learning, and modern search "
"engines. With a suite of powerful features such as state-of-the-art hybrid search capabilities, "
"retrieval-augmented generation (RAG) applications, and dense and sparse vector support, Qdrant stands out as an "
"industry leader. Its offerings include managed cloud services, enabling users to harness the robust functionality "
"of Qdrant without the burden of maintaining infrastructure. The platform supports advanced data security measures "
"and seamless integrations with popular platforms and frameworks, catering to diverse data handling and analytic "
"needs. Additionally, Qdrant offers comprehensive solutions for complex searching requirements through its "
"innovative Query API and multivector representations, allowing for precise matching and enhanced retrieval "
"quality. With its commitment to open-source principles and continuous innovation, Qdrant tailors solutions to "
"meet both small-scale projects and enterprise-level demands efficiently, helping organizations unlock profound "
"insights from their unstructured data and optimize their AI capabilities."
)

@dataclass
class HugoContent:
path: str
absolute_url: str
title: str | None
content: str | None


def sort_key(line: dict) -> int:
"""
Calculate a score for the hugo content entry based on its path importance.
The more important the path, the higher the score.
:param line: A dictionary representing a line from the CSV output of `hugo list published`.
:return:
"""
path_boosts = {
"documentation/concepts": 10,
"documentation/quickstart": 9,
"articles": 7,
"documentation": 5,
"blog": 3,
}
path = line.get("path", "")
score = sum(boost for key, boost in path_boosts.items() if key in path)
return score


def load_frontmatter_and_content(raw_content: str) -> (dict, str):
"""
Load the front matter and content from the raw content string.
The front matter is expected to be in YAML format and enclosed in `---` at the beginning of the content.
The content is everything after the front matter.
:param raw_content:
:return:
"""
frontmatter = dict()
if raw_content.startswith("---"):
end_index = raw_content.find("---", 3) + 3
raw_frontmatter = raw_content[:end_index].strip()
# Parse the front matter as a dictionary
for line in raw_frontmatter.splitlines()[1:-1]:
try:
key, value = line.split(":", 1)
frontmatter[key.strip()] = value.strip("\"' ") # Remove quotes and whitespace
except ValueError:
# If the line doesn't contain a key-value pair, skip it
continue
# Remove the front matter from the content
content = raw_content[end_index:].strip()
else:
content = raw_content.strip()
return frontmatter, content

def iter_hugo_content() -> Iterable[HugoContent]:
"""
List the published content in Hugo.
:return:
"""
# Run os `hugo list published` command and capture the output.
cmd = ["hugo", "list", "published"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Command failed with error: {result.stderr}")

# Parse the output to extract the paths. Output is expected to be a CSV format
# with the first line being a header. The first column contains the path.
csv_reader = csv.DictReader(result.stdout.splitlines())
lines = list(csv_reader)
for line in sorted(lines, key=sort_key, reverse=True):
path = line.get("path")
if not path:
continue

# Load the content of the file at the given path.
with open(os.path.join(BASE_DIR, path), "r", encoding="utf-8") as file:
frontmatter, content = load_frontmatter_and_content(file.read())
if not content:
continue

# Render the code snippets in the content.
# Example: {{< code-snippet path="/documentation/headless/snippets/create-collection/simple/" >}}
snippets_iter = re.finditer(r"{{<\s*code-snippet\s+path=\"([^\"]+)\"\s*>}}", content)
for snippet in snippets_iter:
snippet_dir = snippet.group(1)
snippet_files = glob.glob("content/" + snippet_dir.strip("/") + "/[a-z]*.md")
snippet_content = ""
for snippet_file in snippet_files:
with open(snippet_file, "r", encoding="utf-8") as f:
snippet_content += f.read()
snippet_content += "\n" # Add a newline between snippets
# Replace the code snippet placeholder with the actual content
content = content.replace(snippet.group(0), snippet_content.strip())

yield HugoContent(
path=path,
absolute_url=line.get("permalink"),
title=frontmatter.get("title"),
content=content,
)


def summarize_content(content: str) -> str:
"""
Generate a summary for the given content using an LLM.
Use GitHub Models as a provider for the LLM.
:param content:
:return:
"""
# Truncate the content to a maximum of 8192 characters
content = content[:8192]

# Call the GitHub Models API to generate a summary
client = openai.OpenAI(
api_key=os.environ.get("GITHUB_TOKEN"),
base_url="https://models.github.ai/inference",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you propose to call language model as a part of CI process?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but it is supposed to work only on the newly added Hugo content, except for the first run. I assumed the overall meaning of a doc should not change much over time, so the summary should only be created for the new subpages.

)
completions = client.chat.completions.create(
model="openai/gpt-4o",
messages=[
{
"role": "user",
"content": (
"Please summarize the following content in a concise manner, "
"focusing on the main points and key information. The summary should"
"not be longer than 2 sentences:\n\n"
f"{content}"
)
}
]
)
summary = completions.choices[0].message.content.strip()
return summary


def main():
"""
List all the content in Hugo and generate both llms.txt and llms-full.txt files.
Overwrite existing files if they exist.
:return:
"""
# Change the current working directory to the Hugo content directory
os.chdir(BASE_DIR)

# Load the current state of the llms.txt file to avoid duplicates
with open(os.path.join(OUTPUT_DIR, "llms.txt"), "r", encoding="utf-8") as llms_file:
existing_urls = {line.split("](")[1].split(")")[0] for line in llms_file if line.startswith("- [")}

Copy link

Copilot AI Aug 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code attempts to read from llms.txt before checking if it exists. If the file doesn't exist on first run, this will raise a FileNotFoundError. Consider using a try-except block or checking file existence first.

Suggested change
try:
with open(os.path.join(OUTPUT_DIR, "llms.txt"), "r", encoding="utf-8") as llms_file:
existing_urls = {line.split("](")[1].split(")")[0] for line in llms_file if line.startswith("- [")}
except FileNotFoundError:
existing_urls = set()

Copilot uses AI. Check for mistakes.
# Load the paths to all the published content in Hugo and process them sequentially
# to generate the llms.txt and llms-full.txt files.
with (open(os.path.join(OUTPUT_DIR, "llms.txt"), "a+", encoding="utf-8") as llms_file, \
Copy link

Copilot AI Aug 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opening llms.txt in append mode ('a+') after reading existing URLs will result in duplicating content since the file pointer is at the end. Consider opening in write mode ('w') and rewriting the entire file, or handle the file pointer position correctly.

Suggested change
with (open(os.path.join(OUTPUT_DIR, "llms.txt"), "a+", encoding="utf-8") as llms_file, \
with (open(os.path.join(OUTPUT_DIR, "llms.txt"), "w", encoding="utf-8") as llms_file, \

Copilot uses AI. Check for mistakes.
open(os.path.join(OUTPUT_DIR, "llms-full.txt"), "w", encoding="utf-8") as llms_full_file):

# Write the header for the full file
llms_full_file.write("# https://qdrant.tech/ llms-full.txt\n")
llms_full_file.write("## Overall Summary\n")
llms_full_file.write(f"> {GENERAL_DESCRIPTION}\n\n")

for page_counter, content in enumerate(iter_hugo_content(), start=1):
# Write the content to the full file
# Honestly, I don't know why we need this kind of <|page-{page_counter}-lllmstxt|> marker,
# but it is used in the original llms-full.txt file, so I keep it for consistency.
llms_full_file.write(f"<|page-{page_counter}-lllmstxt|>\n")
llms_full_file.write(content.content + "\n\n")

# Skip if there is no title, as we cannot generate a link without it
if not content.title:
continue

# Only append to the llms.txt file if the URL does not already exist
if content.absolute_url in existing_urls:
print(f"Skipping {content.title} ({content.absolute_url}) - already exists in llms.txt")
continue

content_summary = summarize_content(content.content)
llms_file.write(f"- [{content.title}]({content.absolute_url}): {content_summary}\n")
print(f"Processed {content.title} ({content.absolute_url})")


if __name__ == "__main__":
main()
Loading