diff --git a/.beads/daemon-error b/.beads/daemon-error
new file mode 100644
index 000000000..5d7768f8d
--- /dev/null
+++ b/.beads/daemon-error
@@ -0,0 +1,16 @@
+
+LEGACY DATABASE DETECTED!
+
+This database was created before version 0.17.5 and lacks a repository fingerprint.
+To continue using this database, you must explicitly set its repository ID:
+
+ bd migrate --update-repo-id
+
+This ensures the database is bound to this repository and prevents accidental
+database sharing between different repositories.
+
+If this is a fresh clone, run:
+ rm -rf .beads && bd init
+
+Note: Auto-claiming legacy databases is intentionally disabled to prevent
+silent corruption when databases are copied between repositories.
diff --git a/.github/workflows/ci-a2a-rag.yml b/.github/workflows/ci-a2a-rag.yml
index a89c15aed..6de7338ca 100644
--- a/.github/workflows/ci-a2a-rag.yml
+++ b/.github/workflows/ci-a2a-rag.yml
@@ -135,10 +135,19 @@ jobs:
strategy:
matrix:
component: ${{ fromJson(needs.load-config.outputs.rag_components) }}
+ variant: [default]
+ include:
+ # Add slim variant for ingestors (no Playwright, ~1.5GB smaller)
+ - component: ingestors
+ variant: slim
+ # Add HuggingFace variant for server (with PyTorch, ~900MB larger)
+ - component: server
+ variant: huggingface
fail-fast: false
env:
REGISTRY: ghcr.io
+ # For slim variant, append -slim to image name suffix in tags (not the image name itself)
IMAGE_NAME: ${{ github.repository_owner }}/caipe-rag-${{ matrix.component }}
DOCKERFILE: ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.${{ matrix.component }}
@@ -194,6 +203,9 @@ jobs:
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+ # For non-default variants, append variant name as suffix to all tags
+ flavor: |
+ suffix=${{ matrix.variant != 'default' && format('-{0}', matrix.variant) || '' }}
tags: |
type=raw,value=latest,enable=${{ github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' }}
type=raw,value=${{ needs.determine-changes.outputs.tag_version }},enable=${{ needs.determine-changes.outputs.tag_version != '' }}
@@ -223,6 +235,7 @@ jobs:
cache-from: type=gha
cache-to: ${{ matrix.component != 'server' && 'type=gha,mode=min' || '' }}
build-args: |
+ VARIANT=${{ matrix.variant }}
BUILDKIT_INLINE_CACHE=1
provenance: false
sbom: false
@@ -240,6 +253,14 @@ jobs:
strategy:
matrix:
component: ${{ fromJson(needs.load-config.outputs.rag_components) }}
+ variant: [default]
+ include:
+ # Add slim variant for ingestors
+ - component: ingestors
+ variant: slim
+ # Add HuggingFace variant for server
+ - component: server
+ variant: huggingface
fail-fast: false
env:
@@ -269,9 +290,18 @@ jobs:
id: retag-or-build
env:
TAG_VERSION: ${{ needs.determine-changes.outputs.tag_version }}
+ VARIANT: ${{ matrix.variant }}
run: |
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}"
- echo "🏷️ Processing rag-${{ matrix.component }}..."
+
+ # Determine variant suffix for tags
+ if [[ "$VARIANT" != "default" ]]; then
+ VARIANT_SUFFIX="-${VARIANT}"
+ else
+ VARIANT_SUFFIX=""
+ fi
+
+ echo "🏷️ Processing rag-${{ matrix.component }}${VARIANT_SUFFIX}..."
# Determine source tag (previous version)
if [[ "$TAG_VERSION" =~ ^(.+)-rc\.([0-9]+)$ ]]; then
@@ -280,21 +310,23 @@ jobs:
if [[ "$RC_NUM" -gt 1 ]]; then
PREV_RC=$((RC_NUM - 1))
- SOURCE_TAG="${BASE_VERSION}-rc.${PREV_RC}"
+ SOURCE_TAG="${BASE_VERSION}-rc.${PREV_RC}${VARIANT_SUFFIX}"
else
- SOURCE_TAG="${BASE_VERSION}"
+ SOURCE_TAG="${BASE_VERSION}${VARIANT_SUFFIX}"
fi
else
- SOURCE_TAG="latest"
+ SOURCE_TAG="latest${VARIANT_SUFFIX}"
fi
+
+ TARGET_TAG="${TAG_VERSION}${VARIANT_SUFFIX}"
echo " Source: ${SOURCE_TAG}"
- echo " Target: ${TAG_VERSION}"
+ echo " Target: ${TARGET_TAG}"
# Check if source image exists
if crane manifest "${FULL_IMAGE}:${SOURCE_TAG}" >/dev/null 2>&1; then
echo " ✅ Source image exists, retagging..."
- if crane tag "${FULL_IMAGE}:${SOURCE_TAG}" "${TAG_VERSION}"; then
+ if crane tag "${FULL_IMAGE}:${SOURCE_TAG}" "${TARGET_TAG}"; then
echo " ✅ Successfully retagged from ${SOURCE_TAG}"
echo "needs_build=false" >> $GITHUB_OUTPUT
else
@@ -373,11 +405,12 @@ jobs:
file: ${{ env.DOCKERFILE }}
push: true
tags: |
- ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.determine-changes.outputs.tag_version }}
+ ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.determine-changes.outputs.tag_version }}${{ matrix.variant != 'default' && format('-{0}', matrix.variant) || '' }}
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: ${{ matrix.component != 'server' && 'type=gha,mode=min' || '' }}
build-args: |
+ VARIANT=${{ matrix.variant }}
BUILDKIT_INLINE_CACHE=1
provenance: false
sbom: false
diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md
new file mode 100644
index 000000000..f2d3792ea
--- /dev/null
+++ b/PR_DESCRIPTION.md
@@ -0,0 +1,59 @@
+## Helm Chart Simplification
+
+Replaced individual `values.yaml` keys with a generic `env:` map pattern. This reduces template complexity and makes adding new environment variables easier without chart changes.
+
+### What Changed
+
+**Kept as computed values** (from global config):
+- `REDIS_URL`, `NEO4J_*`, `MILVUS_URI`, `ONTOLOGY_AGENT_RESTAPI_ADDR`
+- `enableGraphRag` (has global fallback)
+
+**Everything else** now uses `env:` map with string values.
+
+### Migration Table
+
+#### RAG Server
+
+| Removed Key | Use Instead |
+|-------------|-------------|
+| `enableMcp` | `env.ENABLE_MCP` |
+| `skipInitTests` | `env.SKIP_INIT_TESTS` |
+| `embeddingsProvider` | `env.EMBEDDINGS_PROVIDER` |
+| `embeddingsModel` | `env.EMBEDDINGS_MODEL` |
+| `maxDocumentsPerIngest` | `env.MAX_DOCUMENTS_PER_INGEST` |
+| `maxResultsPerQuery` | `env.MAX_RESULTS_PER_QUERY` |
+| `maxIngestionConcurrency` | `env.MAX_INGESTION_CONCURRENCY` |
+| `logLevel` | `env.LOG_LEVEL` |
+| `rbac.allowUnauthenticated` | `env.ALLOW_UNAUTHENTICATED` |
+| `rbac.adminGroups` | `env.RBAC_ADMIN_GROUPS` |
+| `rbac.readonlyGroups` | `env.RBAC_READONLY_GROUPS` |
+| `rbac.defaultRole` | `env.RBAC_DEFAULT_ROLE` |
+
+#### Web Ingestor
+
+| Removed Key | Use Instead |
+|-------------|-------------|
+| `webIngestor.logLevel` | `webIngestor.env.LOG_LEVEL` |
+| `webIngestor.maxConcurrency` | `webIngestor.env.WEBLOADER_MAX_CONCURRENCY` |
+| `webIngestor.maxIngestionTasks` | `webIngestor.env.WEBLOADER_MAX_INGESTION_TASKS` |
+| `webIngestor.reloadInterval` | `webIngestor.env.WEBLOADER_RELOAD_INTERVAL` |
+
+### Example
+
+**Before:**
+```yaml
+rag-server:
+ enableMcp: true
+ logLevel: INFO
+ rbac:
+ adminGroups: "admins"
+```
+
+**After:**
+```yaml
+rag-server:
+ env:
+ ENABLE_MCP: "true"
+ LOG_LEVEL: "INFO"
+ RBAC_ADMIN_GROUPS: "admins"
+```
diff --git a/ai_platform_engineering/knowledge_bases/rag/agent_ontology/uv.lock b/ai_platform_engineering/knowledge_bases/rag/agent_ontology/uv.lock
index 3abec1a01..f6a5a7b30 100644
--- a/ai_platform_engineering/knowledge_bases/rag/agent_ontology/uv.lock
+++ b/ai_platform_engineering/knowledge_bases/rag/agent_ontology/uv.lock
@@ -443,7 +443,6 @@ dependencies = [
{ name = "cymple" },
{ name = "langchain-aws" },
{ name = "langchain-cohere" },
- { name = "langchain-huggingface" },
{ name = "langchain-ollama" },
{ name = "langchain-openai" },
{ name = "neo4j" },
@@ -456,13 +455,14 @@ requires-dist = [
{ name = "cymple", specifier = ">=0.12.0" },
{ name = "langchain-aws", specifier = ">=0.2.24" },
{ name = "langchain-cohere", specifier = ">=0.3.0" },
- { name = "langchain-huggingface", specifier = ">=0.3.0" },
+ { name = "langchain-huggingface", marker = "extra == 'huggingface'", specifier = ">=0.3.0" },
{ name = "langchain-ollama", specifier = ">=0.3.0" },
{ name = "langchain-openai", specifier = ">=0.3.18" },
{ name = "neo4j", specifier = ">=5.28.1" },
{ name = "pydantic", specifier = ">=2.11.7" },
{ name = "redis", specifier = ">=6.2.0" },
]
+provides-extras = ["huggingface"]
[[package]]
name = "cymple"
@@ -1506,20 +1506,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/97/cb/713897071ffb89b3085e91330b48b59629826e5ed64be136fb9d34459be5/langchain_groq-1.1.0-py3-none-any.whl", hash = "sha256:f6c9a7bfe46a3d6e7e0cdc3888ba9c5443b6e1ed674894a090c4b20b36465a3b", size = 19038, upload-time = "2025-11-24T14:21:27.35Z" },
]
-[[package]]
-name = "langchain-huggingface"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "huggingface-hub" },
- { name = "langchain-core" },
- { name = "tokenizers" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/9f/d7/ffcf97cd977c535df2c621c59eafa82df73f801323f670d88819c23fc304/langchain_huggingface-1.1.0.tar.gz", hash = "sha256:43c3b06413158b0cd1edcdbadf545c24d5f64f180bb71c80dc960959a728c1fd", size = 252295, upload-time = "2025-11-24T14:18:30.366Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/b1/4b/2bdd63464a7bb3aa7911777636cff8e54a2a1edc7b7a85a4acb7decebb23/langchain_huggingface-1.1.0-py3-none-any.whl", hash = "sha256:a3a5218a839062941cb616992bcbc4fe73352454727bafc351a452e76aead1a8", size = 29925, upload-time = "2025-11-24T14:18:29.036Z" },
-]
-
[[package]]
name = "langchain-ollama"
version = "1.0.0"
diff --git a/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors b/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors
index fdcb537ef..a4c695f49 100644
--- a/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors
+++ b/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors
@@ -9,18 +9,40 @@ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
# for an example.
ENV UV_PYTHON_DOWNLOADS=0
-# Copy over the local dependencies
+# Build variant: "default" (with Playwright/Chromium for JS rendering) or "slim" (no Playwright, smaller image)
+ARG VARIANT=default
+
+# Copy over the local dependencies (excluding .venv directories)
COPY common /app/common
+# Remove any .venv from common that shouldn't be in the image
+RUN rm -rf /app/common/.venv
WORKDIR /app/ingestors
+# Install dependencies based on variant
+# Default includes Playwright for JS rendering; slim variant excludes it for smaller image
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=ingestors/uv.lock,target=uv.lock \
--mount=type=bind,source=ingestors/pyproject.toml,target=pyproject.toml \
- UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev
+ if [ "$VARIANT" = "slim" ]; then \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev; \
+ else \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev --extra playwright; \
+ fi
COPY ingestors .
RUN --mount=type=cache,target=/root/.cache/uv \
- UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev
+ if [ "$VARIANT" = "slim" ]; then \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev; \
+ else \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev --extra playwright; \
+ fi
+
+# Clean up .venv to reduce image size (remove ~100MB of unnecessary files)
+RUN find /app/ingestors/.venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
+ find /app/ingestors/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
+ find /app/ingestors/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
+ find /app/ingestors/.venv -type f -name "*.pyi" -delete 2>/dev/null || true && \
+ rm -rf /app/ingestors/.venv/include 2>/dev/null || true
# Then, use a final image without uv
@@ -29,13 +51,34 @@ FROM python:3.13-slim-bookworm
# Python executable must be the same, e.g., using `python:3.13-slim-bookworm`
# will fail.
-# Install AWS CLI v2 for EKS authentication (This is for k8s ingestor)
+# Re-declare ARG after FROM to make it available in this stage
+ARG VARIANT=default
+
+# Install system dependencies:
+# - AWS CLI v2 for EKS authentication (k8s ingestor)
+# - Playwright/Chromium dependencies (default variant only, not slim)
RUN apt-get update && \
- apt-get install -y --no-install-recommends curl unzip && \
- curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip" && \
+ apt-get install -y --no-install-recommends \
+ # For AWS CLI installation
+ curl unzip && \
+ # Conditionally install Playwright dependencies (default variant)
+ if [ "$VARIANT" != "slim" ]; then \
+ apt-get install -y --no-install-recommends \
+ libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
+ libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
+ libgbm1 libasound2 libpango-1.0-0 libcairo2 libatspi2.0-0; \
+ fi && \
+ # Install AWS CLI v2 (detect architecture)
+ ARCH=$(uname -m) && \
+ if [ "$ARCH" = "aarch64" ]; then \
+ curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \
+ else \
+ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \
+ fi && \
unzip awscliv2.zip && \
./aws/install && \
rm -rf awscliv2.zip aws && \
+ # Cleanup
apt-get remove -y curl unzip && \
apt-get autoremove -y && \
apt-get clean && \
@@ -53,8 +96,17 @@ WORKDIR /app/ingestors
# Place executables in the environment at the front of the path
ENV PATH="/app/ingestors/.venv/bin:$PATH"
+# Install Playwright browsers (Chromium only) - default variant only
+# This needs to run before switching to non-root user since it downloads to system cache
+ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright
+RUN if [ "$VARIANT" != "slim" ]; then \
+ mkdir -p /opt/playwright && \
+ playwright install chromium && \
+ chmod -R 755 /opt/playwright; \
+ fi
+
# Use a non-root user to run the application
USER app
# Run the application by default - use shell form to enable variable expansion
-CMD python3 src/ingestors/${INGESTOR_TYPE}/ingestor.py
\ No newline at end of file
+CMD python3 src/ingestors/${INGESTOR_TYPE}/ingestor.py
diff --git a/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.server b/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.server
index 5c3c3c1c0..09f4e43cf 100644
--- a/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.server
+++ b/ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.server
@@ -9,19 +9,44 @@ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
# for an example.
ENV UV_PYTHON_DOWNLOADS=0
+# Build argument for variant: "default" or "huggingface"
+# - default: Slim image (~1.4GB) with API-based embedding providers
+# - huggingface: Full image (~2.3GB) with PyTorch for local HuggingFace models
+ARG VARIANT=default
+
# Copy over the local dependencies
COPY common /app/common
WORKDIR /app/server
# Increase timeout for large packages (e.g., pyarrow 39MB)
+# Install dependencies - conditionally include huggingface extra
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=server/uv.lock,target=uv.lock \
--mount=type=bind,source=server/pyproject.toml,target=pyproject.toml \
- UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev
+ if [ "$VARIANT" = "huggingface" ]; then \
+ echo "Installing with huggingface extra (includes PyTorch)..." && \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev --extra huggingface; \
+ else \
+ echo "Installing default (slim) variant..." && \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev; \
+ fi
COPY server .
RUN --mount=type=cache,target=/root/.cache/uv \
- UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev
+ if [ "$VARIANT" = "huggingface" ]; then \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev --extra huggingface; \
+ else \
+ UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev; \
+ fi
+
+# Cleanup step - remove unnecessary files from .venv to reduce image size
+# Saves ~325MB by removing test files, caches, type stubs, and C headers
+RUN find /app/server/.venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
+ find /app/server/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
+ find /app/server/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
+ find /app/server/.venv -name "*.pyc" -delete 2>/dev/null || true && \
+ find /app/server/.venv -name "*.pyi" -delete 2>/dev/null || true && \
+ find /app/server/.venv -type d -name "include" -path "*/site-packages/*" -exec rm -rf {} + 2>/dev/null || true
# Then, use a final image without uv
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/pyproject.toml b/ai_platform_engineering/knowledge_bases/rag/common/pyproject.toml
index 50d3aebf8..c359d6a4e 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/pyproject.toml
+++ b/ai_platform_engineering/knowledge_bases/rag/common/pyproject.toml
@@ -12,14 +12,19 @@ dependencies = [
"neo4j>=5.28.1",
"pydantic>=2.11.7",
"redis>=6.2.0",
- # Embedding providers - all included in Docker image
+ # Embedding providers - lightweight API-based (always included)
"langchain-openai>=0.3.18",
"langchain-aws>=0.2.24",
"langchain-cohere>=0.3.0",
- "langchain-huggingface>=0.3.0",
"langchain-ollama>=0.3.0",
]
+[project.optional-dependencies]
+# Heavy local embedding provider (~900MB due to PyTorch)
+huggingface = [
+ "langchain-huggingface>=0.3.0",
+]
+
[build-system]
requires = ["uv_build>=0.8.17,<0.9.0"]
build-backend = "uv_build"
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/src/common/embeddings_factory.py b/ai_platform_engineering/knowledge_bases/rag/common/src/common/embeddings_factory.py
index 796298533..05622e609 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/src/common/embeddings_factory.py
+++ b/ai_platform_engineering/knowledge_bases/rag/common/src/common/embeddings_factory.py
@@ -9,178 +9,212 @@
- openai
- aws-bedrock
- cohere
-- huggingface (local)
+- huggingface (local) - requires huggingface extra, use -hf image variant
- ollama (local)
+- litellm (proxy mode - connects to LiteLLM proxy)
-All embedding provider packages are included in the Docker image.
+Most embedding providers are included in the default Docker image.
+HuggingFace/PyTorch requires the -hf image variant (~900MB larger).
"""
import os
from langchain_core.embeddings import Embeddings
+
+# Core providers - always available (lightweight, API-based)
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
-from langchain_aws import BedrockEmbeddings
-from langchain_cohere import CohereEmbeddings
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_ollama import OllamaEmbeddings
+
+from common.utils import get_logger
+
+logger = get_logger(__name__)
class EmbeddingsFactory:
- """Factory for creating embedding models based on provider configuration."""
-
- @staticmethod
- def get_embeddings() -> Embeddings:
- """
- Get embeddings based on EMBEDDINGS_PROVIDER environment variable.
-
- Environment Variables:
- EMBEDDINGS_PROVIDER: Provider name (azure-openai, openai, aws-bedrock, cohere, huggingface, ollama)
- EMBEDDINGS_MODEL: Model name/ID (provider-specific)
-
- Provider-specific variables:
- AWS Bedrock:
- - AWS_REGION: AWS region (default: us-east-1)
- - AWS credentials via standard boto3 methods
-
- OpenAI:
- - OPENAI_API_KEY: API key
-
- Cohere:
- - COHERE_API_KEY: API key
-
- HuggingFace:
- - HUGGINGFACEHUB_API_TOKEN or HF_TOKEN: API token (required for gated models)
- - EMBEDDINGS_DEVICE: Device to use (cpu, cuda, mps) (default: cpu)
- - EMBEDDINGS_BATCH_SIZE: Batch size for embedding inference (default: 32)
-
- Ollama:
- - OLLAMA_BASE_URL: Base URL (default: http://localhost:11434)
-
- Returns:
- Embeddings: Configured embeddings instance
-
- Raises:
- ValueError: If provider is unsupported or credentials are missing
- """
- provider = os.getenv("EMBEDDINGS_PROVIDER", "azure-openai").lower()
- model = os.getenv("EMBEDDINGS_MODEL", "text-embedding-3-small")
-
- if provider == "azure-openai":
- # Azure OpenAI requires these environment variables:
- # AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_VERSION
- return AzureOpenAIEmbeddings(model=model)
-
- elif provider == "openai":
- if not os.getenv("OPENAI_API_KEY"):
- raise ValueError(
- "OPENAI_API_KEY environment variable is required for OpenAI embeddings"
- )
- return OpenAIEmbeddings(model=model)
-
- elif provider == "aws-bedrock":
- # Default to Titan embedding model if not specified
- bedrock_model = os.getenv("EMBEDDINGS_MODEL", "amazon.titan-embed-text-v2:0")
- region = os.getenv("AWS_REGION", "us-east-1")
-
- return BedrockEmbeddings(
- model_id=bedrock_model,
- region_name=region
- )
-
- elif provider == "cohere":
- api_key = os.getenv("COHERE_API_KEY")
- if not api_key:
- raise ValueError(
- "COHERE_API_KEY environment variable is required for Cohere embeddings"
- )
- # client and async_client are automatically created by the validator
- return CohereEmbeddings(model=model, cohere_api_key=api_key) # type: ignore[call-arg]
-
- elif provider == "huggingface":
- # Default to a popular sentence transformer model
- hf_model = os.getenv("EMBEDDINGS_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
-
- # Configure model kwargs for optimal performance
- model_kwargs = {
- "device": "cpu", # Explicitly set device (can be overridden with EMBEDDINGS_DEVICE env var)
- }
-
- # Add token if available (required for gated models)
- if hf_token:
- model_kwargs["token"] = hf_token
-
- # Allow device override via environment variable
- device = os.getenv("EMBEDDINGS_DEVICE", "cpu")
- model_kwargs["device"] = device
-
- # Encode kwargs for inference optimization
- encode_kwargs = {
- "normalize_embeddings": True, # Normalize embeddings for better similarity search
- "batch_size": int(os.getenv("EMBEDDINGS_BATCH_SIZE", "32")), # Configurable batch size
- }
-
- return HuggingFaceEmbeddings(
- model_name=hf_model,
- model_kwargs=model_kwargs,
- encode_kwargs=encode_kwargs,
- )
-
- elif provider == "ollama":
- ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
- return OllamaEmbeddings(
- base_url=ollama_url,
- model=model
- )
-
- else:
- raise ValueError(
- f"Unsupported embeddings provider: '{provider}'. "
- f"Supported providers: azure-openai, openai, aws-bedrock, cohere, huggingface, ollama"
- )
-
- @staticmethod
- def get_embedding_dimensions() -> int:
- """
- Get the expected embedding dimensions for the configured model.
- This is useful for vector database configuration.
-
- Returns:
- int: Embedding dimensions (defaults to 1536 if unknown)
- """
- model = os.getenv("EMBEDDINGS_MODEL", "text-embedding-3-small")
-
- # Common embedding dimensions by model
- dimension_map = {
- # OpenAI/Azure OpenAI
- "text-embedding-3-small": 1536,
- "text-embedding-3-large": 3072,
- "text-embedding-ada-002": 1536,
-
- # AWS Bedrock
- "amazon.titan-embed-text-v1": 1536,
- "amazon.titan-embed-text-v2:0": 1024,
- "cohere.embed-english-v3": 1024,
- "cohere.embed-multilingual-v3": 1024,
-
- # Cohere
- "embed-english-v3.0": 1024,
- "embed-multilingual-v3.0": 1024,
- "embed-english-light-v3.0": 384,
-
- # HuggingFace common models
- "sentence-transformers/all-MiniLM-L6-v2": 384,
- "sentence-transformers/all-mpnet-base-v2": 768,
- "sentence-transformers/all-MiniLM-L12-v2": 384,
- }
-
- # Try to find exact match
- if model in dimension_map:
- return dimension_map[model]
-
- # Check if set by environment variable
- if os.getenv("EMBEDDINGS_DIMENSIONS"):
- return int(os.getenv("EMBEDDINGS_DIMENSIONS"))
-
- # If not set, return default
- # Default to 1536 (most common)
- return 1536
+ """Factory for creating embedding models based on provider configuration."""
+
+ @staticmethod
+ def get_embeddings() -> Embeddings:
+ """
+ Get embeddings based on EMBEDDINGS_PROVIDER environment variable.
+
+ Environment Variables:
+ EMBEDDINGS_PROVIDER: Provider name (azure-openai, openai, aws-bedrock, cohere, huggingface, ollama)
+ EMBEDDINGS_MODEL: Model name/ID (provider-specific)
+
+ Provider-specific variables:
+ AWS Bedrock:
+ - AWS_REGION: AWS region (default: us-east-1)
+ - AWS credentials via standard boto3 methods
+
+ OpenAI:
+ - OPENAI_API_KEY: API key
+
+ Cohere:
+ - COHERE_API_KEY: API key
+
+ HuggingFace:
+ - HUGGINGFACEHUB_API_TOKEN or HF_TOKEN: API token (required for gated models)
+ - EMBEDDINGS_DEVICE: Device to use (cpu, cuda, mps) (default: cpu)
+ - EMBEDDINGS_BATCH_SIZE: Batch size for embedding inference (default: 32)
+
+ Ollama:
+ - OLLAMA_BASE_URL: Base URL (default: http://localhost:11434)
+
+ Returns:
+ Embeddings: Configured embeddings instance
+
+ Raises:
+ ValueError: If provider is unsupported or credentials are missing
+ """
+ provider = os.getenv("EMBEDDINGS_PROVIDER", "azure-openai").lower()
+ model = os.getenv("EMBEDDINGS_MODEL", "text-embedding-3-small")
+
+ embeddings: Embeddings
+
+ if provider == "azure-openai":
+ # Azure OpenAI requires these environment variables:
+ # AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_VERSION
+ embeddings = AzureOpenAIEmbeddings(model=model)
+
+ elif provider == "openai":
+ if not os.getenv("OPENAI_API_KEY"):
+ raise ValueError("OPENAI_API_KEY environment variable is required for OpenAI embeddings")
+ embeddings = OpenAIEmbeddings(model=model)
+
+ elif provider == "aws-bedrock":
+ # Lazy import - lightweight API-based provider
+ from langchain_aws import BedrockEmbeddings
+
+ # Default to Titan embedding model if not specified
+ bedrock_model = os.getenv("EMBEDDINGS_MODEL", "amazon.titan-embed-text-v2:0")
+ region = os.getenv("AWS_REGION", "us-east-1")
+
+ embeddings = BedrockEmbeddings(model_id=bedrock_model, region_name=region)
+
+ elif provider == "cohere":
+ # Lazy import - lightweight API-based provider
+ from langchain_cohere import CohereEmbeddings
+
+ api_key = os.getenv("COHERE_API_KEY")
+ if not api_key:
+ raise ValueError("COHERE_API_KEY environment variable is required for Cohere embeddings")
+ # client and async_client are automatically created by the validator
+ embeddings = CohereEmbeddings(model=model, cohere_api_key=api_key) # type: ignore[call-arg]
+
+ elif provider == "huggingface":
+ # Lazy import - heavy provider requiring PyTorch (~900MB)
+ # Only available in -hf image variant
+ try:
+ from langchain_huggingface import HuggingFaceEmbeddings
+ except ImportError:
+ raise ValueError("HuggingFace embeddings require the 'huggingface' extra. Use the -hf image variant or install with: pip install server[huggingface]")
+ # Default to a popular sentence transformer model
+ hf_model = os.getenv("EMBEDDINGS_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
+
+ # Configure model kwargs for optimal performance
+ model_kwargs = {
+ "device": "cpu", # Explicitly set device (can be overridden with EMBEDDINGS_DEVICE env var)
+ }
+
+ # Add token if available (required for gated models)
+ if hf_token:
+ model_kwargs["token"] = hf_token
+
+ # Allow device override via environment variable
+ device = os.getenv("EMBEDDINGS_DEVICE", "cpu")
+ model_kwargs["device"] = device
+
+ # Encode kwargs for inference optimization
+ encode_kwargs = {
+ "normalize_embeddings": True, # Normalize embeddings for better similarity search
+ "batch_size": int(os.getenv("EMBEDDINGS_BATCH_SIZE", "32")), # Configurable batch size
+ }
+
+ embeddings = HuggingFaceEmbeddings(
+ model_name=hf_model,
+ model_kwargs=model_kwargs,
+ encode_kwargs=encode_kwargs,
+ )
+
+ elif provider == "ollama":
+ # Lazy import - lightweight local provider
+ from langchain_ollama import OllamaEmbeddings
+
+ ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+ embeddings = OllamaEmbeddings(base_url=ollama_url, model=model)
+
+ elif provider == "litellm":
+ # LiteLLM proxy mode: requires LITELLM_API_BASE
+ # The proxy is OpenAI-compatible, so we use OpenAIEmbeddings
+ api_base = os.getenv("LITELLM_API_BASE")
+ if not api_base:
+ raise ValueError("LITELLM_API_BASE environment variable is required for litellm provider")
+ api_key = os.getenv("LITELLM_API_KEY", "not-needed")
+
+ embeddings = OpenAIEmbeddings(
+ model=model,
+ api_key=api_key, # type: ignore[arg-type]
+ base_url=api_base,
+ )
+
+ else:
+ raise ValueError(f"Unsupported embeddings provider: '{provider}'. Supported providers: azure-openai, openai, aws-bedrock, cohere, huggingface, ollama, litellm")
+
+ dimensions = EmbeddingsFactory.get_embedding_dimensions()
+ logger.info(f"Embeddings: provider={provider}, model={model}, dimensions={dimensions}")
+ return embeddings
+
+ @staticmethod
+ def get_embedding_dimensions() -> int:
+ """
+ Get the expected embedding dimensions for the configured model.
+ This is useful for vector database configuration.
+
+ Returns:
+ int: Embedding dimensions (defaults to 1536 if unknown)
+ """
+ model = os.getenv("EMBEDDINGS_MODEL", "text-embedding-3-small")
+
+ # Common embedding dimensions by model
+ dimension_map = {
+ # OpenAI/Azure OpenAI
+ "text-embedding-3-small": 1536,
+ "text-embedding-3-large": 3072,
+ "text-embedding-ada-002": 1536,
+ # AWS Bedrock
+ "amazon.titan-embed-text-v1": 1536,
+ "amazon.titan-embed-text-v2:0": 1024,
+ "cohere.embed-english-v3": 1024,
+ "cohere.embed-multilingual-v3": 1024,
+ # Cohere
+ "embed-english-v3.0": 1024,
+ "embed-multilingual-v3.0": 1024,
+ "embed-english-light-v3.0": 384,
+ # HuggingFace common models
+ "sentence-transformers/all-MiniLM-L6-v2": 384,
+ "sentence-transformers/all-mpnet-base-v2": 768,
+ "sentence-transformers/all-MiniLM-L12-v2": 384,
+ # LiteLLM models (with provider prefix)
+ "mistral/mistral-embed": 1024,
+ "gemini/text-embedding-004": 768,
+ "vertex_ai/textembedding-gecko": 768,
+ "vertex_ai/textembedding-gecko@003": 768,
+ "voyage/voyage-01": 1024,
+ "voyage/voyage-lite-01": 1024,
+ "voyage/voyage-3": 1024,
+ "voyage/voyage-3-lite": 512,
+ "voyage/voyage-code-3": 1024,
+ }
+
+ # Try to find exact match
+ if model in dimension_map:
+ return dimension_map[model]
+
+ # Check if set by environment variable
+ dimensions_env = os.getenv("EMBEDDINGS_DIMENSIONS")
+ if dimensions_env:
+ return int(dimensions_env)
+
+ # If not set, return default
+ # Default to 1536 (most common)
+ return 1536
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/src/common/ingestor.py b/ai_platform_engineering/knowledge_bases/rag/common/src/common/ingestor.py
index 1bb13d812..d9a8d3290 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/src/common/ingestor.py
+++ b/ai_platform_engineering/knowledge_bases/rag/common/src/common/ingestor.py
@@ -709,8 +709,14 @@ def with_startup(self, startup_function: Callable) -> "IngestorBuilder":
self._startup_function = startup_function
return self
- def run(self):
- """Build and run the ingestor"""
+ def run(self, loop: asyncio.AbstractEventLoop | None = None):
+ """Build and run the ingestor
+
+ Args:
+ loop: Optional event loop to use. If not provided, asyncio.run() will create one.
+ Use this when you need to install a custom reactor (e.g., Twisted asyncio reactor)
+ before running the ingestor.
+ """
# Validate required parameters
assert self._name, "Ingestor name is required. Use .name('my-ingestor')"
assert self._type, "Ingestor type is required. Use .type('my-type')"
@@ -723,7 +729,11 @@ def run(self):
self._metadata = {}
# Run the ingestor
- asyncio.run(self._run_ingestor())
+ if loop is not None:
+ # Use provided event loop (useful for Twisted reactor integration)
+ loop.run_until_complete(self._run_ingestor())
+ else:
+ asyncio.run(self._run_ingestor())
async def _calculate_next_sync_time(self, client: Client) -> tuple[int, bool]:
"""
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/src/common/job_manager.py b/ai_platform_engineering/knowledge_bases/rag/common/src/common/job_manager.py
index 759055c84..f6e32bb9b 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/src/common/job_manager.py
+++ b/ai_platform_engineering/knowledge_bases/rag/common/src/common/job_manager.py
@@ -5,386 +5,437 @@
import time
from pydantic import BaseModel, Field
from common.constants import (
- REDIS_JOB_PREFIX,
- REDIS_JOB_DATASOURCE_INDEX_PREFIX,
- REDIS_JOB_ERRORS_SUFFIX,
+ REDIS_JOB_PREFIX,
+ REDIS_JOB_DATASOURCE_INDEX_PREFIX,
+ REDIS_JOB_ERRORS_SUFFIX,
)
logger = get_logger(__name__)
+
class JobStatus(str, Enum):
- PENDING = "pending"
- IN_PROGRESS = "in_progress"
- COMPLETED = "completed"
- COMPLETED_WITH_ERRORS = "completed_with_errors"
- TERMINATED = "terminated"
- FAILED = "failed"
+ PENDING = "pending"
+ IN_PROGRESS = "in_progress"
+ COMPLETED = "completed"
+ COMPLETED_WITH_ERRORS = "completed_with_errors"
+ TERMINATED = "terminated"
+ FAILED = "failed"
+
class JobInfo(BaseModel):
- job_id: str = Field(description="Job ID")
- status: JobStatus = Field(description="Job status")
- message: Optional[str] = Field(description="Current message", default=None)
- created_at: int = Field(description="Created at")
- completed_at: Optional[int] = Field(description="Completed at", default=None)
- total: Optional[int] = Field(description="Total items to process", default=None)
- progress_counter: Optional[int] = Field(description="Number of items processed", default=0)
- failed_counter: Optional[int] = Field(description="Number of items failed", default=0)
- error_msgs: Optional[List[str]] = Field(description="Error messages if any", default_factory=list)
- datasource_id: Optional[str] = Field(description="Associated datasource ID", default=None)
+ job_id: str = Field(description="Job ID")
+ status: JobStatus = Field(description="Job status")
+ message: Optional[str] = Field(description="Current message", default=None)
+ created_at: int = Field(description="Created at")
+ completed_at: Optional[int] = Field(description="Completed at", default=None)
+ total: Optional[int] = Field(description="Total items to process", default=None)
+ progress_counter: Optional[int] = Field(description="Number of items processed", default=0)
+ failed_counter: Optional[int] = Field(description="Number of items failed", default=0)
+ error_msgs: Optional[List[str]] = Field(description="Error messages if any", default_factory=list)
+ datasource_id: Optional[str] = Field(description="Associated datasource ID", default=None)
+ document_count: Optional[int] = Field(description="Number of documents ingested", default=0)
+ chunk_count: Optional[int] = Field(description="Number of chunks created", default=0)
+
class JobManager:
- """Manages job status updates in Redis using atomic operations."""
-
- def __init__(self, redis_client: redis.Redis, max_jobs_per_datasource: int = 10):
- """
- Initializes the JobManager with a Redis client.
-
- :param redis_client: An asynchronous Redis client instance.
- :param max_jobs_per_datasource: Maximum number of jobs to keep per datasource (default: 10).
- """
- self.redis_client = redis_client
- self.max_jobs_per_datasource = max_jobs_per_datasource
-
- def _get_job_key(self, job_id: str) -> str:
- """Constructs the Redis key for storing job information (hash)."""
- return f"{REDIS_JOB_PREFIX}{job_id}"
-
- def _get_error_msgs_key(self, job_id: str) -> str:
- """Constructs the Redis key for error messages list."""
- return f"{REDIS_JOB_PREFIX}{job_id}{REDIS_JOB_ERRORS_SUFFIX}"
-
- def _get_datasource_index_key(self, datasource_id: str) -> str:
- """Constructs the Redis key for datasource->job_id index."""
- return f"{REDIS_JOB_DATASOURCE_INDEX_PREFIX}{datasource_id}"
-
- async def _cleanup_old_jobs_for_datasource(self, datasource_id: str) -> int:
- """
- Removes oldest jobs for a datasource if the count exceeds max_jobs_per_datasource.
-
- :param datasource_id: The datasource ID to cleanup jobs for.
- :return: Number of jobs deleted.
- """
+ """Manages job status updates in Redis using atomic operations."""
+
+ def __init__(self, redis_client: redis.Redis, max_jobs_per_datasource: int = 10):
+ """
+ Initializes the JobManager with a Redis client.
+
+ :param redis_client: An asynchronous Redis client instance.
+ :param max_jobs_per_datasource: Maximum number of jobs to keep per datasource (default: 10).
+ """
+ self.redis_client = redis_client
+ self.max_jobs_per_datasource = max_jobs_per_datasource
+
+ def _get_job_key(self, job_id: str) -> str:
+ """Constructs the Redis key for storing job information (hash)."""
+ return f"{REDIS_JOB_PREFIX}{job_id}"
+
+ def _get_error_msgs_key(self, job_id: str) -> str:
+ """Constructs the Redis key for error messages list."""
+ return f"{REDIS_JOB_PREFIX}{job_id}{REDIS_JOB_ERRORS_SUFFIX}"
+
+ def _get_datasource_index_key(self, datasource_id: str) -> str:
+ """Constructs the Redis key for datasource->job_id index."""
+ return f"{REDIS_JOB_DATASOURCE_INDEX_PREFIX}{datasource_id}"
+
+ async def _cleanup_old_jobs_for_datasource(self, datasource_id: str) -> int:
+ """
+ Removes oldest jobs for a datasource if the count exceeds max_jobs_per_datasource.
+
+ :param datasource_id: The datasource ID to cleanup jobs for.
+ :return: Number of jobs deleted.
+ """
+ index_key = self._get_datasource_index_key(datasource_id)
+ job_ids = await self.redis_client.smembers(index_key) # type: ignore
+
+ if not job_ids or len(job_ids) <= self.max_jobs_per_datasource:
+ return 0
+
+ # Fetch creation times for all jobs
+ jobs_with_times = []
+ for job_id in job_ids:
+ if isinstance(job_id, bytes):
+ job_id = job_id.decode()
+
+ job_key = self._get_job_key(job_id)
+ created_at = await self.redis_client.hget(job_key, "created_at") # type: ignore
+
+ if created_at:
+ if isinstance(created_at, bytes):
+ created_at = created_at.decode()
+ jobs_with_times.append((job_id, int(created_at)))
+
+ # Sort by creation time (oldest first)
+ jobs_with_times.sort(key=lambda x: x[1])
+
+ # Calculate how many to delete
+ num_to_delete = len(jobs_with_times) - self.max_jobs_per_datasource
+
+ if num_to_delete <= 0:
+ return 0
+
+ # Delete oldest jobs
+ deleted_count = 0
+ for job_id, _ in jobs_with_times[:num_to_delete]:
+ if await self.delete_job(job_id):
+ deleted_count += 1
+ logger.info(f"Deleted old job {job_id} from datasource {datasource_id} (cleanup)")
+
+ return deleted_count
+
+ async def upsert_job(
+ self,
+ job_id: str,
+ *,
+ status: Optional[JobStatus] = None,
+ message: Optional[str] = None,
+ total: Optional[int] = None,
+ datasource_id: Optional[str] = None,
+ ) -> bool:
+ """
+ Creates a new job or updates an existing job in Redis.
+
+ :param job_id: The ID of the job to create or update.
+ :param status: The status of the job (defaults to PENDING for new jobs).
+ :param message: The message for the job.
+ :param total: The total number of items to process.
+ :param datasource_id: The datasource ID associated with this job.
+ :return: True if the operation was successful, False if job is terminated and cannot be updated.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Check if job exists
+ exists = await self.redis_client.exists(job_key)
+
+ if not exists:
+ # Job doesn't exist, create a new one
+ hash_data = {
+ "job_id": job_id,
+ "status": status.value if status is not None else JobStatus.PENDING.value,
+ "created_at": str(int(time.time())),
+ "progress_counter": "0",
+ "failed_counter": "0",
+ "document_count": "0",
+ "chunk_count": "0",
+ }
+
+ if message is not None:
+ hash_data["message"] = message
+ if total is not None:
+ hash_data["total"] = str(total)
+ if datasource_id is not None:
+ hash_data["datasource_id"] = datasource_id
+
+ # Save as hash (no TTL - jobs are managed via max_jobs_per_datasource limit)
+ await self.redis_client.hset(job_key, mapping=hash_data) # type: ignore
+
+ # Add to datasource index if datasource_id provided
+ if datasource_id is not None:
index_key = self._get_datasource_index_key(datasource_id)
- job_ids = await self.redis_client.smembers(index_key) # type: ignore
-
- if not job_ids or len(job_ids) <= self.max_jobs_per_datasource:
- return 0
-
- # Fetch creation times for all jobs
- jobs_with_times = []
- for job_id in job_ids:
- if isinstance(job_id, bytes):
- job_id = job_id.decode()
-
- job_key = self._get_job_key(job_id)
- created_at = await self.redis_client.hget(job_key, "created_at") # type: ignore
-
- if created_at:
- if isinstance(created_at, bytes):
- created_at = created_at.decode()
- jobs_with_times.append((job_id, int(created_at)))
-
- # Sort by creation time (oldest first)
- jobs_with_times.sort(key=lambda x: x[1])
-
- # Calculate how many to delete
- num_to_delete = len(jobs_with_times) - self.max_jobs_per_datasource
-
- if num_to_delete <= 0:
- return 0
-
- # Delete oldest jobs
- deleted_count = 0
- for job_id, _ in jobs_with_times[:num_to_delete]:
- if await self.delete_job(job_id):
- deleted_count += 1
- logger.info(f"Deleted old job {job_id} from datasource {datasource_id} (cleanup)")
-
- return deleted_count
-
- async def upsert_job(
- self,
- job_id: str,
- *,
- status: Optional[JobStatus] = None,
- message: Optional[str] = None,
- total: Optional[int] = None,
- datasource_id: Optional[str] = None,
- ) -> bool:
- """
- Creates a new job or updates an existing job in Redis.
-
- :param job_id: The ID of the job to create or update.
- :param status: The status of the job (defaults to PENDING for new jobs).
- :param message: The message for the job.
- :param total: The total number of items to process.
- :param datasource_id: The datasource ID associated with this job.
- :return: True if the operation was successful, False if job is terminated and cannot be updated.
- """
- job_key = self._get_job_key(job_id)
-
- # Check if job exists
- exists = await self.redis_client.exists(job_key)
-
- if not exists:
- # Job doesn't exist, create a new one
- hash_data = {
- "job_id": job_id,
- "status": status.value if status is not None else JobStatus.PENDING.value,
- "created_at": str(int(time.time())),
- "progress_counter": "0",
- "failed_counter": "0",
- }
-
- if message is not None:
- hash_data["message"] = message
- if total is not None:
- hash_data["total"] = str(total)
- if datasource_id is not None:
- hash_data["datasource_id"] = datasource_id
-
- # Save as hash (no TTL - jobs are managed via max_jobs_per_datasource limit)
- await self.redis_client.hset(job_key, mapping=hash_data) # type: ignore
-
- # Add to datasource index if datasource_id provided
- if datasource_id is not None:
- index_key = self._get_datasource_index_key(datasource_id)
- await self.redis_client.sadd(index_key, job_id) # type: ignore
-
- # Cleanup old jobs if limit exceeded
- await self._cleanup_old_jobs_for_datasource(datasource_id)
-
- logger.debug(f"Successfully created job {job_id}")
- return True
- else:
- # Job exists, check if it's terminated
- job_status = await self.redis_client.hget(job_key, "status") # type: ignore
- if job_status == JobStatus.TERMINATED.value and status != JobStatus.TERMINATED:
- logger.warning(f"Cannot update job {job_id} - job is terminated")
- return False
-
- # Prepare updates
- updates = {}
- if status is not None:
- updates["status"] = status.value
- if message is not None:
- updates["message"] = message
- if total is not None:
- updates["total"] = str(total)
- if datasource_id is not None:
- updates["datasource_id"] = datasource_id
- # Update datasource index
- old_datasource_id = await self.redis_client.hget(job_key, "datasource_id") # type: ignore
- if old_datasource_id and old_datasource_id != datasource_id:
- # Remove from old index
- await self.redis_client.srem(self._get_datasource_index_key(old_datasource_id), job_id) # type: ignore
- # Add to new index
- index_key = self._get_datasource_index_key(datasource_id)
- await self.redis_client.sadd(index_key, job_id) # type: ignore
-
- # Set completed_at if job is completing
- if status is not None and status in [JobStatus.COMPLETED, JobStatus.COMPLETED_WITH_ERRORS, JobStatus.FAILED, JobStatus.TERMINATED]:
- updates["completed_at"] = str(int(time.time()))
-
- if not updates:
- logger.debug(f"upsert_job called for job {job_id} with no fields to update")
- return True
-
- # Apply updates to hash
- await self.redis_client.hset(job_key, mapping=updates) # type: ignore
-
- logger.debug(f"Successfully updated job {job_id} with: {updates}")
- return True
-
- async def increment_progress(self, job_id: str, increment: int = 1) -> int:
- """
- Atomically increments the progress counter for a job.
-
- :param job_id: The ID of the job.
- :param increment: The amount to increment by (default: 1).
- :return: The new progress counter value, or -1 if job is terminated.
- """
- job_key = self._get_job_key(job_id)
-
- # Check if job is terminated
- job_status = await self.redis_client.hget(job_key, "status") # type: ignore
- if job_status == JobStatus.TERMINATED.value:
- logger.warning(f"Cannot increment progress for job {job_id} - job is terminated")
- return -1
-
- # Use HINCRBY for atomic increment on hash field
- new_value = await self.redis_client.hincrby(job_key, "progress_counter", increment) # type: ignore
- logger.debug(f"Incremented progress for job {job_id} by {increment}, new value: {new_value}")
- return new_value
-
- async def increment_failure(self, job_id: str, increment: int = 1, message: str = "") -> int:
- """
- Atomically increments the failure counter for a job.
-
- :param job_id: The ID of the job.
- :param increment: The amount to increment by (default: 1).
- :param message: An optional error message to add.
- :return: The new failure counter value, or -1 if job is terminated.
- """
- job_key = self._get_job_key(job_id)
-
- # Check if job is terminated
- job_status = await self.redis_client.hget(job_key, "status") # type: ignore
- if job_status == JobStatus.TERMINATED.value:
- logger.warning(f"Cannot increment failure for job {job_id} - job is terminated")
- return -1
-
- if message:
- await self.add_error_msg(job_id, message)
-
- # Use HINCRBY for atomic increment on hash field
- new_value = await self.redis_client.hincrby(job_key, "failed_counter", increment) # type: ignore
- logger.debug(f"Incremented failure counter for job {job_id} by {increment}, new value: {new_value}")
- return new_value
-
- async def add_error_msg(self, job_id: str, error_msg: str) -> int:
- """
- Adds an error message to the job's error list.
-
- :param job_id: The ID of the job.
- :param error_msg: The error message to add.
- :return: The new length of the error messages list, or -1 if job is terminated.
- """
- job_key = self._get_job_key(job_id)
-
- # Check if job is terminated
- job_status = await self.redis_client.hget(job_key, "status") # type: ignore
- if job_status == JobStatus.TERMINATED.value:
- logger.warning(f"Cannot add error message to job {job_id} - job is terminated")
- return -1
-
- error_msgs_key = self._get_error_msgs_key(job_id)
- new_length = await self.redis_client.rpush(error_msgs_key, error_msg) # type: ignore
-
- logger.debug(f"Added error message to job {job_id}, new list length: {new_length}")
- return new_length # type: ignore
-
- async def terminate_job(self, job_id: str) -> bool:
- """
- Marks a job as terminated.
-
- :param job_id: The ID of the job to terminate.
- :return: True if the termination flag was set successfully.
- """
- # Update the job status to terminated
- await self.upsert_job(job_id, status=JobStatus.TERMINATED)
-
- logger.debug(f"Terminated job {job_id}")
- return True
+ await self.redis_client.sadd(index_key, job_id) # type: ignore
+
+ # Cleanup old jobs if limit exceeded
+ await self._cleanup_old_jobs_for_datasource(datasource_id)
+
+ logger.debug(f"Successfully created job {job_id}")
+ return True
+ else:
+ # Job exists, check if it's terminated
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ if job_status == JobStatus.TERMINATED.value and status != JobStatus.TERMINATED:
+ logger.warning(f"Cannot update job {job_id} - job is terminated")
+ return False
- async def is_job_terminated(self, job_id: str) -> bool:
- """
- Checks if a job is terminated.
-
- :param job_id: The ID of the job to check.
- :return: True if the job is terminated, False otherwise.
- """
- job_key = self._get_job_key(job_id)
- job_status = await self.redis_client.hget(job_key, "status") # type: ignore
- return job_status == JobStatus.TERMINATED.value if job_status else False
-
- async def get_job(self, job_id: str) -> Optional[JobInfo]:
- """
- Retrieves a job's information from Redis, including counters and error messages.
-
- :param job_id: The ID of the job to retrieve.
- :return: JobInfo object if job exists, None otherwise.
- """
- job_key = self._get_job_key(job_id)
-
- # Get all hash fields and error messages in parallel
- pipeline = self.redis_client.pipeline()
- pipeline.hgetall(job_key)
- pipeline.lrange(self._get_error_msgs_key(job_id), 0, -1)
- results = await pipeline.execute()
-
- hash_data = results[0]
- error_msgs = results[1]
-
- if not hash_data:
- return None
-
- # Convert hash data to JobInfo
- job_dict = {
- "job_id": hash_data.get(b"job_id", b"").decode() if isinstance(hash_data.get(b"job_id"), bytes) else hash_data.get("job_id", ""),
- "status": hash_data.get(b"status", b"").decode() if isinstance(hash_data.get(b"status"), bytes) else hash_data.get("status", ""),
- "message": hash_data.get(b"message", b"").decode() if isinstance(hash_data.get(b"message"), bytes) else hash_data.get("message") if hash_data.get("message") or hash_data.get(b"message") else None,
- "created_at": int(hash_data.get(b"created_at", b"0").decode() if isinstance(hash_data.get(b"created_at"), bytes) else hash_data.get("created_at", "0")),
- "completed_at": int(hash_data.get(b"completed_at", b"0").decode() if isinstance(hash_data.get(b"completed_at"), bytes) else hash_data.get("completed_at", "0")) if hash_data.get("completed_at") or hash_data.get(b"completed_at") else None,
- "total": int(hash_data.get(b"total", b"0").decode() if isinstance(hash_data.get(b"total"), bytes) else hash_data.get("total", "0")) if hash_data.get("total") or hash_data.get(b"total") else None,
- "progress_counter": int(hash_data.get(b"progress_counter", b"0").decode() if isinstance(hash_data.get(b"progress_counter"), bytes) else hash_data.get("progress_counter", "0")),
- "failed_counter": int(hash_data.get(b"failed_counter", b"0").decode() if isinstance(hash_data.get(b"failed_counter"), bytes) else hash_data.get("failed_counter", "0")),
- "datasource_id": hash_data.get(b"datasource_id", b"").decode() if isinstance(hash_data.get(b"datasource_id"), bytes) else hash_data.get("datasource_id") if hash_data.get("datasource_id") or hash_data.get(b"datasource_id") else None,
- "error_msgs": error_msgs if error_msgs else [],
- }
-
- job_info = JobInfo(**job_dict)
- return job_info
-
- async def get_jobs_by_datasource(self, datasource_id: str, status_filter: Optional[JobStatus] = None) -> Optional[List[JobInfo]]:
- """
- Retrieves jobs associated with a specific datasource. Sorted by creation time descending (latest first).
-
- :param datasource_id: The datasource ID to search for.
- :param status_filter: Optional status to filter by (e.g., JobStatus.IN_PROGRESS).
- :return: List of JobInfo objects if found, None otherwise.
- """
- # Use datasource index for O(1) lookup
+ # Prepare updates
+ updates = {}
+ if status is not None:
+ updates["status"] = status.value
+ if message is not None:
+ updates["message"] = message
+ if total is not None:
+ updates["total"] = str(total)
+ if datasource_id is not None:
+ updates["datasource_id"] = datasource_id
+ # Update datasource index
+ old_datasource_id = await self.redis_client.hget(job_key, "datasource_id") # type: ignore
+ if old_datasource_id and old_datasource_id != datasource_id:
+ # Remove from old index
+ await self.redis_client.srem(self._get_datasource_index_key(old_datasource_id), job_id) # type: ignore
+ # Add to new index
index_key = self._get_datasource_index_key(datasource_id)
- job_ids = await self.redis_client.smembers(index_key) # type: ignore
-
- if not job_ids:
- return None
-
- # Fetch all jobs in parallel using pipeline
- matching_jobs = []
-
- for job_id in job_ids:
- # Decode job_id if it's bytes
- if isinstance(job_id, bytes):
- job_id = job_id.decode()
-
- job_info = await self.get_job(job_id)
- if job_info:
- # Apply status filter if provided
- if status_filter is None or job_info.status == status_filter:
- matching_jobs.append(job_info)
-
- # Sort by created_at descending (most recent first)
- if matching_jobs:
- matching_jobs.sort(key=lambda j: j.created_at, reverse=True)
- return matching_jobs
-
- return None
-
- async def delete_job(self, job_id: str) -> bool:
- """
- Deletes a job and all its associated data from Redis.
-
- :param job_id: The ID of the job to delete.
- :return: True if the job was deleted successfully.
- """
- job_key = self._get_job_key(job_id)
-
- # Get datasource_id to remove from index
- datasource_id = await self.redis_client.hget(job_key, "datasource_id") # type: ignore
-
- keys_to_delete = [
- job_key,
- self._get_error_msgs_key(job_id),
- ]
-
- # Remove from datasource index
- if datasource_id:
- if isinstance(datasource_id, bytes):
- datasource_id = datasource_id.decode()
- await self.redis_client.srem(self._get_datasource_index_key(datasource_id), job_id) # type: ignore
-
- deleted_count = await self.redis_client.delete(*keys_to_delete)
- logger.debug(f"Deleted job {job_id}, removed {deleted_count} keys")
- return deleted_count > 0
\ No newline at end of file
+ await self.redis_client.sadd(index_key, job_id) # type: ignore
+
+ # Set completed_at if job is completing
+ if status is not None and status in [JobStatus.COMPLETED, JobStatus.COMPLETED_WITH_ERRORS, JobStatus.FAILED, JobStatus.TERMINATED]:
+ updates["completed_at"] = str(int(time.time()))
+
+ if not updates:
+ logger.debug(f"upsert_job called for job {job_id} with no fields to update")
+ return True
+
+ # Apply updates to hash
+ await self.redis_client.hset(job_key, mapping=updates) # type: ignore
+
+ logger.debug(f"Successfully updated job {job_id} with: {updates}")
+ return True
+
+ async def increment_progress(self, job_id: str, increment: int = 1) -> int:
+ """
+ Atomically increments the progress counter for a job.
+
+ :param job_id: The ID of the job.
+ :param increment: The amount to increment by (default: 1).
+ :return: The new progress counter value, or -1 if job is terminated.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Check if job is terminated
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ if job_status == JobStatus.TERMINATED.value:
+ logger.warning(f"Cannot increment progress for job {job_id} - job is terminated")
+ return -1
+
+ # Use HINCRBY for atomic increment on hash field
+ new_value = await self.redis_client.hincrby(job_key, "progress_counter", increment) # type: ignore
+ logger.debug(f"Incremented progress for job {job_id} by {increment}, new value: {new_value}")
+ return new_value
+
+ async def increment_failure(self, job_id: str, increment: int = 1, message: str = "") -> int:
+ """
+ Atomically increments the failure counter for a job.
+
+ :param job_id: The ID of the job.
+ :param increment: The amount to increment by (default: 1).
+ :param message: An optional error message to add.
+ :return: The new failure counter value, or -1 if job is terminated.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Check if job is terminated
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ if job_status == JobStatus.TERMINATED.value:
+ logger.warning(f"Cannot increment failure for job {job_id} - job is terminated")
+ return -1
+
+ if message:
+ await self.add_error_msg(job_id, message)
+
+ # Use HINCRBY for atomic increment on hash field
+ new_value = await self.redis_client.hincrby(job_key, "failed_counter", increment) # type: ignore
+ logger.debug(f"Incremented failure counter for job {job_id} by {increment}, new value: {new_value}")
+ return new_value
+
+ async def increment_document_count(self, job_id: str, increment: int = 1) -> int:
+ """
+ Atomically increments the document count for a job.
+
+ :param job_id: The ID of the job.
+ :param increment: The amount to increment by (default: 1).
+ :return: The new document count value, or -1 if job is terminated.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Check if job is terminated
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ if job_status == JobStatus.TERMINATED.value:
+ logger.warning(f"Cannot increment document count for job {job_id} - job is terminated")
+ return -1
+
+ # Use HINCRBY for atomic increment on hash field
+ new_value = await self.redis_client.hincrby(job_key, "document_count", increment) # type: ignore
+ logger.debug(f"Incremented document count for job {job_id} by {increment}, new value: {new_value}")
+ return new_value
+
+ async def increment_chunk_count(self, job_id: str, increment: int = 1) -> int:
+ """
+ Atomically increments the chunk count for a job.
+
+ :param job_id: The ID of the job.
+ :param increment: The amount to increment by (default: 1).
+ :return: The new chunk count value, or -1 if job is terminated.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Check if job is terminated
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ if job_status == JobStatus.TERMINATED.value:
+ logger.warning(f"Cannot increment chunk count for job {job_id} - job is terminated")
+ return -1
+
+ # Use HINCRBY for atomic increment on hash field
+ new_value = await self.redis_client.hincrby(job_key, "chunk_count", increment) # type: ignore
+ logger.debug(f"Incremented chunk count for job {job_id} by {increment}, new value: {new_value}")
+ return new_value
+
+ async def add_error_msg(self, job_id: str, error_msg: str) -> int:
+ """
+ Adds an error message to the job's error list.
+
+ :param job_id: The ID of the job.
+ :param error_msg: The error message to add.
+ :return: The new length of the error messages list, or -1 if job is terminated.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Check if job is terminated
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ if job_status == JobStatus.TERMINATED.value:
+ logger.warning(f"Cannot add error message to job {job_id} - job is terminated")
+ return -1
+
+ error_msgs_key = self._get_error_msgs_key(job_id)
+ new_length = await self.redis_client.rpush(error_msgs_key, error_msg) # type: ignore
+
+ logger.debug(f"Added error message to job {job_id}, new list length: {new_length}")
+ return new_length # type: ignore
+
+ async def terminate_job(self, job_id: str) -> bool:
+ """
+ Marks a job as terminated.
+
+ :param job_id: The ID of the job to terminate.
+ :return: True if the termination flag was set successfully.
+ """
+ # Update the job status to terminated
+ await self.upsert_job(job_id, status=JobStatus.TERMINATED)
+
+ logger.debug(f"Terminated job {job_id}")
+ return True
+
+ async def is_job_terminated(self, job_id: str) -> bool:
+ """
+ Checks if a job is terminated.
+
+ :param job_id: The ID of the job to check.
+ :return: True if the job is terminated, False otherwise.
+ """
+ job_key = self._get_job_key(job_id)
+ job_status = await self.redis_client.hget(job_key, "status") # type: ignore
+ return job_status == JobStatus.TERMINATED.value if job_status else False
+
+ async def get_job(self, job_id: str) -> Optional[JobInfo]:
+ """
+ Retrieves a job's information from Redis, including counters and error messages.
+
+ :param job_id: The ID of the job to retrieve.
+ :return: JobInfo object if job exists, None otherwise.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Get all hash fields and error messages in parallel
+ pipeline = self.redis_client.pipeline()
+ pipeline.hgetall(job_key)
+ pipeline.lrange(self._get_error_msgs_key(job_id), 0, -1)
+ results = await pipeline.execute()
+
+ hash_data = results[0]
+ error_msgs = results[1]
+
+ if not hash_data:
+ return None
+
+ # Convert hash data to JobInfo
+ job_dict = {
+ "job_id": hash_data.get(b"job_id", b"").decode() if isinstance(hash_data.get(b"job_id"), bytes) else hash_data.get("job_id", ""),
+ "status": hash_data.get(b"status", b"").decode() if isinstance(hash_data.get(b"status"), bytes) else hash_data.get("status", ""),
+ "message": hash_data.get(b"message", b"").decode() if isinstance(hash_data.get(b"message"), bytes) else hash_data.get("message") if hash_data.get("message") or hash_data.get(b"message") else None,
+ "created_at": int(hash_data.get(b"created_at", b"0").decode() if isinstance(hash_data.get(b"created_at"), bytes) else hash_data.get("created_at", "0")),
+ "completed_at": int(hash_data.get(b"completed_at", b"0").decode() if isinstance(hash_data.get(b"completed_at"), bytes) else hash_data.get("completed_at", "0")) if hash_data.get("completed_at") or hash_data.get(b"completed_at") else None,
+ "total": int(hash_data.get(b"total", b"0").decode() if isinstance(hash_data.get(b"total"), bytes) else hash_data.get("total", "0")) if hash_data.get("total") or hash_data.get(b"total") else None,
+ "progress_counter": int(hash_data.get(b"progress_counter", b"0").decode() if isinstance(hash_data.get(b"progress_counter"), bytes) else hash_data.get("progress_counter", "0")),
+ "failed_counter": int(hash_data.get(b"failed_counter", b"0").decode() if isinstance(hash_data.get(b"failed_counter"), bytes) else hash_data.get("failed_counter", "0")),
+ "datasource_id": hash_data.get(b"datasource_id", b"").decode() if isinstance(hash_data.get(b"datasource_id"), bytes) else hash_data.get("datasource_id") if hash_data.get("datasource_id") or hash_data.get(b"datasource_id") else None,
+ "document_count": int(hash_data.get(b"document_count", b"0").decode() if isinstance(hash_data.get(b"document_count"), bytes) else hash_data.get("document_count", "0")),
+ "chunk_count": int(hash_data.get(b"chunk_count", b"0").decode() if isinstance(hash_data.get(b"chunk_count"), bytes) else hash_data.get("chunk_count", "0")),
+ "error_msgs": error_msgs if error_msgs else [],
+ }
+
+ job_info = JobInfo(**job_dict)
+ return job_info
+
+ async def get_jobs_by_datasource(self, datasource_id: str, status_filter: Optional[JobStatus] = None) -> Optional[List[JobInfo]]:
+ """
+ Retrieves jobs associated with a specific datasource. Sorted by creation time descending (latest first).
+
+ :param datasource_id: The datasource ID to search for.
+ :param status_filter: Optional status to filter by (e.g., JobStatus.IN_PROGRESS).
+ :return: List of JobInfo objects if found, None otherwise.
+ """
+ # Use datasource index for O(1) lookup
+ index_key = self._get_datasource_index_key(datasource_id)
+ job_ids = await self.redis_client.smembers(index_key) # type: ignore
+
+ if not job_ids:
+ return None
+
+ # Fetch all jobs in parallel using pipeline
+ matching_jobs = []
+
+ for job_id in job_ids:
+ # Decode job_id if it's bytes
+ if isinstance(job_id, bytes):
+ job_id = job_id.decode()
+
+ job_info = await self.get_job(job_id)
+ if job_info:
+ # Apply status filter if provided
+ if status_filter is None or job_info.status == status_filter:
+ matching_jobs.append(job_info)
+
+ # Sort by created_at descending (most recent first)
+ if matching_jobs:
+ matching_jobs.sort(key=lambda j: j.created_at, reverse=True)
+ return matching_jobs
+
+ return None
+
+ async def delete_job(self, job_id: str) -> bool:
+ """
+ Deletes a job and all its associated data from Redis.
+
+ :param job_id: The ID of the job to delete.
+ :return: True if the job was deleted successfully.
+ """
+ job_key = self._get_job_key(job_id)
+
+ # Get datasource_id to remove from index
+ datasource_id = await self.redis_client.hget(job_key, "datasource_id") # type: ignore
+
+ keys_to_delete = [
+ job_key,
+ self._get_error_msgs_key(job_id),
+ ]
+
+ # Remove from datasource index
+ if datasource_id:
+ if isinstance(datasource_id, bytes):
+ datasource_id = datasource_id.decode()
+ await self.redis_client.srem(self._get_datasource_index_key(datasource_id), job_id) # type: ignore
+
+ deleted_count = await self.redis_client.delete(*keys_to_delete)
+ logger.debug(f"Deleted job {job_id}, removed {deleted_count} keys")
+ return deleted_count > 0
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/src/common/models/server.py b/ai_platform_engineering/knowledge_bases/rag/common/src/common/models/server.py
index 469a56661..4298f946d 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/src/common/models/server.py
+++ b/ai_platform_engineering/knowledge_bases/rag/common/src/common/models/server.py
@@ -3,105 +3,174 @@
from typing import Optional, Dict, Any, List
from langchain_core.documents import Document
+
+# ============================================================================
+# Web Scraping Configuration Models
+# ============================================================================
+
+
+class CrawlMode(str, Enum):
+ """How to discover pages to crawl."""
+
+ SINGLE_URL = "single" # Only the specified URL
+ SITEMAP = "sitemap" # Discover and crawl sitemap
+ RECURSIVE = "recursive" # Follow links from starting URL
+
+
+class ScrapySettings(BaseModel):
+ """Scraping configuration exposed to users."""
+
+ # Crawl behavior
+ crawl_mode: CrawlMode = Field(CrawlMode.SINGLE_URL, description="How to discover pages: 'single' (just this URL), 'sitemap' (discover sitemap), 'recursive' (follow links)")
+ max_depth: int = Field(2, description="Maximum link depth for recursive crawling", ge=1, le=10)
+ max_pages: int = Field(2000, description="Maximum pages to crawl", ge=1)
+
+ # JavaScript rendering
+ render_javascript: bool = Field(False, description="Enable JavaScript rendering via Playwright (slower but handles SPAs)")
+ wait_for_selector: Optional[str] = Field(None, description="CSS selector to wait for before extracting content (JS rendering only)")
+ page_load_timeout: int = Field(15, description="Page load timeout in seconds", ge=5, le=120)
+
+ # URL filtering
+ follow_external_links: bool = Field(False, description="Follow links to external domains (recursive mode only)")
+ allowed_url_patterns: Optional[List[str]] = Field(None, description="Regex patterns for URLs to include (whitelist)")
+ denied_url_patterns: Optional[List[str]] = Field(None, description="Regex patterns for URLs to exclude (blacklist)")
+
+ # Rate limiting
+ download_delay: float = Field(0.05, description="Delay between requests to same domain (seconds)", ge=0)
+ concurrent_requests: int = Field(30, description="Maximum concurrent requests", ge=1, le=50)
+ respect_robots_txt: bool = Field(True, description="Obey robots.txt rules")
+
+ # Chunking
+ chunk_size: int = Field(10000, description="Maximum size of each text chunk in characters", ge=100, le=100000)
+ chunk_overlap: int = Field(2000, description="Overlap between chunks in characters", ge=0, le=10000)
+
+ # Misc
+ user_agent: Optional[str] = Field(None, description="Custom user agent string (defaults to Chrome-like UA)")
+
+
# ============================================================================
# Models for Ingestor ping and registration
# ============================================================================
class IngestorPingRequest(BaseModel):
- ingestor_type: str = Field(..., description="Type of the ingestor")
- ingestor_name: str = Field(..., description="Name of the ingestor")
- description: Optional[str] = Field("", description="Description of the ingestor")
- metadata: Optional[Dict[str, Any]] = Field({}, description="Additional metadata for the ingestor")
+ ingestor_type: str = Field(..., description="Type of the ingestor")
+ ingestor_name: str = Field(..., description="Name of the ingestor")
+ description: Optional[str] = Field("", description="Description of the ingestor")
+ metadata: Optional[Dict[str, Any]] = Field({}, description="Additional metadata for the ingestor")
+
class IngestorPingResponse(BaseModel):
- ingestor_id: str = Field(..., description="Unique identifier for the ingestor")
- max_documents_per_ingest: int = Field(..., description="Maximum number of documents the server can handle per request")
- message: str = Field(..., description="Response message from the server")
+ ingestor_id: str = Field(..., description="Unique identifier for the ingestor")
+ max_documents_per_ingest: int = Field(..., description="Maximum number of documents the server can handle per request")
+ message: str = Field(..., description="Response message from the server")
+
# ============================================================================
# General Ingestor Models
# ============================================================================
+
class IngestorRequest(BaseModel):
- ingestor_id: str = Field(..., description="ID of the ingestor performing the ingestion")
- command: str = Field(..., description="Command to execute")
- payload: Optional[Any] = Field(..., description="Data associated with the command")
+ ingestor_id: str = Field(..., description="ID of the ingestor performing the ingestion")
+ command: str = Field(..., description="Command to execute")
+ payload: Optional[Any] = Field(..., description="Data associated with the command")
+
class DocumentIngestRequest(BaseModel):
- documents: List[Document] = Field(..., description="List of langchain Documents to ingest")
- ingestor_id: str = Field(..., description="ID of the ingestor ingesting these documents")
- datasource_id: str = Field(..., description="ID of the datasource associated with these documents")
- job_id: str = Field(None, description="Job ID associated with this ingestion")
- fresh_until: Optional[int] = Field(0, description="Timestamp until which this data is considered fresh (epoch seconds)")
+ documents: List[Document] = Field(..., description="List of langchain Documents to ingest")
+ ingestor_id: str = Field(..., description="ID of the ingestor ingesting these documents")
+ datasource_id: str = Field(..., description="ID of the datasource associated with these documents")
+ job_id: Optional[str] = Field(None, description="Job ID associated with this ingestion")
+ fresh_until: Optional[int] = Field(0, description="Timestamp until which this data is considered fresh (epoch seconds)")
+
# ============================================================================
# Models specific for Web Ingestor
# ============================================================================
+
class UrlIngestRequest(BaseModel):
- url: str = Field(..., description="URL to ingest")
- check_for_sitemaps: bool = Field(False, description="Whether to check for a sitemaps")
- sitemap_max_urls: int = Field(2000, description="Maximum number of URLs to fetch from sitemap - 0 means no limit", ge=0)
- description: str = Field("", description="Description for this data source")
- ingest_type: str = Field("web", description="Type of ingestor to use: 'web' or 'confluence'")
+ """Request to ingest a URL with configurable scraping settings."""
+
+ url: str = Field(..., description="URL to ingest")
+ description: str = Field("", description="Description for this data source")
+ settings: ScrapySettings = Field(default_factory=lambda: ScrapySettings(), description="Scraping configuration (crawl mode, JS rendering, rate limiting, etc.)")
+
+ # DEPRECATED fields - will be removed in a future version.
+ # Use 'settings' object instead.
+ check_for_sitemaps: Optional[bool] = Field(None, description="DEPRECATED: Use settings.crawl_mode instead")
+ sitemap_max_urls: Optional[int] = Field(None, description="DEPRECATED: Use settings.max_pages instead")
+ ingest_type: Optional[str] = Field(None, description="DEPRECATED: No longer used")
+
class UrlReloadRequest(BaseModel):
- datasource_id: str = Field(..., description="ID of the URL datasource to reload")
+ datasource_id: str = Field(..., description="ID of the URL datasource to reload")
+
class WebIngestorCommand(str, Enum):
- INGEST_URL = "ingest-url"
- RELOAD_ALL = "reload-all"
- RELOAD_DATASOURCE = "reload-datasource"
+ INGEST_URL = "ingest-url"
+ RELOAD_ALL = "reload-all"
+ RELOAD_DATASOURCE = "reload-datasource"
+
# ============================================================================
# Models specific for Confluence Ingestor
# ============================================================================
+
class ConfluenceIngestRequest(BaseModel):
- url: str = Field(..., description="Confluence page URL (e.g., 'https://domain.atlassian.net/wiki/spaces/SPACE/pages/PAGE_ID/Title')")
- description: str = Field("", description="Description for this data source")
- get_child_pages: bool = Field(False, description="Whether to ingest direct child pages of this page")
+ url: str = Field(..., description="Confluence page URL (e.g., 'https://domain.atlassian.net/wiki/spaces/SPACE/pages/PAGE_ID/Title')")
+ description: str = Field("", description="Description for this data source")
+ get_child_pages: bool = Field(False, description="Whether to ingest direct child pages of this page")
+
class ConfluenceReloadRequest(BaseModel):
- datasource_id: str = Field(..., description="ID of the Confluence datasource to reload")
+ datasource_id: str = Field(..., description="ID of the Confluence datasource to reload")
+
class ConfluenceIngestorCommand(str, Enum):
- INGEST_PAGE = "ingest-page"
- RELOAD_ALL = "reload-all"
- RELOAD_DATASOURCE = "reload-datasource"
+ INGEST_PAGE = "ingest-page"
+ RELOAD_ALL = "reload-all"
+ RELOAD_DATASOURCE = "reload-datasource"
+
# ============================================================================
# Models for Graph Exploration and Querying
# ============================================================================
class ExploreNeighborhoodRequest(BaseModel):
- entity_type: str = Field(..., description="Type of the entity to explore")
- entity_pk: str = Field(..., description="Primary key of the entity to explore")
- depth: int = Field(1, description="Depth of neighborhood to explore (0 = just entity, 1 = direct neighbors, etc.)", ge=0, le=10)
+ entity_type: str = Field(..., description="Type of the entity to explore")
+ entity_pk: str = Field(..., description="Primary key of the entity to explore")
+ depth: int = Field(1, description="Depth of neighborhood to explore (0 = just entity, 1 = direct neighbors, etc.)", ge=0, le=10)
+
class ExploreDataEntityRequest(BaseModel):
- entity_type: str = Field(..., description="Type of the entity to fetch")
- entity_pk: str = Field(..., description="Primary key of the entity to fetch")
+ entity_type: str = Field(..., description="Type of the entity to fetch")
+ entity_pk: str = Field(..., description="Primary key of the entity to fetch")
+
class ExploreEntityRequest(BaseModel):
- entity_type: Optional[str] = Field(None, description="Type of entity to explore")
- filter_by_properties: Optional[Dict[str, str]] = Field(None, description="Properties to filter by")
+ entity_type: Optional[str] = Field(None, description="Type of entity to explore")
+ filter_by_properties: Optional[Dict[str, str]] = Field(None, description="Properties to filter by")
+
class ExploreRelationsRequest(BaseModel):
- from_type: Optional[str] = Field(None, description="Type of the source entity")
- to_type: Optional[str] = Field(None, description="Type of the target entity")
- relation_name: Optional[str] = Field(None, description="Name of the relation")
- filter_by_properties: Optional[Dict[str, str]] = Field(None, description="Properties to filter relations by")
+ from_type: Optional[str] = Field(None, description="Type of the source entity")
+ to_type: Optional[str] = Field(None, description="Type of the target entity")
+ relation_name: Optional[str] = Field(None, description="Name of the relation")
+ filter_by_properties: Optional[Dict[str, str]] = Field(None, description="Properties to filter relations by")
+
# ============================================================================
# Models for Querying
# ============================================================================
class QueryRequest(BaseModel):
- query: str = Field(..., description="Query string to search for")
- limit: int = Field(3, description="Maximum number of results to return", ge=1, le=100)
- similarity_threshold: float = Field(0.3, description="Minimum similarity score", ge=0.0, le=1.0)
- filters: Optional[Dict[str, str|bool]] = Field(None, description="Additional filters as key-value pairs")
- ranker_type: str = Field("weighted", description="Type of ranker to use")
- ranker_params: Optional[Dict[str, Any]] = Field({"weights": [0.7, 0.3]}, description="Parameters for the ranker")
+ query: str = Field(..., description="Query string to search for")
+ limit: int = Field(3, description="Maximum number of results to return", ge=1, le=100)
+ similarity_threshold: float = Field(0.3, description="Minimum similarity score", ge=0.0, le=1.0)
+ filters: Optional[Dict[str, str | bool]] = Field(None, description="Additional filters as key-value pairs")
+ ranker_type: str = Field("weighted", description="Type of ranker to use")
+ ranker_params: Optional[Dict[str, Any]] = Field({"weights": [0.7, 0.3]}, description="Parameters for the ranker")
+
class QueryResult(BaseModel):
- document: Document
- score: float
+ document: Document
+ score: float
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/tests/test_embeddings_factory.py b/ai_platform_engineering/knowledge_bases/rag/common/tests/test_embeddings_factory.py
index d6b6f65ba..db0c3bb0a 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/tests/test_embeddings_factory.py
+++ b/ai_platform_engineering/knowledge_bases/rag/common/tests/test_embeddings_factory.py
@@ -1,6 +1,7 @@
"""
Tests for EmbeddingsFactory
"""
+
import os
import pytest
from unittest.mock import patch, MagicMock
@@ -8,97 +9,129 @@
class TestEmbeddingsFactory:
- """Test suite for EmbeddingsFactory"""
-
- def test_default_provider_azure_openai(self):
- """Test that default provider is azure-openai"""
- with patch.dict(os.environ, {}, clear=True):
- with patch('common.embeddings_factory.AzureOpenAIEmbeddings') as mock_azure:
- mock_instance = MagicMock()
- mock_azure.return_value = mock_instance
-
- result = EmbeddingsFactory.get_embeddings()
-
- mock_azure.assert_called_once_with(model='text-embedding-3-small')
- assert result == mock_instance
-
- def test_custom_model(self):
- """Test custom model configuration"""
- with patch.dict(os.environ, {
- 'EMBEDDINGS_PROVIDER': 'azure-openai',
- 'EMBEDDINGS_MODEL': 'text-embedding-3-large'
- }):
- with patch('common.embeddings_factory.AzureOpenAIEmbeddings') as mock_azure:
- mock_instance = MagicMock()
- mock_azure.return_value = mock_instance
-
- EmbeddingsFactory.get_embeddings()
-
- mock_azure.assert_called_once_with(model='text-embedding-3-large')
-
- def test_openai_provider(self):
- """Test OpenAI provider"""
- with patch.dict(os.environ, {
- 'EMBEDDINGS_PROVIDER': 'openai',
- 'OPENAI_API_KEY': 'test-key'
- }):
- with patch('common.embeddings_factory.OpenAIEmbeddings') as mock_openai:
- mock_instance = MagicMock()
- mock_openai.return_value = mock_instance
-
- result = EmbeddingsFactory.get_embeddings()
-
- mock_openai.assert_called_once()
- assert result == mock_instance
-
- def test_openai_missing_api_key(self):
- """Test OpenAI provider fails without API key"""
- with patch.dict(os.environ, {'EMBEDDINGS_PROVIDER': 'openai'}, clear=True):
- with pytest.raises(ValueError, match="OPENAI_API_KEY"):
- EmbeddingsFactory.get_embeddings()
-
- def test_bedrock_provider(self):
- """Test AWS Bedrock provider"""
- with patch.dict(os.environ, {
- 'EMBEDDINGS_PROVIDER': 'aws-bedrock',
- 'AWS_REGION': 'us-west-2'
- }):
- with patch('common.embeddings_factory.BedrockEmbeddings') as mock_bedrock:
- mock_instance = MagicMock()
- mock_bedrock.return_value = mock_instance
-
- result = EmbeddingsFactory.get_embeddings()
-
- mock_bedrock.assert_called_once_with(
- model_id='amazon.titan-embed-text-v2:0',
- region_name='us-west-2'
- )
- assert result == mock_instance
-
- def test_unsupported_provider(self):
- """Test unsupported provider raises error"""
- with patch.dict(os.environ, {'EMBEDDINGS_PROVIDER': 'invalid-provider'}):
- with pytest.raises(ValueError, match="Unsupported embeddings provider"):
- EmbeddingsFactory.get_embeddings()
-
- def test_get_embedding_dimensions(self):
- """Test embedding dimensions retrieval"""
- with patch.dict(os.environ, {
- 'EMBEDDINGS_MODEL': 'text-embedding-3-small'
- }):
- dimensions = EmbeddingsFactory.get_embedding_dimensions()
- assert dimensions == 1536
-
- with patch.dict(os.environ, {
- 'EMBEDDINGS_MODEL': 'text-embedding-3-large'
- }):
- dimensions = EmbeddingsFactory.get_embedding_dimensions()
- assert dimensions == 3072
-
- # Test unknown model returns default
- with patch.dict(os.environ, {
- 'EMBEDDINGS_MODEL': 'unknown-model'
- }):
- dimensions = EmbeddingsFactory.get_embedding_dimensions()
- assert dimensions == 1536 # default
+ """Test suite for EmbeddingsFactory"""
+
+ def test_default_provider_azure_openai(self):
+ """Test that default provider is azure-openai"""
+ with patch.dict(os.environ, {}, clear=True):
+ with patch("common.embeddings_factory.AzureOpenAIEmbeddings") as mock_azure:
+ mock_instance = MagicMock()
+ mock_azure.return_value = mock_instance
+
+ result = EmbeddingsFactory.get_embeddings()
+
+ mock_azure.assert_called_once_with(model="text-embedding-3-small")
+ assert result == mock_instance
+
+ def test_custom_model(self):
+ """Test custom model configuration"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "azure-openai", "EMBEDDINGS_MODEL": "text-embedding-3-large"}):
+ with patch("common.embeddings_factory.AzureOpenAIEmbeddings") as mock_azure:
+ mock_instance = MagicMock()
+ mock_azure.return_value = mock_instance
+
+ EmbeddingsFactory.get_embeddings()
+
+ mock_azure.assert_called_once_with(model="text-embedding-3-large")
+
+ def test_openai_provider(self):
+ """Test OpenAI provider"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "openai", "OPENAI_API_KEY": "test-key"}):
+ with patch("common.embeddings_factory.OpenAIEmbeddings") as mock_openai:
+ mock_instance = MagicMock()
+ mock_openai.return_value = mock_instance
+
+ result = EmbeddingsFactory.get_embeddings()
+
+ mock_openai.assert_called_once()
+ assert result == mock_instance
+
+ def test_openai_missing_api_key(self):
+ """Test OpenAI provider fails without API key"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "openai"}, clear=True):
+ with pytest.raises(ValueError, match="OPENAI_API_KEY"):
+ EmbeddingsFactory.get_embeddings()
+
+ def test_bedrock_provider(self):
+ """Test AWS Bedrock provider"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "aws-bedrock", "AWS_REGION": "us-west-2"}):
+ with patch("common.embeddings_factory.BedrockEmbeddings") as mock_bedrock:
+ mock_instance = MagicMock()
+ mock_bedrock.return_value = mock_instance
+
+ result = EmbeddingsFactory.get_embeddings()
+
+ mock_bedrock.assert_called_once_with(model_id="amazon.titan-embed-text-v2:0", region_name="us-west-2")
+ assert result == mock_instance
+
+ def test_unsupported_provider(self):
+ """Test unsupported provider raises error"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "invalid-provider"}):
+ with pytest.raises(ValueError, match="Unsupported embeddings provider"):
+ EmbeddingsFactory.get_embeddings()
+
+ def test_get_embedding_dimensions(self):
+ """Test embedding dimensions retrieval"""
+ with patch.dict(os.environ, {"EMBEDDINGS_MODEL": "text-embedding-3-small"}):
+ dimensions = EmbeddingsFactory.get_embedding_dimensions()
+ assert dimensions == 1536
+
+ with patch.dict(os.environ, {"EMBEDDINGS_MODEL": "text-embedding-3-large"}):
+ dimensions = EmbeddingsFactory.get_embedding_dimensions()
+ assert dimensions == 3072
+
+ # Test unknown model returns default
+ with patch.dict(os.environ, {"EMBEDDINGS_MODEL": "unknown-model"}):
+ dimensions = EmbeddingsFactory.get_embedding_dimensions()
+ assert dimensions == 1536 # default
+
+ def test_litellm_provider_requires_api_base(self):
+ """Test LiteLLM provider fails without LITELLM_API_BASE"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "litellm", "EMBEDDINGS_MODEL": "azure/text-embedding-3-small"}, clear=True):
+ with pytest.raises(ValueError, match="LITELLM_API_BASE"):
+ EmbeddingsFactory.get_embeddings()
+
+ def test_litellm_provider_with_proxy(self):
+ """Test LiteLLM proxy mode uses OpenAIEmbeddings with correct params"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "litellm", "EMBEDDINGS_MODEL": "azure/text-embedding-3-small", "LITELLM_API_KEY": "test-api-key", "LITELLM_API_BASE": "https://my-proxy.example.com"}):
+ with patch("common.embeddings_factory.OpenAIEmbeddings") as mock_openai:
+ mock_instance = MagicMock()
+ mock_openai.return_value = mock_instance
+
+ result = EmbeddingsFactory.get_embeddings()
+
+ # Verify OpenAIEmbeddings was called with proxy params
+ mock_openai.assert_called_once_with(
+ model="azure/text-embedding-3-small",
+ api_key="test-api-key",
+ base_url="https://my-proxy.example.com",
+ )
+ assert result == mock_instance
+
+ def test_litellm_provider_default_api_key(self):
+ """Test LiteLLM provider uses default api_key when not provided"""
+ with patch.dict(os.environ, {"EMBEDDINGS_PROVIDER": "litellm", "EMBEDDINGS_MODEL": "azure/text-embedding-3-small", "LITELLM_API_BASE": "https://my-proxy.example.com"}, clear=True):
+ with patch("common.embeddings_factory.OpenAIEmbeddings") as mock_openai:
+ mock_instance = MagicMock()
+ mock_openai.return_value = mock_instance
+
+ result = EmbeddingsFactory.get_embeddings()
+
+ # Verify OpenAIEmbeddings was called with default api_key
+ mock_openai.assert_called_once_with(
+ model="azure/text-embedding-3-small",
+ api_key="not-needed",
+ base_url="https://my-proxy.example.com",
+ )
+ assert result == mock_instance
+
+ def test_litellm_dimensions(self):
+ """Test dimension mappings for LiteLLM models"""
+ with patch.dict(os.environ, {"EMBEDDINGS_MODEL": "mistral/mistral-embed"}):
+ assert EmbeddingsFactory.get_embedding_dimensions() == 1024
+
+ with patch.dict(os.environ, {"EMBEDDINGS_MODEL": "gemini/text-embedding-004"}):
+ assert EmbeddingsFactory.get_embedding_dimensions() == 768
+ with patch.dict(os.environ, {"EMBEDDINGS_MODEL": "voyage/voyage-01"}):
+ assert EmbeddingsFactory.get_embedding_dimensions() == 1024
diff --git a/ai_platform_engineering/knowledge_bases/rag/common/uv.lock b/ai_platform_engineering/knowledge_bases/rag/common/uv.lock
index 6b7ecc6a1..139182828 100644
--- a/ai_platform_engineering/knowledge_bases/rag/common/uv.lock
+++ b/ai_platform_engineering/knowledge_bases/rag/common/uv.lock
@@ -237,7 +237,6 @@ dependencies = [
{ name = "cymple" },
{ name = "langchain-aws" },
{ name = "langchain-cohere" },
- { name = "langchain-huggingface" },
{ name = "langchain-ollama" },
{ name = "langchain-openai" },
{ name = "neo4j" },
@@ -245,18 +244,24 @@ dependencies = [
{ name = "redis" },
]
+[package.optional-dependencies]
+huggingface = [
+ { name = "langchain-huggingface" },
+]
+
[package.metadata]
requires-dist = [
{ name = "cymple", specifier = ">=0.12.0" },
{ name = "langchain-aws", specifier = ">=0.2.24" },
{ name = "langchain-cohere", specifier = ">=0.3.0" },
- { name = "langchain-huggingface", specifier = ">=0.3.0" },
+ { name = "langchain-huggingface", marker = "extra == 'huggingface'", specifier = ">=0.3.0" },
{ name = "langchain-ollama", specifier = ">=0.3.0" },
{ name = "langchain-openai", specifier = ">=0.3.18" },
{ name = "neo4j", specifier = ">=5.28.1" },
{ name = "pydantic", specifier = ">=2.11.7" },
{ name = "redis", specifier = ">=6.2.0" },
]
+provides-extras = ["huggingface"]
[[package]]
name = "cymple"
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/pyproject.toml b/ai_platform_engineering/knowledge_bases/rag/ingestors/pyproject.toml
index 18c08f362..8d3cb5fd6 100644
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/pyproject.toml
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/pyproject.toml
@@ -22,9 +22,21 @@ dependencies = [
"python-dotenv>=1.1.0",
"redis>=7.0.1",
"requests>=2.32.0",
+ "scrapy>=2.12.0",
"slack-sdk>=3.38.0",
]
+[project.optional-dependencies]
+# Playwright for JavaScript rendering in webloader ingestor
+# Adds ~1GB to image size (Chromium browser + dependencies)
+playwright = [
+ "scrapy-playwright>=0.0.43",
+]
+dev = [
+ "pytest>=8.0.0",
+ "pytest-asyncio>=0.23.0",
+]
+
[project.scripts]
ingestors = "ingestors:main"
@@ -32,5 +44,11 @@ ingestors = "ingestors:main"
requires = ["uv_build>=0.8.17,<0.9.0"]
build-backend = "uv_build"
+[dependency-groups]
+dev = [
+ "pytest>=9.0.2",
+ "pytest-asyncio>=1.3.0",
+]
+
[tool.uv.sources]
common = { path = "../common" }
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/pytest.ini b/ai_platform_engineering/knowledge_bases/rag/ingestors/pytest.ini
new file mode 100644
index 000000000..ae4b83575
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+testpaths = tests
+python_paths = src
+addopts = -v --tb=short
+asyncio_mode = auto
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/README.md b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/README.md
index 4403675c4..07c9137d4 100644
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/README.md
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/README.md
@@ -1,6 +1,6 @@
# Webloader Ingestor
-The Webloader ingestor is a specialized ingestor that crawls and ingests web pages and documentation sites into the RAG system. It supports sitemap parsing, automatic reloading, and concurrent URL processing.
+The Webloader ingestor is a specialized ingestor that crawls and ingests web pages and documentation sites into the RAG system. It supports sitemap parsing, recursive crawling, JavaScript rendering, and concurrent URL processing.
## Overview
@@ -8,8 +8,8 @@ The Webloader operates differently from other ingestors:
- **Event-Driven**: Listens to Redis queue for ingestion requests from the RAG server
- **Concurrent Processing**: Handles multiple URL ingestion tasks simultaneously
- **Automatic Reloading**: Periodically re-ingests datasources to keep content fresh
-- **Sitemap Support**: Can automatically discover and crawl URLs from sitemaps
-- **Smart Scrapers**: Includes specialized scrapers for Docusaurus and MkDocs sites
+- **Scrapy-Powered**: Uses Scrapy with Playwright for robust web scraping
+- **Smart Parsers**: Includes specialized parsers for Docusaurus, MkDocs, Sphinx, ReadTheDocs, and VitePress sites
## Architecture
@@ -28,11 +28,30 @@ The Webloader must run **alongside the RAG server** with access to the **same Re
## Optional Environment Variables
-- `WEBLOADER_MAX_CONCURRENCY` - Max concurrent HTTP requests per ingestion (default: `10`)
- `WEBLOADER_MAX_INGESTION_TASKS` - Max concurrent ingestion tasks (default: `5`)
- `WEBLOADER_RELOAD_INTERVAL` - Auto-reload interval in seconds (default: `86400` = 24 hours)
- `LOG_LEVEL` - Logging level (default: `INFO`)
+## Scraping Configuration
+
+Scraping behavior is configured per-request via `ScrapySettings`. Key options:
+
+| Setting | Default | Description |
+|---------|---------|-------------|
+| `crawl_mode` | `single` | `single` (one page), `sitemap` (discover sitemap), `recursive` (follow links) |
+| `max_depth` | `2` | Max link depth for recursive crawling (1-10) |
+| `max_pages` | `2000` | Maximum pages to crawl |
+| `render_javascript` | `false` | Enable Playwright for JavaScript-heavy sites |
+| `wait_for_selector` | `null` | CSS selector to wait for (JS rendering only) |
+| `page_load_timeout` | `30` | Page load timeout in seconds |
+| `follow_external_links` | `false` | Follow links to external domains |
+| `allowed_url_patterns` | `null` | Regex whitelist for URLs to include |
+| `denied_url_patterns` | `null` | Regex blacklist for URLs to exclude |
+| `download_delay` | `0.5` | Delay between requests (seconds) |
+| `concurrent_requests` | `10` | Max concurrent requests per crawl |
+| `respect_robots_txt` | `true` | Obey robots.txt rules |
+| `user_agent` | `null` | Custom user agent string |
+
## Features
### 1. URL Ingestion
@@ -41,22 +60,30 @@ The Webloader must run **alongside the RAG server** with access to the **same Re
- Extracts metadata (title, description, etc.)
- Stores documents with source URL tracking
-### 2. Sitemap Support
-- Automatically checks for and parses sitemaps
-- Supports both XML sitemaps and sitemap indexes
-- Can limit maximum URLs to crawl from sitemap
+### 2. Crawl Modes
+- **Single URL**: Scrape only the specified URL
+- **Sitemap**: Discover and crawl sitemap.xml automatically
+- **Recursive**: Follow links from the starting URL up to `max_depth`
+
+### 3. JavaScript Rendering
+- Enable `render_javascript: true` for JavaScript-heavy sites (SPAs)
+- Uses Playwright for headless browser rendering
+- Supports waiting for specific selectors before extraction
-### 3. Specialized Scrapers
+### 4. Specialized Parsers
- **Docusaurus**: Optimized for Docusaurus documentation sites
- **MkDocs**: Optimized for MkDocs documentation sites
+- **Sphinx**: Supports various Sphinx themes (Alabaster, RTD, Furo, PyData)
+- **ReadTheDocs**: Optimized for ReadTheDocs-hosted documentation
+- **VitePress**: Optimized for VitePress sites
- **Generic**: Falls back to generic HTML parsing for other sites
-### 4. Automatic Reloading
+### 5. Automatic Reloading
- Periodically re-ingests all datasources
- Keeps content up-to-date automatically
- Can be triggered on-demand via Redis
-### 5. Concurrent Processing
+### 6. Concurrent Processing
- Processes multiple URLs simultaneously
- Rate limiting to prevent overwhelming servers
- Task queue management with configurable limits
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/ingestor.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/ingestor.py
index 07da7052b..f001555c4 100644
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/ingestor.py
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/ingestor.py
@@ -1,361 +1,392 @@
+"""
+Webloader Ingestor - Main entry point for web content ingestion.
+
+This ingestor listens for URL ingestion requests via Redis and uses a Scrapy
+worker pool to crawl and extract content from websites.
+
+NOTE: Scrapy runs in separate subprocess workers to avoid Twisted/asyncio
+event loop conflicts. The main process uses pure asyncio.
+"""
+
import os
import asyncio
import time
import traceback
from typing import Set
+
from redis.asyncio import Redis
+
from common.ingestor import IngestorBuilder, Client
from common.models.rag import DataSourceInfo
-from common.models.server import IngestorRequest, UrlIngestRequest, WebIngestorCommand, UrlReloadRequest
+from common.models.server import IngestorRequest, UrlIngestRequest, WebIngestorCommand, UrlReloadRequest, ScrapySettings, CrawlMode
from common.job_manager import JobStatus, JobManager
from common.constants import WEBLOADER_INGESTOR_REDIS_QUEUE, WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE
from common.utils import get_logger, generate_datasource_id_from_url
-from loader.loader import Loader
+
+from loader.scrapy_loader import ScrapyLoader
+from loader.worker_pool import get_worker_pool, shutdown_worker_pool
logger = get_logger(__name__)
# Redis configuration
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
-
# Webloader configuration
-MAX_CONCURRENCY = int(os.getenv("WEBLOADER_MAX_CONCURRENCY", "10"))
RELOAD_INTERVAL = int(os.getenv("WEBLOADER_RELOAD_INTERVAL", "86400")) # 24 hours default
MAX_INGESTION_TASKS = int(os.getenv("WEBLOADER_MAX_INGESTION_TASKS", "5")) # Max concurrent ingestion tasks
redis_client = Redis.from_url(REDIS_URL, decode_responses=True)
-async def process_url_ingestion(
- client: Client,
- job_manager: JobManager,
- url_request: UrlIngestRequest
-):
- """Process a single URL ingestion request."""
-
- try:
- # Generate datasource ID from URL
- datasource_id = generate_datasource_id_from_url(url_request.url)
-
- # Fetch existing datasource (created by server)
- datasources = await client.list_datasources(ingestor_id=client.ingestor_id)
- datasource_info = next((ds for ds in datasources if ds.datasource_id == datasource_id), None)
-
- if not datasource_info:
- logger.error(f"Datasource not found: {datasource_id}")
- raise ValueError(f"Datasource not found: {datasource_id}")
-
- # Fetch existing job for this datasource (created by server)
- jobs = await job_manager.get_jobs_by_datasource(datasource_id)
- if not jobs:
- logger.error(f"No job found for datasource: {datasource_id}")
- raise ValueError(f"No job found for datasource: {datasource_id}")
-
- job = jobs[0] # Get the most recent job
- job_id = job.job_id
-
- # Check if job was terminated before we started
- if job.status == JobStatus.TERMINATED:
- logger.info(f"Job {job_id} was already terminated, skipping processing")
- return
-
- # Update job status to IN_PROGRESS
- await job_manager.upsert_job(
- job_id=job_id,
- status=JobStatus.IN_PROGRESS,
- message=f"Starting URL ingestion for {url_request.url}"
- )
- logger.info(f"Processing job: {job_id} for datasource: {datasource_id}")
-
- # Process the URL using Loader
- async with Loader(
- rag_client=client,
- jobmanager=job_manager,
- datasourceinfo=datasource_info,
- max_concurrency=MAX_CONCURRENCY
- ) as loader:
- await loader.load_url(
- url=url_request.url,
- job_id=job_id,
- check_for_site_map=url_request.check_for_sitemaps,
- sitemap_max_urls=url_request.sitemap_max_urls
- )
-
- logger.info(f"Completed URL ingestion for {url_request.url}")
-
- except Exception as e:
- error_msg = f"Error processing URL {url_request.url}: {str(e)}"
- logger.error(error_msg)
- logger.error(traceback.format_exc())
-
- # Try to update job with error if we have job_id
- try:
- if 'job_id' in locals():
- await job_manager.add_error_msg(job_id, error_msg)
- except Exception:
- pass
-
- raise
-
-
-async def reload_datasource(
- client: Client,
- job_manager: JobManager,
- datasource_info: DataSourceInfo):
- """Reload a single datasource."""
- # Extract UrlIngestRequest from metadata
- if not datasource_info.metadata:
- logger.warning(f"No metadata for datasource {datasource_info.datasource_id}, skipping")
- return
- url_ingest_request_data = datasource_info.metadata.get("url_ingest_request")
- if not url_ingest_request_data:
- logger.warning(f"No url_ingest_request in metadata for {datasource_info.datasource_id}, skipping")
- return
-
- # Parse the UrlIngestRequest model
- url_request = UrlIngestRequest.model_validate(url_ingest_request_data)
-
- logger.info(f"Reloading datasource: {datasource_info.datasource_id}")
-
- # Create new job for reload
- job_response = await client.create_job(
- datasource_id=datasource_info.datasource_id,
- job_status=JobStatus.IN_PROGRESS,
- message=f"Reloading data from {url_request.url}"
+def _get_effective_settings(request: UrlIngestRequest, datasource_id: str) -> tuple[ScrapySettings, list[str]]:
+ """
+ Get effective settings, mapping deprecated fields if present.
+
+ Args:
+ request: The URL ingest request
+ datasource_id: ID of the datasource (for logging)
+
+ Returns:
+ Tuple of (effective_settings, list_of_deprecated_field_names)
+ """
+ # Start with provided settings or defaults
+ settings = request.settings or ScrapySettings()
+ deprecated_fields = []
+
+ # Map deprecated check_for_sitemaps -> crawl_mode
+ if request.check_for_sitemaps is not None:
+ deprecated_fields.append("check_for_sitemaps")
+ logger.warning(f"Deprecated field 'check_for_sitemaps' detected for datasource '{datasource_id}'. Use 'settings.crawl_mode' instead. Delete and re-ingest datasource to update.")
+ # Only apply if crawl_mode is still default (single)
+ if settings.crawl_mode == CrawlMode.SINGLE_URL:
+ settings.crawl_mode = CrawlMode.SITEMAP if request.check_for_sitemaps else CrawlMode.SINGLE_URL
+
+ # Map deprecated sitemap_max_urls -> max_pages
+ if request.sitemap_max_urls is not None:
+ deprecated_fields.append("sitemap_max_urls")
+ logger.warning(f"Deprecated field 'sitemap_max_urls' detected for datasource '{datasource_id}'. Use 'settings.max_pages' instead. Delete and re-ingest datasource to update.")
+ # Only apply if max_pages is still default (2000)
+ if settings.max_pages == 2000:
+ settings.max_pages = request.sitemap_max_urls
+
+ # Log warning for deprecated ingest_type (no mapping needed)
+ if request.ingest_type is not None:
+ deprecated_fields.append("ingest_type")
+ logger.warning(f"Deprecated field 'ingest_type' detected for datasource '{datasource_id}'. This field is no longer used. Delete and re-ingest datasource to update.")
+
+ return settings, deprecated_fields
+
+
+async def process_url_ingestion(client: Client, job_manager: JobManager, url_request: UrlIngestRequest):
+ """Process a single URL ingestion request."""
+ job_id = None
+
+ try:
+ # Generate datasource ID from URL
+ datasource_id = generate_datasource_id_from_url(url_request.url)
+
+ # Fetch existing datasource (created by server)
+ datasources = await client.list_datasources(ingestor_id=client.ingestor_id)
+ datasource_info = next((ds for ds in datasources if ds.datasource_id == datasource_id), None)
+
+ if not datasource_info:
+ logger.error(f"Datasource not found: {datasource_id}")
+ raise ValueError(f"Datasource not found: {datasource_id}")
+
+ # Fetch existing job for this datasource (created by server)
+ jobs = await job_manager.get_jobs_by_datasource(datasource_id)
+ if not jobs:
+ logger.error(f"No job found for datasource: {datasource_id}")
+ raise ValueError(f"No job found for datasource: {datasource_id}")
+
+ job = jobs[0] # Get the most recent job
+ job_id = job.job_id
+
+ # Check if job was terminated before we started
+ if job.status == JobStatus.TERMINATED:
+ logger.info(f"Job {job_id} was already terminated, skipping processing")
+ return
+
+ # Update job status to IN_PROGRESS
+ await job_manager.upsert_job(job_id=job_id, status=JobStatus.IN_PROGRESS, message=f"Starting URL ingestion for {url_request.url}")
+ logger.info(f"Processing job: {job_id} for datasource: {datasource_id}")
+
+ # Get effective settings, mapping deprecated fields if present
+ settings, deprecated_fields = _get_effective_settings(url_request, datasource_id)
+
+ # Add warning to job status if deprecated fields detected
+ if deprecated_fields:
+ fields_str = ", ".join(deprecated_fields)
+ await job_manager.upsert_job(job_id=job_id, status=JobStatus.IN_PROGRESS, message=f"Warning: Deprecated settings detected ({fields_str}). Delete and re-ingest to update.")
+
+ # Process the URL using ScrapyLoader (which uses worker pool)
+ loader = ScrapyLoader(
+ rag_client=client,
+ job_manager=job_manager,
+ datasource_info=datasource_info,
+ )
+ await loader.load(
+ url=url_request.url,
+ settings=settings,
+ job_id=job_id,
)
- job_id = job_response["job_id"]
- logger.info(f"Created reload job: {job_id}")
-
+
+ logger.info(f"Completed URL ingestion for {url_request.url}")
+
+ except Exception as e:
+ error_msg = f"Error processing URL {url_request.url}: {str(e)}"
+ logger.error(error_msg)
+ logger.error(traceback.format_exc())
+
+ # Try to update job with error if we have job_id
try:
- # Update datasource last_updated timestamp (no need to update job_id)
- datasource_info.last_updated = int(time.time())
- await client.upsert_datasource(datasource_info)
-
- # Process the URL using Loader (which will delete old data via fresh_until)
- async with Loader(
- rag_client=client,
- jobmanager=job_manager,
- datasourceinfo=datasource_info,
- max_concurrency=MAX_CONCURRENCY
- ) as loader:
- await loader.load_url(
- url=url_request.url,
- job_id=job_id,
- check_for_site_map=url_request.check_for_sitemaps,
- sitemap_max_urls=url_request.sitemap_max_urls
- )
-
- logger.info(f"Completed reload for {datasource_info.datasource_id}")
-
- except Exception as e:
- error_msg = f"Error reloading datasource {datasource_info.datasource_id}: {str(e)}"
- logger.error(error_msg)
- logger.error(traceback.format_exc())
-
+ if job_id:
await job_manager.add_error_msg(job_id, error_msg)
-
- raise
+ except Exception:
+ pass
+
+ raise
+
+
+async def reload_datasource(client: Client, job_manager: JobManager, datasource_info: DataSourceInfo):
+ """Reload a single datasource."""
+ # Extract UrlIngestRequest from metadata
+ if not datasource_info.metadata:
+ logger.warning(f"No metadata for datasource {datasource_info.datasource_id}, skipping")
+ return
+
+ url_ingest_request_data = datasource_info.metadata.get("url_ingest_request")
+ if not url_ingest_request_data:
+ logger.warning(f"No url_ingest_request in metadata for {datasource_info.datasource_id}, skipping")
+ return
+
+ # Parse the UrlIngestRequest model
+ url_request = UrlIngestRequest.model_validate(url_ingest_request_data)
+
+ logger.info(f"Reloading datasource: {datasource_info.datasource_id}")
+
+ # Create new job for reload
+ job_response = await client.create_job(datasource_id=datasource_info.datasource_id, job_status=JobStatus.IN_PROGRESS, message=f"Reloading data from {url_request.url}")
+ job_id = job_response["job_id"]
+ logger.info(f"Created reload job: {job_id}")
+
+ try:
+ # Update datasource last_updated timestamp
+ datasource_info.last_updated = int(time.time())
+ await client.upsert_datasource(datasource_info)
+
+ # Get effective settings, mapping deprecated fields if present
+ settings, deprecated_fields = _get_effective_settings(url_request, datasource_info.datasource_id)
+
+ # Add warning to job status if deprecated fields detected
+ if deprecated_fields:
+ fields_str = ", ".join(deprecated_fields)
+ await job_manager.upsert_job(job_id=job_id, status=JobStatus.IN_PROGRESS, message=f"Warning: Deprecated settings detected ({fields_str}). Delete and re-ingest to update.")
+
+ # Process the URL using ScrapyLoader (which uses worker pool)
+ loader = ScrapyLoader(
+ rag_client=client,
+ job_manager=job_manager,
+ datasource_info=datasource_info,
+ )
+ await loader.load(
+ url=url_request.url,
+ settings=settings,
+ job_id=job_id,
+ )
+
+ logger.info(f"Completed reload for {datasource_info.datasource_id}")
+
+ except Exception as e:
+ error_msg = f"Error reloading datasource {datasource_info.datasource_id}: {str(e)}"
+ logger.error(error_msg)
+ logger.error(traceback.format_exc())
+
+ await job_manager.add_error_msg(job_id, error_msg)
+
+ raise
async def redis_listener(client: Client):
- """
- Listen to Redis queue for new URL ingestion requests.
- Processes IngestorRequest messages with UrlIngestRequest payloads.
- Manages concurrent ingestion tasks with a semaphore.
- """
-
- # Since this will be run in a trusted environment, we can use redis_client instead of server apis for job management
- job_manager = JobManager(redis_client)
-
- # Track active ingestion tasks
- active_tasks: Set[asyncio.Task] = set()
-
- logger.info(f"Starting Redis listener on {REDIS_URL} queue: {WEBLOADER_INGESTOR_REDIS_QUEUE}")
- logger.info(f"Max concurrent ingestion tasks: {MAX_INGESTION_TASKS}")
-
- async def handle_ingestion_task(coro, task_name: str):
- """Wrapper to handle task completion and cleanup."""
+ """
+ Listen to Redis queue for new URL ingestion requests.
+ Processes IngestorRequest messages with UrlIngestRequest payloads.
+ Manages concurrent ingestion tasks with a semaphore.
+ """
+
+ # Initialize the worker pool at startup
+ logger.info("Initializing Scrapy worker pool...")
+ await get_worker_pool()
+ logger.info("Worker pool initialized")
+
+ # Since this will be run in a trusted environment, we can use redis_client instead of server apis for job management
+ job_manager = JobManager(redis_client)
+
+ # Track active ingestion tasks
+ active_tasks: Set[asyncio.Task] = set()
+
+ logger.info(f"Starting Redis listener on {REDIS_URL} queue: {WEBLOADER_INGESTOR_REDIS_QUEUE}")
+ logger.info(f"Max concurrent ingestion tasks: {MAX_INGESTION_TASKS}")
+
+ async def handle_ingestion_task(coro, task_name: str):
+ """Wrapper to handle task completion and cleanup."""
+ try:
+ await coro
+ except Exception as e:
+ logger.error(f"Error in {task_name}: {e}")
+ logger.error(traceback.format_exc())
+
+ try:
+ while True:
+ try:
+ # Clean up completed tasks
+ done_tasks = {task for task in active_tasks if task.done()}
+ for task in done_tasks:
+ try:
+ task.result() # Raise any exceptions that occurred
+ except Exception as e:
+ logger.error(f"Task failed: {e}")
+ active_tasks -= done_tasks
+
+ # Check if we can accept more tasks
+ if len(active_tasks) >= MAX_INGESTION_TASKS:
+ logger.debug(f"At max capacity ({MAX_INGESTION_TASKS} tasks), waiting for tasks to complete...")
+ # Wait a bit before checking again
+ await asyncio.sleep(0.5)
+ continue
+
+ # Blocking pop from Redis list (timeout 1 second to allow for task cleanup)
+ result = await redis_client.blpop([WEBLOADER_INGESTOR_REDIS_QUEUE], timeout=1) # type: ignore
+
+ if result is None:
+ # Timeout - continue loop to check for shutdown and cleanup tasks
+ continue
+
+ _, message = result
+ logger.info(f"Received message from Redis: {message}")
+
+ # Parse the IngestorRequest
try:
- await coro
+ ingestor_request = IngestorRequest.model_validate_json(message)
+
+ # Verify this request is for our ingestor
+ if ingestor_request.ingestor_id != client.ingestor_id:
+ logger.warning(f"Ignoring request for different ingestor: {ingestor_request.ingestor_id}")
+ continue
+
+ # Handle different commands
+ if ingestor_request.command == WebIngestorCommand.INGEST_URL:
+ url_request = UrlIngestRequest.model_validate(ingestor_request.payload)
+ logger.info(f"Processing URL ingestion request: {url_request.url} (active tasks: {len(active_tasks)})")
+
+ # Create task for concurrent processing
+ task = asyncio.create_task(handle_ingestion_task(process_url_ingestion(client=client, job_manager=job_manager, url_request=url_request), f"URL ingestion: {url_request.url}"))
+ active_tasks.add(task)
+
+ elif ingestor_request.command == WebIngestorCommand.RELOAD_ALL:
+ logger.info("Processing on-demand reload request")
+
+ # Create task for concurrent processing
+ task = asyncio.create_task(handle_ingestion_task(periodic_reload(client), "Reload all datasources"))
+ active_tasks.add(task)
+
+ elif ingestor_request.command == WebIngestorCommand.RELOAD_DATASOURCE:
+ # Reload specific datasource
+ if not ingestor_request.payload:
+ logger.error("Missing payload in reload-datasource request")
+ continue
+
+ datasource_id = UrlReloadRequest.model_validate(ingestor_request.payload).datasource_id
+ if not datasource_id:
+ logger.error("Missing datasource_id in reload-datasource request")
+ continue
+
+ logger.info(f"Processing reload request for datasource: {datasource_id}")
+
+ # Fetch the specific datasource
+ datasources = await client.list_datasources(ingestor_id=client.ingestor_id)
+ datasource_info = next((ds for ds in datasources if ds.datasource_id == datasource_id), None)
+
+ if not datasource_info:
+ logger.error(f"Datasource not found: {datasource_id}")
+ continue
+
+ # Create task for concurrent processing
+ task = asyncio.create_task(handle_ingestion_task(reload_datasource(client, job_manager, datasource_info), f"Reload datasource: {datasource_id}"))
+ active_tasks.add(task)
+
+ else:
+ logger.warning(f"Unknown command: {ingestor_request.command}")
+
except Exception as e:
- logger.error(f"Error in {task_name}: {e}")
- logger.error(traceback.format_exc())
-
- try:
- while True:
- try:
- # Clean up completed tasks
- done_tasks = {task for task in active_tasks if task.done()}
- for task in done_tasks:
- try:
- task.result() # Raise any exceptions that occurred
- except Exception as e:
- logger.error(f"Task failed: {e}")
- active_tasks -= done_tasks
-
- # Check if we can accept more tasks
- if len(active_tasks) >= MAX_INGESTION_TASKS:
- logger.debug(f"At max capacity ({MAX_INGESTION_TASKS} tasks), waiting for tasks to complete...")
- # Wait a bit before checking again
- await asyncio.sleep(0.5)
- continue
-
- # Blocking pop from Redis list (timeout 1 second to allow for task cleanup)
- result = await redis_client.blpop([WEBLOADER_INGESTOR_REDIS_QUEUE], timeout=1) # type: ignore
-
- if result is None:
- # Timeout - continue loop to check for shutdown and cleanup tasks
- continue
-
- _, message = result
- logger.info(f"Received message from Redis: {message}")
-
- # Parse the IngestorRequest
- try:
- ingestor_request = IngestorRequest.model_validate_json(message)
-
- # Verify this request is for our ingestor
- if ingestor_request.ingestor_id != client.ingestor_id:
- logger.warning(f"Ignoring request for different ingestor: {ingestor_request.ingestor_id}")
- continue
-
- # Handle different commands
- if ingestor_request.command == WebIngestorCommand.INGEST_URL:
- url_request = UrlIngestRequest.model_validate(ingestor_request.payload)
- logger.info(f"Processing URL ingestion request: {url_request.url} (active tasks: {len(active_tasks)})")
-
- # Create task for concurrent processing
- task = asyncio.create_task(
- handle_ingestion_task(
- process_url_ingestion(
- client=client,
- job_manager=job_manager,
- url_request=url_request
- ),
- f"URL ingestion: {url_request.url}"
- )
- )
- active_tasks.add(task)
-
- elif ingestor_request.command == WebIngestorCommand.RELOAD_ALL:
- logger.info("Processing on-demand reload request")
-
- # Create task for concurrent processing
- task = asyncio.create_task(
- handle_ingestion_task(
- periodic_reload(client),
- "Reload all datasources"
- )
- )
- active_tasks.add(task)
-
- elif ingestor_request.command == WebIngestorCommand.RELOAD_DATASOURCE:
- # Reload specific datasource
- if not ingestor_request.payload:
- logger.error("Missing payload in reload-datasource request")
- continue
-
- datasource_id = UrlReloadRequest.model_validate(ingestor_request.payload).datasource_id
- if not datasource_id:
- logger.error("Missing datasource_id in reload-datasource request")
- continue
-
- logger.info(f"Processing reload request for datasource: {datasource_id}")
-
- # Fetch the specific datasource
- datasources = await client.list_datasources(ingestor_id=client.ingestor_id)
- datasource_info = next((ds for ds in datasources if ds.datasource_id == datasource_id), None)
-
- if not datasource_info:
- logger.error(f"Datasource not found: {datasource_id}")
- continue
-
- # Create task for concurrent processing
- task = asyncio.create_task(
- handle_ingestion_task(
- reload_datasource(client, job_manager, datasource_info),
- f"Reload datasource: {datasource_id}"
- )
- )
- active_tasks.add(task)
-
- else:
- logger.warning(f"Unknown command: {ingestor_request.command}")
-
- except Exception as e:
- logger.error(f"Error processing message: {e}")
- logger.error(traceback.format_exc())
-
- except asyncio.CancelledError:
- logger.info("Redis listener cancelled, waiting for active tasks to complete...")
- # Wait for all active tasks to complete
- if active_tasks:
- logger.info(f"Waiting for {len(active_tasks)} active tasks to complete...")
- await asyncio.gather(*active_tasks, return_exceptions=True)
- break
- except Exception as e:
- logger.error(f"Error in Redis listener loop: {e}")
- logger.error(traceback.format_exc())
- await asyncio.sleep(5) # Back off on errors
-
- finally:
- # Cancel any remaining tasks
+ logger.error(f"Error processing message: {e}")
+ logger.error(traceback.format_exc())
+
+ except asyncio.CancelledError:
+ logger.info("Redis listener cancelled, waiting for active tasks to complete...")
+ # Wait for all active tasks to complete
if active_tasks:
- logger.info(f"Cancelling {len(active_tasks)} remaining tasks...")
- for task in active_tasks:
- task.cancel()
- await asyncio.gather(*active_tasks, return_exceptions=True)
-
- await redis_client.close()
- logger.info("Redis listener stopped")
+ logger.info(f"Waiting for {len(active_tasks)} active tasks to complete...")
+ await asyncio.gather(*active_tasks, return_exceptions=True)
+ break
+ except Exception as e:
+ logger.error(f"Error in Redis listener loop: {e}")
+ logger.error(traceback.format_exc())
+ await asyncio.sleep(5) # Back off on errors
+
+ finally:
+ # Shutdown worker pool
+ logger.info("Shutting down worker pool...")
+ await shutdown_worker_pool()
+
+ # Cancel any remaining tasks
+ if active_tasks:
+ logger.info(f"Cancelling {len(active_tasks)} remaining tasks...")
+ for task in active_tasks:
+ task.cancel()
+ await asyncio.gather(*active_tasks, return_exceptions=True)
+
+ await redis_client.close()
+ logger.info("Redis listener stopped")
async def periodic_reload(client: Client):
- """
- Reload all datasources for this ingestor.
- Fetches datasources filtered by ingestor_id and re-ingests them.
- Called periodically by IngestorBuilder or on-demand via Redis.
- """
- logger.info("Starting datasource reload...")
- job_manager = JobManager(redis_client)
-
- try:
- datasources = await client.list_datasources(ingestor_id=client.ingestor_id)
- logger.info(f"Found {len(datasources)} datasources to reload")
-
- for datasource_info in datasources:
- try:
- await reload_datasource(client, job_manager, datasource_info)
- except Exception as e:
- logger.error(f"Error reloading datasource {datasource_info.datasource_id}: {e}")
- logger.error(traceback.format_exc())
-
- logger.info("Datasource reload completed")
-
- except Exception as e:
- logger.error(f"Error in datasource reload: {e}")
+ """
+ Reload all datasources for this ingestor.
+ Fetches datasources filtered by ingestor_id and re-ingests them.
+ Called periodically by IngestorBuilder or on-demand via Redis.
+ """
+ logger.info("Starting datasource reload...")
+ job_manager = JobManager(redis_client)
+
+ try:
+ datasources = await client.list_datasources(ingestor_id=client.ingestor_id)
+ logger.info(f"Found {len(datasources)} datasources to reload")
+
+ for datasource_info in datasources:
+ try:
+ await reload_datasource(client, job_manager, datasource_info)
+ except Exception as e:
+ logger.error(f"Error reloading datasource {datasource_info.datasource_id}: {e}")
logger.error(traceback.format_exc())
+ logger.info("Datasource reload completed")
+
+ except Exception as e:
+ logger.error(f"Error in datasource reload: {e}")
+ logger.error(traceback.format_exc())
+
if __name__ == "__main__":
- try:
- logger.info("Starting Webloader Ingestor...")
-
- # Build and run the ingestor with both Redis listener and periodic reload
- IngestorBuilder()\
- .name(WEBLOADER_INGESTOR_NAME)\
- .type(WEBLOADER_INGESTOR_TYPE)\
- .description("Default ingestor for websites and sitemaps")\
- .metadata({
- "reload_interval": RELOAD_INTERVAL
- })\
- .sync_with_fn(periodic_reload)\
- .with_startup(redis_listener)\
- .every(RELOAD_INTERVAL)\
- .run()
-
- except KeyboardInterrupt:
- logger.info("Webloader ingestor interrupted by user")
- except Exception as e:
- logger.error(f"Webloader ingestor failed: {e}")
- logger.error(traceback.format_exc())
+ try:
+ logger.info("Starting Webloader Ingestor...")
+
+ # Build and run the ingestor with standard asyncio
+ # No Twisted reactor needed - Scrapy runs in subprocess workers
+ IngestorBuilder().name(WEBLOADER_INGESTOR_NAME).type(WEBLOADER_INGESTOR_TYPE).description("Default ingestor for websites and sitemaps").metadata({"reload_interval": RELOAD_INTERVAL}).sync_with_fn(periodic_reload).with_startup(redis_listener).every(RELOAD_INTERVAL).run()
+
+ except KeyboardInterrupt:
+ logger.info("Webloader ingestor interrupted by user")
+ except Exception as e:
+ logger.error(f"Webloader ingestor failed: {e}")
+ logger.error(traceback.format_exc())
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/items.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/items.py
new file mode 100644
index 000000000..770afae9f
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/items.py
@@ -0,0 +1,43 @@
+"""
+Scrapy Item definitions for web scraping.
+
+Items define the structure of scraped data before it's processed
+by pipelines and converted to LangChain Documents.
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any
+
+
+@dataclass
+class ScrapedPageItem:
+ """
+ Item representing a scraped web page.
+
+ This is the output of spider parsing and the input to pipelines.
+ """
+
+ # Required fields
+ url: str
+ content: str
+
+ # Metadata fields
+ title: str = ""
+ description: str = ""
+ language: str = ""
+
+ # Optional extended metadata
+ generator: Optional[str] = None
+ extra_metadata: Dict[str, Any] = field(default_factory=dict)
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for pipeline processing."""
+ return {
+ "url": self.url,
+ "content": self.content,
+ "title": self.title,
+ "description": self.description,
+ "language": self.language,
+ "generator": self.generator,
+ "extra_metadata": self.extra_metadata,
+ }
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/loader.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/loader.py
deleted file mode 100644
index 60df6e0ab..000000000
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/loader.py
+++ /dev/null
@@ -1,502 +0,0 @@
-import gc
-import time
-from common import utils
-from loader.url.docsaurus_scraper import scrape_docsaurus
-from loader.url.mkdocs_scraper import scrape_mkdocs
-import aiohttp
-from aiohttp_retry import RetryClient, ExponentialRetry
-from bs4 import BeautifulSoup
-import gzip
-from typing import List
-from langchain_core.documents import Document
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from typing import Tuple, Dict, Any
-from common.job_manager import JobStatus, JobManager
-from common.models.rag import DataSourceInfo, DocumentMetadata
-from common.utils import get_logger
-import traceback
-from common.task_scheduler import TaskScheduler
-from common.ingestor import Client
-from urllib.parse import urlparse
-
-class Loader:
- def __init__(self, rag_client: Client, jobmanager: JobManager, datasourceinfo: DataSourceInfo, max_concurrency: int):
- """
- Initialize the loader with the given vstore, logger, metadata storage, and datasource.
-
- Args:
- vstore (VectorStore): The vector storage to use for storing documents.
- metadata_storage (MetadataStorage): The metadata storage to use for storing metadata.
- datasourceinfo (DataSourceInfo): The datasource configuration to use for loading documents.
- """
- self.session = None
- self.logger = get_logger(f"loader:{datasourceinfo.datasource_id[12:]}")
- self.chunk_size = datasourceinfo.default_chunk_size
- self.chunk_overlap = datasourceinfo.default_chunk_overlap
- self.datasourceinfo = datasourceinfo
- self.max_concurrency = max_concurrency
- self.jobmanager = jobmanager
- self.client = rag_client
-
- # Chrome user agent for better web scraping compatibility
- self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-
- # Initialize text splitter for chunking large documents
- self.text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=self.chunk_size,
- chunk_overlap=self.chunk_overlap,
- length_function=len,
- separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
- )
-
- async def __aenter__(self):
- # Configure retry policy for rate limiting and transient errors
- retry_options = ExponentialRetry(
- attempts=4, # 3 retries + 1 initial attempt
- start_timeout=1.0, # Start with 1 second delay
- max_timeout=60.0, # Cap at 60 seconds
- factor=2.0, # Double delay each time
- statuses={429, 502, 503, 504}, # Retry on rate limit and server errors
- exceptions={aiohttp.ClientError, aiohttp.ServerTimeoutError}
- )
-
- base_session = aiohttp.ClientSession(
- timeout=aiohttp.ClientTimeout(total=30),
- headers={"User-Agent": self.user_agent}
- )
- self.session = RetryClient(client_session=base_session, retry_options=retry_options)
- return self
-
- async def __aexit__(self, exc_type, exc_val, exc_tb):
- if self.session:
- await self.session.close()
-
- def set_chunking_config(self, chunk_size: int, chunk_overlap: int):
- """Update chunking configuration and recreate text splitter"""
- self.chunk_size = chunk_size
- self.chunk_overlap = chunk_overlap
-
- self.text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=self.chunk_size,
- chunk_overlap=self.chunk_overlap,
- length_function=len,
- separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
- )
- self.logger.debug(f"Updated chunking config: size={chunk_size}, overlap={chunk_overlap}")
-
- async def get_sitemaps(self, url: str) -> List[str]:
- """Return a list of sitemap URLs for the given site.
-
- Order of checks:
- 1) robots.txt for one or more Sitemap: entries
- 2) /sitemap.xml (or the URL itself if it already ends with sitemap.xml)
- 3) :///sitemap.xml
- """
- if self.session is None:
- raise Exception("Session is not initialized")
- sitemaps: List[str] = []
- parsed = urlparse(url)
- if not parsed.scheme:
- parsed = urlparse("https://" + url)
- base = f"{parsed.scheme}://{parsed.netloc}"
-
- self.logger.debug(f"Checking robots.txt at: {base}/robots.txt")
- # 1) robots.txt
- robots_url = f"{base}/robots.txt"
- try:
- async with self.session.get(robots_url, allow_redirects=True) as resp:
- if resp.status == 200:
- text = await resp.text()
- for line in text.splitlines():
- line = line.strip()
- if line.lower().startswith("sitemap:"):
- sitemap_url = line.split(":", 1)[1].strip()
- if sitemap_url and sitemap_url not in sitemaps:
- sitemaps.append(sitemap_url)
- else:
- self.logger.debug(f"robots.txt not found or not accessible: {robots_url} (status {resp.status})")
- except Exception as e:
- self.logger.error(traceback.format_exc())
- self.logger.debug(f"Error fetching robots.txt {robots_url}: {e}")
-
- if sitemaps:
- self.logger.debug(f"Found sitemaps: {sitemaps}")
- return sitemaps
-
- # 2) /sitemap.xml
- if url.endswith("/sitemap.xml"):
- candidate = url
- elif url.endswith("/"):
- candidate = url + "sitemap.xml"
- else:
- candidate = url + "/sitemap.xml"
-
- self.logger.debug(f"Checking sitemap at: {candidate}")
- try:
- async with self.session.get(candidate, allow_redirects=True) as resp:
- if resp.status == 200:
- sitemaps.append(str(resp.url))
- except Exception as e:
- self.logger.warning(traceback.format_exc())
- self.logger.debug(f"Error checking sitemap at {candidate}: {e}")
-
- if sitemaps:
- self.logger.debug(f"Found sitemaps: {sitemaps}")
- return sitemaps
-
- # 3) /sitemap.xml
- base_sitemap = f"{base}/sitemap.xml"
- self.logger.debug(f"Checking base sitemap at: {base_sitemap}")
- try:
- async with self.session.get(base_sitemap, allow_redirects=True) as resp:
- if resp.status == 200:
- sitemaps.append(str(resp.url))
- except Exception as e:
- self.logger.warning(traceback.format_exc())
- self.logger.debug(f"Error checking base sitemap at {base_sitemap}: {e}")
-
- if sitemaps:
- self.logger.debug(f"Found sitemaps: {sitemaps}")
- return sitemaps
-
- self.logger.debug(f"No sitemaps found at: {url}")
- return []
-
- async def custom_parser(self, soup: BeautifulSoup, url) -> Tuple[str, Dict[str, Any]]:
- """
- Processes a webpage and save contents to a file.
- Parses webpage based on generator.
- Returns (content, metadata)
- """
- # check meta tag for generator
- content = ""
- generator = None
- generator_tag = soup.find('meta', attrs={'name': 'generator'})
- if generator_tag:
- self.logger.debug(f"Generator tag found: {generator_tag}")
- generator = generator_tag.get('content') # type: ignore
- self.logger.debug(f"Generator: {generator}")
- if generator and "docusaurus" in generator.lower(): # type: ignore
- content = scrape_docsaurus(soup)
- elif generator and "mkdocs" in generator.lower(): # type: ignore
- content = scrape_mkdocs(soup)
- # TODO: Add more processors
- else:
- # If no generator is found, just remove nav and header
- self.logger.debug("No generator found, just removing nav and header")
- # Find all 'nav' and 'header' elements in the BeautifulSoup object
- nav_elements = soup.find_all("nav")
- header_elements = soup.find_all("header")
-
- # Remove each 'nav' and 'header' element from the BeautifulSoup object
- for element in nav_elements + header_elements:
- element.decompose()
-
- content = soup.get_text(separator='\n', strip=True)
-
- # Build metadata from BeautifulSoup output.
- # Borrowed from: https://python.langchain.com/api_reference/_modules/langchain_community/document_loaders/web_base.html#WebBaseLoader._build_metadata
- metadata = {"source": url}
- if title := soup.find("title"):
- metadata["title"] = title.get_text()
- else:
- metadata["title"] = ""
-
- if description := soup.find("meta", attrs={"name": "description"}):
- metadata["description"] = description.get("content", "") # type: ignore
- else:
- metadata["description"] = ""
-
- if html := soup.find("html"):
- metadata["language"] = html.get("lang", "") # type: ignore
- else:
- metadata["language"] = ""
-
- return content, metadata
-
-
-
- async def process_url(self, url: str, job_id: str, batch: List[Document]):
- """
- Process a URL, fetching the document and adding it to the batch.
- """
- try:
- self.logger.info(f"Processing URL {url}")
-
- assert self.client.ingestor_id is not None, "Ingestor ID is None, Ingestor client not initialized properly"
-
- # Check if job is terminated
- if await self.jobmanager.is_job_terminated(job_id):
- self.logger.debug(f"Job {job_id} is terminated. Stopping processing of URL {url}.")
- return
-
- # Sanitize URL
- url = utils.sanitize_url(url)
- self.logger.debug(f"Processing sanitized URL {url}")
- if self.session is None:
- raise Exception("Session is not initialized")
-
- # Fetch the URL content
- async with self.session.get(url, allow_redirects=True, max_redirects=10) as resp:
- self.logger.debug(f"Received response: {resp.status} for URL: {url}")
- resp.raise_for_status()
- html_content = await resp.text()
- soup = BeautifulSoup(html_content, 'html.parser')
- content, metadata = await self.custom_parser(soup, url)
- doc_id = utils.generate_document_id_from_url(self.datasourceinfo.datasource_id, url)
- doc = Document(id=doc_id, page_content=content,
- metadata=DocumentMetadata(
- datasource_id=self.datasourceinfo.datasource_id,
- description=metadata.get("description", ""),
- title=metadata.get("title", ""),
- document_id=doc_id,
- document_ingested_at=int(time.time()),
- document_type="webpage",
- fresh_until=0, # to be set later
- ingestor_id=self.client.ingestor_id,
- is_graph_entity=False,
- metadata=metadata
- ).model_dump())
-
- # Add document to batch instead of ingesting immediately
- batch.append(doc)
-
- except aiohttp.TooManyRedirects as e:
- self.logger.error(f"TooManyRedirects error: {e}")
- # Print redirect history for debugging
- if hasattr(e, 'history'):
- for resp in e.history:
- self.logger.error(f"Redirected from: {resp.url} with status {resp.status}")
-
- await self.jobmanager.increment_failure(
- job_id=job_id,
- message=f"Failed: Too many redirects - URL: {url} : {type(e).__name__} {e} "
- )
- except Exception as e:
- self.logger.error(traceback.format_exc())
- self.logger.error(f"Failed to load URL {url}: {e}")
-
- await self.jobmanager.increment_failure(
- job_id=job_id,
- message=f"Failed to load URL {url} : {type(e).__name__} {e} "
- )
- finally:
- await self.jobmanager.increment_progress(
- job_id=job_id,
- )
- await self.jobmanager.upsert_job(
- job_id=job_id,
- message=f"Processed URL: {url}"
- )
- self.logger.debug(f"DONE Processing URL {url}")
-
-
- async def get_urls_from_sitemap(self, sitemap_url: str) -> List[str]:
- """
- Fetch a sitemap (or sitemap index) and return a list of page URLs using BeautifulSoup.
- Supports .xml and .xml.gz. Recurses into sitemap indexes. Namespace-safe.
- """
- if self.session is None:
- raise Exception("Session is not initialized")
- self.logger.info(f"Fetching sitemap: {sitemap_url}")
- async with self.session.get(sitemap_url, allow_redirects=True) as resp:
- if resp.status != 200:
- self.logger.warning(f"Failed to fetch sitemap {sitemap_url}: HTTP {resp.status}")
- return []
-
- raw = await resp.read()
- content_type = resp.headers.get("Content-Type", "").lower()
- if sitemap_url.endswith(".gz") or "gzip" in content_type:
- try:
- xml_bytes = gzip.decompress(raw)
- except Exception:
- self.logger.warning(traceback.format_exc())
- self.logger.warning(f"Failed to decompress sitemap {sitemap_url}, using raw content")
- # Some servers send gzip content without actual compression
- xml_bytes = raw
- else:
- xml_bytes = raw
-
- # Parse XML via BeautifulSoup with XML parser
- soup = BeautifulSoup(xml_bytes, "xml")
-
- # Determine if this is a sitemap index
- root = soup.find(True) # first tag
- root_name = root.name.lower() if root else "" # type: ignore
-
- def find_all_locs(s: BeautifulSoup) -> List[str]:
- locs: List[str] = []
- for tag in s.find_all(lambda t: isinstance(t.name, str) and t.name.lower().endswith("loc")):
- text = tag.get_text(strip=True)
- if text:
- locs.append(text)
- return locs
-
- if root_name.endswith("sitemapindex"):
- # Recurse into child sitemaps
- child_sitemaps = find_all_locs(soup)
- self.logger.info(f"Found {len(child_sitemaps)} child sitemaps in index")
- urls: List[str] = []
- for child in child_sitemaps:
- urls.extend(await self.get_urls_from_sitemap(child))
- # Deduplicate preserving order
- seen = set()
- deduped: List[str] = []
- for u in urls:
- if u not in seen:
- deduped.append(u)
- seen.add(u)
- return deduped
- else:
- # Regular urlset sitemap
- urls = find_all_locs(soup)
- self.logger.info(f"Extracted {len(urls)} URLs from sitemap")
- return urls
-
- async def load_url(self, url: str, job_id: str, check_for_site_map: bool = False, sitemap_max_urls: int = 0):
- """
- Loads documents from a URL and save contents to a files.
- # TODO: Support saving to S3 or MinIO for storage
-
- Returns: List of document_ids (filenames for now)
- """
- try:
- urls = [] # URLs to process
- if check_for_site_map:
- # Check if the URL has sitemap
- await self.jobmanager.upsert_job(
- job_id=job_id,
- status=JobStatus.IN_PROGRESS,
- message="Checking for sitemaps..."
- )
- self.logger.info(f"Checking for sitemaps at: {url}")
- sitemaps = await self.get_sitemaps(url)
- self.logger.debug(f"Found {len(sitemaps)} sitemaps")
-
- else:
- self.logger.info("Skipping sitemap check as per request")
- sitemaps = []
-
- if not sitemaps: # If no sitemaps found, process the URL directly
- self.logger.info(f"No sitemaps, processing the URL directly: {url}")
- urls = [url]
- else: # If sitemaps found, get URLs from sitemaps
- # Load documents from URLs with streaming processing
- for sitemap_url in sitemaps:
- self.logger.info(f"Loading sitemap: {sitemap_url}")
- await self.jobmanager.upsert_job(
- job_id=job_id,
- message=f"Getting URLs from sitemap: {sitemap_url}..."
- )
- urls.extend(await self.get_urls_from_sitemap(sitemap_url))
-
- # Respect maximum URLs limit if set
- if sitemap_max_urls > 0 and len(urls) >= sitemap_max_urls:
- self.logger.info(f"Reached maximum URLs limit from sitemap: {sitemap_max_urls}")
- urls = urls[:sitemap_max_urls]
- break
-
- await self.jobmanager.upsert_job(
- job_id=job_id,
- message=f"Found {len(urls)} URLs to process",
- total=len(urls))
-
- # Process URLs concurrently with batching
- self.logger.info(f"Processing {len(urls)} URLs with max concurrency {self.max_concurrency}")
- batch: List[Document] = []
- batch_size = 100
-
- async def process_and_flush(url: str):
- """Process URL and flush batch if needed"""
-
- # process the URL and add to batch
- await self.process_url(url, job_id, batch)
-
- # Check if batch is full
- if len(batch) >= batch_size:
- # Flush the batch
- docs_to_send = batch[:batch_size]
- del batch[:batch_size]
-
- self.logger.info(f"Flushing batch of {len(docs_to_send)} documents")
- await self.client.ingest_documents(
- job_id=job_id,
- datasource_id=self.datasourceinfo.datasource_id,
- documents=docs_to_send
- )
-
- tasks = [process_and_flush(url) for url in urls]
- scheduler = TaskScheduler(max_parallel_tasks=self.max_concurrency)
- await scheduler.run(tasks) # type: ignore
-
- # Flush remaining documents in batch
- if batch:
- self.logger.info(f"Flushing final batch of {len(batch)} documents")
- await self.client.ingest_documents(
- job_id=job_id,
- datasource_id=self.datasourceinfo.datasource_id,
- documents=batch
- )
-
- # Invoke garbage collection to free up memory
- gc.collect()
-
- # Check if job was deleted during processing
- job = await self.jobmanager.get_job(job_id)
- if job is None:
- self.logger.error(f"Job not found when finalizing: {job_id}")
- return
-
- # Check if job was terminated
- if job.status == JobStatus.TERMINATED:
- self.logger.info(f"Job {job_id} was terminated during URL processing.")
- await self.jobmanager.upsert_job(
- job_id=job_id,
- status=JobStatus.TERMINATED,
- message="Job was terminated during URL processing."
- )
- # Determine final job status
- elif job.failed_counter and job.failed_counter == job.total:
- await self.jobmanager.upsert_job(
- job_id=job_id,
- status=JobStatus.FAILED,
- message=f"All {job.total} URLs failed to process",
- )
- elif job.failed_counter and job.failed_counter > 0:
- await self.jobmanager.upsert_job(
- job_id=job_id,
- status=JobStatus.COMPLETED_WITH_ERRORS,
- message=f"Processed {job.progress_counter} URLs with {job.failed_counter} failures",
- )
- else:
- await self.jobmanager.upsert_job(
- job_id=job_id,
- status=JobStatus.COMPLETED,
- message=f"Processed: {job.total} URLs",
- )
-
- except Exception as e:
- self.logger.error(traceback.format_exc())
- self.logger.error(f"Error during URL ingestion: {e}")
- await self.jobmanager.increment_failure(
- job_id=job_id,
- message=f"Error during URL ingestion: {type(e).__name__} {e}"
- )
- await self.jobmanager.upsert_job(
- job_id=job_id,
- status=JobStatus.FAILED,
- message="Failed to process URLs"
- )
- raise
-
- async def close(self):
- """Close the aiohttp session"""
- if self.session:
- await self.session.close()
- self.session = None
-
- async def cleanup(self):
- """Clean up resources."""
- if self.session:
- await self.session.close()
- self.session = None
\ No newline at end of file
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/__init__.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/__init__.py
new file mode 100644
index 000000000..28c09dc2a
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/__init__.py
@@ -0,0 +1,12 @@
+"""
+Content parsers for different documentation site generators.
+
+This module provides a registry of parsers that can automatically detect
+and extract content from various documentation frameworks like Docusaurus,
+MkDocs, Sphinx, ReadTheDocs, VitePress, and generic HTML pages.
+"""
+
+from .registry import ParserRegistry
+from .base import BaseParser, ParseResult
+
+__all__ = ["ParserRegistry", "BaseParser", "ParseResult"]
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/base.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/base.py
new file mode 100644
index 000000000..8a84faf3b
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/base.py
@@ -0,0 +1,131 @@
+"""
+Base parser interface for content extraction.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+from scrapy.http import Response
+
+
+@dataclass
+class ParseResult:
+ """Result of parsing a webpage."""
+
+ content: str
+ title: str
+ description: str
+ language: str
+ generator: Optional[str] = None
+
+
+class BaseParser(ABC):
+ """Abstract base class for content parsers."""
+
+ # Human-readable name for this parser
+ name: str = "base"
+
+ @classmethod
+ @abstractmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Check if this parser can handle the given response.
+
+ Args:
+ response: Scrapy Response object
+
+ Returns:
+ True if this parser should be used for this page
+ """
+ pass
+
+ @classmethod
+ @abstractmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content and metadata from the response.
+
+ Args:
+ response: Scrapy Response object
+
+ Returns:
+ ParseResult with extracted content and metadata
+ """
+ pass
+
+ @classmethod
+ def _get_meta_content(cls, response: Response, name: str) -> str:
+ """Helper to get meta tag content by name."""
+ # Try name attribute first
+ content = response.css(f'meta[name="{name}"]::attr(content)').get()
+ if content:
+ return content.strip()
+
+ # Try property attribute (for Open Graph tags)
+ content = response.css(f'meta[property="{name}"]::attr(content)').get()
+ if content:
+ return content.strip()
+
+ return ""
+
+ @classmethod
+ def _get_title(cls, response: Response) -> str:
+ """Extract page title."""
+ # Try title tag first
+ title = response.css("title::text").get()
+ if title:
+ return title.strip()
+
+ # Try og:title
+ title = cls._get_meta_content(response, "og:title")
+ if title:
+ return title
+
+ # Try h1
+ title = response.css("h1::text").get()
+ if title:
+ return title.strip()
+
+ return ""
+
+ @classmethod
+ def _get_description(cls, response: Response) -> str:
+ """Extract page description."""
+ desc = cls._get_meta_content(response, "description")
+ if desc:
+ return desc
+
+ desc = cls._get_meta_content(response, "og:description")
+ return desc
+
+ @classmethod
+ def _get_language(cls, response: Response) -> str:
+ """Extract page language."""
+ lang = response.css("html::attr(lang)").get()
+ if lang:
+ return lang.strip()
+ return ""
+
+ @classmethod
+ def _get_generator(cls, response: Response) -> Optional[str]:
+ """Extract generator meta tag value."""
+ return cls._get_meta_content(response, "generator") or None
+
+ @classmethod
+ def _clean_text(cls, text: str) -> str:
+ """Clean up extracted text content."""
+ if not text:
+ return ""
+
+ # Normalize whitespace
+ lines = text.split("\n")
+ cleaned_lines = []
+
+ for line in lines:
+ # Strip each line
+ line = line.strip()
+ # Skip empty lines
+ if line:
+ cleaned_lines.append(line)
+
+ return "\n".join(cleaned_lines)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/docusaurus.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/docusaurus.py
new file mode 100644
index 000000000..e09803a27
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/docusaurus.py
@@ -0,0 +1,119 @@
+"""
+Parser for Docusaurus documentation sites.
+
+Docusaurus (https://docusaurus.io/) is a popular static site generator
+for documentation, often used by open source projects.
+"""
+
+from scrapy.http import Response
+
+from .base import BaseParser, ParseResult
+from .registry import register_parser
+
+
+@register_parser()
+class DocusaurusParser(BaseParser):
+ """Parser for Docusaurus documentation sites."""
+
+ name = "docusaurus"
+
+ # Tags to exclude from content extraction
+ EXCLUDE_TAGS = ["nav", "header", "footer", "script", "style", "noscript"]
+
+ # Classes to exclude from content extraction
+ EXCLUDE_CLASSES = ["navbar", "sidebar", "toc", "theme-doc-toc-mobile"]
+
+ @classmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Detect Docusaurus sites by:
+ 1. Generator meta tag containing "docusaurus"
+ 2. Docusaurus-specific data attributes
+ 3. Docusaurus class patterns
+ """
+ generator = cls._get_generator(response)
+ if generator and "docusaurus" in generator.lower():
+ return True
+
+ # Check for Docusaurus-specific attributes
+ if response.css("[data-theme]").get():
+ # Check for docusaurus class patterns
+ if response.css('.theme-doc-markdown, .docusaurus-mt-lg, [class*="docSidebarContainer"]').get():
+ return True
+
+ return False
+
+ @classmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content from Docusaurus pages.
+
+ Docusaurus uses tags for main content.
+ """
+ # Try to get main article content
+ article = response.css("article")
+
+ if article:
+ # Use XPath to exclude nav elements (CSS :not() doesn't support comma-separated selectors)
+ content = cls._extract_from_article(article[0])
+
+ # If that's empty, fallback to getting all text from article
+ if not content or len(content.strip()) < 100:
+ content = article.css("::text").getall()
+ content = "\n".join(t.strip() for t in content if t.strip())
+ else:
+ # Fallback: get all text, remove nav/header
+ content = cls._extract_fallback(response)
+
+ return ParseResult(
+ content=cls._clean_text(content),
+ title=cls._get_title(response),
+ description=cls._get_description(response),
+ language=cls._get_language(response),
+ generator=cls._get_generator(response),
+ )
+
+ @classmethod
+ def _extract_from_article(cls, article) -> str:
+ """Extract text from article, excluding nav/header/footer elements using XPath."""
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for classes
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{class_name}")])' for class_name in cls.EXCLUDE_CLASSES])
+
+ # Combine all exclusions
+ xpath = f".//text()[{tag_exclusions} and {class_exclusions}]"
+
+ texts = []
+ for text in article.xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
+
+ @classmethod
+ def _extract_fallback(cls, response: Response) -> str:
+ """Fallback extraction when article tag is not found."""
+ # Get body, remove nav/header/footer using XPath
+ body = response.css("body")
+ if not body:
+ return "\n".join(response.css("::text").getall())
+
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for classes
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{class_name}")])' for class_name in cls.EXCLUDE_CLASSES])
+
+ # Combine all exclusions
+ xpath = f".//text()[{tag_exclusions} and {class_exclusions}]"
+
+ texts = []
+ for text in body[0].xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/generic.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/generic.py
new file mode 100644
index 000000000..9fa30d474
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/generic.py
@@ -0,0 +1,136 @@
+"""
+Generic parser for HTML pages.
+
+This is the fallback parser used when no specific documentation framework
+is detected. It attempts to extract main content by removing navigation,
+headers, footers, and other non-content elements.
+"""
+
+from scrapy.http import Response
+
+from .base import BaseParser, ParseResult
+from .registry import register_parser
+
+
+@register_parser(is_fallback=True)
+class GenericParser(BaseParser):
+ """Generic HTML content parser used as fallback."""
+
+ name = "generic"
+
+ # Elements to exclude from content extraction
+ EXCLUDE_TAGS = [
+ "nav",
+ "header",
+ "footer",
+ "aside",
+ "script",
+ "style",
+ "noscript",
+ "iframe",
+ "svg",
+ "form",
+ "button",
+ "input",
+ "select",
+ "textarea",
+ ]
+
+ # Class patterns to exclude (common navigation/UI elements)
+ EXCLUDE_CLASS_PATTERNS = [
+ "nav",
+ "menu",
+ "sidebar",
+ "footer",
+ "header",
+ "toolbar",
+ "breadcrumb",
+ "pagination",
+ "social",
+ "share",
+ "comment",
+ "cookie",
+ "banner",
+ "ad",
+ "popup",
+ "modal",
+ "overlay",
+ "tooltip",
+ "dropdown",
+ ]
+
+ @classmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Generic parser always returns True as it's the fallback.
+ This method is only called if no other parser matches.
+ """
+ return True
+
+ @classmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content from a generic HTML page.
+
+ Strategy:
+ 1. Try to find semantic main content (, , [role="main"])
+ 2. Fall back to body content with navigation removed
+ 3. Clean and deduplicate text
+ """
+ content = ""
+
+ # Try semantic main content first
+ main = response.css('main, article, [role="main"], .main-content, #main-content, #content, .content')
+ if main:
+ # Use the first match (most likely the main content)
+ content = cls._extract_from_element(main[0])
+
+ # Fallback to body
+ if not content or len(content.strip()) < 100:
+ content = cls._extract_from_body(response)
+
+ return ParseResult(
+ content=cls._clean_text(content),
+ title=cls._get_title(response),
+ description=cls._get_description(response),
+ language=cls._get_language(response),
+ generator=cls._get_generator(response),
+ )
+
+ @classmethod
+ def _extract_from_element(cls, element) -> str:
+ """Extract text from a single element."""
+ texts = []
+
+ # Use XPath for complex exclusion logic since CSS :not() doesn't support comma-separated selectors
+ # Exclude script, style, and other non-content elements
+ exclude_xpath = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ for text in element.xpath(f".//text()[{exclude_xpath}]").getall():
+ text = text.strip()
+ if text and len(text) > 1: # Skip single characters
+ texts.append(text)
+
+ return "\n".join(texts)
+
+ @classmethod
+ def _extract_from_body(cls, response: Response) -> str:
+ """Extract content from body, removing navigation elements."""
+ texts = []
+
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for class patterns (contains check)
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{pattern}")])' for pattern in cls.EXCLUDE_CLASS_PATTERNS])
+
+ # Combine all exclusions
+ xpath = f".//body//text()[{tag_exclusions} and {class_exclusions}]"
+
+ # Get all text nodes not in excluded elements
+ for text in response.xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/mkdocs.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/mkdocs.py
new file mode 100644
index 000000000..8d2f58cb3
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/mkdocs.py
@@ -0,0 +1,107 @@
+"""
+Parser for MkDocs documentation sites.
+
+MkDocs (https://www.mkdocs.org/) is a fast, simple static site generator
+geared towards project documentation, often styled with Material theme.
+"""
+
+from scrapy.http import Response
+
+from .base import BaseParser, ParseResult
+from .registry import register_parser
+
+
+@register_parser()
+class MkDocsParser(BaseParser):
+ """Parser for MkDocs documentation sites."""
+
+ name = "mkdocs"
+
+ # Tags to exclude from content extraction
+ EXCLUDE_TAGS = ["nav", "header", "footer", "script", "style", "noscript"]
+
+ # Classes to exclude from content extraction
+ EXCLUDE_CLASSES = ["md-sidebar", "md-header", "md-footer"]
+
+ @classmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Detect MkDocs sites by:
+ 1. Generator meta tag containing "mkdocs"
+ 2. MkDocs-specific class patterns (md-main, md-content)
+ """
+ generator = cls._get_generator(response)
+ if generator and "mkdocs" in generator.lower():
+ return True
+
+ # Check for Material for MkDocs specific classes
+ if response.css(".md-main, .md-content, [data-md-component]").get():
+ return True
+
+ # Check for classic MkDocs theme
+ if response.css(".rst-content, .wy-nav-content").get():
+ return True
+
+ return False
+
+ @classmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content from MkDocs pages.
+
+ Material for MkDocs uses main.md-main div.md-content
+ Classic MkDocs uses .rst-content or similar
+ """
+ content = ""
+
+ # Try Material for MkDocs first
+ main_content = response.css("main.md-main div.md-content")
+ if main_content:
+ # Get the article within md-content
+ article = main_content.css("article")
+ if article:
+ content = "\n".join(article.css("::text").getall())
+ else:
+ content = "\n".join(main_content.css("::text").getall())
+
+ # Try classic MkDocs / ReadTheDocs theme
+ if not content:
+ rst_content = response.css('.rst-content, .wy-nav-content, [role="main"]')
+ if rst_content:
+ content = "\n".join(rst_content.css("::text").getall())
+
+ # Fallback to body without nav/header
+ if not content:
+ content = cls._extract_body_content(response)
+
+ return ParseResult(
+ content=cls._clean_text(content),
+ title=cls._get_title(response),
+ description=cls._get_description(response),
+ language=cls._get_language(response),
+ generator=cls._get_generator(response),
+ )
+
+ @classmethod
+ def _extract_body_content(cls, response: Response) -> str:
+ """Extract body content excluding navigation elements using XPath."""
+ body = response.css("body")
+ if not body:
+ return "\n".join(response.css("::text").getall())
+
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for classes
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{class_name}")])' for class_name in cls.EXCLUDE_CLASSES])
+
+ # Combine all exclusions
+ xpath = f".//text()[{tag_exclusions} and {class_exclusions}]"
+
+ texts = []
+ for text in body[0].xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/readthedocs.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/readthedocs.py
new file mode 100644
index 000000000..2ce3b31c9
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/readthedocs.py
@@ -0,0 +1,111 @@
+"""
+Parser for ReadTheDocs-hosted documentation.
+
+ReadTheDocs (https://readthedocs.org/) hosts documentation for many open source
+projects. It can use various themes but has its own characteristic elements.
+"""
+
+from scrapy.http import Response
+
+from .base import BaseParser, ParseResult
+from .registry import register_parser
+
+
+@register_parser()
+class ReadTheDocsParser(BaseParser):
+ """Parser for ReadTheDocs-hosted documentation."""
+
+ name = "readthedocs"
+
+ # Tags to exclude from content extraction
+ EXCLUDE_TAGS = ["nav", "header", "footer", "script", "style", "noscript"]
+
+ # Classes to exclude from content extraction
+ EXCLUDE_CLASSES = ["wy-nav-side", "rst-versions"]
+
+ @classmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Detect ReadTheDocs sites by:
+ 1. ReadTheDocs-specific scripts or elements
+ 2. RTD theme class patterns
+ 3. readthedocs.io domain
+ """
+ # Check for RTD embed script
+ if response.css('script[src*="readthedocs"]').get():
+ return True
+
+ # Check for RTD version selector or flyout
+ if response.css(".rst-versions, .injected, [data-readthedocs-analytics]").get():
+ return True
+
+ # Check for RTD theme specific classes
+ if response.css(".wy-body-for-nav, .wy-nav-content-wrap").get():
+ return True
+
+ # Check URL for readthedocs domain
+ if "readthedocs.io" in response.url or "readthedocs.org" in response.url:
+ return True
+
+ return False
+
+ @classmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content from ReadTheDocs pages.
+
+ RTD typically uses .wy-nav-content or .rst-content for main content.
+ """
+ content = ""
+
+ # Try RTD Sphinx theme structure
+ main_content = response.css(".wy-nav-content, .rst-content")
+ if main_content:
+ # Get the document div within
+ document = main_content.css('[role="main"], .document')
+ if document:
+ content = "\n".join(document.css("::text").getall())
+ else:
+ content = "\n".join(main_content.css("::text").getall())
+
+ # Try generic main content area
+ if not content:
+ main = response.css('[role="main"], main, .main-content')
+ if main:
+ content = "\n".join(main.css("::text").getall())
+
+ # Fallback
+ if not content:
+ content = cls._extract_body_content(response)
+
+ return ParseResult(
+ content=cls._clean_text(content),
+ title=cls._get_title(response),
+ description=cls._get_description(response),
+ language=cls._get_language(response),
+ generator=cls._get_generator(response),
+ )
+
+ @classmethod
+ def _extract_body_content(cls, response: Response) -> str:
+ """Extract body content excluding navigation elements using XPath."""
+ body = response.css("body")
+ if not body:
+ return "\n".join(response.css("::text").getall())
+
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for classes
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{class_name}")])' for class_name in cls.EXCLUDE_CLASSES])
+
+ # Combine all exclusions
+ xpath = f".//text()[{tag_exclusions} and {class_exclusions}]"
+
+ texts = []
+ for text in body[0].xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/registry.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/registry.py
new file mode 100644
index 000000000..bfc98d148
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/registry.py
@@ -0,0 +1,104 @@
+"""
+Parser registry for automatic content parser detection and selection.
+"""
+
+from typing import List, Type
+from scrapy.http import Response
+from common.utils import get_logger
+
+from .base import BaseParser, ParseResult
+
+logger = get_logger(__name__)
+
+
+class ParserRegistry:
+ """
+ Registry that manages content parsers and automatically selects
+ the appropriate parser based on page characteristics.
+ """
+
+ _parsers: List[Type[BaseParser]] = []
+ _fallback_parser: Type[BaseParser] | None = None
+
+ @classmethod
+ def register(cls, parser: Type[BaseParser], is_fallback: bool = False) -> None:
+ """
+ Register a parser class.
+
+ Args:
+ parser: Parser class to register
+ is_fallback: If True, use this parser as the fallback
+ """
+ if is_fallback:
+ cls._fallback_parser = parser
+ else:
+ # Insert at beginning to prefer more specific parsers
+ cls._parsers.insert(0, parser)
+ logger.debug(f"Registered parser: {parser.name} (fallback={is_fallback})")
+
+ @classmethod
+ def get_parser(cls, response: Response) -> Type[BaseParser]:
+ """
+ Get the appropriate parser for a response.
+
+ Args:
+ response: Scrapy Response object
+
+ Returns:
+ Parser class that can handle this response
+ """
+ # Try each registered parser
+ for parser in cls._parsers:
+ try:
+ if parser.can_parse(response):
+ logger.debug(f"Selected parser: {parser.name} for {response.url}")
+ return parser
+ except Exception as e:
+ logger.warning(f"Error checking parser {parser.name}: {e}")
+ continue
+
+ # Use fallback parser
+ if cls._fallback_parser:
+ logger.debug(f"Using fallback parser: {cls._fallback_parser.name} for {response.url}")
+ return cls._fallback_parser
+
+ raise ValueError(f"No parser found for {response.url}")
+
+ @classmethod
+ def parse(cls, response: Response) -> ParseResult:
+ """
+ Parse a response using the appropriate parser.
+
+ Args:
+ response: Scrapy Response object
+
+ Returns:
+ ParseResult with extracted content
+ """
+ parser = cls.get_parser(response)
+ return parser.extract(response)
+
+ @classmethod
+ def list_parsers(cls) -> List[str]:
+ """List all registered parser names."""
+ names = [p.name for p in cls._parsers]
+ if cls._fallback_parser:
+ names.append(f"{cls._fallback_parser.name} (fallback)")
+ return names
+
+
+def register_parser(is_fallback: bool = False):
+ """
+ Decorator to register a parser class.
+
+ Usage:
+ @register_parser()
+ class MyParser(BaseParser):
+ ...
+ """
+
+ def decorator(parser_class: Type[BaseParser]) -> Type[BaseParser]:
+ ParserRegistry.register(parser_class, is_fallback=is_fallback)
+ return parser_class
+
+ return decorator
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/sphinx.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/sphinx.py
new file mode 100644
index 000000000..8645a1cb9
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/sphinx.py
@@ -0,0 +1,121 @@
+"""
+Parser for Sphinx documentation sites.
+
+Sphinx (https://www.sphinx-doc.org/) is the de facto standard for Python
+documentation. Used by Python docs, many PyPI packages, and ReadTheDocs.
+"""
+
+from scrapy.http import Response
+
+from .base import BaseParser, ParseResult
+from .registry import register_parser
+
+
+@register_parser()
+class SphinxParser(BaseParser):
+ """Parser for Sphinx documentation sites."""
+
+ name = "sphinx"
+
+ @classmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Detect Sphinx sites by:
+ 1. Generator meta tag containing "sphinx"
+ 2. Sphinx-specific class patterns
+ 3. Alabaster or other Sphinx theme markers
+ """
+ generator = cls._get_generator(response)
+ if generator and "sphinx" in generator.lower():
+ return True
+
+ # Check for Sphinx-specific elements
+ if response.css(".sphinxsidebar, .sphinxsidebarwrapper").get():
+ return True
+
+ # Check for Sphinx document structure
+ if response.css("div.document, div.documentwrapper").get():
+ if response.css("div.bodywrapper, div.body").get():
+ return True
+
+ # Check for Furo theme (popular Sphinx theme)
+ if response.css(".sidebar-container, .content-container").get():
+ if response.css("[data-content_root]").get():
+ return True
+
+ # Check for PyData Sphinx theme
+ if response.css(".bd-main, .bd-content").get():
+ return True
+
+ return False
+
+ @classmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content from Sphinx pages.
+
+ Sphinx typically uses div.document > div.documentwrapper > div.bodywrapper > div.body
+ """
+ content = ""
+
+ # Try standard Sphinx structure
+ body = response.css("div.body, div.bodywrapper div.body")
+ if body:
+ # Exclude table of contents and sidebar references
+ texts = []
+ for element in body.css("*:not(.toctree-wrapper):not(.contents)"):
+ # Use XPath to get direct text children (CSS "> ::text" is not valid)
+ for text in element.xpath("text()").getall():
+ if text.strip():
+ texts.append(text.strip())
+ content = "\n".join(texts)
+
+ # Try Furo theme
+ if not content:
+ article = response.css('article.bd-article, article[role="main"]')
+ if article:
+ content = "\n".join(article.css("::text").getall())
+
+ # Try PyData theme
+ if not content:
+ main_content = response.css(".bd-content main, #main-content")
+ if main_content:
+ content = "\n".join(main_content.css("::text").getall())
+
+ # Try role="main" fallback
+ if not content:
+ main = response.css('[role="main"], main')
+ if main:
+ content = "\n".join(main.css("::text").getall())
+
+ # Final fallback
+ if not content:
+ content = cls._extract_body_content(response)
+
+ return ParseResult(
+ content=cls._clean_text(content),
+ title=cls._get_title(response),
+ description=cls._get_description(response),
+ language=cls._get_language(response),
+ generator=cls._get_generator(response),
+ )
+
+ @classmethod
+ def _extract_body_content(cls, response: Response) -> str:
+ """Extract body content excluding navigation elements."""
+ texts = []
+
+ # Use XPath for exclusions since CSS :not() doesn't support comma-separated selectors
+ exclude_tags = ["nav", "header", "footer", "script", "style"]
+ exclude_classes = ["sphinxsidebar", "sidebar", "toctree-wrapper"]
+
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in exclude_tags])
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{cls_name}")])' for cls_name in exclude_classes])
+
+ xpath = f".//body//text()[{tag_exclusions} and {class_exclusions}]"
+
+ for text in response.xpath(xpath).getall():
+ if text.strip():
+ texts.append(text.strip())
+
+ return "\n".join(texts)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/vitepress.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/vitepress.py
new file mode 100644
index 000000000..9e2c1f8ac
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/parsers/vitepress.py
@@ -0,0 +1,128 @@
+"""
+Parser for VitePress documentation sites.
+
+VitePress (https://vitepress.dev/) is a Vue-powered static site generator
+designed for building fast, content-centric websites.
+"""
+
+from scrapy.http import Response
+
+from .base import BaseParser, ParseResult
+from .registry import register_parser
+
+
+@register_parser()
+class VitePressParser(BaseParser):
+ """Parser for VitePress documentation sites."""
+
+ name = "vitepress"
+
+ # Tags to exclude from content extraction
+ EXCLUDE_TAGS = ["nav", "header", "footer", "script", "style", "noscript", "aside"]
+
+ # Classes to exclude from content extraction
+ EXCLUDE_CLASSES = ["VPNav", "VPSidebar", "VPLocalNav", "aside", "outline"]
+
+ @classmethod
+ def can_parse(cls, response: Response) -> bool:
+ """
+ Detect VitePress sites by:
+ 1. Generator meta tag containing "vitepress"
+ 2. VitePress-specific class patterns
+ 3. VitePress data attributes
+ """
+ generator = cls._get_generator(response)
+ if generator and "vitepress" in generator.lower():
+ return True
+
+ # Check for VitePress-specific classes
+ if response.css('.VPDoc, .vp-doc, [class*="VPContent"]').get():
+ return True
+
+ # Check for VitePress app container
+ if response.css("#VPContent, #app[data-server-rendered]").get():
+ if response.css(".Layout, .VPNav").get():
+ return True
+
+ return False
+
+ @classmethod
+ def extract(cls, response: Response) -> ParseResult:
+ """
+ Extract content from VitePress pages.
+
+ VitePress uses .vp-doc for documentation content.
+ """
+ content = ""
+
+ # Try VitePress document container
+ doc = response.css(".vp-doc, .VPDoc")
+ if doc:
+ # Use XPath to exclude aside/outline elements (CSS :not() doesn't support comma-separated selectors)
+ content = cls._extract_from_element(doc[0])
+
+ # Fallback to all text if extraction is empty
+ if not content or len(content.strip()) < 100:
+ content = "\n".join(doc.css("::text").getall())
+
+ # Try main content area
+ if not content:
+ main = response.css('main, .main, [class*="VPContent"]')
+ if main:
+ content = "\n".join(main.css("::text").getall())
+
+ # Fallback
+ if not content:
+ content = cls._extract_body_content(response)
+
+ return ParseResult(
+ content=cls._clean_text(content),
+ title=cls._get_title(response),
+ description=cls._get_description(response),
+ language=cls._get_language(response),
+ generator=cls._get_generator(response),
+ )
+
+ @classmethod
+ def _extract_from_element(cls, element) -> str:
+ """Extract text from element, excluding aside/nav elements using XPath."""
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for classes
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{class_name}")])' for class_name in cls.EXCLUDE_CLASSES])
+
+ # Combine all exclusions
+ xpath = f".//text()[{tag_exclusions} and {class_exclusions}]"
+
+ texts = []
+ for text in element.xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
+
+ @classmethod
+ def _extract_body_content(cls, response: Response) -> str:
+ """Extract body content excluding navigation elements using XPath."""
+ body = response.css("body")
+ if not body:
+ return "\n".join(response.css("::text").getall())
+
+ # Build XPath exclusion for tags
+ tag_exclusions = " and ".join([f"not(ancestor-or-self::{tag})" for tag in cls.EXCLUDE_TAGS])
+
+ # Build XPath exclusion for classes
+ class_exclusions = " and ".join([f'not(ancestor-or-self::*[contains(@class, "{class_name}")])' for class_name in cls.EXCLUDE_CLASSES])
+
+ # Combine all exclusions
+ xpath = f".//text()[{tag_exclusions} and {class_exclusions}]"
+
+ texts = []
+ for text in body[0].xpath(xpath).getall():
+ text = text.strip()
+ if text and len(text) > 1:
+ texts.append(text)
+
+ return "\n".join(texts)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/pipelines/__init__.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/pipelines/__init__.py
new file mode 100644
index 000000000..5388f352f
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/pipelines/__init__.py
@@ -0,0 +1,7 @@
+"""
+Scrapy pipelines for processing scraped items.
+"""
+
+from .document import DocumentPipeline
+
+__all__ = ["DocumentPipeline"]
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/pipelines/document.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/pipelines/document.py
new file mode 100644
index 000000000..bdbed8a2d
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/pipelines/document.py
@@ -0,0 +1,158 @@
+"""
+Document pipeline for converting scraped items to LangChain Documents.
+
+This pipeline batches scraped pages and sends them to the RAG server
+for embedding and storage.
+"""
+
+import time
+from typing import List
+
+from scrapy import Spider
+from scrapy.exceptions import DropItem
+from langchain_core.documents import Document
+
+from common.models.rag import DocumentMetadata
+from common.utils import generate_document_id_from_url, get_logger
+
+from ..items import ScrapedPageItem
+
+logger = get_logger(__name__)
+
+
+class DocumentPipeline:
+ """
+ Pipeline that converts ScrapedPageItem to LangChain Documents
+ and sends them to the RAG server in batches.
+ """
+
+ # Number of documents to batch before sending
+ batch_size: int = 100
+
+ def __init__(self):
+ self.batch: List[Document] = []
+ self.client = None
+ self.job_manager = None
+ self.datasource_info = None
+ self.job_id = None
+ self.ingestor_id = None
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ """
+ Create pipeline instance from crawler.
+
+ This is called by Scrapy to instantiate the pipeline.
+ """
+ pipeline = cls()
+ return pipeline
+
+ def open_spider(self, spider: Spider):
+ """
+ Called when spider opens.
+
+ Initialize pipeline with spider's shared resources.
+ """
+ self.client = spider.client
+ self.job_manager = spider.job_manager
+ self.datasource_info = spider.datasource_info
+ self.job_id = spider.job_id
+ self.ingestor_id = spider.client.ingestor_id
+
+ logger.info(f"Document pipeline opened for job {self.job_id}")
+
+ async def process_item(self, item: ScrapedPageItem, spider: Spider):
+ """
+ Process a scraped item.
+
+ Converts item to Document and adds to batch. When batch is full,
+ sends documents to RAG server.
+
+ Args:
+ item: Scraped page item
+ spider: Spider instance
+
+ Returns:
+ The processed item
+
+ Raises:
+ DropItem: If item has no content
+ """
+ # Validate item
+ if not item.content or len(item.content.strip()) < 10:
+ logger.warning(f"Dropping item with no content: {item.url}")
+ raise DropItem(f"No content extracted from {item.url}")
+
+ # Generate document ID
+ doc_id = generate_document_id_from_url(self.datasource_info.datasource_id, item.url)
+
+ # Build metadata
+ metadata = DocumentMetadata(
+ datasource_id=self.datasource_info.datasource_id,
+ document_id=doc_id,
+ title=item.title or "",
+ description=item.description or "",
+ document_type="webpage",
+ document_ingested_at=int(time.time()),
+ ingestor_id=self.ingestor_id,
+ fresh_until=0, # Will be set by server
+ is_graph_entity=False,
+ metadata={
+ "source": item.url,
+ "language": item.language or "",
+ "generator": item.generator or "",
+ **item.extra_metadata,
+ },
+ )
+
+ # Create Document
+ doc = Document(
+ id=doc_id,
+ page_content=item.content,
+ metadata=metadata.model_dump(),
+ )
+
+ # Add to batch
+ self.batch.append(doc)
+
+ # Update job progress
+ await self.job_manager.increment_progress(self.job_id)
+
+ # Flush batch if full
+ if len(self.batch) >= self.batch_size:
+ await self._flush_batch()
+
+ return item
+
+ async def close_spider(self, spider: Spider):
+ """
+ Called when spider closes.
+
+ Flushes any remaining documents in the batch.
+ """
+ logger.info(f"Closing document pipeline, flushing {len(self.batch)} remaining documents")
+ await self._flush_batch()
+
+ async def _flush_batch(self):
+ """
+ Send current batch to RAG server.
+ """
+ if not self.batch:
+ return
+
+ try:
+ logger.info(f"Flushing batch of {len(self.batch)} documents to RAG server")
+
+ await self.client.ingest_documents(
+ job_id=self.job_id,
+ datasource_id=self.datasource_info.datasource_id,
+ documents=self.batch,
+ )
+
+ # Clear batch after successful send
+ self.batch = []
+
+ except Exception as e:
+ logger.error(f"Failed to flush batch: {e}")
+ # Keep batch to retry on next flush
+ raise
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/scrapy_loader.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/scrapy_loader.py
new file mode 100644
index 000000000..cc0691a3d
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/scrapy_loader.py
@@ -0,0 +1,303 @@
+"""
+Scrapy-based web loader for scraping and ingesting web content.
+
+This module provides the main entry point for Scrapy-based web scraping,
+using a subprocess worker pool to avoid Twisted/asyncio event loop conflicts.
+"""
+
+import gc
+
+from langchain_core.documents import Document
+
+from common.models.server import ScrapySettings
+from common.models.rag import DataSourceInfo
+from common.job_manager import JobManager, JobStatus
+from common.ingestor import Client
+from common.utils import get_logger
+
+from .worker_pool import get_worker_pool
+from .worker_types import CrawlRequest, CrawlProgress, CrawlResult, CrawlStatus
+
+logger = get_logger(__name__)
+
+
+class ScrapyLoader:
+ """
+ Main entry point for Scrapy-based web scraping.
+
+ This class manages crawl requests using a worker pool and handles
+ document ingestion to the RAG server.
+ """
+
+ def __init__(
+ self,
+ rag_client: Client,
+ job_manager: JobManager,
+ datasource_info: DataSourceInfo,
+ ):
+ """
+ Initialize the Scrapy loader.
+
+ Args:
+ rag_client: Client for communicating with RAG server
+ job_manager: Manager for job status updates
+ datasource_info: Metadata about the datasource
+ """
+ self.client = rag_client
+ self.job_manager = job_manager
+ self.datasource_info = datasource_info
+ self.logger = get_logger(f"scrapy-loader:{datasource_info.datasource_id[:12]}")
+
+ async def load(
+ self,
+ url: str,
+ settings: ScrapySettings,
+ job_id: str,
+ ) -> None:
+ """
+ Load content from a URL using Scrapy worker pool.
+
+ Args:
+ url: URL to scrape
+ settings: Scraping configuration
+ job_id: ID of the ingestion job
+ """
+ self.logger.info(f"Starting Scrapy crawl for {url} with mode {settings.crawl_mode}")
+
+ try:
+ # Update job status with mode info
+ if settings.render_javascript:
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.IN_PROGRESS,
+ message=f"Starting {settings.crawl_mode.value} crawl with JavaScript rendering (Chromium)",
+ )
+ self.logger.info("JavaScript rendering enabled - using Playwright/Chromium")
+ else:
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.IN_PROGRESS,
+ message=f"Starting {settings.crawl_mode.value} crawl of {url}",
+ )
+
+ # Get worker pool
+ pool = await get_worker_pool()
+
+ # Build crawl request
+ request = CrawlRequest(
+ job_id=job_id,
+ url=url,
+ datasource_id=self.datasource_info.datasource_id,
+ crawl_mode=settings.crawl_mode.value,
+ max_depth=settings.max_depth,
+ max_pages=settings.max_pages,
+ render_javascript=settings.render_javascript,
+ wait_for_selector=settings.wait_for_selector,
+ page_load_timeout=settings.page_load_timeout,
+ follow_external_links=settings.follow_external_links,
+ allowed_url_patterns=settings.allowed_url_patterns,
+ denied_url_patterns=settings.denied_url_patterns,
+ download_delay=settings.download_delay,
+ concurrent_requests=settings.concurrent_requests,
+ respect_robots_txt=settings.respect_robots_txt,
+ user_agent=settings.user_agent,
+ ingestor_id=self.client.ingestor_id or "",
+ datasource_name=getattr(self.datasource_info, "name", "") or "",
+ )
+
+ # Progress tracking state
+ last_pages_crawled = 0
+ total_set = False
+
+ # Progress callback
+ async def on_progress(progress: CrawlProgress):
+ nonlocal last_pages_crawled, total_set
+
+ # Set total if we now know it (from sitemap) and haven't set it yet
+ if progress.total_pages and not total_set:
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.IN_PROGRESS,
+ message=progress.message,
+ total=progress.total_pages,
+ )
+ total_set = True
+ else:
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.IN_PROGRESS,
+ message=progress.message,
+ )
+
+ # Increment progress counter by the delta
+ delta = progress.pages_crawled - last_pages_crawled
+ if delta > 0:
+ await self.job_manager.increment_progress(job_id, delta)
+ last_pages_crawled = progress.pages_crawled
+
+ # Run crawl
+ self.logger.info(f"Submitting crawl to worker pool: {url}")
+ result = await pool.crawl(
+ request=request,
+ on_progress=on_progress,
+ timeout=settings.max_pages * 30, # ~30 seconds per page max
+ )
+
+ self.logger.info(f"Crawl completed: {result.pages_crawled} pages, status: {result.status}")
+
+ # Process results
+ await self._process_result(result, job_id, url)
+
+ except Exception as e:
+ self.logger.error(f"Crawl failed: {e}")
+
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.FAILED,
+ message=f"Crawl failed: {str(e)}",
+ )
+
+ raise
+
+ finally:
+ gc.collect()
+
+ async def _process_result(self, result: CrawlResult, job_id: str, url: str):
+ """
+ Process crawl result and ingest documents.
+
+ Args:
+ result: Crawl result from worker
+ job_id: Job ID
+ url: Original URL
+ """
+ if result.status == CrawlStatus.FAILED:
+ fatal_error = result.fatal_error or f"Failed to crawl {url}"
+ self.logger.error(f"Crawl failed: {fatal_error}")
+
+ # Log filtering stats for debugging
+ if result.urls_found_in_sitemap > 0:
+ self.logger.info(f"Filtering stats: {result.urls_found_in_sitemap} URLs in sitemap, {result.urls_filtered_external} filtered as external, {result.urls_filtered_pattern} filtered by pattern, {result.urls_filtered_max_pages} filtered by max pages")
+
+ # Add individual error messages to the job
+ for error in result.errors:
+ await self.job_manager.add_error_msg(job_id, error)
+
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.FAILED,
+ message=fatal_error,
+ )
+ return
+
+ if not result.documents:
+ # Build a more helpful message if we have filtering stats
+ fatal_error = result.fatal_error
+ if not fatal_error:
+ if result.urls_found_in_sitemap > 0:
+ fatal_error = f"No content extracted from {url}. Found {result.urls_found_in_sitemap} URLs in sitemap but none were successfully scraped."
+ else:
+ fatal_error = f"No content extracted from {url}"
+
+ self.logger.error(f"No documents extracted: {fatal_error}")
+
+ # Add individual error messages to the job
+ for error in result.errors:
+ await self.job_manager.add_error_msg(job_id, error)
+
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.FAILED,
+ message=fatal_error,
+ )
+ return
+
+ # Convert document dicts to LangChain Documents
+ documents = []
+ for doc_dict in result.documents:
+ doc = Document(
+ id=doc_dict.get("id"),
+ page_content=doc_dict.get("page_content", ""),
+ metadata=doc_dict.get("metadata", {}),
+ )
+ documents.append(doc)
+
+ self.logger.info(f"Ingesting {len(documents)} documents to RAG server")
+
+ # Send to RAG server in batches
+ batch_size = 100
+ total_batches = (len(documents) + batch_size - 1) // batch_size # Ceiling division
+
+ for i in range(0, len(documents), batch_size):
+ batch = documents[i : i + batch_size]
+ batch_num = i // batch_size + 1
+
+ # Update job message with batch progress
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.IN_PROGRESS,
+ message=f"Sending batch {batch_num}/{total_batches} to server ({len(batch)} documents)",
+ )
+
+ try:
+ await self.client.ingest_documents(
+ job_id=job_id,
+ datasource_id=self.datasource_info.datasource_id,
+ documents=batch,
+ )
+ self.logger.info(f"Ingested batch {batch_num}/{total_batches} ({len(batch)} documents)")
+
+ # Track document count
+ await self.job_manager.increment_document_count(job_id, len(batch))
+
+ except Exception as e:
+ error_msg = f"Failed to ingest batch {batch_num}/{total_batches}: {e}"
+ self.logger.error(error_msg)
+ await self.job_manager.add_error_msg(job_id, error_msg)
+ # Continue with next batch
+
+ # Update final status
+ if result.status == CrawlStatus.PARTIAL:
+ # Add individual error messages to the job
+ for error in result.errors:
+ await self.job_manager.add_error_msg(job_id, error)
+
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.COMPLETED_WITH_ERRORS,
+ message=f"Crawled {result.pages_crawled} pages with {result.pages_failed} errors",
+ )
+ else:
+ await self.job_manager.upsert_job(
+ job_id=job_id,
+ status=JobStatus.COMPLETED,
+ message=f"Successfully crawled {result.pages_crawled} pages in {result.elapsed_seconds:.1f}s",
+ )
+
+
+async def run_scrapy_loader(
+ url: str,
+ settings: ScrapySettings,
+ job_id: str,
+ client: Client,
+ job_manager: JobManager,
+ datasource_info: DataSourceInfo,
+) -> None:
+ """
+ Convenience function to run the Scrapy loader.
+
+ Args:
+ url: URL to scrape
+ settings: Scraping configuration
+ job_id: ID of the ingestion job
+ client: RAG server client
+ job_manager: Job status manager
+ datasource_info: Datasource metadata
+ """
+ loader = ScrapyLoader(
+ rag_client=client,
+ job_manager=job_manager,
+ datasource_info=datasource_info,
+ )
+
+ await loader.load(url=url, settings=settings, job_id=job_id)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/scrapy_worker.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/scrapy_worker.py
new file mode 100644
index 000000000..1c687789c
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/scrapy_worker.py
@@ -0,0 +1,627 @@
+#!/usr/bin/env python3
+"""
+Scrapy worker subprocess.
+
+This module runs in a separate process with Twisted's reactor.run() as the main loop.
+It receives crawl requests via multiprocessing Queue and sends results back.
+
+Usage:
+ This module is spawned by ScrapyWorkerPool using multiprocessing.Process.
+ Do not run directly.
+"""
+
+# Install Twisted reactor FIRST before any other imports
+from scrapy.utils.reactor import install_reactor
+
+install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
+
+import hashlib
+import re
+import sys
+import time
+import traceback
+from multiprocessing import Queue
+from typing import List
+from urllib.parse import urlparse, urljoin
+
+from twisted.internet import reactor
+from scrapy import Spider, Request
+from scrapy.crawler import CrawlerRunner
+from scrapy.http import Response
+from scrapy.utils.log import configure_logging
+
+from common import utils as common_utils
+from common.models.server import ScrapySettings, CrawlMode
+
+from .worker_types import (
+ WorkerMessage,
+ MessageType,
+ CrawlRequest,
+ CrawlProgress,
+ CrawlResult,
+ CrawlStatus,
+)
+from .settings import build_scrapy_settings
+from .parsers import ParserRegistry
+
+# Import all parsers to register them
+from .parsers import docusaurus, mkdocs, sphinx, readthedocs, vitepress, generic # noqa: F401
+
+
+class WorkerSpider(Spider):
+ """
+ Generic spider that handles all crawl modes.
+
+ This spider is configured at runtime based on the CrawlRequest.
+ """
+
+ name = "worker_spider"
+
+ def __init__(
+ self,
+ request: CrawlRequest,
+ result_queue: Queue,
+ *args,
+ **kwargs,
+ ):
+ super().__init__(*args, **kwargs)
+ self.crawl_request = request
+ self.result_queue = result_queue
+
+ self.start_url = request.url
+ self.max_pages = request.max_pages
+ self.crawl_mode = request.crawl_mode
+ self.follow_external = request.follow_external_links
+ self.allowed_patterns = request.allowed_url_patterns or []
+ self.denied_patterns = request.denied_url_patterns or []
+
+ # Track the effective domain (may change after redirect for sitemap mode)
+ self.effective_domain: str | None = None
+
+ # Tracking
+ self.pages_crawled = 0
+ self.pages_failed = 0
+ self.documents: List[dict] = []
+ self.visited_urls: set = set()
+ self.start_time = time.time()
+
+ # Track filtering stats for better error messages
+ self.urls_found_in_sitemap = 0
+ self.urls_filtered_external = 0
+ self.urls_filtered_pattern = 0
+ self.urls_filtered_max_pages = 0
+
+ # Collect error messages for reporting
+ self.errors: list[str] = []
+ self.max_errors = 50 # Limit to prevent memory issues
+
+ # Progress tracking
+ self.total_pages_to_crawl: int | None = None # Known total (from sitemap)
+ self.pending_urls: set = set() # URLs queued but not yet crawled
+
+ # Progress reporting
+ self.last_progress_time = 0
+ self.progress_interval = 2 # Report progress every 2 seconds (was 5)
+
+ def start_requests(self):
+ """Generate initial request(s) based on crawl mode."""
+ # Send initial progress message for JS rendering
+ if self.crawl_request.render_javascript:
+ self.logger.info("JavaScript rendering enabled - starting Chromium browser")
+ progress = CrawlProgress(
+ job_id=self.crawl_request.job_id,
+ pages_crawled=0,
+ pages_failed=0,
+ message="Starting Chromium browser for JavaScript rendering...",
+ )
+ self.result_queue.put(WorkerMessage.crawl_progress(progress).to_dict())
+
+ if self.crawl_mode == "sitemap":
+ # For sitemap mode, first try to fetch the sitemap
+ parsed = urlparse(self.start_url)
+ base_url = f"{parsed.scheme}://{parsed.netloc}"
+
+ # Try sitemap.xml first
+ yield Request(
+ f"{base_url}/sitemap.xml",
+ callback=self.parse_sitemap,
+ errback=self.handle_sitemap_error,
+ meta={"base_url": base_url},
+ )
+ else:
+ # Single URL or recursive mode - start with the URL
+ yield Request(self.start_url, callback=self.parse_page, errback=self.handle_error)
+
+ def parse_sitemap(self, response: Response):
+ """Parse sitemap.xml and yield requests for each URL."""
+ # Update effective domain based on where we actually landed (handles redirects)
+ self.effective_domain = urlparse(response.url).netloc
+ self.logger.info(f"Sitemap loaded from {response.url}, effective domain: {self.effective_domain}")
+
+ # Extract URLs from sitemap
+ urls = re.findall(r"(.*?)", response.text)
+ self.urls_found_in_sitemap = len(urls)
+
+ self.logger.info(f"Found {len(urls)} URLs in sitemap")
+
+ # Track how many URLs we'll actually crawl
+ urls_to_crawl = []
+ for url in urls[: self.max_pages]:
+ if self._should_follow(url):
+ urls_to_crawl.append(url)
+ self.pending_urls.add(url)
+
+ # Set total for progress tracking
+ self.total_pages_to_crawl = len(urls_to_crawl)
+
+ self.logger.info(f"Queued {len(urls_to_crawl)} URLs for crawling. Filtered: {self.urls_filtered_external} external, {self.urls_filtered_pattern} by pattern, {self.urls_filtered_max_pages} over max pages limit")
+
+ # Yield requests
+ for url in urls_to_crawl:
+ yield Request(url, callback=self.parse_page, errback=self.handle_error)
+
+ def handle_sitemap_error(self, failure):
+ """Handle sitemap fetch failure - fall back to robots.txt."""
+ base_url = failure.request.meta.get("base_url", self.start_url)
+ error_detail = self._get_failure_reason(failure)
+ error_msg = f"Sitemap fetch failed ({error_detail}), trying robots.txt: {failure.request.url}"
+ self.logger.warning(error_msg)
+ if len(self.errors) < self.max_errors:
+ self.errors.append(error_msg)
+
+ yield Request(
+ f"{base_url}/robots.txt",
+ callback=self.parse_robots,
+ errback=self.handle_robots_error,
+ meta={"base_url": base_url},
+ )
+
+ def parse_robots(self, response: Response):
+ """Parse robots.txt for sitemap URLs."""
+ sitemaps = re.findall(r"Sitemap:\s*(\S+)", response.text, re.IGNORECASE)
+
+ if sitemaps:
+ for sitemap_url in sitemaps:
+ yield Request(sitemap_url, callback=self.parse_sitemap, errback=self.handle_error)
+ else:
+ # No sitemap in robots.txt, fall back to crawling the start URL
+ self.logger.warning("No sitemap found in robots.txt, falling back to start URL")
+ yield Request(self.start_url, callback=self.parse_page, errback=self.handle_error)
+
+ def handle_robots_error(self, failure):
+ """Handle robots.txt fetch failure."""
+ error_detail = self._get_failure_reason(failure)
+ error_msg = f"robots.txt fetch failed ({error_detail}), crawling start URL: {failure.request.url}"
+ self.logger.warning(error_msg)
+ if len(self.errors) < self.max_errors:
+ self.errors.append(error_msg)
+ yield Request(self.start_url, callback=self.parse_page, errback=self.handle_error)
+
+ def parse_page(self, response: Response):
+ """Parse a page and extract content."""
+ # Remove from pending set
+ self.pending_urls.discard(response.url)
+
+ # Check limits
+ if self.pages_crawled >= self.max_pages:
+ return
+
+ # Skip if already visited
+ if response.url in self.visited_urls:
+ return
+ self.visited_urls.add(response.url)
+
+ # Handle non-200 responses
+ if response.status != 200:
+ error_msg = f"Ignoring non-200 response ({response.status}): {response.url}"
+ self.logger.warning(error_msg)
+ self.pages_failed += 1
+ if len(self.errors) < self.max_errors:
+ self.errors.append(error_msg)
+ return
+
+ try:
+ # Extract content using parser registry
+ result = ParserRegistry.parse(response)
+
+ if result.content and len(result.content.strip()) >= 10:
+ # Create document
+ now = int(time.time())
+ # Use the system default freshness (configurable via DEFAULT_FRESH_UNTIL_SECONDS env var)
+ fresh_until = common_utils.get_default_fresh_until()
+
+ doc = {
+ "id": self._generate_doc_id(response.url),
+ "page_content": result.content,
+ "metadata": {
+ "datasource_id": self.crawl_request.datasource_id,
+ "document_id": self._generate_doc_id(response.url),
+ "title": result.title or "",
+ "description": result.description or "",
+ "document_type": "webpage",
+ "document_ingested_at": now,
+ "fresh_until": fresh_until,
+ "ingestor_id": self.crawl_request.ingestor_id,
+ "is_graph_entity": False,
+ "metadata": {
+ "source": response.url,
+ "language": result.language or "",
+ "generator": result.generator or "",
+ },
+ },
+ }
+ self.documents.append(doc)
+ self.pages_crawled += 1
+
+ self.logger.debug(f"Parsed page {self.pages_crawled}: {response.url}")
+ else:
+ # Skip pages with no meaningful content (redirects, images, etc.)
+ # This is not an error, just nothing to extract
+ self.logger.debug(f"Skipped page with no content: {response.url}")
+
+ except Exception as e:
+ error_msg = f"Error parsing {response.url}: {e}"
+ self.logger.error(error_msg)
+ self.pages_failed += 1
+ # Collect error messages for reporting
+ if len(self.errors) < self.max_errors:
+ self.errors.append(error_msg)
+
+ # Report progress periodically
+ self._maybe_report_progress(response.url)
+
+ # Follow links if in recursive mode
+ if self.crawl_mode == "recursive" and self.pages_crawled < self.max_pages:
+ for link in self._extract_links(response):
+ if self._should_follow(link) and link not in self.pending_urls:
+ self.pending_urls.add(link)
+ yield Request(link, callback=self.parse_page, errback=self.handle_error)
+
+ def handle_error(self, failure):
+ """Handle request errors."""
+ url = failure.request.url
+
+ # Extract meaningful error details from the failure
+ error_detail = self._get_failure_reason(failure)
+ error_msg = f"{error_detail}: {url}"
+
+ self.logger.error(error_msg)
+ self.pages_failed += 1
+ # Collect error messages for reporting
+ if len(self.errors) < self.max_errors:
+ self.errors.append(error_msg)
+
+ def _get_failure_reason(self, failure) -> str:
+ """Extract a human-readable reason from a Twisted Failure."""
+ from twisted.internet.error import DNSLookupError, TimeoutError, ConnectionRefusedError, TCPTimedOutError
+ from scrapy.spidermiddlewares.httperror import HttpError
+
+ exc = failure.value
+
+ # Check for specific exception types
+ if failure.check(HttpError):
+ response = exc.response
+ return f"HTTP {response.status}"
+ elif failure.check(DNSLookupError):
+ return "DNS lookup failed"
+ elif failure.check(TimeoutError, TCPTimedOutError):
+ return "Connection timed out"
+ elif failure.check(ConnectionRefusedError):
+ return "Connection refused"
+ else:
+ # For other errors, use the exception class name and message
+ exc_name = type(exc).__name__
+ exc_msg = str(exc)
+ if exc_msg and exc_msg != exc_name:
+ return f"{exc_name}: {exc_msg}"
+ return exc_name
+
+ def _should_follow(self, url: str, track_filtering: bool = True) -> bool:
+ """
+ Check if a URL should be followed.
+
+ Args:
+ url: The URL to check
+ track_filtering: If True, increment filtering counters when rejecting URLs
+ """
+ if url in self.visited_urls:
+ return False
+
+ if self.pages_crawled >= self.max_pages:
+ if track_filtering:
+ self.urls_filtered_max_pages += 1
+ return False
+
+ # Check external links
+ if not self.follow_external:
+ # Use effective_domain if set (e.g., after following sitemap redirect)
+ # Otherwise use the original start_url domain
+ if self.effective_domain:
+ allowed_domain = self.effective_domain
+ else:
+ allowed_domain = urlparse(self.start_url).netloc
+
+ url_domain = urlparse(url).netloc
+ if url_domain != allowed_domain:
+ if track_filtering:
+ self.urls_filtered_external += 1
+ # Log the first few for debugging
+ if self.urls_filtered_external <= 3:
+ self.logger.debug(f"Filtered external URL: {url} (domain {url_domain} != {allowed_domain})")
+ return False
+
+ # Check allowed patterns
+ if self.allowed_patterns:
+ if not any(re.search(p, url) for p in self.allowed_patterns):
+ if track_filtering:
+ self.urls_filtered_pattern += 1
+ return False
+
+ # Check denied patterns
+ if self.denied_patterns:
+ if any(re.search(p, url) for p in self.denied_patterns):
+ if track_filtering:
+ self.urls_filtered_pattern += 1
+ return False
+
+ return True
+
+ def _extract_links(self, response: Response) -> List[str]:
+ """Extract links from a response."""
+ links = []
+ for href in response.css("a::attr(href)").getall():
+ # Skip anchors, javascript, mailto, etc.
+ if href.startswith(("#", "javascript:", "mailto:", "tel:")):
+ continue
+
+ # Convert to absolute URL
+ absolute_url = urljoin(response.url, href)
+
+ # Only follow http/https
+ if absolute_url.startswith(("http://", "https://")):
+ links.append(absolute_url)
+
+ return links
+
+ def _generate_doc_id(self, url: str) -> str:
+ """Generate a document ID from URL."""
+ url_hash = hashlib.sha256(url.encode()).hexdigest()[:12]
+ return f"doc_{self.crawl_request.datasource_id}_{url_hash}"
+
+ def _maybe_report_progress(self, current_url: str):
+ """Report progress if enough time has passed."""
+ now = time.time()
+ if now - self.last_progress_time >= self.progress_interval:
+ self.last_progress_time = now
+
+ # Build progress message based on crawl mode
+ queue_size = len(self.pending_urls)
+ if self.total_pages_to_crawl:
+ # Sitemap mode - we know the total
+ message = f"Crawling {self.pages_crawled}/{self.total_pages_to_crawl} pages"
+ elif queue_size > 0:
+ # Recursive mode - show queue size
+ message = f"Crawling... {self.pages_crawled} pages ({queue_size} queued)"
+ else:
+ message = f"Crawling... {self.pages_crawled} pages"
+
+ progress = CrawlProgress(
+ job_id=self.crawl_request.job_id,
+ pages_crawled=self.pages_crawled,
+ pages_failed=self.pages_failed,
+ current_url=current_url,
+ message=message,
+ total_pages=self.total_pages_to_crawl,
+ queue_size=queue_size,
+ )
+ self.result_queue.put(WorkerMessage.crawl_progress(progress).to_dict())
+
+ def closed(self, reason):
+ """Called when spider closes."""
+ elapsed = time.time() - self.start_time
+
+ # Determine status and build fatal error message
+ fatal_error = None
+ if self.pages_crawled == 0:
+ status = CrawlStatus.FAILED
+ # Build detailed error message explaining why no pages were crawled
+ fatal_error = self._build_failure_message()
+ elif self.pages_failed > 0:
+ status = CrawlStatus.PARTIAL
+ else:
+ status = CrawlStatus.SUCCESS
+
+ result = CrawlResult(
+ job_id=self.crawl_request.job_id,
+ status=status,
+ pages_crawled=self.pages_crawled,
+ pages_failed=self.pages_failed,
+ documents=self.documents,
+ elapsed_seconds=elapsed,
+ fatal_error=fatal_error,
+ errors=self.errors,
+ # Include filtering stats for debugging
+ urls_found_in_sitemap=self.urls_found_in_sitemap,
+ urls_filtered_external=self.urls_filtered_external,
+ urls_filtered_pattern=self.urls_filtered_pattern,
+ urls_filtered_max_pages=self.urls_filtered_max_pages,
+ )
+
+ self.result_queue.put(WorkerMessage.crawl_result(result).to_dict())
+ self.logger.info(f"Spider closed: {reason}, crawled {self.pages_crawled} pages in {elapsed:.1f}s")
+
+ def _build_failure_message(self) -> str:
+ """Build a detailed message explaining why the crawl failed."""
+ parts = []
+
+ # Check if this was a sitemap crawl that found URLs but none were followed
+ if self.urls_found_in_sitemap > 0:
+ parts.append(f"Found {self.urls_found_in_sitemap} URLs in sitemap but 0 were scraped.")
+
+ filter_details = []
+ if self.urls_filtered_external > 0:
+ original_domain = urlparse(self.start_url).netloc
+ effective = self.effective_domain or original_domain
+ if original_domain != effective:
+ filter_details.append(f"{self.urls_filtered_external} filtered as external (sitemap domain '{effective}' differs from start URL domain '{original_domain}')")
+ else:
+ filter_details.append(f"{self.urls_filtered_external} filtered as external links")
+
+ if self.urls_filtered_pattern > 0:
+ filter_details.append(f"{self.urls_filtered_pattern} filtered by URL patterns")
+
+ if self.urls_filtered_max_pages > 0:
+ filter_details.append(f"{self.urls_filtered_max_pages} filtered by max pages limit ({self.max_pages})")
+
+ if filter_details:
+ parts.append("Filtering breakdown: " + "; ".join(filter_details) + ".")
+
+ # Suggest fix for domain mismatch
+ if self.urls_filtered_external > 0 and self.effective_domain:
+ original_domain = urlparse(self.start_url).netloc
+ if original_domain != self.effective_domain:
+ parts.append(f"Tip: The site redirects from '{original_domain}' to '{self.effective_domain}'. Try using 'https://{self.effective_domain}' as the start URL, or enable 'Follow external links' to allow cross-domain crawling.")
+ else:
+ # Generic failure message
+ parts.append("No pages were crawled.")
+ if self.pages_failed > 0:
+ parts.append(f"{self.pages_failed} requests failed.")
+
+ # Include collected error messages for more detail
+ if self.errors:
+ parts.append("Errors: " + "; ".join(self.errors[:5])) # Show first 5 errors
+ if len(self.errors) > 5:
+ parts.append(f"(and {len(self.errors) - 5} more)")
+
+ return " ".join(parts)
+
+
+def build_spider_settings(request: CrawlRequest) -> dict:
+ """Build Scrapy settings from a CrawlRequest."""
+ # Convert to ScrapySettings
+ settings = ScrapySettings(
+ crawl_mode=CrawlMode(request.crawl_mode),
+ max_depth=request.max_depth,
+ max_pages=request.max_pages,
+ render_javascript=request.render_javascript,
+ wait_for_selector=request.wait_for_selector,
+ page_load_timeout=request.page_load_timeout,
+ follow_external_links=request.follow_external_links,
+ allowed_url_patterns=request.allowed_url_patterns,
+ denied_url_patterns=request.denied_url_patterns,
+ download_delay=request.download_delay,
+ concurrent_requests=request.concurrent_requests,
+ respect_robots_txt=request.respect_robots_txt,
+ user_agent=request.user_agent,
+ )
+
+ return build_scrapy_settings(settings)
+
+
+def run_crawl(request: CrawlRequest, result_queue: Queue):
+ """
+ Run a single crawl using Scrapy.
+
+ This function sets up the CrawlerRunner and runs the spider.
+ """
+ # Build settings
+ scrapy_settings = build_spider_settings(request)
+
+ # Configure logging
+ configure_logging({"LOG_LEVEL": "INFO"})
+
+ # Create runner
+ runner = CrawlerRunner(settings=scrapy_settings)
+
+ # Run spider
+ deferred = runner.crawl(WorkerSpider, request=request, result_queue=result_queue)
+
+ return deferred
+
+
+def worker_main(worker_id: int, request_queue: Queue, result_queue: Queue):
+ """
+ Main entry point for worker subprocess.
+
+ This function runs the Twisted reactor and processes crawl requests.
+
+ Args:
+ worker_id: Unique ID for this worker
+ request_queue: Queue to receive crawl requests from
+ result_queue: Queue to send results back to main process
+ """
+ print(f"[Worker {worker_id}] Starting...")
+
+ # Signal that we're ready
+ result_queue.put(WorkerMessage.worker_ready(worker_id).to_dict())
+
+ def check_queue():
+ """Check for new requests in the queue."""
+ try:
+ # Non-blocking check
+ if not request_queue.empty():
+ msg_dict = request_queue.get_nowait()
+ msg = WorkerMessage.from_dict(msg_dict)
+
+ if msg.type == MessageType.SHUTDOWN:
+ print(f"[Worker {worker_id}] Received shutdown signal")
+ reactor.stop()
+ return
+
+ if msg.type == MessageType.CRAWL_REQUEST:
+ # Parse request
+ request = CrawlRequest(**msg.payload)
+ print(f"[Worker {worker_id}] Starting crawl: {request.url}")
+
+ # Signal crawl started
+ result_queue.put(WorkerMessage.crawl_started(request.job_id).to_dict())
+
+ # Run the crawl
+ d = run_crawl(request, result_queue)
+
+ # When done, check for more work
+ d.addCallback(lambda _: reactor.callLater(0.1, check_queue))
+ d.addErrback(lambda f: handle_crawl_error(f, request.job_id, result_queue))
+ return
+
+ except Exception as e:
+ print(f"[Worker {worker_id}] Error checking queue: {e}")
+ traceback.print_exc()
+
+ # Schedule next check
+ reactor.callLater(0.5, check_queue)
+
+ def handle_crawl_error(failure, job_id: str, result_queue: Queue):
+ """Handle crawl errors."""
+ error_msg = str(failure.value)
+ print(f"[Worker {worker_id}] Crawl error: {error_msg}")
+
+ result = CrawlResult(
+ job_id=job_id,
+ status=CrawlStatus.FAILED,
+ pages_crawled=0,
+ pages_failed=0,
+ fatal_error=error_msg,
+ )
+ result_queue.put(WorkerMessage.crawl_result(result).to_dict())
+
+ # Continue checking for more work
+ reactor.callLater(0.1, check_queue)
+
+ # Start checking queue after reactor starts
+ reactor.callWhenRunning(check_queue)
+
+ # Run the reactor - this blocks until reactor.stop() is called
+ try:
+ reactor.run(installSignalHandlers=False)
+ except Exception as e:
+ print(f"[Worker {worker_id}] Reactor error: {e}")
+ result_queue.put(WorkerMessage.worker_error(str(e)).to_dict())
+
+ print(f"[Worker {worker_id}] Exiting")
+
+
+if __name__ == "__main__":
+ # This should not be run directly - it's spawned by ScrapyWorkerPool
+ print("This module should be spawned by ScrapyWorkerPool, not run directly")
+ sys.exit(1)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/settings.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/settings.py
new file mode 100644
index 000000000..c7448a106
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/settings.py
@@ -0,0 +1,117 @@
+"""
+Scrapy settings builder for configuring crawl behavior.
+
+Builds Scrapy settings dict from ScrapySettings model.
+"""
+
+from typing import Dict, Any
+from common.models.server import ScrapySettings
+
+
+# Default user agent mimicking Chrome browser
+DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
+
+def build_scrapy_settings(settings: ScrapySettings) -> Dict[str, Any]:
+ """
+ Build Scrapy settings dictionary from ScrapySettings model.
+
+ Args:
+ settings: User-provided scraping settings
+
+ Returns:
+ Dictionary of Scrapy settings
+ """
+ scrapy_settings: Dict[str, Any] = {
+ # NOTE: We don't set TWISTED_REACTOR here because the asyncio reactor
+ # is installed manually in ingestor.py before the event loop starts.
+ # Setting it here would cause Scrapy to try to install it again and fail.
+ # Rate limiting
+ "DOWNLOAD_DELAY": settings.download_delay,
+ "CONCURRENT_REQUESTS": settings.concurrent_requests,
+ "CONCURRENT_REQUESTS_PER_DOMAIN": min(settings.concurrent_requests, 16),
+ "RANDOMIZE_DOWNLOAD_DELAY": True,
+ # Respect robots.txt
+ "ROBOTSTXT_OBEY": settings.respect_robots_txt,
+ # Depth and page limits
+ "DEPTH_LIMIT": settings.max_depth,
+ "CLOSESPIDER_PAGECOUNT": settings.max_pages,
+ # User agent
+ "USER_AGENT": settings.user_agent or DEFAULT_USER_AGENT,
+ # Auto-throttle for polite crawling
+ "AUTOTHROTTLE_ENABLED": True,
+ "AUTOTHROTTLE_START_DELAY": settings.download_delay,
+ "AUTOTHROTTLE_MAX_DELAY": 10.0,
+ "AUTOTHROTTLE_TARGET_CONCURRENCY": float(settings.concurrent_requests),
+ # Retry settings - only retry on specific HTTP errors, not connection/timeout failures
+ "RETRY_ENABLED": True,
+ "RETRY_TIMES": 2,
+ "RETRY_HTTP_CODES": [429, 500, 502, 503, 504],
+ # Don't retry on connection/timeout errors - fail fast
+ # Default includes TimeoutError, ConnectionRefusedError, etc. which we want to skip
+ "RETRY_EXCEPTIONS": [],
+ # Timeout settings
+ "DOWNLOAD_TIMEOUT": settings.page_load_timeout,
+ # DNS timeout (helps with unreachable hosts)
+ "DNS_TIMEOUT": 5,
+ # Use threaded DNS resolver to work around Twisted DNS bug with Python 3.13
+ # (Twisted's async DNS resolver has a str/bytes mismatch issue)
+ "DNS_RESOLVER": "scrapy.resolver.CachingThreadedResolver",
+ # Disable cookies by default (for scraping)
+ "COOKIES_ENABLED": False,
+ # Logging
+ "LOG_LEVEL": "INFO",
+ # Don't filter duplicate requests (we handle this ourselves)
+ "DUPEFILTER_DEBUG": True,
+ # Memory management
+ "MEMUSAGE_ENABLED": True,
+ "MEMUSAGE_LIMIT_MB": 1024,
+ "MEMUSAGE_WARNING_MB": 512,
+ }
+
+ # Add Playwright settings if JS rendering is enabled
+ if settings.render_javascript:
+ scrapy_settings.update(
+ {
+ "DOWNLOAD_HANDLERS": {
+ "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+ "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+ },
+ "PLAYWRIGHT_BROWSER_TYPE": "chromium",
+ "PLAYWRIGHT_LAUNCH_OPTIONS": {
+ "headless": True,
+ },
+ "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": settings.page_load_timeout * 1000,
+ "PLAYWRIGHT_CONTEXTS": {
+ "default": {
+ "ignore_https_errors": True,
+ }
+ },
+ }
+ )
+
+ return scrapy_settings
+
+
+def get_playwright_page_methods(settings: ScrapySettings) -> list:
+ """
+ Build list of Playwright PageMethod calls based on settings.
+
+ Args:
+ settings: User-provided scraping settings
+
+ Returns:
+ List of PageMethod objects to execute on each page
+ """
+ from scrapy_playwright.page import PageMethod
+
+ page_methods = []
+
+ # Wait for specific selector if configured
+ if settings.wait_for_selector:
+ page_methods.append(PageMethod("wait_for_selector", settings.wait_for_selector, timeout=settings.page_load_timeout * 1000))
+
+ # Wait for network idle to ensure dynamic content is loaded
+ page_methods.append(PageMethod("wait_for_load_state", "networkidle", timeout=settings.page_load_timeout * 1000))
+
+ return page_methods
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/__init__.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/__init__.py
new file mode 100644
index 000000000..6f0175508
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/__init__.py
@@ -0,0 +1,20 @@
+"""
+Scrapy spiders for web scraping.
+
+This package contains spiders for different crawling modes:
+- SingleUrlSpider: Scrape a single URL
+- SitemapCrawlSpider: Crawl using sitemap
+- RecursiveCrawlSpider: Follow links recursively
+"""
+
+from .base import BaseWebSpider
+from .single_url import SingleUrlSpider
+from .sitemap import SitemapCrawlSpider
+from .recursive import RecursiveCrawlSpider
+
+__all__ = [
+ "BaseWebSpider",
+ "SingleUrlSpider",
+ "SitemapCrawlSpider",
+ "RecursiveCrawlSpider",
+]
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/base.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/base.py
new file mode 100644
index 000000000..d2747f4f5
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/base.py
@@ -0,0 +1,178 @@
+"""
+Base spider with shared functionality for all web scrapers.
+
+Provides common parsing logic, Playwright integration, and job management.
+"""
+
+from typing import Optional, Iterator, Any
+import scrapy
+from scrapy.http import Response, Request
+
+from common.models.server import ScrapySettings
+from common.models.rag import DataSourceInfo
+from common.job_manager import JobManager
+from common.ingestor import Client
+from common.utils import get_logger
+
+from ..items import ScrapedPageItem
+from ..parsers import ParserRegistry
+
+# Import all parsers to register them
+from ..parsers import docusaurus, mkdocs, sphinx, readthedocs, vitepress, generic # noqa: F401
+
+
+class BaseWebSpider(scrapy.Spider):
+ """
+ Base spider with shared parsing logic and Playwright support.
+
+ All web scrapers inherit from this class to get:
+ - Automatic parser selection based on site generator
+ - Playwright integration for JS rendering
+ - Job progress tracking
+ - Page count limiting
+ """
+
+ name = "base_web_spider"
+
+ def __init__(self, start_url: str, scrape_settings: ScrapySettings, job_id: str, client: Client, job_manager: JobManager, datasource_info: DataSourceInfo, *args, **kwargs):
+ """
+ Initialize the spider.
+
+ Args:
+ start_url: Initial URL to crawl
+ scrape_settings: User-provided scraping configuration
+ job_id: ID of the ingestion job
+ client: RAG server client
+ job_manager: Job status manager
+ datasource_info: Datasource metadata
+ """
+ super().__init__(*args, **kwargs)
+
+ self.start_url = start_url
+ self.scrape_settings = scrape_settings
+ self.job_id = job_id
+ self.client = client
+ self.job_manager = job_manager
+ self.datasource_info = datasource_info
+
+ self.logger_custom = get_logger(f"spider:{self.name}")
+ self.pages_crawled = 0
+ self.max_pages = scrape_settings.max_pages
+
+ def start_requests(self) -> Iterator[Request]:
+ """
+ Generate initial requests.
+
+ Override in subclasses for custom start behavior.
+ """
+ yield self._make_request(self.start_url, callback=self.parse)
+
+ def _make_request(self, url: str, callback: Any = None, meta: Optional[dict] = None, **kwargs) -> Request:
+ """
+ Create a request with Playwright meta if JS rendering is enabled.
+
+ Args:
+ url: URL to request
+ callback: Callback function for response
+ meta: Additional meta data
+ **kwargs: Additional Request arguments
+
+ Returns:
+ Scrapy Request object
+ """
+ request_meta = meta or {}
+
+ # Add Playwright settings if JS rendering is enabled
+ if self.scrape_settings.render_javascript:
+ from ..settings import get_playwright_page_methods
+
+ request_meta.update(
+ {
+ "playwright": True,
+ "playwright_include_page": False,
+ "playwright_page_methods": get_playwright_page_methods(self.scrape_settings),
+ }
+ )
+
+ return Request(url=url, callback=callback or self.parse, meta=request_meta, errback=self.handle_error, **kwargs)
+
+ def parse(self, response: Response) -> Iterator[ScrapedPageItem]:
+ """
+ Parse a response and yield scraped items.
+
+ Args:
+ response: Scrapy Response object
+
+ Yields:
+ ScrapedPageItem for each successfully parsed page
+ """
+ # Check page limit
+ if self.pages_crawled >= self.max_pages:
+ self.logger_custom.info(f"Reached max pages limit ({self.max_pages}), stopping")
+ return
+
+ self.pages_crawled += 1
+ self.logger_custom.debug(f"Parsing page {self.pages_crawled}: {response.url}")
+
+ try:
+ # Use parser registry to extract content
+ result = ParserRegistry.parse(response)
+
+ yield ScrapedPageItem(
+ url=response.url,
+ content=result.content,
+ title=result.title,
+ description=result.description,
+ language=result.language,
+ generator=result.generator,
+ )
+
+ except Exception as e:
+ self.logger_custom.error(f"Error parsing {response.url}: {e}")
+ # Don't re-raise - let pipeline handle the error
+
+ def handle_error(self, failure):
+ """
+ Handle request failures.
+
+ Args:
+ failure: Twisted Failure object
+ """
+ request = failure.request
+ self.logger_custom.error(f"Request failed: {request.url} - {failure.value}")
+
+ def should_follow_url(self, url: str) -> bool:
+ """
+ Check if a URL should be followed based on settings.
+
+ Args:
+ url: URL to check
+
+ Returns:
+ True if URL should be followed
+ """
+ import re
+ from urllib.parse import urlparse
+
+ # Check page limit
+ if self.pages_crawled >= self.max_pages:
+ return False
+
+ # Check external links
+ if not self.scrape_settings.follow_external_links:
+ start_domain = urlparse(self.start_url).netloc
+ url_domain = urlparse(url).netloc
+ if url_domain != start_domain:
+ return False
+
+ # Check allowed patterns
+ if self.scrape_settings.allowed_url_patterns:
+ if not any(re.search(p, url) for p in self.scrape_settings.allowed_url_patterns):
+ return False
+
+ # Check denied patterns
+ if self.scrape_settings.denied_url_patterns:
+ if any(re.search(p, url) for p in self.scrape_settings.denied_url_patterns):
+ return False
+
+ return True
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/recursive.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/recursive.py
new file mode 100644
index 000000000..5ee04e8f9
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/recursive.py
@@ -0,0 +1,225 @@
+"""
+Recursive crawl spider for following links.
+
+Uses Scrapy's CrawlSpider with LinkExtractor for discovering pages
+by following links from the start URL.
+"""
+
+from typing import Iterator, Set
+from urllib.parse import urlparse
+
+from scrapy.http import Request, Response
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+
+from common.models.server import ScrapySettings
+from common.models.rag import DataSourceInfo
+from common.job_manager import JobManager
+from common.ingestor import Client
+from common.utils import get_logger
+
+from ..items import ScrapedPageItem
+from ..parsers import ParserRegistry
+
+# Import all parsers to register them
+from ..parsers import docusaurus, mkdocs, sphinx, readthedocs, vitepress, generic # noqa: F401
+
+
+class RecursiveCrawlSpider(CrawlSpider):
+ """
+ Spider that follows links recursively.
+
+ Uses Scrapy's CrawlSpider with configurable rules for
+ discovering and following links.
+ """
+
+ name = "recursive_spider"
+
+ def __init__(self, start_url: str, scrape_settings: ScrapySettings, job_id: str, client: Client, job_manager: JobManager, datasource_info: DataSourceInfo, *args, **kwargs):
+ """
+ Initialize the recursive spider.
+
+ Args:
+ start_url: Initial URL to start crawling from
+ scrape_settings: User-provided scraping configuration
+ job_id: ID of the ingestion job
+ client: RAG server client
+ job_manager: Job status manager
+ datasource_info: Datasource metadata
+ """
+ self.start_url = start_url
+ self.scrape_settings = scrape_settings
+ self.job_id = job_id
+ self.client = client
+ self.job_manager = job_manager
+ self.datasource_info = datasource_info
+
+ self.logger_custom = get_logger(f"spider:{self.name}")
+ self.pages_crawled = 0
+ self.max_pages = scrape_settings.max_pages
+ self.visited_urls: Set[str] = set()
+
+ # Set start URLs
+ self.start_urls = [start_url]
+
+ # Set allowed domains
+ parsed = urlparse(start_url)
+ if scrape_settings.follow_external_links:
+ self.allowed_domains = [] # Allow all domains
+ else:
+ self.allowed_domains = [parsed.netloc]
+
+ # Build link extractor with URL patterns
+ link_extractor = LinkExtractor(
+ allow=scrape_settings.allowed_url_patterns or (),
+ deny=scrape_settings.denied_url_patterns or (),
+ allow_domains=self.allowed_domains if self.allowed_domains else None,
+ deny_extensions=[
+ "png",
+ "jpg",
+ "jpeg",
+ "gif",
+ "svg",
+ "ico",
+ "webp", # Images
+ "pdf",
+ "doc",
+ "docx",
+ "xls",
+ "xlsx",
+ "ppt",
+ "pptx", # Documents
+ "zip",
+ "tar",
+ "gz",
+ "rar", # Archives
+ "mp3",
+ "mp4",
+ "avi",
+ "mov",
+ "wmv", # Media
+ "css",
+ "js",
+ "woff",
+ "woff2",
+ "ttf",
+ "eot", # Assets
+ ],
+ )
+
+ # Set up rules for following links
+ self.rules = (
+ Rule(
+ link_extractor,
+ callback="parse_page",
+ follow=True,
+ process_request="process_request",
+ ),
+ )
+
+ super().__init__(*args, **kwargs)
+
+ def start_requests(self) -> Iterator[Request]:
+ """Generate initial requests with Playwright support."""
+ for url in self.start_urls:
+ yield self._make_request(url, callback=self.parse_page)
+
+ def _make_request(self, url: str, callback=None, **kwargs) -> Request:
+ """
+ Create a request with Playwright meta if needed.
+
+ Args:
+ url: URL to request
+ callback: Callback function
+ **kwargs: Additional request arguments
+
+ Returns:
+ Scrapy Request object
+ """
+ meta = kwargs.pop("meta", {})
+
+ if self.scrape_settings.render_javascript:
+ from ..settings import get_playwright_page_methods
+
+ meta.update(
+ {
+ "playwright": True,
+ "playwright_include_page": False,
+ "playwright_page_methods": get_playwright_page_methods(self.scrape_settings),
+ }
+ )
+
+ return Request(url, callback=callback or self.parse_page, meta=meta, **kwargs)
+
+ def process_request(self, request: Request, response: Response) -> Request | None:
+ """
+ Process each request before sending.
+
+ Used to add Playwright settings and check limits.
+
+ Args:
+ request: The request to process
+ response: The response that generated this request
+
+ Returns:
+ Modified request or None to skip
+ """
+ # Check page limit
+ if self.pages_crawled >= self.max_pages:
+ return None
+
+ # Check if already visited
+ if request.url in self.visited_urls:
+ return None
+
+ # Add Playwright settings if needed
+ if self.scrape_settings.render_javascript:
+ from ..settings import get_playwright_page_methods
+
+ request.meta.update(
+ {
+ "playwright": True,
+ "playwright_include_page": False,
+ "playwright_page_methods": get_playwright_page_methods(self.scrape_settings),
+ }
+ )
+
+ return request
+
+ def parse_page(self, response: Response) -> Iterator[ScrapedPageItem]:
+ """
+ Parse a response and yield scraped items.
+
+ Args:
+ response: Scrapy Response object
+
+ Yields:
+ ScrapedPageItem for each successfully parsed page
+ """
+ # Check page limit
+ if self.pages_crawled >= self.max_pages:
+ return
+
+ # Track visited URLs
+ if response.url in self.visited_urls:
+ return
+ self.visited_urls.add(response.url)
+
+ self.pages_crawled += 1
+ self.logger_custom.debug(f"Parsing page {self.pages_crawled}: {response.url}")
+
+ try:
+ # Use parser registry to extract content
+ result = ParserRegistry.parse(response)
+
+ yield ScrapedPageItem(
+ url=response.url,
+ content=result.content,
+ title=result.title,
+ description=result.description,
+ language=result.language,
+ generator=result.generator,
+ )
+
+ except Exception as e:
+ self.logger_custom.error(f"Error parsing {response.url}: {e}")
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/single_url.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/single_url.py
new file mode 100644
index 000000000..2adb86c55
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/single_url.py
@@ -0,0 +1,26 @@
+"""
+Single URL spider for scraping a single page.
+
+The simplest spider - just fetches and parses one URL.
+"""
+
+from typing import Iterator
+from scrapy.http import Request
+
+from .base import BaseWebSpider
+
+
+class SingleUrlSpider(BaseWebSpider):
+ """
+ Spider that scrapes a single URL.
+
+ This is the default spider when crawl_mode is 'single'.
+ It simply fetches the provided URL and extracts content.
+ """
+
+ name = "single_url_spider"
+
+ def start_requests(self) -> Iterator[Request]:
+ """Generate a single request for the start URL."""
+ self.logger_custom.info(f"Starting single URL scrape: {self.start_url}")
+ yield self._make_request(self.start_url, callback=self.parse)
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/sitemap.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/sitemap.py
new file mode 100644
index 000000000..bfdc9866c
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/spiders/sitemap.py
@@ -0,0 +1,164 @@
+"""
+Sitemap-based spider for crawling documentation sites.
+
+Uses Scrapy's SitemapSpider to discover pages from sitemap.xml or robots.txt.
+"""
+
+import re
+from typing import Iterator, Any
+from urllib.parse import urlparse
+
+from scrapy.http import Request, Response
+from scrapy.spiders import SitemapSpider
+
+from common.models.server import ScrapySettings
+from common.models.rag import DataSourceInfo
+from common.job_manager import JobManager
+from common.ingestor import Client
+from common.utils import get_logger
+
+from ..items import ScrapedPageItem
+from ..parsers import ParserRegistry
+
+# Import all parsers to register them
+from ..parsers import docusaurus, mkdocs, sphinx, readthedocs, vitepress, generic # noqa: F401
+
+
+class SitemapCrawlSpider(SitemapSpider):
+ """
+ Spider that crawls from sitemap.
+
+ Uses Scrapy's built-in SitemapSpider for efficient sitemap parsing,
+ including support for sitemap indexes and gzipped sitemaps.
+ """
+
+ name = "sitemap_spider"
+
+ def __init__(self, start_url: str, scrape_settings: ScrapySettings, job_id: str, client: Client, job_manager: JobManager, datasource_info: DataSourceInfo, *args, **kwargs):
+ """
+ Initialize the sitemap spider.
+
+ Args:
+ start_url: Base URL or sitemap URL to crawl
+ scrape_settings: User-provided scraping configuration
+ job_id: ID of the ingestion job
+ client: RAG server client
+ job_manager: Job status manager
+ datasource_info: Datasource metadata
+ """
+ self.start_url = start_url
+ self.scrape_settings = scrape_settings
+ self.job_id = job_id
+ self.client = client
+ self.job_manager = job_manager
+ self.datasource_info = datasource_info
+
+ self.logger_custom = get_logger(f"spider:{self.name}")
+ self.pages_crawled = 0
+ self.max_pages = scrape_settings.max_pages
+
+ # Configure sitemap URLs
+ # If URL ends with sitemap.xml, use it directly
+ # Otherwise, try robots.txt first, then sitemap.xml
+ if start_url.endswith("sitemap.xml") or start_url.endswith("sitemap.xml.gz"):
+ self.sitemap_urls = [start_url]
+ else:
+ # SitemapSpider will check robots.txt for Sitemap: directives
+ parsed = urlparse(start_url)
+ base_url = f"{parsed.scheme}://{parsed.netloc}"
+ self.sitemap_urls = [
+ f"{base_url}/robots.txt",
+ f"{base_url}/sitemap.xml",
+ ]
+
+ # Configure URL pattern filters
+ if scrape_settings.allowed_url_patterns:
+ self.sitemap_follow = scrape_settings.allowed_url_patterns
+
+ super().__init__(*args, **kwargs)
+
+ def sitemap_filter(self, entries: Iterator[dict]) -> Iterator[dict]:
+ """
+ Filter sitemap entries based on settings.
+
+ Args:
+ entries: Iterator of sitemap entry dicts with 'loc' key
+
+ Yields:
+ Filtered entries that should be crawled
+ """
+ for entry in entries:
+ # Check page limit
+ if self.pages_crawled >= self.max_pages:
+ self.logger_custom.info(f"Reached max pages limit ({self.max_pages}), stopping sitemap processing")
+ return
+
+ url = entry.get("loc", "")
+
+ # Check denied patterns
+ if self.scrape_settings.denied_url_patterns:
+ if any(re.search(p, url) for p in self.scrape_settings.denied_url_patterns):
+ self.logger_custom.debug(f"Skipping denied URL: {url}")
+ continue
+
+ # Check allowed patterns (if specified)
+ if self.scrape_settings.allowed_url_patterns:
+ if not any(re.search(p, url) for p in self.scrape_settings.allowed_url_patterns):
+ self.logger_custom.debug(f"Skipping non-matching URL: {url}")
+ continue
+
+ yield entry
+
+ def _build_request(self, url: str, callback: Any) -> Request:
+ """
+ Build a request with Playwright meta if needed.
+
+ Override SitemapSpider's method to add JS rendering support.
+ """
+ meta = {}
+
+ if self.scrape_settings.render_javascript:
+ from ..settings import get_playwright_page_methods
+
+ meta.update(
+ {
+ "playwright": True,
+ "playwright_include_page": False,
+ "playwright_page_methods": get_playwright_page_methods(self.scrape_settings),
+ }
+ )
+
+ return Request(url, callback=callback, meta=meta)
+
+ def parse(self, response: Response) -> Iterator[ScrapedPageItem]:
+ """
+ Parse a response and yield scraped items.
+
+ Args:
+ response: Scrapy Response object
+
+ Yields:
+ ScrapedPageItem for each successfully parsed page
+ """
+ # Check page limit
+ if self.pages_crawled >= self.max_pages:
+ return
+
+ self.pages_crawled += 1
+ self.logger_custom.debug(f"Parsing page {self.pages_crawled}: {response.url}")
+
+ try:
+ # Use parser registry to extract content
+ result = ParserRegistry.parse(response)
+
+ yield ScrapedPageItem(
+ url=response.url,
+ content=result.content,
+ title=result.title,
+ description=result.description,
+ language=result.language,
+ generator=result.generator,
+ )
+
+ except Exception as e:
+ self.logger_custom.error(f"Error parsing {response.url}: {e}")
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/__init__.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/docsaurus_scraper.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/docsaurus_scraper.py
deleted file mode 100644
index 01b81f0e4..000000000
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/docsaurus_scraper.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from bs4 import BeautifulSoup
-
-def scrape_docsaurus(soup: BeautifulSoup) -> str:
- article_tag = soup.find('article')
- if article_tag:
- content = article_tag.get_text(separator='\n', strip=True)
- else:
- content = soup.get_text(separator='\n', strip=True)
- return content
\ No newline at end of file
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/mkdocs_scraper.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/mkdocs_scraper.py
deleted file mode 100644
index 8493a0926..000000000
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/url/mkdocs_scraper.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from bs4 import BeautifulSoup
-
-def scrape_mkdocs(soup: BeautifulSoup) -> str:
- main_content = soup.select_one("main.md-main div.md-content")
- if main_content:
- content = main_content.get_text(separator='\n', strip=True)
- else:
- content = soup.get_text(separator='\n', strip=True)
- return content
\ No newline at end of file
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/worker_pool.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/worker_pool.py
new file mode 100644
index 000000000..36c0ddca4
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/worker_pool.py
@@ -0,0 +1,404 @@
+"""
+Scrapy Worker Pool for managing crawler subprocess workers.
+
+This module provides a pool of worker processes that run Scrapy crawlers.
+Each worker runs Twisted's reactor.run() in its own process, avoiding
+event loop conflicts with the main asyncio-based ingestor.
+"""
+
+import asyncio
+import multiprocessing
+import os
+from typing import Dict, Optional, Callable, Awaitable
+from dataclasses import dataclass
+
+# Use 'spawn' context to create fresh processes without inheriting parent's event loop
+# This is critical for Twisted reactor to work properly in subprocess
+mp_context = multiprocessing.get_context("spawn")
+Process = mp_context.Process
+Queue = mp_context.Queue
+
+from common.utils import get_logger
+
+from .worker_types import (
+ WorkerMessage,
+ MessageType,
+ CrawlRequest,
+ CrawlProgress,
+ CrawlResult,
+ CrawlStatus,
+)
+
+logger = get_logger(__name__)
+
+# Default pool size
+DEFAULT_POOL_SIZE = int(os.getenv("SCRAPY_WORKER_POOL_SIZE", "3"))
+
+
+@dataclass
+class PendingJob:
+ """A job waiting for a result."""
+
+ request: CrawlRequest
+ future: asyncio.Future
+ on_progress: Optional[Callable[[CrawlProgress], Awaitable[None]]] = None
+
+
+class ScrapyWorkerPool:
+ """
+ Pool of Scrapy worker processes.
+
+ This class manages a fixed number of worker processes that run Scrapy crawlers.
+ Crawl requests are dispatched to available workers, and results are returned
+ asynchronously.
+
+ Usage:
+ pool = ScrapyWorkerPool(max_workers=3)
+ await pool.start()
+
+ result = await pool.crawl(request, on_progress=callback)
+
+ await pool.shutdown()
+ """
+
+ def __init__(self, max_workers: int = DEFAULT_POOL_SIZE):
+ """
+ Initialize the worker pool.
+
+ Args:
+ max_workers: Maximum number of concurrent worker processes
+ """
+ self.max_workers = max_workers
+ self.workers: Dict[int, Process] = {}
+ self.request_queues: Dict[int, Queue] = {}
+ self.result_queue: Optional[Queue] = None
+ self.pending_jobs: Dict[str, PendingJob] = {} # job_id -> PendingJob
+ self.available_workers: asyncio.Queue = asyncio.Queue()
+ self._running = False
+ self._result_processor_task: Optional[asyncio.Task] = None
+
+ async def start(self):
+ """
+ Start the worker pool.
+
+ This spawns worker processes and waits for them to become ready.
+ """
+ if self._running:
+ logger.warning("Worker pool already running")
+ return
+
+ logger.info(f"Starting Scrapy worker pool with {self.max_workers} workers")
+
+ # Create shared result queue
+ self.result_queue = Queue()
+
+ # Spawn workers
+ for worker_id in range(self.max_workers):
+ request_queue = Queue()
+ self.request_queues[worker_id] = request_queue
+
+ # Import here to avoid circular imports
+ from .scrapy_worker import worker_main
+
+ process = Process(
+ target=worker_main,
+ args=(worker_id, request_queue, self.result_queue),
+ daemon=True,
+ )
+ process.start()
+ self.workers[worker_id] = process
+
+ logger.info(f"Started worker {worker_id} (PID: {process.pid})")
+
+ # Wait for all workers to signal ready
+ ready_count = 0
+ timeout = 30 # seconds
+ start_time = asyncio.get_event_loop().time()
+
+ while ready_count < self.max_workers:
+ if asyncio.get_event_loop().time() - start_time > timeout:
+ raise TimeoutError(f"Workers failed to start within {timeout}s")
+
+ try:
+ # Check result queue in executor to avoid blocking
+ msg_dict = await asyncio.get_event_loop().run_in_executor(None, self._get_message_timeout, 1.0)
+
+ if msg_dict:
+ msg = WorkerMessage.from_dict(msg_dict)
+ if msg.type == MessageType.WORKER_READY:
+ worker_id = msg.payload.get("worker_id")
+ await self.available_workers.put(worker_id)
+ ready_count += 1
+ logger.info(f"Worker {worker_id} ready ({ready_count}/{self.max_workers})")
+
+ except Exception as e:
+ logger.warning(f"Error waiting for workers: {e}")
+ await asyncio.sleep(0.5)
+
+ self._running = True
+
+ # Start result processor
+ self._result_processor_task = asyncio.create_task(self._process_results())
+
+ logger.info("Worker pool started successfully")
+
+ def _get_message_timeout(self, timeout: float) -> Optional[dict]:
+ """Get a message from result queue with timeout (runs in executor)."""
+ try:
+ return self.result_queue.get(timeout=timeout)
+ except Exception:
+ return None
+
+ async def _process_results(self):
+ """
+ Background task to process results from workers.
+
+ This runs continuously and dispatches results to waiting futures.
+ """
+ logger.info("Starting result processor")
+
+ while self._running:
+ try:
+ # Check for results in executor
+ msg_dict = await asyncio.get_event_loop().run_in_executor(None, self._get_message_timeout, 0.5)
+
+ if msg_dict is None:
+ continue
+
+ msg = WorkerMessage.from_dict(msg_dict)
+
+ if msg.type == MessageType.CRAWL_STARTED:
+ job_id = msg.payload.get("job_id")
+ logger.debug(f"Crawl started: {job_id}")
+
+ elif msg.type == MessageType.CRAWL_PROGRESS:
+ job_id = msg.payload.get("job_id")
+ pending = self.pending_jobs.get(job_id)
+
+ if pending and pending.on_progress:
+ progress = CrawlProgress(
+ job_id=job_id,
+ pages_crawled=msg.payload.get("pages_crawled", 0),
+ pages_failed=msg.payload.get("pages_failed", 0),
+ current_url=msg.payload.get("current_url"),
+ message=msg.payload.get("message", ""),
+ total_pages=msg.payload.get("total_pages"),
+ queue_size=msg.payload.get("queue_size", 0),
+ )
+ try:
+ await pending.on_progress(progress)
+ except Exception:
+ logger.warning("Progress callback error")
+
+ elif msg.type == MessageType.CRAWL_RESULT:
+ job_id = msg.payload.get("job_id")
+ pending = self.pending_jobs.pop(job_id, None)
+
+ if pending:
+ result = CrawlResult(
+ job_id=job_id,
+ status=CrawlStatus(msg.payload.get("status", "failed")),
+ pages_crawled=msg.payload.get("pages_crawled", 0),
+ pages_failed=msg.payload.get("pages_failed", 0),
+ documents=msg.payload.get("documents", []),
+ fatal_error=msg.payload.get("fatal_error"),
+ errors=msg.payload.get("errors", []),
+ elapsed_seconds=msg.payload.get("elapsed_seconds", 0),
+ urls_found_in_sitemap=msg.payload.get("urls_found_in_sitemap", 0),
+ urls_filtered_external=msg.payload.get("urls_filtered_external", 0),
+ urls_filtered_pattern=msg.payload.get("urls_filtered_pattern", 0),
+ urls_filtered_max_pages=msg.payload.get("urls_filtered_max_pages", 0),
+ )
+
+ if not pending.future.done():
+ pending.future.set_result(result)
+
+ logger.info(f"Crawl completed: {job_id} - {result.pages_crawled} pages in {result.elapsed_seconds:.1f}s")
+
+ # Find which worker handled this and mark available
+ # For now, we track by job -> worker mapping
+ # TODO: Add worker_id to result messages
+
+ elif msg.type == MessageType.WORKER_READY:
+ worker_id = msg.payload.get("worker_id")
+ await self.available_workers.put(worker_id)
+ logger.debug(f"Worker {worker_id} available")
+
+ elif msg.type == MessageType.WORKER_ERROR:
+ error = msg.payload.get("error")
+ job_id = msg.payload.get("job_id")
+ logger.error(f"Worker error: {error}")
+
+ if job_id:
+ pending = self.pending_jobs.pop(job_id, None)
+ if pending and not pending.future.done():
+ pending.future.set_exception(Exception(error))
+
+ except asyncio.CancelledError:
+ break
+ except Exception as e:
+ logger.error(f"Error processing results: {e}")
+ await asyncio.sleep(0.5)
+
+ logger.info("Result processor stopped")
+
+ async def crawl(
+ self,
+ request: CrawlRequest,
+ on_progress: Optional[Callable[[CrawlProgress], Awaitable[None]]] = None,
+ timeout: float = 3600, # 1 hour default
+ ) -> CrawlResult:
+ """
+ Submit a crawl request and wait for the result.
+
+ Args:
+ request: Crawl request configuration
+ on_progress: Optional callback for progress updates
+ timeout: Maximum time to wait for crawl completion
+
+ Returns:
+ CrawlResult with crawled documents
+
+ Raises:
+ TimeoutError: If crawl exceeds timeout
+ Exception: If worker encounters an error
+ """
+ if not self._running:
+ raise RuntimeError("Worker pool not running. Call start() first.")
+
+ # Get an available worker
+ try:
+ worker_id = await asyncio.wait_for(self.available_workers.get(), timeout=60)
+ except asyncio.TimeoutError:
+ raise RuntimeError("No workers available within 60s")
+
+ logger.info(f"Dispatching crawl {request.job_id} to worker {worker_id}")
+
+ # Create future for result
+ loop = asyncio.get_event_loop()
+ future = loop.create_future()
+
+ # Track pending job
+ self.pending_jobs[request.job_id] = PendingJob(
+ request=request,
+ future=future,
+ on_progress=on_progress,
+ )
+
+ # Send request to worker
+ msg = WorkerMessage.crawl_request(request)
+ self.request_queues[worker_id].put(msg.to_dict())
+
+ try:
+ # Wait for result
+ result = await asyncio.wait_for(future, timeout=timeout)
+
+ # Mark worker as available again
+ await self.available_workers.put(worker_id)
+
+ return result
+
+ except asyncio.TimeoutError:
+ # Remove from pending
+ self.pending_jobs.pop(request.job_id, None)
+
+ # Worker might be stuck - don't return it to pool
+ logger.error(f"Crawl {request.job_id} timed out after {timeout}s")
+
+ return CrawlResult(
+ job_id=request.job_id,
+ status=CrawlStatus.FAILED,
+ pages_crawled=0,
+ pages_failed=0,
+ fatal_error=f"Crawl timed out after {timeout}s",
+ )
+
+ except Exception:
+ # Remove from pending
+ self.pending_jobs.pop(request.job_id, None)
+
+ # Return worker to pool (it might still be healthy)
+ await self.available_workers.put(worker_id)
+
+ raise
+
+ async def shutdown(self, timeout: float = 10):
+ """
+ Shutdown the worker pool.
+
+ Args:
+ timeout: Maximum time to wait for workers to exit gracefully
+ """
+ if not self._running:
+ return
+
+ logger.info("Shutting down worker pool")
+ self._running = False
+
+ # Cancel result processor
+ if self._result_processor_task:
+ self._result_processor_task.cancel()
+ try:
+ await self._result_processor_task
+ except asyncio.CancelledError:
+ pass
+
+ # Send shutdown signal to all workers
+ for worker_id, request_queue in self.request_queues.items():
+ try:
+ request_queue.put(WorkerMessage.shutdown().to_dict())
+ except Exception as e:
+ logger.warning(f"Error sending shutdown to worker {worker_id}: {e}")
+
+ # Wait for workers to exit
+ for worker_id, process in self.workers.items():
+ process.join(timeout=timeout)
+ if process.is_alive():
+ logger.warning(f"Worker {worker_id} did not exit gracefully, terminating")
+ process.terminate()
+ process.join(timeout=2)
+
+ # Cleanup
+ self.workers.clear()
+ self.request_queues.clear()
+ self.pending_jobs.clear()
+
+ logger.info("Worker pool shutdown complete")
+
+ @property
+ def active_crawls(self) -> int:
+ """Number of crawls currently in progress."""
+ return len(self.pending_jobs)
+
+ @property
+ def available_worker_count(self) -> int:
+ """Number of workers available for new crawls."""
+ return self.available_workers.qsize()
+
+
+# Global pool instance (lazy initialized)
+_pool: Optional[ScrapyWorkerPool] = None
+
+
+async def get_worker_pool() -> ScrapyWorkerPool:
+ """
+ Get the global worker pool instance.
+
+ Creates and starts the pool if it doesn't exist.
+ """
+ global _pool
+
+ if _pool is None:
+ _pool = ScrapyWorkerPool()
+ await _pool.start()
+
+ return _pool
+
+
+async def shutdown_worker_pool():
+ """Shutdown the global worker pool."""
+ global _pool
+
+ if _pool is not None:
+ await _pool.shutdown()
+ _pool = None
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/worker_types.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/worker_types.py
new file mode 100644
index 000000000..15602e6ba
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/src/ingestors/webloader/loader/worker_types.py
@@ -0,0 +1,177 @@
+"""
+IPC message types for Scrapy worker subprocess communication.
+
+These dataclasses define the messages passed between the main ingestor process
+and the Scrapy worker subprocess via multiprocessing queues.
+"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+
+
+class MessageType(str, Enum):
+ """Types of messages sent between main process and worker."""
+
+ # Main -> Worker
+ CRAWL_REQUEST = "crawl_request"
+ SHUTDOWN = "shutdown"
+
+ # Worker -> Main
+ CRAWL_STARTED = "crawl_started"
+ CRAWL_PROGRESS = "crawl_progress"
+ CRAWL_RESULT = "crawl_result"
+ WORKER_READY = "worker_ready"
+ WORKER_ERROR = "worker_error"
+
+
+class CrawlStatus(str, Enum):
+ """Status of a crawl operation."""
+
+ SUCCESS = "success"
+ FAILED = "failed"
+ PARTIAL = "partial" # Some pages failed
+
+
+@dataclass
+class CrawlRequest:
+ """
+ Request to crawl a URL.
+
+ Sent from main process to worker.
+ """
+
+ job_id: str
+ url: str
+ datasource_id: str
+
+ # Scrapy settings (serialized as dict for IPC)
+ crawl_mode: str # "single", "sitemap", "recursive"
+ max_depth: int = 2
+ max_pages: int = 100
+ render_javascript: bool = False
+ wait_for_selector: str | None = None
+ page_load_timeout: int = 30
+ follow_external_links: bool = False
+ allowed_url_patterns: list[str] | None = None
+ denied_url_patterns: list[str] | None = None
+ download_delay: float = 0.5
+ concurrent_requests: int = 8
+ respect_robots_txt: bool = True
+ user_agent: str | None = None
+
+ # Metadata for document creation
+ ingestor_id: str = ""
+ datasource_name: str = ""
+
+
+@dataclass
+class CrawlProgress:
+ """
+ Progress update during a crawl.
+
+ Sent from worker to main process periodically.
+ """
+
+ job_id: str
+ pages_crawled: int
+ pages_failed: int
+ current_url: str | None = None
+ message: str = ""
+ # For progress bar calculation
+ total_pages: int | None = None # Known total (e.g., from sitemap)
+ queue_size: int = 0 # Pending URLs in queue (for recursive mode)
+
+
+@dataclass
+class CrawlResult:
+ """
+ Final result of a crawl operation.
+
+ Sent from worker to main process when crawl completes.
+ """
+
+ job_id: str
+ status: CrawlStatus
+ pages_crawled: int
+ pages_failed: int
+ documents: list[dict] = field(default_factory=list) # Serialized Document objects
+ fatal_error: str | None = None # Fatal error message when entire crawl fails
+ errors: list[str] = field(default_factory=list) # Individual page errors (for partial failures)
+ elapsed_seconds: float = 0.0
+
+ # Filtering stats for debugging failed crawls
+ urls_found_in_sitemap: int = 0
+ urls_filtered_external: int = 0
+ urls_filtered_pattern: int = 0
+ urls_filtered_max_pages: int = 0
+
+
+@dataclass
+class WorkerMessage:
+ """
+ Wrapper for all worker messages with type discrimination.
+
+ This is the actual message format sent through queues.
+ """
+
+ type: MessageType
+ payload: dict = field(default_factory=dict)
+
+ def to_dict(self) -> dict:
+ """Serialize to dict for queue transport."""
+ return {"type": self.type.value, "payload": self.payload}
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "WorkerMessage":
+ """Deserialize from dict."""
+ return cls(type=MessageType(data["type"]), payload=data.get("payload", {}))
+
+ @classmethod
+ def crawl_request(cls, request: CrawlRequest) -> "WorkerMessage":
+ """Create a crawl request message."""
+ return cls(type=MessageType.CRAWL_REQUEST, payload=request.__dict__)
+
+ @classmethod
+ def shutdown(cls) -> "WorkerMessage":
+ """Create a shutdown message."""
+ return cls(type=MessageType.SHUTDOWN)
+
+ @classmethod
+ def crawl_started(cls, job_id: str) -> "WorkerMessage":
+ """Create a crawl started message."""
+ return cls(type=MessageType.CRAWL_STARTED, payload={"job_id": job_id})
+
+ @classmethod
+ def crawl_progress(cls, progress: CrawlProgress) -> "WorkerMessage":
+ """Create a progress update message."""
+ return cls(type=MessageType.CRAWL_PROGRESS, payload=progress.__dict__)
+
+ @classmethod
+ def crawl_result(cls, result: CrawlResult) -> "WorkerMessage":
+ """Create a crawl result message."""
+ payload = {
+ "job_id": result.job_id,
+ "status": result.status.value,
+ "pages_crawled": result.pages_crawled,
+ "pages_failed": result.pages_failed,
+ "documents": result.documents,
+ "fatal_error": result.fatal_error,
+ "errors": result.errors,
+ "elapsed_seconds": result.elapsed_seconds,
+ # Filtering stats for debugging
+ "urls_found_in_sitemap": result.urls_found_in_sitemap,
+ "urls_filtered_external": result.urls_filtered_external,
+ "urls_filtered_pattern": result.urls_filtered_pattern,
+ "urls_filtered_max_pages": result.urls_filtered_max_pages,
+ }
+ return cls(type=MessageType.CRAWL_RESULT, payload=payload)
+
+ @classmethod
+ def worker_ready(cls, worker_id: int) -> "WorkerMessage":
+ """Create a worker ready message."""
+ return cls(type=MessageType.WORKER_READY, payload={"worker_id": worker_id})
+
+ @classmethod
+ def worker_error(cls, error: str, job_id: str | None = None) -> "WorkerMessage":
+ """Create a worker error message."""
+ return cls(type=MessageType.WORKER_ERROR, payload={"error": error, "job_id": job_id})
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/__init__.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/__init__.py
new file mode 100644
index 000000000..0a788d088
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/__init__.py
@@ -0,0 +1 @@
+# Ingestors tests
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/__init__.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/__init__.py
new file mode 100644
index 000000000..c451e3d76
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/__init__.py
@@ -0,0 +1 @@
+# Webloader tests
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_parsers.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_parsers.py
new file mode 100644
index 000000000..14cec715f
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_parsers.py
@@ -0,0 +1,283 @@
+"""
+Simple tests for the web content parsers.
+
+These tests use mock Scrapy Response objects to verify that parsers
+correctly detect and extract content from different documentation sites.
+"""
+
+from scrapy.http import HtmlResponse
+
+
+# ============================================================================
+# Test Fixtures - Sample HTML for different documentation frameworks
+# ============================================================================
+
+
+def make_response(html: str, url: str = "http://example.com/docs/page") -> HtmlResponse:
+ """Create a mock HtmlResponse from HTML string."""
+ return HtmlResponse(url=url, body=html.encode("utf-8"))
+
+
+# Sample Docusaurus HTML
+DOCUSAURUS_HTML = """
+
+
+
+
+
+ Getting Started | My Docs
+
+
+
+
+ Getting Started
+ Welcome to the documentation.
+ This is the main content of the page.
+
+
+
+
+"""
+
+# Sample MkDocs HTML
+MKDOCS_HTML = """
+
+
+
+
+
+ Welcome - My MkDocs Site
+
+
+
+
+
+
+ Welcome
+ This is MkDocs content.
+ More documentation text here.
+
+
+
+
+
+"""
+
+# Sample generic HTML (no specific framework)
+GENERIC_HTML = """
+
+
+
+
+ Welcome Page
+
+
+
+
+
+ Welcome
+ This is the main content.
+ More text content here.
+
+
+
+
+"""
+
+# Sample Sphinx HTML
+SPHINX_HTML = """
+
+
+
+
+
+ Introduction — My Project documentation
+
+
+
+
+
+
+ Introduction
+ This is Sphinx documentation content.
+
+
+
+
+
+"""
+
+
+# ============================================================================
+# Parser Detection Tests
+# ============================================================================
+
+
+class TestParserDetection:
+ """Tests for parser can_parse() detection logic."""
+
+ def test_docusaurus_detected_by_generator(self):
+ """Docusaurus parser should detect pages with Docusaurus generator meta tag."""
+ from ingestors.webloader.loader.parsers.docusaurus import DocusaurusParser
+
+ response = make_response(DOCUSAURUS_HTML)
+ assert DocusaurusParser.can_parse(response) is True
+
+ def test_mkdocs_detected_by_generator(self):
+ """MkDocs parser should detect pages with mkdocs generator meta tag."""
+ from ingestors.webloader.loader.parsers.mkdocs import MkDocsParser
+
+ response = make_response(MKDOCS_HTML)
+ assert MkDocsParser.can_parse(response) is True
+
+ def test_sphinx_detected_by_generator(self):
+ """Sphinx parser should detect pages with Sphinx generator meta tag."""
+ from ingestors.webloader.loader.parsers.sphinx import SphinxParser
+
+ response = make_response(SPHINX_HTML)
+ assert SphinxParser.can_parse(response) is True
+
+ def test_generic_always_matches(self):
+ """Generic parser should always return True (it's the fallback)."""
+ from ingestors.webloader.loader.parsers.generic import GenericParser
+
+ response = make_response(GENERIC_HTML)
+ assert GenericParser.can_parse(response) is True
+
+
+# ============================================================================
+# Content Extraction Tests
+# ============================================================================
+
+
+class TestContentExtraction:
+ """Tests for parser extract() content extraction."""
+
+ def test_docusaurus_extracts_article_content(self):
+ """Docusaurus parser should extract content from article tag."""
+ from ingestors.webloader.loader.parsers.docusaurus import DocusaurusParser
+
+ response = make_response(DOCUSAURUS_HTML)
+ result = DocusaurusParser.extract(response)
+
+ assert result.title == "Getting Started | My Docs"
+ assert result.description == "This is a Docusaurus page"
+ assert "Welcome to the documentation" in result.content
+ assert "main content" in result.content
+ # Should not include navigation
+ assert "Navigation here" not in result.content
+
+ def test_mkdocs_extracts_md_content(self):
+ """MkDocs parser should extract content from md-content class."""
+ from ingestors.webloader.loader.parsers.mkdocs import MkDocsParser
+
+ response = make_response(MKDOCS_HTML)
+ result = MkDocsParser.extract(response)
+
+ assert result.title == "Welcome - My MkDocs Site"
+ assert "MkDocs content" in result.content
+ # Should not include navigation
+ assert "Navigation" not in result.content or result.content.count("Navigation") == 0
+
+ def test_generic_extracts_main_content(self):
+ """Generic parser should extract content from main tag."""
+ from ingestors.webloader.loader.parsers.generic import GenericParser
+
+ response = make_response(GENERIC_HTML)
+ result = GenericParser.extract(response)
+
+ assert result.title == "Welcome Page"
+ assert result.description == "A generic website"
+ assert "main content" in result.content
+
+ def test_sphinx_extracts_body_content(self):
+ """Sphinx parser should extract content from document body."""
+ from ingestors.webloader.loader.parsers.sphinx import SphinxParser
+
+ response = make_response(SPHINX_HTML)
+ result = SphinxParser.extract(response)
+
+ assert "Introduction" in result.title or "Introduction" in result.content
+ assert "Sphinx documentation content" in result.content
+
+
+# ============================================================================
+# Metadata Extraction Tests
+# ============================================================================
+
+
+class TestMetadataExtraction:
+ """Tests for metadata extraction (title, description, language)."""
+
+ def test_extracts_language(self):
+ """Parsers should extract the html lang attribute."""
+ from ingestors.webloader.loader.parsers.generic import GenericParser
+
+ response = make_response(GENERIC_HTML)
+ result = GenericParser.extract(response)
+
+ assert result.language == "en"
+
+ def test_extracts_generator(self):
+ """Parsers should extract the generator meta tag."""
+ from ingestors.webloader.loader.parsers.docusaurus import DocusaurusParser
+
+ response = make_response(DOCUSAURUS_HTML)
+ result = DocusaurusParser.extract(response)
+
+ assert result.generator is not None
+ assert "Docusaurus" in result.generator
+
+ def test_missing_description_returns_empty(self):
+ """Missing description meta tag should return empty string."""
+ from ingestors.webloader.loader.parsers.generic import GenericParser
+
+ html = """
+
+ No Description
+ Content
+ """
+ response = make_response(html)
+ result = GenericParser.extract(response)
+
+ assert result.description == ""
+
+
+# ============================================================================
+# Registry Tests
+# ============================================================================
+
+
+class TestParserRegistry:
+ """Tests for the parser registry automatic selection."""
+
+ def test_registry_selects_docusaurus_for_docusaurus_page(self):
+ """Registry should select DocusaurusParser for Docusaurus pages."""
+ from ingestors.webloader.loader.parsers.registry import ParserRegistry
+ from ingestors.webloader.loader.parsers.docusaurus import DocusaurusParser
+
+ response = make_response(DOCUSAURUS_HTML)
+ parser = ParserRegistry.get_parser(response)
+
+ assert parser == DocusaurusParser
+
+ def test_registry_selects_generic_for_unknown_pages(self):
+ """Registry should fall back to GenericParser for unknown pages."""
+ from ingestors.webloader.loader.parsers.registry import ParserRegistry
+ from ingestors.webloader.loader.parsers.generic import GenericParser
+
+ response = make_response(GENERIC_HTML)
+ parser = ParserRegistry.get_parser(response)
+
+ assert parser == GenericParser
+
+ def test_registry_parse_returns_result(self):
+ """Registry.parse() should return a ParseResult."""
+ from ingestors.webloader.loader.parsers.registry import ParserRegistry
+ from ingestors.webloader.loader.parsers.base import ParseResult
+
+ response = make_response(GENERIC_HTML)
+ result = ParserRegistry.parse(response)
+
+ assert isinstance(result, ParseResult)
+ assert result.title == "Welcome Page"
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_pipeline.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_pipeline.py
new file mode 100644
index 000000000..915723fc4
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_pipeline.py
@@ -0,0 +1,290 @@
+"""
+Simple tests for the document pipeline.
+
+These tests verify that the pipeline correctly converts ScrapedPageItem
+objects to LangChain Documents and handles validation.
+"""
+
+import pytest
+from unittest.mock import Mock, AsyncMock
+
+
+# ============================================================================
+# Test Fixtures
+# ============================================================================
+
+
+def make_mock_spider():
+ """Create a mock spider with required attributes."""
+ spider = Mock()
+ spider.client = Mock()
+ spider.client.ingestor_id = "test-ingestor-123"
+ spider.client.ingest_documents = AsyncMock()
+ spider.job_manager = Mock()
+ spider.job_manager.increment_progress = AsyncMock()
+ spider.datasource_info = Mock()
+ spider.datasource_info.datasource_id = "test-datasource-456"
+ spider.job_id = "test-job-789"
+ return spider
+
+
+def make_scraped_item(
+ url: str = "https://example.com/page",
+ content: str = "This is the page content with enough text.",
+ title: str = "Test Page",
+ description: str = "A test page description",
+ language: str = "en",
+ generator: str = None,
+):
+ """Create a ScrapedPageItem for testing."""
+ from ingestors.webloader.loader.items import ScrapedPageItem
+
+ return ScrapedPageItem(
+ url=url,
+ content=content,
+ title=title,
+ description=description,
+ language=language,
+ generator=generator,
+ )
+
+
+# ============================================================================
+# Pipeline Initialization Tests
+# ============================================================================
+
+
+class TestPipelineInitialization:
+ """Tests for pipeline initialization."""
+
+ def test_pipeline_creates_empty_batch(self):
+ """Pipeline should initialize with an empty batch."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+
+ assert pipeline.batch == []
+ assert pipeline.client is None
+ assert pipeline.job_id is None
+
+ def test_open_spider_initializes_from_spider(self):
+ """open_spider should copy attributes from spider."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+
+ pipeline.open_spider(spider)
+
+ assert pipeline.client == spider.client
+ assert pipeline.job_manager == spider.job_manager
+ assert pipeline.datasource_info == spider.datasource_info
+ assert pipeline.job_id == "test-job-789"
+ assert pipeline.ingestor_id == "test-ingestor-123"
+
+
+# ============================================================================
+# Item Processing Tests
+# ============================================================================
+
+
+class TestItemProcessing:
+ """Tests for process_item functionality."""
+
+ @pytest.mark.asyncio
+ async def test_process_item_creates_document(self):
+ """process_item should add a Document to the batch."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ item = make_scraped_item()
+ result = await pipeline.process_item(item, spider)
+
+ # Should return the item
+ assert result == item
+
+ # Should have one document in batch
+ assert len(pipeline.batch) == 1
+
+ # Document should have correct content
+ doc = pipeline.batch[0]
+ assert doc.page_content == item.content
+ assert doc.metadata["title"] == "Test Page"
+ assert doc.metadata["description"] == "A test page description"
+
+ @pytest.mark.asyncio
+ async def test_process_item_increments_progress(self):
+ """process_item should call increment_progress on job manager."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ item = make_scraped_item()
+ await pipeline.process_item(item, spider)
+
+ spider.job_manager.increment_progress.assert_called_once_with("test-job-789")
+
+ @pytest.mark.asyncio
+ async def test_drops_item_with_no_content(self):
+ """process_item should drop items with empty content."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+ from scrapy.exceptions import DropItem
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ item = make_scraped_item(content="")
+
+ with pytest.raises(DropItem):
+ await pipeline.process_item(item, spider)
+
+ # Batch should still be empty
+ assert len(pipeline.batch) == 0
+
+ @pytest.mark.asyncio
+ async def test_drops_item_with_short_content(self):
+ """process_item should drop items with content shorter than 10 chars."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+ from scrapy.exceptions import DropItem
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ item = make_scraped_item(content="Short") # Less than 10 chars
+
+ with pytest.raises(DropItem):
+ await pipeline.process_item(item, spider)
+
+
+# ============================================================================
+# Batch Flushing Tests
+# ============================================================================
+
+
+class TestBatchFlushing:
+ """Tests for batch flushing behavior."""
+
+ @pytest.mark.asyncio
+ async def test_flushes_batch_when_full(self):
+ """Pipeline should flush batch when batch_size is reached."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ pipeline.batch_size = 3 # Small batch for testing
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ # Add items up to batch size
+ for i in range(3):
+ item = make_scraped_item(
+ url=f"https://example.com/page{i}",
+ content=f"Content for page {i} with enough text",
+ )
+ await pipeline.process_item(item, spider)
+
+ # Should have called ingest_documents
+ spider.client.ingest_documents.assert_called_once()
+
+ # Batch should be empty after flush
+ assert len(pipeline.batch) == 0
+
+ @pytest.mark.asyncio
+ async def test_close_spider_flushes_remaining(self):
+ """close_spider should flush remaining documents."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ pipeline.batch_size = 100 # Large batch
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ # Add a few items (less than batch_size)
+ for i in range(3):
+ item = make_scraped_item(
+ url=f"https://example.com/page{i}",
+ content=f"Content for page {i} with enough text",
+ )
+ await pipeline.process_item(item, spider)
+
+ # ingest_documents should not have been called yet
+ spider.client.ingest_documents.assert_not_called()
+
+ # Close spider
+ await pipeline.close_spider(spider)
+
+ # Now it should have been called
+ spider.client.ingest_documents.assert_called_once()
+
+
+# ============================================================================
+# Document Metadata Tests
+# ============================================================================
+
+
+class TestDocumentMetadata:
+ """Tests for document metadata generation."""
+
+ @pytest.mark.asyncio
+ async def test_document_has_correct_metadata_structure(self):
+ """Generated documents should have correct metadata structure."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ item = make_scraped_item(
+ url="https://example.com/docs/guide",
+ title="User Guide",
+ description="How to use the product",
+ language="en",
+ generator="Docusaurus v2.4",
+ )
+ await pipeline.process_item(item, spider)
+
+ doc = pipeline.batch[0]
+ metadata = doc.metadata
+
+ # Check required fields
+ assert metadata["datasource_id"] == "test-datasource-456"
+ assert metadata["ingestor_id"] == "test-ingestor-123"
+ assert metadata["document_type"] == "webpage"
+ assert metadata["title"] == "User Guide"
+ assert metadata["description"] == "How to use the product"
+
+ # Check nested metadata
+ assert metadata["metadata"]["source"] == "https://example.com/docs/guide"
+ assert metadata["metadata"]["language"] == "en"
+ assert metadata["metadata"]["generator"] == "Docusaurus v2.4"
+
+ @pytest.mark.asyncio
+ async def test_document_id_is_generated_from_url(self):
+ """Document ID should be deterministically generated from URL."""
+ from ingestors.webloader.loader.pipelines.document import DocumentPipeline
+
+ pipeline = DocumentPipeline()
+ spider = make_mock_spider()
+ pipeline.open_spider(spider)
+
+ item = make_scraped_item(url="https://example.com/docs/page")
+ await pipeline.process_item(item, spider)
+
+ doc = pipeline.batch[0]
+
+ # ID should exist and be non-empty
+ assert doc.id is not None
+ assert len(doc.id) > 0
+
+ # Same URL should generate same ID
+ item2 = make_scraped_item(url="https://example.com/docs/page")
+ await pipeline.process_item(item2, spider)
+
+ doc2 = pipeline.batch[1]
+ assert doc.id == doc2.id
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_spiders.py b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_spiders.py
new file mode 100644
index 000000000..3db6dbc1a
--- /dev/null
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/tests/webloader/test_spiders.py
@@ -0,0 +1,293 @@
+"""
+Simple tests for the Scrapy spiders.
+
+These tests verify spider initialization, URL filtering logic,
+and basic request generation without running actual crawls.
+"""
+
+from unittest.mock import Mock
+
+
+# ============================================================================
+# Mock Objects for Testing
+# ============================================================================
+
+
+def make_mock_settings(
+ crawl_mode: str = "single",
+ max_pages: int = 100,
+ max_depth: int = 2,
+ render_javascript: bool = False,
+ follow_external_links: bool = False,
+ allowed_url_patterns: list = None,
+ denied_url_patterns: list = None,
+):
+ """Create a mock ScrapySettings object."""
+ mock = Mock()
+ mock.crawl_mode = crawl_mode
+ mock.max_pages = max_pages
+ mock.max_depth = max_depth
+ mock.render_javascript = render_javascript
+ mock.follow_external_links = follow_external_links
+ mock.allowed_url_patterns = allowed_url_patterns
+ mock.denied_url_patterns = denied_url_patterns
+ mock.wait_for_selector = None
+ mock.page_load_timeout = 30
+ return mock
+
+
+def make_mock_client():
+ """Create a mock RAG Client."""
+ return Mock()
+
+
+def make_mock_job_manager():
+ """Create a mock JobManager."""
+ return Mock()
+
+
+def make_mock_datasource_info():
+ """Create a mock DataSourceInfo."""
+ mock = Mock()
+ mock.datasource_id = "test-datasource-123"
+ return mock
+
+
+# ============================================================================
+# ScrapedPageItem Tests
+# ============================================================================
+
+
+class TestScrapedPageItem:
+ """Tests for the ScrapedPageItem dataclass."""
+
+ def test_item_creation(self):
+ """Should create an item with required fields."""
+ from ingestors.webloader.loader.items import ScrapedPageItem
+
+ item = ScrapedPageItem(
+ url="https://example.com/page",
+ content="This is the page content.",
+ )
+
+ assert item.url == "https://example.com/page"
+ assert item.content == "This is the page content."
+ assert item.title == "" # Default
+ assert item.description == "" # Default
+
+ def test_item_with_metadata(self):
+ """Should create an item with all metadata fields."""
+ from ingestors.webloader.loader.items import ScrapedPageItem
+
+ item = ScrapedPageItem(
+ url="https://example.com/docs",
+ content="Documentation content",
+ title="My Docs",
+ description="Documentation description",
+ language="en",
+ generator="Docusaurus v2.4",
+ )
+
+ assert item.title == "My Docs"
+ assert item.description == "Documentation description"
+ assert item.language == "en"
+ assert item.generator == "Docusaurus v2.4"
+
+ def test_item_to_dict(self):
+ """Should convert item to dictionary."""
+ from ingestors.webloader.loader.items import ScrapedPageItem
+
+ item = ScrapedPageItem(
+ url="https://example.com",
+ content="Content",
+ title="Title",
+ )
+
+ result = item.to_dict()
+
+ assert isinstance(result, dict)
+ assert result["url"] == "https://example.com"
+ assert result["content"] == "Content"
+ assert result["title"] == "Title"
+
+
+# ============================================================================
+# URL Filtering Tests
+# ============================================================================
+
+
+class TestUrlFiltering:
+ """Tests for URL filtering logic in spiders."""
+
+ def test_blocks_external_links_by_default(self):
+ """Spider should block external links when follow_external_links=False."""
+ from ingestors.webloader.loader.spiders.base import BaseWebSpider
+
+ spider = BaseWebSpider(
+ start_url="https://docs.example.com/guide",
+ scrape_settings=make_mock_settings(follow_external_links=False),
+ job_id="test-job",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ # Same domain should be allowed
+ assert spider.should_follow_url("https://docs.example.com/other-page") is True
+
+ # Different domain should be blocked
+ assert spider.should_follow_url("https://other-site.com/page") is False
+
+ def test_allows_external_links_when_enabled(self):
+ """Spider should allow external links when follow_external_links=True."""
+ from ingestors.webloader.loader.spiders.base import BaseWebSpider
+
+ spider = BaseWebSpider(
+ start_url="https://docs.example.com/guide",
+ scrape_settings=make_mock_settings(follow_external_links=True),
+ job_id="test-job",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ assert spider.should_follow_url("https://other-site.com/page") is True
+
+ def test_respects_allowed_patterns(self):
+ """Spider should only follow URLs matching allowed patterns."""
+ from ingestors.webloader.loader.spiders.base import BaseWebSpider
+
+ spider = BaseWebSpider(
+ start_url="https://docs.example.com/",
+ scrape_settings=make_mock_settings(allowed_url_patterns=[r"/docs/", r"/api/"]),
+ job_id="test-job",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ # Matches /docs/
+ assert spider.should_follow_url("https://docs.example.com/docs/getting-started") is True
+ # Matches /api/
+ assert spider.should_follow_url("https://docs.example.com/api/reference") is True
+ # Doesn't match any pattern
+ assert spider.should_follow_url("https://docs.example.com/blog/post") is False
+
+ def test_respects_denied_patterns(self):
+ """Spider should skip URLs matching denied patterns."""
+ from ingestors.webloader.loader.spiders.base import BaseWebSpider
+
+ spider = BaseWebSpider(
+ start_url="https://docs.example.com/",
+ scrape_settings=make_mock_settings(denied_url_patterns=[r"/blog/", r"\.pdf$"]),
+ job_id="test-job",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ # Should be blocked by /blog/ pattern
+ assert spider.should_follow_url("https://docs.example.com/blog/post") is False
+ # Should be blocked by .pdf pattern
+ assert spider.should_follow_url("https://docs.example.com/files/doc.pdf") is False
+ # Should be allowed
+ assert spider.should_follow_url("https://docs.example.com/docs/page") is True
+
+ def test_respects_max_pages_limit(self):
+ """Spider should stop following URLs when max_pages is reached."""
+ from ingestors.webloader.loader.spiders.base import BaseWebSpider
+
+ spider = BaseWebSpider(
+ start_url="https://docs.example.com/",
+ scrape_settings=make_mock_settings(max_pages=10),
+ job_id="test-job",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ # Initially should allow
+ assert spider.should_follow_url("https://docs.example.com/page") is True
+
+ # Simulate reaching max pages
+ spider.pages_crawled = 10
+
+ # Should now block
+ assert spider.should_follow_url("https://docs.example.com/page") is False
+
+
+# ============================================================================
+# Spider Initialization Tests
+# ============================================================================
+
+
+class TestSpiderInitialization:
+ """Tests for spider initialization."""
+
+ def test_base_spider_stores_settings(self):
+ """Base spider should store all provided settings."""
+ from ingestors.webloader.loader.spiders.base import BaseWebSpider
+
+ settings = make_mock_settings(max_pages=500)
+ client = make_mock_client()
+ job_manager = make_mock_job_manager()
+ datasource_info = make_mock_datasource_info()
+
+ spider = BaseWebSpider(
+ start_url="https://example.com",
+ scrape_settings=settings,
+ job_id="test-123",
+ client=client,
+ job_manager=job_manager,
+ datasource_info=datasource_info,
+ )
+
+ assert spider.start_url == "https://example.com"
+ assert spider.job_id == "test-123"
+ assert spider.max_pages == 500
+ assert spider.pages_crawled == 0
+
+ def test_single_url_spider_name(self):
+ """SingleUrlSpider should have correct name."""
+ from ingestors.webloader.loader.spiders.single_url import SingleUrlSpider
+
+ spider = SingleUrlSpider(
+ start_url="https://example.com",
+ scrape_settings=make_mock_settings(),
+ job_id="test",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ assert spider.name == "single_url_spider"
+
+ def test_sitemap_spider_name(self):
+ """SitemapCrawlSpider should have correct name."""
+ from ingestors.webloader.loader.spiders.sitemap import SitemapCrawlSpider
+
+ spider = SitemapCrawlSpider(
+ start_url="https://example.com",
+ scrape_settings=make_mock_settings(),
+ job_id="test",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ assert spider.name == "sitemap_spider"
+
+ def test_recursive_spider_name(self):
+ """RecursiveCrawlSpider should have correct name."""
+ from ingestors.webloader.loader.spiders.recursive import RecursiveCrawlSpider
+
+ spider = RecursiveCrawlSpider(
+ start_url="https://example.com",
+ scrape_settings=make_mock_settings(),
+ job_id="test",
+ client=make_mock_client(),
+ job_manager=make_mock_job_manager(),
+ datasource_info=make_mock_datasource_info(),
+ )
+
+ assert spider.name == "recursive_spider"
diff --git a/ai_platform_engineering/knowledge_bases/rag/ingestors/uv.lock b/ai_platform_engineering/knowledge_bases/rag/ingestors/uv.lock
index bc058dbb2..20b09f7d7 100644
--- a/ai_platform_engineering/knowledge_bases/rag/ingestors/uv.lock
+++ b/ai_platform_engineering/knowledge_bases/rag/ingestors/uv.lock
@@ -143,6 +143,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
]
+[[package]]
+name = "automat"
+version = "25.4.16"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/0f/d40bbe294bbf004d436a8bcbcfaadca8b5140d39ad0ad3d73d1a8ba15f14/automat-25.4.16.tar.gz", hash = "sha256:0017591a5477066e90d26b0e696ddc143baafd87b588cfac8100bc6be9634de0", size = 129977, upload-time = "2025-04-16T20:12:16.002Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/02/ff/1175b0b7371e46244032d43a56862d0af455823b5280a50c63d99cc50f18/automat-25.4.16-py3-none-any.whl", hash = "sha256:04e9bce696a8d5671ee698005af6e5a9fa15354140a87f4870744604dcdd3ba1", size = 42842, upload-time = "2025-04-16T20:12:14.447Z" },
+]
+
[[package]]
name = "beautifulsoup4"
version = "4.14.2"
@@ -158,30 +167,30 @@ wheels = [
[[package]]
name = "boto3"
-version = "1.40.74"
+version = "1.42.45"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "botocore" },
{ name = "jmespath" },
{ name = "s3transfer" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/a2/37/0db5fc46548b347255310893f1a47971a1d8eb0dbc46dfb5ace8a1e7d45e/boto3-1.40.74.tar.gz", hash = "sha256:484e46bf394b03a7c31b34f90945ebe1390cb1e2ac61980d128a9079beac87d4", size = 111592, upload-time = "2025-11-14T20:29:10.991Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/e6/8fdd78825de6d8086aa3097955f83d8db3c5a3868b73da233c49977a7444/boto3-1.42.45.tar.gz", hash = "sha256:4db50b8b39321fab87ff7f40ab407887d436d004c1f2b0dfdf56e42b4884709b", size = 112846, upload-time = "2026-02-09T21:50:14.925Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/d2/08/c52751748762901c0ca3c3019e3aa950010217f0fdf9940ebe68e6bb2f5a/boto3-1.40.74-py3-none-any.whl", hash = "sha256:41fc8844b37ae27b24bcabf8369769df246cc12c09453988d0696ad06d6aa9ef", size = 139360, upload-time = "2025-11-14T20:29:09.477Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/e0/d59a178799412cfe38c2757d6e49c337a5e71b18cdc3641dd6d9daf52151/boto3-1.42.45-py3-none-any.whl", hash = "sha256:5074e074a718a6f3c2b519cbb9ceab258f17b331a143d23351d487984f2a412f", size = 140604, upload-time = "2026-02-09T21:50:13.113Z" },
]
[[package]]
name = "botocore"
-version = "1.40.74"
+version = "1.42.45"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jmespath" },
{ name = "python-dateutil" },
{ name = "urllib3" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/81/dc/0412505f05286f282a75bb0c650e525ddcfaf3f6f1a05cd8e99d32a2db06/botocore-1.40.74.tar.gz", hash = "sha256:57de0b9ffeada06015b3c7e5186c77d0692b210d9e5efa294f3214df97e2f8ee", size = 14452479, upload-time = "2025-11-14T20:29:00.949Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7a/b1/c36ad705d67bb935eac3085052b5dc03ec22d5ac12e7aedf514f3d76cac8/botocore-1.42.45.tar.gz", hash = "sha256:40b577d07b91a0ed26879da9e4658d82d3a400382446af1014d6ad3957497545", size = 14941217, upload-time = "2026-02-09T21:50:01.966Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/7d/a2/306dec16e3c84f3ca7aaead0084358c1c7fbe6501f6160844cbc93bc871e/botocore-1.40.74-py3-none-any.whl", hash = "sha256:f39f5763e35e75f0bd91212b7b36120b1536203e8003cd952ef527db79702b15", size = 14117911, upload-time = "2025-11-14T20:28:58.153Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/ec/6681b8e4884f8663d7650220e702c503e4ba6bd09a5b91d44803b0b1d0a8/botocore-1.42.45-py3-none-any.whl", hash = "sha256:a5ea5d1b7c46c2d5d113879e45b21eaf7d60dc865f4bcb46dfcf0703fe3429f4", size = 14615557, upload-time = "2026-02-09T21:49:57.066Z" },
]
[[package]]
@@ -202,6 +211,51 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
]
+[[package]]
+name = "cffi"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
+ { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
+ { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
+ { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
+ { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
+ { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
+ { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
+ { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
+ { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
+ { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
+ { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
+ { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
+]
+
[[package]]
name = "charset-normalizer"
version = "3.4.4"
@@ -257,12 +311,11 @@ wheels = [
[[package]]
name = "cohere"
-version = "5.20.0"
+version = "5.20.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "fastavro" },
{ name = "httpx" },
- { name = "httpx-sse" },
{ name = "pydantic" },
{ name = "pydantic-core" },
{ name = "requests" },
@@ -270,9 +323,9 @@ dependencies = [
{ name = "types-requests" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/0e5dcfa9d111b82de4f3c7d83fbc92f478d229c8a004cc63c321fe44bb42/cohere-5.20.0.tar.gz", hash = "sha256:fb5ad5afa47447dd7eb090ad29bdb3a8181b0e758a3b03ba6ed8ca48d68d11a7", size = 168600, upload-time = "2025-10-24T20:24:05.903Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/37/65af9c50b5d5772a5528c6a949799f98ae286b8ebb924e0fac0619b3ae88/cohere-5.20.4.tar.gz", hash = "sha256:3b3017048ff5d5b4f113180947d538ca3d0f274de5875f0345be4c8cb3d5119a", size = 180737, upload-time = "2026-02-05T14:47:54.639Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/69/5c/e312678fb4dff827c748980ec18918307d25e39ce006c84f7c6b32bc5641/cohere-5.20.0-py3-none-any.whl", hash = "sha256:a95f17ed22be3f978363703beb6008b55000ce0e85124ddb976fa5b688014fea", size = 303306, upload-time = "2025-10-24T20:24:04.237Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/14/5c5077c6b01aed7a18dfc5ab775a35c7a6cb118e5bc1dafcfc06abdb9d9e/cohere-5.20.4-py3-none-any.whl", hash = "sha256:9cc6ebb0adac3d9f96ac0efffde6a2484534fb0aec3684a62c250d49da958f29", size = 318987, upload-time = "2026-02-05T14:47:53.505Z" },
]
[[package]]
@@ -292,7 +345,6 @@ dependencies = [
{ name = "cymple" },
{ name = "langchain-aws" },
{ name = "langchain-cohere" },
- { name = "langchain-huggingface" },
{ name = "langchain-ollama" },
{ name = "langchain-openai" },
{ name = "neo4j" },
@@ -305,13 +357,85 @@ requires-dist = [
{ name = "cymple", specifier = ">=0.12.0" },
{ name = "langchain-aws", specifier = ">=0.2.24" },
{ name = "langchain-cohere", specifier = ">=0.3.0" },
- { name = "langchain-huggingface", specifier = ">=0.3.0" },
+ { name = "langchain-huggingface", marker = "extra == 'huggingface'", specifier = ">=0.3.0" },
{ name = "langchain-ollama", specifier = ">=0.3.0" },
{ name = "langchain-openai", specifier = ">=0.3.18" },
{ name = "neo4j", specifier = ">=5.28.1" },
{ name = "pydantic", specifier = ">=2.11.7" },
{ name = "redis", specifier = ">=6.2.0" },
]
+provides-extras = ["huggingface"]
+
+[[package]]
+name = "constantly"
+version = "23.10.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4d/6f/cb2a94494ff74aa9528a36c5b1422756330a75a8367bf20bd63171fc324d/constantly-23.10.4.tar.gz", hash = "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd", size = 13300, upload-time = "2023-10-28T23:18:24.316Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b8/40/c199d095151addf69efdb4b9ca3a4f20f70e20508d6222bffb9b76f58573/constantly-23.10.4-py3-none-any.whl", hash = "sha256:3fd9b4d1c3dc1ec9757f3c52aef7e53ad9323dbe39f51dfd4c43853b68dfa3f9", size = 13547, upload-time = "2023-10-28T23:18:23.038Z" },
+]
+
+[[package]]
+name = "cryptography"
+version = "46.0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/78/19/f748958276519adf6a0c1e79e7b8860b4830dda55ccdf29f2719b5fc499c/cryptography-46.0.4.tar.gz", hash = "sha256:bfd019f60f8abc2ed1b9be4ddc21cfef059c841d86d710bb69909a688cbb8f59", size = 749301, upload-time = "2026-01-28T00:24:37.379Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8d/99/157aae7949a5f30d51fcb1a9851e8ebd5c74bf99b5285d8bb4b8b9ee641e/cryptography-46.0.4-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:281526e865ed4166009e235afadf3a4c4cba6056f99336a99efba65336fd5485", size = 7173686, upload-time = "2026-01-28T00:23:07.515Z" },
+ { url = "https://files.pythonhosted.org/packages/87/91/874b8910903159043b5c6a123b7e79c4559ddd1896e38967567942635778/cryptography-46.0.4-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f14fba5bf6f4390d7ff8f086c566454bff0411f6d8aa7af79c88b6f9267aecc", size = 4275871, upload-time = "2026-01-28T00:23:09.439Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/35/690e809be77896111f5b195ede56e4b4ed0435b428c2f2b6d35046fbb5e8/cryptography-46.0.4-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47bcd19517e6389132f76e2d5303ded6cf3f78903da2158a671be8de024f4cd0", size = 4423124, upload-time = "2026-01-28T00:23:11.529Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/5b/a26407d4f79d61ca4bebaa9213feafdd8806dc69d3d290ce24996d3cfe43/cryptography-46.0.4-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:01df4f50f314fbe7009f54046e908d1754f19d0c6d3070df1e6268c5a4af09fa", size = 4277090, upload-time = "2026-01-28T00:23:13.123Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/d8/4bb7aec442a9049827aa34cee1aa83803e528fa55da9a9d45d01d1bb933e/cryptography-46.0.4-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5aa3e463596b0087b3da0dbe2b2487e9fc261d25da85754e30e3b40637d61f81", size = 4947652, upload-time = "2026-01-28T00:23:14.554Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/08/f83e2e0814248b844265802d081f2fac2f1cbe6cd258e72ba14ff006823a/cryptography-46.0.4-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0a9ad24359fee86f131836a9ac3bffc9329e956624a2d379b613f8f8abaf5255", size = 4455157, upload-time = "2026-01-28T00:23:16.443Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/05/19d849cf4096448779d2dcc9bb27d097457dac36f7273ffa875a93b5884c/cryptography-46.0.4-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:dc1272e25ef673efe72f2096e92ae39dea1a1a450dd44918b15351f72c5a168e", size = 3981078, upload-time = "2026-01-28T00:23:17.838Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/89/f7bac81d66ba7cde867a743ea5b37537b32b5c633c473002b26a226f703f/cryptography-46.0.4-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:de0f5f4ec8711ebc555f54735d4c673fc34b65c44283895f1a08c2b49d2fd99c", size = 4276213, upload-time = "2026-01-28T00:23:19.257Z" },
+ { url = "https://files.pythonhosted.org/packages/da/9f/7133e41f24edd827020ad21b068736e792bc68eecf66d93c924ad4719fb3/cryptography-46.0.4-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:eeeb2e33d8dbcccc34d64651f00a98cb41b2dc69cef866771a5717e6734dfa32", size = 4912190, upload-time = "2026-01-28T00:23:21.244Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/f7/6d43cbaddf6f65b24816e4af187d211f0bc536a29961f69faedc48501d8e/cryptography-46.0.4-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:3d425eacbc9aceafd2cb429e42f4e5d5633c6f873f5e567077043ef1b9bbf616", size = 4454641, upload-time = "2026-01-28T00:23:22.866Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/4f/ebd0473ad656a0ac912a16bd07db0f5d85184924e14fc88feecae2492834/cryptography-46.0.4-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91627ebf691d1ea3976a031b61fb7bac1ccd745afa03602275dda443e11c8de0", size = 4405159, upload-time = "2026-01-28T00:23:25.278Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/f7/7923886f32dc47e27adeff8246e976d77258fd2aa3efdd1754e4e323bf49/cryptography-46.0.4-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2d08bc22efd73e8854b0b7caff402d735b354862f1145d7be3b9c0f740fef6a0", size = 4666059, upload-time = "2026-01-28T00:23:26.766Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/a7/0fca0fd3591dffc297278a61813d7f661a14243dd60f499a7a5b48acb52a/cryptography-46.0.4-cp311-abi3-win32.whl", hash = "sha256:82a62483daf20b8134f6e92898da70d04d0ef9a75829d732ea1018678185f4f5", size = 3026378, upload-time = "2026-01-28T00:23:28.317Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/12/652c84b6f9873f0909374864a57b003686c642ea48c84d6c7e2c515e6da5/cryptography-46.0.4-cp311-abi3-win_amd64.whl", hash = "sha256:6225d3ebe26a55dbc8ead5ad1265c0403552a63336499564675b29eb3184c09b", size = 3478614, upload-time = "2026-01-28T00:23:30.275Z" },
+ { url = "https://files.pythonhosted.org/packages/b9/27/542b029f293a5cce59349d799d4d8484b3b1654a7b9a0585c266e974a488/cryptography-46.0.4-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:485e2b65d25ec0d901bca7bcae0f53b00133bf3173916d8e421f6fddde103908", size = 7116417, upload-time = "2026-01-28T00:23:31.958Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/f5/559c25b77f40b6bf828eabaf988efb8b0e17b573545edb503368ca0a2a03/cryptography-46.0.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:078e5f06bd2fa5aea5a324f2a09f914b1484f1d0c2a4d6a8a28c74e72f65f2da", size = 4264508, upload-time = "2026-01-28T00:23:34.264Z" },
+ { url = "https://files.pythonhosted.org/packages/49/a1/551fa162d33074b660dc35c9bc3616fefa21a0e8c1edd27b92559902e408/cryptography-46.0.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dce1e4f068f03008da7fa51cc7abc6ddc5e5de3e3d1550334eaf8393982a5829", size = 4409080, upload-time = "2026-01-28T00:23:35.793Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/6a/4d8d129a755f5d6df1bbee69ea2f35ebfa954fa1847690d1db2e8bca46a5/cryptography-46.0.4-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:2067461c80271f422ee7bdbe79b9b4be54a5162e90345f86a23445a0cf3fd8a2", size = 4270039, upload-time = "2026-01-28T00:23:37.263Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/f5/ed3fcddd0a5e39321e595e144615399e47e7c153a1fb8c4862aec3151ff9/cryptography-46.0.4-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:c92010b58a51196a5f41c3795190203ac52edfd5dc3ff99149b4659eba9d2085", size = 4926748, upload-time = "2026-01-28T00:23:38.884Z" },
+ { url = "https://files.pythonhosted.org/packages/43/ae/9f03d5f0c0c00e85ecb34f06d3b79599f20630e4db91b8a6e56e8f83d410/cryptography-46.0.4-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:829c2b12bbc5428ab02d6b7f7e9bbfd53e33efd6672d21341f2177470171ad8b", size = 4442307, upload-time = "2026-01-28T00:23:40.56Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/22/e0f9f2dae8040695103369cf2283ef9ac8abe4d51f68710bec2afd232609/cryptography-46.0.4-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:62217ba44bf81b30abaeda1488686a04a702a261e26f87db51ff61d9d3510abd", size = 3959253, upload-time = "2026-01-28T00:23:42.827Z" },
+ { url = "https://files.pythonhosted.org/packages/01/5b/6a43fcccc51dae4d101ac7d378a8724d1ba3de628a24e11bf2f4f43cba4d/cryptography-46.0.4-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:9c2da296c8d3415b93e6053f5a728649a87a48ce084a9aaf51d6e46c87c7f2d2", size = 4269372, upload-time = "2026-01-28T00:23:44.655Z" },
+ { url = "https://files.pythonhosted.org/packages/17/b7/0f6b8c1dd0779df2b526e78978ff00462355e31c0a6f6cff8a3e99889c90/cryptography-46.0.4-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:9b34d8ba84454641a6bf4d6762d15847ecbd85c1316c0a7984e6e4e9f748ec2e", size = 4891908, upload-time = "2026-01-28T00:23:46.48Z" },
+ { url = "https://files.pythonhosted.org/packages/83/17/259409b8349aa10535358807a472c6a695cf84f106022268d31cea2b6c97/cryptography-46.0.4-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:df4a817fa7138dd0c96c8c8c20f04b8aaa1fac3bbf610913dcad8ea82e1bfd3f", size = 4441254, upload-time = "2026-01-28T00:23:48.403Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/fe/e4a1b0c989b00cee5ffa0764401767e2d1cf59f45530963b894129fd5dce/cryptography-46.0.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b1de0ebf7587f28f9190b9cb526e901bf448c9e6a99655d2b07fff60e8212a82", size = 4396520, upload-time = "2026-01-28T00:23:50.26Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/81/ba8fd9657d27076eb40d6a2f941b23429a3c3d2f56f5a921d6b936a27bc9/cryptography-46.0.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9b4d17bc7bd7cdd98e3af40b441feaea4c68225e2eb2341026c84511ad246c0c", size = 4651479, upload-time = "2026-01-28T00:23:51.674Z" },
+ { url = "https://files.pythonhosted.org/packages/00/03/0de4ed43c71c31e4fe954edd50b9d28d658fef56555eba7641696370a8e2/cryptography-46.0.4-cp314-cp314t-win32.whl", hash = "sha256:c411f16275b0dea722d76544a61d6421e2cc829ad76eec79280dbdc9ddf50061", size = 3001986, upload-time = "2026-01-28T00:23:53.485Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/70/81830b59df7682917d7a10f833c4dab2a5574cd664e86d18139f2b421329/cryptography-46.0.4-cp314-cp314t-win_amd64.whl", hash = "sha256:728fedc529efc1439eb6107b677f7f7558adab4553ef8669f0d02d42d7b959a7", size = 3468288, upload-time = "2026-01-28T00:23:55.09Z" },
+ { url = "https://files.pythonhosted.org/packages/56/f7/f648fdbb61d0d45902d3f374217451385edc7e7768d1b03ff1d0e5ffc17b/cryptography-46.0.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a9556ba711f7c23f77b151d5798f3ac44a13455cc68db7697a1096e6d0563cab", size = 7169583, upload-time = "2026-01-28T00:23:56.558Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/cc/8f3224cbb2a928de7298d6ed4790f5ebc48114e02bdc9559196bfb12435d/cryptography-46.0.4-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8bf75b0259e87fa70bddc0b8b4078b76e7fd512fd9afae6c1193bcf440a4dbef", size = 4275419, upload-time = "2026-01-28T00:23:58.364Z" },
+ { url = "https://files.pythonhosted.org/packages/17/43/4a18faa7a872d00e4264855134ba82d23546c850a70ff209e04ee200e76f/cryptography-46.0.4-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3c268a3490df22270955966ba236d6bc4a8f9b6e4ffddb78aac535f1a5ea471d", size = 4419058, upload-time = "2026-01-28T00:23:59.867Z" },
+ { url = "https://files.pythonhosted.org/packages/ee/64/6651969409821d791ba12346a124f55e1b76f66a819254ae840a965d4b9c/cryptography-46.0.4-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:812815182f6a0c1d49a37893a303b44eaac827d7f0d582cecfc81b6427f22973", size = 4278151, upload-time = "2026-01-28T00:24:01.731Z" },
+ { url = "https://files.pythonhosted.org/packages/20/0b/a7fce65ee08c3c02f7a8310cc090a732344066b990ac63a9dfd0a655d321/cryptography-46.0.4-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:a90e43e3ef65e6dcf969dfe3bb40cbf5aef0d523dff95bfa24256be172a845f4", size = 4939441, upload-time = "2026-01-28T00:24:03.175Z" },
+ { url = "https://files.pythonhosted.org/packages/db/a7/20c5701e2cd3e1dfd7a19d2290c522a5f435dd30957d431dcb531d0f1413/cryptography-46.0.4-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a05177ff6296644ef2876fce50518dffb5bcdf903c85250974fc8bc85d54c0af", size = 4451617, upload-time = "2026-01-28T00:24:05.403Z" },
+ { url = "https://files.pythonhosted.org/packages/00/dc/3e16030ea9aa47b63af6524c354933b4fb0e352257c792c4deeb0edae367/cryptography-46.0.4-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:daa392191f626d50f1b136c9b4cf08af69ca8279d110ea24f5c2700054d2e263", size = 3977774, upload-time = "2026-01-28T00:24:06.851Z" },
+ { url = "https://files.pythonhosted.org/packages/42/c8/ad93f14118252717b465880368721c963975ac4b941b7ef88f3c56bf2897/cryptography-46.0.4-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e07ea39c5b048e085f15923511d8121e4a9dc45cee4e3b970ca4f0d338f23095", size = 4277008, upload-time = "2026-01-28T00:24:08.926Z" },
+ { url = "https://files.pythonhosted.org/packages/00/cf/89c99698151c00a4631fbfcfcf459d308213ac29e321b0ff44ceeeac82f1/cryptography-46.0.4-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:d5a45ddc256f492ce42a4e35879c5e5528c09cd9ad12420828c972951d8e016b", size = 4903339, upload-time = "2026-01-28T00:24:12.009Z" },
+ { url = "https://files.pythonhosted.org/packages/03/c3/c90a2cb358de4ac9309b26acf49b2a100957e1ff5cc1e98e6c4996576710/cryptography-46.0.4-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:6bb5157bf6a350e5b28aee23beb2d84ae6f5be390b2f8ee7ea179cda077e1019", size = 4451216, upload-time = "2026-01-28T00:24:13.975Z" },
+ { url = "https://files.pythonhosted.org/packages/96/2c/8d7f4171388a10208671e181ca43cdc0e596d8259ebacbbcfbd16de593da/cryptography-46.0.4-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dd5aba870a2c40f87a3af043e0dee7d9eb02d4aff88a797b48f2b43eff8c3ab4", size = 4404299, upload-time = "2026-01-28T00:24:16.169Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/23/cbb2036e450980f65c6e0a173b73a56ff3bccd8998965dea5cc9ddd424a5/cryptography-46.0.4-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:93d8291da8d71024379ab2cb0b5c57915300155ad42e07f76bea6ad838d7e59b", size = 4664837, upload-time = "2026-01-28T00:24:17.629Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/21/f7433d18fe6d5845329cbdc597e30caf983229c7a245bcf54afecc555938/cryptography-46.0.4-cp38-abi3-win32.whl", hash = "sha256:0563655cb3c6d05fb2afe693340bc050c30f9f34e15763361cf08e94749401fc", size = 3009779, upload-time = "2026-01-28T00:24:20.198Z" },
+ { url = "https://files.pythonhosted.org/packages/3a/6a/bd2e7caa2facffedf172a45c1a02e551e6d7d4828658c9a245516a598d94/cryptography-46.0.4-cp38-abi3-win_amd64.whl", hash = "sha256:fa0900b9ef9c49728887d1576fd8d9e7e3ea872fa9b25ef9b64888adc434e976", size = 3466633, upload-time = "2026-01-28T00:24:21.851Z" },
+]
+
+[[package]]
+name = "cssselect"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/2e/cdfd8b01c37cbf4f9482eefd455853a3cf9c995029a46acd31dfaa9c1dd6/cssselect-1.4.0.tar.gz", hash = "sha256:fdaf0a1425e17dfe8c5cf66191d211b357cf7872ae8afc4c6762ddd8ac47fc92", size = 40589, upload-time = "2026-01-29T07:00:26.701Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/20/0c/7bb51e3acfafd16c48875bf3db03607674df16f5b6ef8d056586af7e2b8b/cssselect-1.4.0-py3-none-any.whl", hash = "sha256:c0ec5c0191c8ee39fcc8afc1540331d8b55b0183478c50e9c8a79d44dbceb1d8", size = 18540, upload-time = "2026-01-29T07:00:24.994Z" },
+]
[[package]]
name = "cymple"
@@ -335,6 +459,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686, upload-time = "2024-06-09T16:20:16.715Z" },
]
+[[package]]
+name = "defusedxml"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
+]
+
[[package]]
name = "distro"
version = "1.9.0"
@@ -551,11 +684,11 @@ wheels = [
[[package]]
name = "fsspec"
-version = "2025.12.0"
+version = "2026.2.0"
source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b6/27/954057b0d1f53f086f681755207dda6de6c660ce133c829158e8e8fe7895/fsspec-2025.12.0.tar.gz", hash = "sha256:c505de011584597b1060ff778bb664c1bc022e87921b0e4f10cc9c44f9635973", size = 309748, upload-time = "2025-12-03T15:23:42.687Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/51/c7/b64cae5dba3a1b138d7123ec36bb5ccd39d39939f18454407e5468f4763f/fsspec-2025.12.0-py3-none-any.whl", hash = "sha256:8bf1fe301b7d8acfa6e8571e3b1c3d158f909666642431cc78a1b7b4dbc5ec5b", size = 201422, upload-time = "2025-12-03T15:23:41.434Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" },
]
[[package]]
@@ -693,16 +826,16 @@ wheels = [
[[package]]
name = "httpx-sse"
-version = "0.4.0"
+version = "0.4.3"
source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4c/60/8f4281fa9bbf3c8034fd54c0e7412e66edbab6bc74c4996bd616f8d0406e/httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721", size = 12624, upload-time = "2023-12-22T08:01:21.083Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload-time = "2025-10-10T21:48:22.271Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f", size = 7819, upload-time = "2023-12-22T08:01:19.89Z" },
+ { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload-time = "2025-10-10T21:48:21.158Z" },
]
[[package]]
name = "huggingface-hub"
-version = "0.36.0"
+version = "0.36.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "filelock" },
@@ -714,9 +847,21 @@ dependencies = [
{ name = "tqdm" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" },
+]
+
+[[package]]
+name = "hyperlink"
+version = "21.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/51/1947bd81d75af87e3bb9e34593a4cf118115a8feb451ce7a69044ef1412e/hyperlink-21.0.0.tar.gz", hash = "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b", size = 140743, upload-time = "2021-01-08T05:51:20.972Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/aa/8caf6a0a3e62863cbb9dab27135660acba46903b703e224f14f447e57934/hyperlink-21.0.0-py2.py3-none-any.whl", hash = "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4", size = 74638, upload-time = "2021-01-08T05:51:22.906Z" },
]
[[package]]
@@ -728,6 +873,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
]
+[[package]]
+name = "incremental"
+version = "24.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/3c/82e84109e02c492f382c711c58a3dd91badda6d746def81a1465f74dc9f5/incremental-24.11.0.tar.gz", hash = "sha256:87d3480dbb083c1d736222511a8cf380012a8176c2456d01ef483242abbbcf8c", size = 24000, upload-time = "2025-11-28T02:30:17.861Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/1d/55/0f4df2a44053867ea9cbea73fc588b03c55605cd695cee0a3d86f0029cb2/incremental-24.11.0-py3-none-any.whl", hash = "sha256:a34450716b1c4341fe6676a0598e88a39e04189f4dce5dc96f656e040baa10b3", size = 21109, upload-time = "2025-11-28T02:30:16.442Z" },
+]
+
[[package]]
name = "ingestors"
version = "0.1.0"
@@ -747,9 +904,25 @@ dependencies = [
{ name = "python-dotenv" },
{ name = "redis" },
{ name = "requests" },
+ { name = "scrapy" },
{ name = "slack-sdk" },
]
+[package.optional-dependencies]
+dev = [
+ { name = "pytest" },
+ { name = "pytest-asyncio" },
+]
+playwright = [
+ { name = "scrapy-playwright" },
+]
+
+[package.dev-dependencies]
+dev = [
+ { name = "pytest" },
+ { name = "pytest-asyncio" },
+]
+
[package.metadata]
requires-dist = [
{ name = "aiohttp", specifier = ">=3.13.2" },
@@ -763,11 +936,54 @@ requires-dist = [
{ name = "langchain-text-splitters", specifier = ">=1.0.0" },
{ name = "lxml", specifier = ">=6.0.2" },
{ name = "pyjwt", specifier = ">=2.10.1" },
+ { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
+ { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "redis", specifier = ">=7.0.1" },
{ name = "requests", specifier = ">=2.32.0" },
+ { name = "scrapy", specifier = ">=2.12.0" },
+ { name = "scrapy-playwright", marker = "extra == 'playwright'", specifier = ">=0.0.43" },
{ name = "slack-sdk", specifier = ">=3.38.0" },
]
+provides-extras = ["playwright", "dev"]
+
+[package.metadata.requires-dev]
+dev = [
+ { name = "pytest", specifier = ">=9.0.2" },
+ { name = "pytest-asyncio", specifier = ">=1.3.0" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "itemadapter"
+version = "0.13.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/47/4c75c5396941e653d5f864389964da6951e8f338c6739602dd778f62333e/itemadapter-0.13.1.tar.gz", hash = "sha256:fa139c7be2aa80f8874b2f23d165d5d4aa47c4b85c54ab530b567fd5f684f1b4", size = 32343, upload-time = "2026-01-08T17:56:38.863Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/db/a6/48805cef65b13644f1c23545dc525a7051581c84f5227efb1cd9a8ac9b02/itemadapter-0.13.1-py3-none-any.whl", hash = "sha256:f3c6b1babb4fb6cca4aa9061ef0b0c25c783c24a571c30e3667e7bcfea41815b", size = 18540, upload-time = "2026-01-08T17:56:37.29Z" },
+]
+
+[[package]]
+name = "itemloaders"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "itemadapter" },
+ { name = "jmespath" },
+ { name = "parsel" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/bd/916f4fd26e14e6ad292b69693ccca4f192bcaf9f817ba7d6f7162dbbd835/itemloaders-1.4.0.tar.gz", hash = "sha256:b5338308a819098f43525b7afc5f7d46ba338ba4710f5ebe7a21b3b47bb29929", size = 29740, upload-time = "2026-01-29T12:50:38.04Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ac/71/d9cd0e4c6a4aace991009fc47362ce9251be0fbcf2b6c533f918b31854d5/itemloaders-1.4.0-py3-none-any.whl", hash = "sha256:202b6f855299b4cadfdf78bb93a6cf977899e3c40c4c54524e120a444e65b5ac", size = 12188, upload-time = "2026-01-29T12:50:36.148Z" },
+]
[[package]]
name = "jinja2"
@@ -783,53 +999,53 @@ wheels = [
[[package]]
name = "jiter"
-version = "0.12.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/45/9d/e0660989c1370e25848bb4c52d061c71837239738ad937e83edca174c273/jiter-0.12.0.tar.gz", hash = "sha256:64dfcd7d5c168b38d3f9f8bba7fc639edb3418abcc74f22fdbe6b8938293f30b", size = 168294, upload-time = "2025-11-09T20:49:23.302Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/3d/a6/97209693b177716e22576ee1161674d1d58029eb178e01866a0422b69224/jiter-0.12.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6cc49d5130a14b732e0612bc76ae8db3b49898732223ef8b7599aa8d9810683e", size = 313658, upload-time = "2025-11-09T20:47:44.424Z" },
- { url = "https://files.pythonhosted.org/packages/06/4d/125c5c1537c7d8ee73ad3d530a442d6c619714b95027143f1b61c0b4dfe0/jiter-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37f27a32ce36364d2fa4f7fdc507279db604d27d239ea2e044c8f148410defe1", size = 318605, upload-time = "2025-11-09T20:47:45.973Z" },
- { url = "https://files.pythonhosted.org/packages/99/bf/a840b89847885064c41a5f52de6e312e91fa84a520848ee56c97e4fa0205/jiter-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc0944aa3d4b4773e348cda635252824a78f4ba44328e042ef1ff3f6080d1cf", size = 349803, upload-time = "2025-11-09T20:47:47.535Z" },
- { url = "https://files.pythonhosted.org/packages/8a/88/e63441c28e0db50e305ae23e19c1d8fae012d78ed55365da392c1f34b09c/jiter-0.12.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da25c62d4ee1ffbacb97fac6dfe4dcd6759ebdc9015991e92a6eae5816287f44", size = 365120, upload-time = "2025-11-09T20:47:49.284Z" },
- { url = "https://files.pythonhosted.org/packages/0a/7c/49b02714af4343970eb8aca63396bc1c82fa01197dbb1e9b0d274b550d4e/jiter-0.12.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:048485c654b838140b007390b8182ba9774621103bd4d77c9c3f6f117474ba45", size = 479918, upload-time = "2025-11-09T20:47:50.807Z" },
- { url = "https://files.pythonhosted.org/packages/69/ba/0a809817fdd5a1db80490b9150645f3aae16afad166960bcd562be194f3b/jiter-0.12.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:635e737fbb7315bef0037c19b88b799143d2d7d3507e61a76751025226b3ac87", size = 379008, upload-time = "2025-11-09T20:47:52.211Z" },
- { url = "https://files.pythonhosted.org/packages/5f/c3/c9fc0232e736c8877d9e6d83d6eeb0ba4e90c6c073835cc2e8f73fdeef51/jiter-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e017c417b1ebda911bd13b1e40612704b1f5420e30695112efdbed8a4b389ed", size = 361785, upload-time = "2025-11-09T20:47:53.512Z" },
- { url = "https://files.pythonhosted.org/packages/96/61/61f69b7e442e97ca6cd53086ddc1cf59fb830549bc72c0a293713a60c525/jiter-0.12.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:89b0bfb8b2bf2351fba36bb211ef8bfceba73ef58e7f0c68fb67b5a2795ca2f9", size = 386108, upload-time = "2025-11-09T20:47:54.893Z" },
- { url = "https://files.pythonhosted.org/packages/e9/2e/76bb3332f28550c8f1eba3bf6e5efe211efda0ddbbaf24976bc7078d42a5/jiter-0.12.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f5aa5427a629a824a543672778c9ce0c5e556550d1569bb6ea28a85015287626", size = 519937, upload-time = "2025-11-09T20:47:56.253Z" },
- { url = "https://files.pythonhosted.org/packages/84/d6/fa96efa87dc8bff2094fb947f51f66368fa56d8d4fc9e77b25d7fbb23375/jiter-0.12.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed53b3d6acbcb0fd0b90f20c7cb3b24c357fe82a3518934d4edfa8c6898e498c", size = 510853, upload-time = "2025-11-09T20:47:58.32Z" },
- { url = "https://files.pythonhosted.org/packages/8a/28/93f67fdb4d5904a708119a6ab58a8f1ec226ff10a94a282e0215402a8462/jiter-0.12.0-cp313-cp313-win32.whl", hash = "sha256:4747de73d6b8c78f2e253a2787930f4fffc68da7fa319739f57437f95963c4de", size = 204699, upload-time = "2025-11-09T20:47:59.686Z" },
- { url = "https://files.pythonhosted.org/packages/c4/1f/30b0eb087045a0abe2a5c9c0c0c8da110875a1d3be83afd4a9a4e548be3c/jiter-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:e25012eb0c456fcc13354255d0338cd5397cce26c77b2832b3c4e2e255ea5d9a", size = 204258, upload-time = "2025-11-09T20:48:01.01Z" },
- { url = "https://files.pythonhosted.org/packages/2c/f4/2b4daf99b96bce6fc47971890b14b2a36aef88d7beb9f057fafa032c6141/jiter-0.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:c97b92c54fe6110138c872add030a1f99aea2401ddcdaa21edf74705a646dd60", size = 185503, upload-time = "2025-11-09T20:48:02.35Z" },
- { url = "https://files.pythonhosted.org/packages/39/ca/67bb15a7061d6fe20b9b2a2fd783e296a1e0f93468252c093481a2f00efa/jiter-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53839b35a38f56b8be26a7851a48b89bc47e5d88e900929df10ed93b95fea3d6", size = 317965, upload-time = "2025-11-09T20:48:03.783Z" },
- { url = "https://files.pythonhosted.org/packages/18/af/1788031cd22e29c3b14bc6ca80b16a39a0b10e611367ffd480c06a259831/jiter-0.12.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94f669548e55c91ab47fef8bddd9c954dab1938644e715ea49d7e117015110a4", size = 345831, upload-time = "2025-11-09T20:48:05.55Z" },
- { url = "https://files.pythonhosted.org/packages/05/17/710bf8472d1dff0d3caf4ced6031060091c1320f84ee7d5dcbed1f352417/jiter-0.12.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:351d54f2b09a41600ffea43d081522d792e81dcfb915f6d2d242744c1cc48beb", size = 361272, upload-time = "2025-11-09T20:48:06.951Z" },
- { url = "https://files.pythonhosted.org/packages/fb/f1/1dcc4618b59761fef92d10bcbb0b038b5160be653b003651566a185f1a5c/jiter-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2a5e90604620f94bf62264e7c2c038704d38217b7465b863896c6d7c902b06c7", size = 204604, upload-time = "2025-11-09T20:48:08.328Z" },
- { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" },
- { url = "https://files.pythonhosted.org/packages/a8/99/45c9f0dbe4a1416b2b9a8a6d1236459540f43d7fb8883cff769a8db0612d/jiter-0.12.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c46d927acd09c67a9fb1416df45c5a04c27e83aae969267e98fba35b74e99525", size = 312478, upload-time = "2025-11-09T20:48:10.898Z" },
- { url = "https://files.pythonhosted.org/packages/4c/a7/54ae75613ba9e0f55fcb0bc5d1f807823b5167cc944e9333ff322e9f07dd/jiter-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:774ff60b27a84a85b27b88cd5583899c59940bcc126caca97eb2a9df6aa00c49", size = 318706, upload-time = "2025-11-09T20:48:12.266Z" },
- { url = "https://files.pythonhosted.org/packages/59/31/2aa241ad2c10774baf6c37f8b8e1f39c07db358f1329f4eb40eba179c2a2/jiter-0.12.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5433fab222fb072237df3f637d01b81f040a07dcac1cb4a5c75c7aa9ed0bef1", size = 351894, upload-time = "2025-11-09T20:48:13.673Z" },
- { url = "https://files.pythonhosted.org/packages/54/4f/0f2759522719133a9042781b18cc94e335b6d290f5e2d3e6899d6af933e3/jiter-0.12.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8c593c6e71c07866ec6bfb790e202a833eeec885022296aff6b9e0b92d6a70e", size = 365714, upload-time = "2025-11-09T20:48:15.083Z" },
- { url = "https://files.pythonhosted.org/packages/dc/6f/806b895f476582c62a2f52c453151edd8a0fde5411b0497baaa41018e878/jiter-0.12.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90d32894d4c6877a87ae00c6b915b609406819dce8bc0d4e962e4de2784e567e", size = 478989, upload-time = "2025-11-09T20:48:16.706Z" },
- { url = "https://files.pythonhosted.org/packages/86/6c/012d894dc6e1033acd8db2b8346add33e413ec1c7c002598915278a37f79/jiter-0.12.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:798e46eed9eb10c3adbbacbd3bdb5ecd4cf7064e453d00dbef08802dae6937ff", size = 378615, upload-time = "2025-11-09T20:48:18.614Z" },
- { url = "https://files.pythonhosted.org/packages/87/30/d718d599f6700163e28e2c71c0bbaf6dace692e7df2592fd793ac9276717/jiter-0.12.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3f1368f0a6719ea80013a4eb90ba72e75d7ea67cfc7846db2ca504f3df0169a", size = 364745, upload-time = "2025-11-09T20:48:20.117Z" },
- { url = "https://files.pythonhosted.org/packages/8f/85/315b45ce4b6ddc7d7fceca24068543b02bdc8782942f4ee49d652e2cc89f/jiter-0.12.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65f04a9d0b4406f7e51279710b27484af411896246200e461d80d3ba0caa901a", size = 386502, upload-time = "2025-11-09T20:48:21.543Z" },
- { url = "https://files.pythonhosted.org/packages/74/0b/ce0434fb40c5b24b368fe81b17074d2840748b4952256bab451b72290a49/jiter-0.12.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:fd990541982a24281d12b67a335e44f117e4c6cbad3c3b75c7dea68bf4ce3a67", size = 519845, upload-time = "2025-11-09T20:48:22.964Z" },
- { url = "https://files.pythonhosted.org/packages/e8/a3/7a7a4488ba052767846b9c916d208b3ed114e3eb670ee984e4c565b9cf0d/jiter-0.12.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:b111b0e9152fa7df870ecaebb0bd30240d9f7fff1f2003bcb4ed0f519941820b", size = 510701, upload-time = "2025-11-09T20:48:24.483Z" },
- { url = "https://files.pythonhosted.org/packages/c3/16/052ffbf9d0467b70af24e30f91e0579e13ded0c17bb4a8eb2aed3cb60131/jiter-0.12.0-cp314-cp314-win32.whl", hash = "sha256:a78befb9cc0a45b5a5a0d537b06f8544c2ebb60d19d02c41ff15da28a9e22d42", size = 205029, upload-time = "2025-11-09T20:48:25.749Z" },
- { url = "https://files.pythonhosted.org/packages/e4/18/3cf1f3f0ccc789f76b9a754bdb7a6977e5d1d671ee97a9e14f7eb728d80e/jiter-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:e1fe01c082f6aafbe5c8faf0ff074f38dfb911d53f07ec333ca03f8f6226debf", size = 204960, upload-time = "2025-11-09T20:48:27.415Z" },
- { url = "https://files.pythonhosted.org/packages/02/68/736821e52ecfdeeb0f024b8ab01b5a229f6b9293bbdb444c27efade50b0f/jiter-0.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:d72f3b5a432a4c546ea4bedc84cce0c3404874f1d1676260b9c7f048a9855451", size = 185529, upload-time = "2025-11-09T20:48:29.125Z" },
- { url = "https://files.pythonhosted.org/packages/30/61/12ed8ee7a643cce29ac97c2281f9ce3956eb76b037e88d290f4ed0d41480/jiter-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6ded41aeba3603f9728ed2b6196e4df875348ab97b28fc8afff115ed42ba7a7", size = 318974, upload-time = "2025-11-09T20:48:30.87Z" },
- { url = "https://files.pythonhosted.org/packages/2d/c6/f3041ede6d0ed5e0e79ff0de4c8f14f401bbf196f2ef3971cdbe5fd08d1d/jiter-0.12.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a947920902420a6ada6ad51892082521978e9dd44a802663b001436e4b771684", size = 345932, upload-time = "2025-11-09T20:48:32.658Z" },
- { url = "https://files.pythonhosted.org/packages/d5/5d/4d94835889edd01ad0e2dbfc05f7bdfaed46292e7b504a6ac7839aa00edb/jiter-0.12.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:add5e227e0554d3a52cf390a7635edaffdf4f8fce4fdbcef3cc2055bb396a30c", size = 367243, upload-time = "2025-11-09T20:48:34.093Z" },
- { url = "https://files.pythonhosted.org/packages/fd/76/0051b0ac2816253a99d27baf3dda198663aff882fa6ea7deeb94046da24e/jiter-0.12.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9b1cda8fcb736250d7e8711d4580ebf004a46771432be0ae4796944b5dfa5d", size = 479315, upload-time = "2025-11-09T20:48:35.507Z" },
- { url = "https://files.pythonhosted.org/packages/70/ae/83f793acd68e5cb24e483f44f482a1a15601848b9b6f199dacb970098f77/jiter-0.12.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deeb12a2223fe0135c7ff1356a143d57f95bbf1f4a66584f1fc74df21d86b993", size = 380714, upload-time = "2025-11-09T20:48:40.014Z" },
- { url = "https://files.pythonhosted.org/packages/b1/5e/4808a88338ad2c228b1126b93fcd8ba145e919e886fe910d578230dabe3b/jiter-0.12.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c596cc0f4cb574877550ce4ecd51f8037469146addd676d7c1a30ebe6391923f", size = 365168, upload-time = "2025-11-09T20:48:41.462Z" },
- { url = "https://files.pythonhosted.org/packages/0c/d4/04619a9e8095b42aef436b5aeb4c0282b4ff1b27d1db1508df9f5dc82750/jiter-0.12.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ab4c823b216a4aeab3fdbf579c5843165756bd9ad87cc6b1c65919c4715f783", size = 387893, upload-time = "2025-11-09T20:48:42.921Z" },
- { url = "https://files.pythonhosted.org/packages/17/ea/d3c7e62e4546fdc39197fa4a4315a563a89b95b6d54c0d25373842a59cbe/jiter-0.12.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e427eee51149edf962203ff8db75a7514ab89be5cb623fb9cea1f20b54f1107b", size = 520828, upload-time = "2025-11-09T20:48:44.278Z" },
- { url = "https://files.pythonhosted.org/packages/cc/0b/c6d3562a03fd767e31cb119d9041ea7958c3c80cb3d753eafb19b3b18349/jiter-0.12.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:edb868841f84c111255ba5e80339d386d937ec1fdce419518ce1bd9370fac5b6", size = 511009, upload-time = "2025-11-09T20:48:45.726Z" },
- { url = "https://files.pythonhosted.org/packages/aa/51/2cb4468b3448a8385ebcd15059d325c9ce67df4e2758d133ab9442b19834/jiter-0.12.0-cp314-cp314t-win32.whl", hash = "sha256:8bbcfe2791dfdb7c5e48baf646d37a6a3dcb5a97a032017741dea9f817dca183", size = 205110, upload-time = "2025-11-09T20:48:47.033Z" },
- { url = "https://files.pythonhosted.org/packages/b2/c5/ae5ec83dec9c2d1af805fd5fe8f74ebded9c8670c5210ec7820ce0dbeb1e/jiter-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2fa940963bf02e1d8226027ef461e36af472dea85d36054ff835aeed944dd873", size = 205223, upload-time = "2025-11-09T20:48:49.076Z" },
- { url = "https://files.pythonhosted.org/packages/97/9a/3c5391907277f0e55195550cf3fa8e293ae9ee0c00fb402fec1e38c0c82f/jiter-0.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:506c9708dd29b27288f9f8f1140c3cb0e3d8ddb045956d7757b1fa0e0f39a473", size = 185564, upload-time = "2025-11-09T20:48:50.376Z" },
+version = "0.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/5e/4ec91646aee381d01cdb9974e30882c9cd3b8c5d1079d6b5ff4af522439a/jiter-0.13.0.tar.gz", hash = "sha256:f2839f9c2c7e2dffc1bc5929a510e14ce0a946be9365fd1219e7ef342dae14f4", size = 164847, upload-time = "2026-02-02T12:37:56.441Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/91/9c/7ee5a6ff4b9991e1a45263bfc46731634c4a2bde27dfda6c8251df2d958c/jiter-0.13.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1f8a55b848cbabf97d861495cd65f1e5c590246fabca8b48e1747c4dfc8f85bf", size = 306897, upload-time = "2026-02-02T12:36:16.748Z" },
+ { url = "https://files.pythonhosted.org/packages/7c/02/be5b870d1d2be5dd6a91bdfb90f248fbb7dcbd21338f092c6b89817c3dbf/jiter-0.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f556aa591c00f2c45eb1b89f68f52441a016034d18b65da60e2d2875bbbf344a", size = 317507, upload-time = "2026-02-02T12:36:18.351Z" },
+ { url = "https://files.pythonhosted.org/packages/da/92/b25d2ec333615f5f284f3a4024f7ce68cfa0604c322c6808b2344c7f5d2b/jiter-0.13.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7e1d61da332ec412350463891923f960c3073cf1aae93b538f0bb4c8cd46efb", size = 350560, upload-time = "2026-02-02T12:36:19.746Z" },
+ { url = "https://files.pythonhosted.org/packages/be/ec/74dcb99fef0aca9fbe56b303bf79f6bd839010cb18ad41000bf6cc71eec0/jiter-0.13.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3097d665a27bc96fd9bbf7f86178037db139f319f785e4757ce7ccbf390db6c2", size = 363232, upload-time = "2026-02-02T12:36:21.243Z" },
+ { url = "https://files.pythonhosted.org/packages/1b/37/f17375e0bb2f6a812d4dd92d7616e41917f740f3e71343627da9db2824ce/jiter-0.13.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d01ecc3a8cbdb6f25a37bd500510550b64ddf9f7d64a107d92f3ccb25035d0f", size = 483727, upload-time = "2026-02-02T12:36:22.688Z" },
+ { url = "https://files.pythonhosted.org/packages/77/d2/a71160a5ae1a1e66c1395b37ef77da67513b0adba73b993a27fbe47eb048/jiter-0.13.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ed9bbc30f5d60a3bdf63ae76beb3f9db280d7f195dfcfa61af792d6ce912d159", size = 370799, upload-time = "2026-02-02T12:36:24.106Z" },
+ { url = "https://files.pythonhosted.org/packages/01/99/ed5e478ff0eb4e8aa5fd998f9d69603c9fd3f32de3bd16c2b1194f68361c/jiter-0.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98fbafb6e88256f4454de33c1f40203d09fc33ed19162a68b3b257b29ca7f663", size = 359120, upload-time = "2026-02-02T12:36:25.519Z" },
+ { url = "https://files.pythonhosted.org/packages/16/be/7ffd08203277a813f732ba897352797fa9493faf8dc7995b31f3d9cb9488/jiter-0.13.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5467696f6b827f1116556cb0db620440380434591e93ecee7fd14d1a491b6daa", size = 390664, upload-time = "2026-02-02T12:36:26.866Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/84/e0787856196d6d346264d6dcccb01f741e5f0bd014c1d9a2ebe149caf4f3/jiter-0.13.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:2d08c9475d48b92892583df9da592a0e2ac49bcd41fae1fec4f39ba6cf107820", size = 513543, upload-time = "2026-02-02T12:36:28.217Z" },
+ { url = "https://files.pythonhosted.org/packages/65/50/ecbd258181c4313cf79bca6c88fb63207d04d5bf5e4f65174114d072aa55/jiter-0.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:aed40e099404721d7fcaf5b89bd3b4568a4666358bcac7b6b15c09fb6252ab68", size = 547262, upload-time = "2026-02-02T12:36:29.678Z" },
+ { url = "https://files.pythonhosted.org/packages/27/da/68f38d12e7111d2016cd198161b36e1f042bd115c169255bcb7ec823a3bf/jiter-0.13.0-cp313-cp313-win32.whl", hash = "sha256:36ebfbcffafb146d0e6ffb3e74d51e03d9c35ce7c625c8066cdbfc7b953bdc72", size = 200630, upload-time = "2026-02-02T12:36:31.808Z" },
+ { url = "https://files.pythonhosted.org/packages/25/65/3bd1a972c9a08ecd22eb3b08a95d1941ebe6938aea620c246cf426ae09c2/jiter-0.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:8d76029f077379374cf0dbc78dbe45b38dec4a2eb78b08b5194ce836b2517afc", size = 202602, upload-time = "2026-02-02T12:36:33.679Z" },
+ { url = "https://files.pythonhosted.org/packages/15/fe/13bd3678a311aa67686bb303654792c48206a112068f8b0b21426eb6851e/jiter-0.13.0-cp313-cp313-win_arm64.whl", hash = "sha256:bb7613e1a427cfcb6ea4544f9ac566b93d5bf67e0d48c787eca673ff9c9dff2b", size = 185939, upload-time = "2026-02-02T12:36:35.065Z" },
+ { url = "https://files.pythonhosted.org/packages/49/19/a929ec002ad3228bc97ca01dbb14f7632fffdc84a95ec92ceaf4145688ae/jiter-0.13.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fa476ab5dd49f3bf3a168e05f89358c75a17608dbabb080ef65f96b27c19ab10", size = 316616, upload-time = "2026-02-02T12:36:36.579Z" },
+ { url = "https://files.pythonhosted.org/packages/52/56/d19a9a194afa37c1728831e5fb81b7722c3de18a3109e8f282bfc23e587a/jiter-0.13.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ade8cb6ff5632a62b7dbd4757d8c5573f7a2e9ae285d6b5b841707d8363205ef", size = 346850, upload-time = "2026-02-02T12:36:38.058Z" },
+ { url = "https://files.pythonhosted.org/packages/36/4a/94e831c6bf287754a8a019cb966ed39ff8be6ab78cadecf08df3bb02d505/jiter-0.13.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9950290340acc1adaded363edd94baebcee7dabdfa8bee4790794cd5cfad2af6", size = 358551, upload-time = "2026-02-02T12:36:39.417Z" },
+ { url = "https://files.pythonhosted.org/packages/a2/ec/a4c72c822695fa80e55d2b4142b73f0012035d9fcf90eccc56bc060db37c/jiter-0.13.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2b4972c6df33731aac0742b64fd0d18e0a69bc7d6e03108ce7d40c85fd9e3e6d", size = 201950, upload-time = "2026-02-02T12:36:40.791Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/00/393553ec27b824fbc29047e9c7cd4a3951d7fbe4a76743f17e44034fa4e4/jiter-0.13.0-cp313-cp313t-win_arm64.whl", hash = "sha256:701a1e77d1e593c1b435315ff625fd071f0998c5f02792038a5ca98899261b7d", size = 185852, upload-time = "2026-02-02T12:36:42.077Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/f5/f1997e987211f6f9bd71b8083047b316208b4aca0b529bb5f8c96c89ef3e/jiter-0.13.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:cc5223ab19fe25e2f0bf2643204ad7318896fe3729bf12fde41b77bfc4fafff0", size = 308804, upload-time = "2026-02-02T12:36:43.496Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/8f/5482a7677731fd44881f0204981ce2d7175db271f82cba2085dd2212e095/jiter-0.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9776ebe51713acf438fd9b4405fcd86893ae5d03487546dae7f34993217f8a91", size = 318787, upload-time = "2026-02-02T12:36:45.071Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/b9/7257ac59778f1cd025b26a23c5520a36a424f7f1b068f2442a5b499b7464/jiter-0.13.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:879e768938e7b49b5e90b7e3fecc0dbec01b8cb89595861fb39a8967c5220d09", size = 353880, upload-time = "2026-02-02T12:36:47.365Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/87/719eec4a3f0841dad99e3d3604ee4cba36af4419a76f3cb0b8e2e691ad67/jiter-0.13.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:682161a67adea11e3aae9038c06c8b4a9a71023228767477d683f69903ebc607", size = 366702, upload-time = "2026-02-02T12:36:48.871Z" },
+ { url = "https://files.pythonhosted.org/packages/d2/65/415f0a75cf6921e43365a1bc227c565cb949caca8b7532776e430cbaa530/jiter-0.13.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a13b68cd1cd8cc9de8f244ebae18ccb3e4067ad205220ef324c39181e23bbf66", size = 486319, upload-time = "2026-02-02T12:36:53.006Z" },
+ { url = "https://files.pythonhosted.org/packages/54/a2/9e12b48e82c6bbc6081fd81abf915e1443add1b13d8fc586e1d90bb02bb8/jiter-0.13.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87ce0f14c6c08892b610686ae8be350bf368467b6acd5085a5b65441e2bf36d2", size = 372289, upload-time = "2026-02-02T12:36:54.593Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/c1/e4693f107a1789a239c759a432e9afc592366f04e901470c2af89cfd28e1/jiter-0.13.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c365005b05505a90d1c47856420980d0237adf82f70c4aff7aebd3c1cc143ad", size = 360165, upload-time = "2026-02-02T12:36:56.112Z" },
+ { url = "https://files.pythonhosted.org/packages/17/08/91b9ea976c1c758240614bd88442681a87672eebc3d9a6dde476874e706b/jiter-0.13.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1317fdffd16f5873e46ce27d0e0f7f4f90f0cdf1d86bf6abeaea9f63ca2c401d", size = 389634, upload-time = "2026-02-02T12:36:57.495Z" },
+ { url = "https://files.pythonhosted.org/packages/18/23/58325ef99390d6d40427ed6005bf1ad54f2577866594bcf13ce55675f87d/jiter-0.13.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:c05b450d37ba0c9e21c77fef1f205f56bcee2330bddca68d344baebfc55ae0df", size = 514933, upload-time = "2026-02-02T12:36:58.909Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/25/69f1120c7c395fd276c3996bb8adefa9c6b84c12bb7111e5c6ccdcd8526d/jiter-0.13.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:775e10de3849d0631a97c603f996f518159272db00fdda0a780f81752255ee9d", size = 548842, upload-time = "2026-02-02T12:37:00.433Z" },
+ { url = "https://files.pythonhosted.org/packages/18/05/981c9669d86850c5fbb0d9e62bba144787f9fba84546ba43d624ee27ef29/jiter-0.13.0-cp314-cp314-win32.whl", hash = "sha256:632bf7c1d28421c00dd8bbb8a3bac5663e1f57d5cd5ed962bce3c73bf62608e6", size = 202108, upload-time = "2026-02-02T12:37:01.718Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/96/cdcf54dd0b0341db7d25413229888a346c7130bd20820530905fdb65727b/jiter-0.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:f22ef501c3f87ede88f23f9b11e608581c14f04db59b6a801f354397ae13739f", size = 204027, upload-time = "2026-02-02T12:37:03.075Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/f9/724bcaaab7a3cd727031fe4f6995cb86c4bd344909177c186699c8dec51a/jiter-0.13.0-cp314-cp314-win_arm64.whl", hash = "sha256:07b75fe09a4ee8e0c606200622e571e44943f47254f95e2436c8bdcaceb36d7d", size = 187199, upload-time = "2026-02-02T12:37:04.414Z" },
+ { url = "https://files.pythonhosted.org/packages/62/92/1661d8b9fd6a3d7a2d89831db26fe3c1509a287d83ad7838831c7b7a5c7e/jiter-0.13.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:964538479359059a35fb400e769295d4b315ae61e4105396d355a12f7fef09f0", size = 318423, upload-time = "2026-02-02T12:37:05.806Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/3b/f77d342a54d4ebcd128e520fc58ec2f5b30a423b0fd26acdfc0c6fef8e26/jiter-0.13.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e104da1db1c0991b3eaed391ccd650ae8d947eab1480c733e5a3fb28d4313e40", size = 351438, upload-time = "2026-02-02T12:37:07.189Z" },
+ { url = "https://files.pythonhosted.org/packages/76/b3/ba9a69f0e4209bd3331470c723c2f5509e6f0482e416b612431a5061ed71/jiter-0.13.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e3a5f0cde8ff433b8e88e41aa40131455420fb3649a3c7abdda6145f8cb7202", size = 364774, upload-time = "2026-02-02T12:37:08.579Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/16/6cdb31fa342932602458dbb631bfbd47f601e03d2e4950740e0b2100b570/jiter-0.13.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57aab48f40be1db920a582b30b116fe2435d184f77f0e4226f546794cedd9cf0", size = 487238, upload-time = "2026-02-02T12:37:10.066Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/b1/956cc7abaca8d95c13aa8d6c9b3f3797241c246cd6e792934cc4c8b250d2/jiter-0.13.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7772115877c53f62beeb8fd853cab692dbc04374ef623b30f997959a4c0e7e95", size = 372892, upload-time = "2026-02-02T12:37:11.656Z" },
+ { url = "https://files.pythonhosted.org/packages/26/c4/97ecde8b1e74f67b8598c57c6fccf6df86ea7861ed29da84629cdbba76c4/jiter-0.13.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1211427574b17b633cfceba5040de8081e5abf114f7a7602f73d2e16f9fdaa59", size = 360309, upload-time = "2026-02-02T12:37:13.244Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/d7/eabe3cf46715854ccc80be2cd78dd4c36aedeb30751dbf85a1d08c14373c/jiter-0.13.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7beae3a3d3b5212d3a55d2961db3c292e02e302feb43fce6a3f7a31b90ea6dfe", size = 389607, upload-time = "2026-02-02T12:37:14.881Z" },
+ { url = "https://files.pythonhosted.org/packages/df/2d/03963fc0804e6109b82decfb9974eb92df3797fe7222428cae12f8ccaa0c/jiter-0.13.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e5562a0f0e90a6223b704163ea28e831bd3a9faa3512a711f031611e6b06c939", size = 514986, upload-time = "2026-02-02T12:37:16.326Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/6c/8c83b45eb3eb1c1e18d841fe30b4b5bc5619d781267ca9bc03e005d8fd0a/jiter-0.13.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:6c26a424569a59140fb51160a56df13f438a2b0967365e987889186d5fc2f6f9", size = 548756, upload-time = "2026-02-02T12:37:17.736Z" },
+ { url = "https://files.pythonhosted.org/packages/47/66/eea81dfff765ed66c68fd2ed8c96245109e13c896c2a5015c7839c92367e/jiter-0.13.0-cp314-cp314t-win32.whl", hash = "sha256:24dc96eca9f84da4131cdf87a95e6ce36765c3b156fc9ae33280873b1c32d5f6", size = 201196, upload-time = "2026-02-02T12:37:19.101Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/32/4ac9c7a76402f8f00d00842a7f6b83b284d0cf7c1e9d4227bc95aa6d17fa/jiter-0.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0a8d76c7524087272c8ae913f5d9d608bd839154b62c4322ef65723d2e5bb0b8", size = 204215, upload-time = "2026-02-02T12:37:20.495Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/8e/7def204fea9f9be8b3c21a6f2dd6c020cf56c7d5ff753e0e23ed7f9ea57e/jiter-0.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2c26cf47e2cad140fa23b6d58d435a7c0161f5c514284802f25e87fddfe11024", size = 187152, upload-time = "2026-02-02T12:37:22.124Z" },
]
[[package]]
@@ -899,7 +1115,7 @@ wheels = [
[[package]]
name = "langchain-aws"
-version = "1.1.0"
+version = "1.2.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "boto3" },
@@ -907,9 +1123,9 @@ dependencies = [
{ name = "numpy" },
{ name = "pydantic" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/52/1d/bb306951b1c394b7a27effb8eb6c9ee65dd77fcc4be7c20f76e3299a9e1e/langchain_aws-1.1.0.tar.gz", hash = "sha256:1e2f8570328eae4907c3cf7e900dc68d8034ddc865d9dc96823c9f9d8cccb901", size = 393899, upload-time = "2025-11-24T14:35:24.216Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/72/57060b4fd67ce55dac9f3b57e2f7d84405b944ec19a102ec6ee0a76b5382/langchain_aws-1.2.3.tar.gz", hash = "sha256:5e2311e22ea22d1d056bc4b5a226bef88091056d0da841604b291ae51cd7f804", size = 401022, upload-time = "2026-02-10T05:07:47.723Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/26/33/91b8d2a7570657b371382b45054142c54165a51706990a5c1b4cc40c0e9a/langchain_aws-1.1.0-py3-none-any.whl", hash = "sha256:8ec074615b42839e035354063717374c32c63f5028ef5221ba073fd5f3ef5e37", size = 152432, upload-time = "2025-11-24T14:35:23.004Z" },
+ { url = "https://files.pythonhosted.org/packages/58/ed/7388c465d620e5ca50a5927a8fd6e47223455bc29f3c19c0e24746d1e845/langchain_aws-1.2.3-py3-none-any.whl", hash = "sha256:420bc8e72f955ea58ead26b44dd4ff7baf385cb3d3e14f528217b4f90fadb3ec", size = 164733, upload-time = "2026-02-10T05:07:46.569Z" },
]
[[package]]
@@ -971,7 +1187,7 @@ wheels = [
[[package]]
name = "langchain-core"
-version = "1.1.0"
+version = "1.2.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jsonpatch" },
@@ -981,51 +1197,38 @@ dependencies = [
{ name = "pyyaml" },
{ name = "tenacity" },
{ name = "typing-extensions" },
+ { name = "uuid-utils" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/1e/17/67c1cc2ace919e2b02dd9d783154d7fb3f1495a4ef835d9cd163b7855ac2/langchain_core-1.1.0.tar.gz", hash = "sha256:2b76a82d427922c8bc51c08404af4fc2a29e9f161dfe2297cb05091e810201e7", size = 781995, upload-time = "2025-11-21T21:01:26.958Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/71/1e/e129fc471a2d2a7b3804480a937b5ab9319cab9f4142624fcb115f925501/langchain_core-1.1.0-py3-none-any.whl", hash = "sha256:2c9f27dadc6d21ed4aa46506a37a56e6a7e2d2f9141922dc5c251ba921822ee6", size = 473752, upload-time = "2025-11-21T21:01:25.841Z" },
-]
-
-[[package]]
-name = "langchain-huggingface"
-version = "1.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "huggingface-hub" },
- { name = "langchain-core" },
- { name = "tokenizers" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/9f/d7/ffcf97cd977c535df2c621c59eafa82df73f801323f670d88819c23fc304/langchain_huggingface-1.1.0.tar.gz", hash = "sha256:43c3b06413158b0cd1edcdbadf545c24d5f64f180bb71c80dc960959a728c1fd", size = 252295, upload-time = "2025-11-24T14:18:30.366Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/85/f501592b5d76b27a198f1102bafe365151a0a6f69444122fad6d10e6f4bf/langchain_core-1.2.9.tar.gz", hash = "sha256:a3768febc762307241d153b0f8bc58fd4b70c0ff077fda3274606741fca3f5a7", size = 815900, upload-time = "2026-02-05T14:21:43.942Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/b1/4b/2bdd63464a7bb3aa7911777636cff8e54a2a1edc7b7a85a4acb7decebb23/langchain_huggingface-1.1.0-py3-none-any.whl", hash = "sha256:a3a5218a839062941cb616992bcbc4fe73352454727bafc351a452e76aead1a8", size = 29925, upload-time = "2025-11-24T14:18:29.036Z" },
+ { url = "https://files.pythonhosted.org/packages/94/46/77846a98913e444d0d564070a9056bd999daada52bd099dc1e8812272810/langchain_core-1.2.9-py3-none-any.whl", hash = "sha256:7e5ecba5ed7a65852e8d5288e9ceeba05340fa9baf32baf672818b497bbaea8f", size = 496296, upload-time = "2026-02-05T14:21:42.816Z" },
]
[[package]]
name = "langchain-ollama"
-version = "1.0.0"
+version = "1.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "langchain-core" },
{ name = "ollama" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/8b/45/d996b4c0e3e7155ff7a110bf24dcb5043fc1405559a2089c11fe97511cc2/langchain_ollama-1.0.0.tar.gz", hash = "sha256:2ea9ad1b0f0ab319d600b9193d1124a8925523a3b943d75a967718e24ec09a8a", size = 151042, upload-time = "2025-10-17T15:41:50.277Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/73/51/72cd04d74278f3575f921084f34280e2f837211dc008c9671c268c578afe/langchain_ollama-1.0.1.tar.gz", hash = "sha256:e37880c2f41cdb0895e863b1cfd0c2c840a117868b3f32e44fef42569e367443", size = 153850, upload-time = "2025-12-12T21:48:28.68Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/91/08/7be292aee722692b13a93316247b57eefb83d4309f5fdfe636cc47786efe/langchain_ollama-1.0.0-py3-none-any.whl", hash = "sha256:5828523fcbd137847490841110a6aedf96b68534e7fe2735715ecf3e835b2391", size = 29006, upload-time = "2025-10-17T15:41:49.497Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/46/f2907da16dc5a5a6c679f83b7de21176178afad8d2ca635a581429580ef6/langchain_ollama-1.0.1-py3-none-any.whl", hash = "sha256:37eb939a4718a0255fe31e19fbb0def044746c717b01b97d397606ebc3e9b440", size = 29207, upload-time = "2025-12-12T21:48:27.832Z" },
]
[[package]]
name = "langchain-openai"
-version = "1.1.0"
+version = "1.1.8"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "langchain-core" },
{ name = "openai" },
{ name = "tiktoken" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/18/29/cc7a7d1c42d19c903efa3ef9c9f00042942d28a00da1af12be5b7035375d/langchain_openai-1.1.0.tar.gz", hash = "sha256:9a33280c2e8315d013d64e6b15e583be347beb0d0f281755c335ae504ad0c184", size = 1034339, upload-time = "2025-11-24T14:20:48.929Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/8f/21/8053d1c41413521eae31cc6e71b0ac3048a53da0c7780c6e1b6ee4bf85b1/langchain_openai-1.1.8.tar.gz", hash = "sha256:3ff66966812a2362a8ccbcad24b517708f39c42471517abe77b07adc818a2c22", size = 1003370, upload-time = "2026-02-09T15:33:03.834Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/da/ff/82699ef76f36818d86571a0b086ce07af5503a63d7430fc49e7d6aeb5dc1/langchain_openai-1.1.0-py3-none-any.whl", hash = "sha256:243bb345d0260ea1326c2b6ac2237ec29f082ab457c59e9306bac349df4577e8", size = 84282, upload-time = "2025-11-24T14:20:47.717Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/c0/8059e020fc308c321dcdca9768d2b0931e2b3b0685669a75124c9a675ee4/langchain_openai-1.1.8-py3-none-any.whl", hash = "sha256:c37d99eac78700cca2bef8d503cf367065685e1ac4cbdf674056046f3bebf118", size = 84939, upload-time = "2026-02-09T15:33:01.973Z" },
]
[[package]]
@@ -1242,14 +1445,14 @@ wheels = [
[[package]]
name = "marshmallow"
-version = "3.26.1"
+version = "3.26.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "packaging" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/ab/5e/5e53d26b42ab75491cda89b871dab9e97c840bf12c63ec58a1919710cd06/marshmallow-3.26.1.tar.gz", hash = "sha256:e6d8affb6cb61d39d26402096dc0aee12d5a26d490a121f118d2e81dc0719dc6", size = 221825, upload-time = "2025-02-03T15:32:25.093Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/55/79/de6c16cc902f4fc372236926b0ce2ab7845268dcc30fb2fbb7f71b418631/marshmallow-3.26.2.tar.gz", hash = "sha256:bbe2adb5a03e6e3571b573f42527c6fe926e17467833660bebd11593ab8dfd57", size = 222095, upload-time = "2025-12-22T06:53:53.309Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/34/75/51952c7b2d3873b44a0028b1bd26a25078c18f92f256608e8d1dc61b39fd/marshmallow-3.26.1-py3-none-any.whl", hash = "sha256:3350409f20a70a7e4e11a27661187b77cdcaeb20abca41c1454fe33636bea09c", size = 50878, upload-time = "2025-02-03T15:32:22.295Z" },
+ { url = "https://files.pythonhosted.org/packages/be/2f/5108cb3ee4ba6501748c4908b908e55f42a5b66245b4cfe0c99326e1ef6e/marshmallow-3.26.2-py3-none-any.whl", hash = "sha256:013fa8a3c4c276c24d26d84ce934dc964e2aa794345a0f8c7e5a7191482c8a73", size = 50964, upload-time = "2025-12-22T06:53:51.801Z" },
]
[[package]]
@@ -1353,66 +1556,64 @@ wheels = [
[[package]]
name = "neo4j"
-version = "6.0.3"
+version = "6.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pytz" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/60/b2/87534fc0520e5f9db1432bacc3f8d0ce024608010babc4f65b96e0c34906/neo4j-6.0.3.tar.gz", hash = "sha256:7fb79e166e281aafd67d521f6611763ebcdc529f26db506c5605f91ddcd825ea", size = 239653, upload-time = "2025-11-06T16:57:57.012Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/01/d6ce65e4647f6cb2b9cca3b813978f7329b54b4e36660aaec1ddf0ccce7a/neo4j-6.1.0.tar.gz", hash = "sha256:b5dde8c0d8481e7b6ae3733569d990dd3e5befdc5d452f531ad1884ed3500b84", size = 239629, upload-time = "2026-01-12T11:27:34.777Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/ba/fe/55ed1d4636defb57fae1f7be7818820aa8071d45949c91ef8649930e70c5/neo4j-6.0.3-py3-none-any.whl", hash = "sha256:a92023854da96aed4270e0d03d6429cdd7f0d3335eae977370934f4732de5678", size = 325433, upload-time = "2025-11-06T16:57:55.03Z" },
+ { url = "https://files.pythonhosted.org/packages/70/5c/ee71e2dd955045425ef44283f40ba1da67673cf06404916ca2950ac0cd39/neo4j-6.1.0-py3-none-any.whl", hash = "sha256:3bd93941f3a3559af197031157220af9fd71f4f93a311db687bd69ffa417b67d", size = 325326, upload-time = "2026-01-12T11:27:33.196Z" },
]
[[package]]
name = "numpy"
-version = "2.3.5"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/db/69/9cde09f36da4b5a505341180a3f2e6fadc352fd4d2b7096ce9778db83f1a/numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff", size = 16728251, upload-time = "2025-11-16T22:50:19.013Z" },
- { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" },
- { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" },
- { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" },
- { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" },
- { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" },
- { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" },
- { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" },
- { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" },
- { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" },
- { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" },
- { url = "https://files.pythonhosted.org/packages/13/cb/71744144e13389d577f867f745b7df2d8489463654a918eea2eeb166dfc9/numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd", size = 16827292, upload-time = "2025-11-16T22:50:47.715Z" },
- { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" },
- { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" },
- { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" },
- { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" },
- { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" },
- { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" },
- { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" },
- { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" },
- { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" },
- { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" },
- { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" },
- { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" },
- { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" },
- { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" },
- { url = "https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b", size = 14262696, upload-time = "2025-11-16T22:51:29.402Z" },
- { url = "https://files.pythonhosted.org/packages/c8/f6/07ec185b90ec9d7217a00eeeed7383b73d7e709dae2a9a021b051542a708/numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520", size = 16597350, upload-time = "2025-11-16T22:51:32.167Z" },
- { url = "https://files.pythonhosted.org/packages/75/37/164071d1dde6a1a84c9b8e5b414fa127981bad47adf3a6b7e23917e52190/numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c", size = 16040190, upload-time = "2025-11-16T22:51:35.403Z" },
- { url = "https://files.pythonhosted.org/packages/08/3c/f18b82a406b04859eb026d204e4e1773eb41c5be58410f41ffa511d114ae/numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8", size = 18536749, upload-time = "2025-11-16T22:51:39.698Z" },
- { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" },
- { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" },
- { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" },
- { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" },
- { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" },
- { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" },
- { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" },
- { url = "https://files.pythonhosted.org/packages/02/c6/7c34b528740512e57ef1b7c8337ab0b4f0bddf34c723b8996c675bc2bc91/numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52", size = 14308962, upload-time = "2025-11-16T22:52:01.698Z" },
- { url = "https://files.pythonhosted.org/packages/80/35/09d433c5262bc32d725bafc619e095b6a6651caf94027a03da624146f655/numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b", size = 16655054, upload-time = "2025-11-16T22:52:04.267Z" },
- { url = "https://files.pythonhosted.org/packages/7a/ab/6a7b259703c09a88804fa2430b43d6457b692378f6b74b356155283566ac/numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3", size = 16091613, upload-time = "2025-11-16T22:52:08.651Z" },
- { url = "https://files.pythonhosted.org/packages/c2/88/330da2071e8771e60d1038166ff9d73f29da37b01ec3eb43cb1427464e10/numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227", size = 18591147, upload-time = "2025-11-16T22:52:11.453Z" },
- { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" },
- { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" },
- { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" },
+version = "2.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/fd/0005efbd0af48e55eb3c7208af93f2862d4b1a56cd78e84309a2d959208d/numpy-2.4.2.tar.gz", hash = "sha256:659a6107e31a83c4e33f763942275fd278b21d095094044eb35569e86a21ddae", size = 20723651, upload-time = "2026-01-31T23:13:10.135Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a1/22/815b9fe25d1d7ae7d492152adbc7226d3eff731dffc38fe970589fcaaa38/numpy-2.4.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25f2059807faea4b077a2b6837391b5d830864b3543627f381821c646f31a63c", size = 16663696, upload-time = "2026-01-31T23:11:17.516Z" },
+ { url = "https://files.pythonhosted.org/packages/09/f0/817d03a03f93ba9c6c8993de509277d84e69f9453601915e4a69554102a1/numpy-2.4.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bd3a7a9f5847d2fb8c2c6d1c862fa109c31a9abeca1a3c2bd5a64572955b2979", size = 14688322, upload-time = "2026-01-31T23:11:19.883Z" },
+ { url = "https://files.pythonhosted.org/packages/da/b4/f805ab79293c728b9a99438775ce51885fd4f31b76178767cfc718701a39/numpy-2.4.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8e4549f8a3c6d13d55041925e912bfd834285ef1dd64d6bc7d542583355e2e98", size = 5198157, upload-time = "2026-01-31T23:11:22.375Z" },
+ { url = "https://files.pythonhosted.org/packages/74/09/826e4289844eccdcd64aac27d13b0fd3f32039915dd5b9ba01baae1f436c/numpy-2.4.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:aea4f66ff44dfddf8c2cffd66ba6538c5ec67d389285292fe428cb2c738c8aef", size = 6546330, upload-time = "2026-01-31T23:11:23.958Z" },
+ { url = "https://files.pythonhosted.org/packages/19/fb/cbfdbfa3057a10aea5422c558ac57538e6acc87ec1669e666d32ac198da7/numpy-2.4.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3cd545784805de05aafe1dde61752ea49a359ccba9760c1e5d1c88a93bbf2b7", size = 15660968, upload-time = "2026-01-31T23:11:25.713Z" },
+ { url = "https://files.pythonhosted.org/packages/04/dc/46066ce18d01645541f0186877377b9371b8fa8017fa8262002b4ef22612/numpy-2.4.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0d9b7c93578baafcbc5f0b83eaf17b79d345c6f36917ba0c67f45226911d499", size = 16607311, upload-time = "2026-01-31T23:11:28.117Z" },
+ { url = "https://files.pythonhosted.org/packages/14/d9/4b5adfc39a43fa6bf918c6d544bc60c05236cc2f6339847fc5b35e6cb5b0/numpy-2.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f74f0f7779cc7ae07d1810aab8ac6b1464c3eafb9e283a40da7309d5e6e48fbb", size = 17012850, upload-time = "2026-01-31T23:11:30.888Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/20/adb6e6adde6d0130046e6fdfb7675cc62bc2f6b7b02239a09eb58435753d/numpy-2.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c7ac672d699bf36275c035e16b65539931347d68b70667d28984c9fb34e07fa7", size = 18334210, upload-time = "2026-01-31T23:11:33.214Z" },
+ { url = "https://files.pythonhosted.org/packages/78/0e/0a73b3dff26803a8c02baa76398015ea2a5434d9b8265a7898a6028c1591/numpy-2.4.2-cp313-cp313-win32.whl", hash = "sha256:8e9afaeb0beff068b4d9cd20d322ba0ee1cecfb0b08db145e4ab4dd44a6b5110", size = 5958199, upload-time = "2026-01-31T23:11:35.385Z" },
+ { url = "https://files.pythonhosted.org/packages/43/bc/6352f343522fcb2c04dbaf94cb30cca6fd32c1a750c06ad6231b4293708c/numpy-2.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:7df2de1e4fba69a51c06c28f5a3de36731eb9639feb8e1cf7e4a7b0daf4cf622", size = 12310848, upload-time = "2026-01-31T23:11:38.001Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/8d/6da186483e308da5da1cc6918ce913dcfe14ffde98e710bfeff2a6158d4e/numpy-2.4.2-cp313-cp313-win_arm64.whl", hash = "sha256:0fece1d1f0a89c16b03442eae5c56dc0be0c7883b5d388e0c03f53019a4bfd71", size = 10221082, upload-time = "2026-01-31T23:11:40.392Z" },
+ { url = "https://files.pythonhosted.org/packages/25/a1/9510aa43555b44781968935c7548a8926274f815de42ad3997e9e83680dd/numpy-2.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5633c0da313330fd20c484c78cdd3f9b175b55e1a766c4a174230c6b70ad8262", size = 14815866, upload-time = "2026-01-31T23:11:42.495Z" },
+ { url = "https://files.pythonhosted.org/packages/36/30/6bbb5e76631a5ae46e7923dd16ca9d3f1c93cfa8d4ed79a129814a9d8db3/numpy-2.4.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d9f64d786b3b1dd742c946c42d15b07497ed14af1a1f3ce840cce27daa0ce913", size = 5325631, upload-time = "2026-01-31T23:11:44.7Z" },
+ { url = "https://files.pythonhosted.org/packages/46/00/3a490938800c1923b567b3a15cd17896e68052e2145d8662aaf3e1ffc58f/numpy-2.4.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:b21041e8cb6a1eb5312dd1d2f80a94d91efffb7a06b70597d44f1bd2dfc315ab", size = 6646254, upload-time = "2026-01-31T23:11:46.341Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/e9/fac0890149898a9b609caa5af7455a948b544746e4b8fe7c212c8edd71f8/numpy-2.4.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:00ab83c56211a1d7c07c25e3217ea6695e50a3e2f255053686b081dc0b091a82", size = 15720138, upload-time = "2026-01-31T23:11:48.082Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/5c/08887c54e68e1e28df53709f1893ce92932cc6f01f7c3d4dc952f61ffd4e/numpy-2.4.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fb882da679409066b4603579619341c6d6898fc83a8995199d5249f986e8e8f", size = 16655398, upload-time = "2026-01-31T23:11:50.293Z" },
+ { url = "https://files.pythonhosted.org/packages/4d/89/253db0fa0e66e9129c745e4ef25631dc37d5f1314dad2b53e907b8538e6d/numpy-2.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:66cb9422236317f9d44b67b4d18f44efe6e9c7f8794ac0462978513359461554", size = 17079064, upload-time = "2026-01-31T23:11:52.927Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/d5/cbade46ce97c59c6c3da525e8d95b7abe8a42974a1dc5c1d489c10433e88/numpy-2.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0f01dcf33e73d80bd8dc0f20a71303abbafa26a19e23f6b68d1aa9990af90257", size = 18379680, upload-time = "2026-01-31T23:11:55.22Z" },
+ { url = "https://files.pythonhosted.org/packages/40/62/48f99ae172a4b63d981babe683685030e8a3df4f246c893ea5c6ef99f018/numpy-2.4.2-cp313-cp313t-win32.whl", hash = "sha256:52b913ec40ff7ae845687b0b34d8d93b60cb66dcee06996dd5c99f2fc9328657", size = 6082433, upload-time = "2026-01-31T23:11:58.096Z" },
+ { url = "https://files.pythonhosted.org/packages/07/38/e054a61cfe48ad9f1ed0d188e78b7e26859d0b60ef21cd9de4897cdb5326/numpy-2.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:5eea80d908b2c1f91486eb95b3fb6fab187e569ec9752ab7d9333d2e66bf2d6b", size = 12451181, upload-time = "2026-01-31T23:11:59.782Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/a4/a05c3a6418575e185dd84d0b9680b6bb2e2dc3e4202f036b7b4e22d6e9dc/numpy-2.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:fd49860271d52127d61197bb50b64f58454e9f578cb4b2c001a6de8b1f50b0b1", size = 10290756, upload-time = "2026-01-31T23:12:02.438Z" },
+ { url = "https://files.pythonhosted.org/packages/18/88/b7df6050bf18fdcfb7046286c6535cabbdd2064a3440fca3f069d319c16e/numpy-2.4.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:444be170853f1f9d528428eceb55f12918e4fda5d8805480f36a002f1415e09b", size = 16663092, upload-time = "2026-01-31T23:12:04.521Z" },
+ { url = "https://files.pythonhosted.org/packages/25/7a/1fee4329abc705a469a4afe6e69b1ef7e915117747886327104a8493a955/numpy-2.4.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d1240d50adff70c2a88217698ca844723068533f3f5c5fa6ee2e3220e3bdb000", size = 14698770, upload-time = "2026-01-31T23:12:06.96Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/0b/f9e49ba6c923678ad5bc38181c08ac5e53b7a5754dbca8e581aa1a56b1ff/numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:7cdde6de52fb6664b00b056341265441192d1291c130e99183ec0d4b110ff8b1", size = 5208562, upload-time = "2026-01-31T23:12:09.632Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/12/d7de8f6f53f9bb76997e5e4c069eda2051e3fe134e9181671c4391677bb2/numpy-2.4.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:cda077c2e5b780200b6b3e09d0b42205a3d1c68f30c6dceb90401c13bff8fe74", size = 6543710, upload-time = "2026-01-31T23:12:11.969Z" },
+ { url = "https://files.pythonhosted.org/packages/09/63/c66418c2e0268a31a4cf8a8b512685748200f8e8e8ec6c507ce14e773529/numpy-2.4.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d30291931c915b2ab5717c2974bb95ee891a1cf22ebc16a8006bd59cd210d40a", size = 15677205, upload-time = "2026-01-31T23:12:14.33Z" },
+ { url = "https://files.pythonhosted.org/packages/5d/6c/7f237821c9642fb2a04d2f1e88b4295677144ca93285fd76eff3bcba858d/numpy-2.4.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bba37bc29d4d85761deed3954a1bc62be7cf462b9510b51d367b769a8c8df325", size = 16611738, upload-time = "2026-01-31T23:12:16.525Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/a7/39c4cdda9f019b609b5c473899d87abff092fc908cfe4d1ecb2fcff453b0/numpy-2.4.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b2f0073ed0868db1dcd86e052d37279eef185b9c8db5bf61f30f46adac63c909", size = 17028888, upload-time = "2026-01-31T23:12:19.306Z" },
+ { url = "https://files.pythonhosted.org/packages/da/b3/e84bb64bdfea967cc10950d71090ec2d84b49bc691df0025dddb7c26e8e3/numpy-2.4.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7f54844851cdb630ceb623dcec4db3240d1ac13d4990532446761baede94996a", size = 18339556, upload-time = "2026-01-31T23:12:21.816Z" },
+ { url = "https://files.pythonhosted.org/packages/88/f5/954a291bc1192a27081706862ac62bb5920fbecfbaa302f64682aa90beed/numpy-2.4.2-cp314-cp314-win32.whl", hash = "sha256:12e26134a0331d8dbd9351620f037ec470b7c75929cb8a1537f6bfe411152a1a", size = 6006899, upload-time = "2026-01-31T23:12:24.14Z" },
+ { url = "https://files.pythonhosted.org/packages/05/cb/eff72a91b2efdd1bc98b3b8759f6a1654aa87612fc86e3d87d6fe4f948c4/numpy-2.4.2-cp314-cp314-win_amd64.whl", hash = "sha256:068cdb2d0d644cdb45670810894f6a0600797a69c05f1ac478e8d31670b8ee75", size = 12443072, upload-time = "2026-01-31T23:12:26.33Z" },
+ { url = "https://files.pythonhosted.org/packages/37/75/62726948db36a56428fce4ba80a115716dc4fad6a3a4352487f8bb950966/numpy-2.4.2-cp314-cp314-win_arm64.whl", hash = "sha256:6ed0be1ee58eef41231a5c943d7d1375f093142702d5723ca2eb07db9b934b05", size = 10494886, upload-time = "2026-01-31T23:12:28.488Z" },
+ { url = "https://files.pythonhosted.org/packages/36/2f/ee93744f1e0661dc267e4b21940870cabfae187c092e1433b77b09b50ac4/numpy-2.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:98f16a80e917003a12c0580f97b5f875853ebc33e2eaa4bccfc8201ac6869308", size = 14818567, upload-time = "2026-01-31T23:12:30.709Z" },
+ { url = "https://files.pythonhosted.org/packages/a7/24/6535212add7d76ff938d8bdc654f53f88d35cddedf807a599e180dcb8e66/numpy-2.4.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:20abd069b9cda45874498b245c8015b18ace6de8546bf50dfa8cea1696ed06ef", size = 5328372, upload-time = "2026-01-31T23:12:32.962Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/9d/c48f0a035725f925634bf6b8994253b43f2047f6778a54147d7e213bc5a7/numpy-2.4.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:e98c97502435b53741540a5717a6749ac2ada901056c7db951d33e11c885cc7d", size = 6649306, upload-time = "2026-01-31T23:12:34.797Z" },
+ { url = "https://files.pythonhosted.org/packages/81/05/7c73a9574cd4a53a25907bad38b59ac83919c0ddc8234ec157f344d57d9a/numpy-2.4.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da6cad4e82cb893db4b69105c604d805e0c3ce11501a55b5e9f9083b47d2ffe8", size = 15722394, upload-time = "2026-01-31T23:12:36.565Z" },
+ { url = "https://files.pythonhosted.org/packages/35/fa/4de10089f21fc7d18442c4a767ab156b25c2a6eaf187c0db6d9ecdaeb43f/numpy-2.4.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e4424677ce4b47fe73c8b5556d876571f7c6945d264201180db2dc34f676ab5", size = 16653343, upload-time = "2026-01-31T23:12:39.188Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/f9/d33e4ffc857f3763a57aa85650f2e82486832d7492280ac21ba9efda80da/numpy-2.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2b8f157c8a6f20eb657e240f8985cc135598b2b46985c5bccbde7616dc9c6b1e", size = 17078045, upload-time = "2026-01-31T23:12:42.041Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/b8/54bdb43b6225badbea6389fa038c4ef868c44f5890f95dd530a218706da3/numpy-2.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5daf6f3914a733336dab21a05cdec343144600e964d2fcdabaac0c0269874b2a", size = 18380024, upload-time = "2026-01-31T23:12:44.331Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/55/6e1a61ded7af8df04016d81b5b02daa59f2ea9252ee0397cb9f631efe9e5/numpy-2.4.2-cp314-cp314t-win32.whl", hash = "sha256:8c50dd1fc8826f5b26a5ee4d77ca55d88a895f4e4819c7ecc2a9f5905047a443", size = 6153937, upload-time = "2026-01-31T23:12:47.229Z" },
+ { url = "https://files.pythonhosted.org/packages/45/aa/fa6118d1ed6d776b0983f3ceac9b1a5558e80df9365b1c3aa6d42bf9eee4/numpy-2.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:fcf92bee92742edd401ba41135185866f7026c502617f422eb432cfeca4fe236", size = 12631844, upload-time = "2026-01-31T23:12:48.997Z" },
+ { url = "https://files.pythonhosted.org/packages/32/0a/2ec5deea6dcd158f254a7b372fb09cfba5719419c8d66343bab35237b3fb/numpy-2.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:1f92f53998a17265194018d1cc321b2e96e900ca52d54c7c77837b71b9465181", size = 10565379, upload-time = "2026-01-31T23:12:51.345Z" },
]
[[package]]
@@ -1439,7 +1640,7 @@ wheels = [
[[package]]
name = "openai"
-version = "2.9.0"
+version = "2.19.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
@@ -1451,9 +1652,9 @@ dependencies = [
{ name = "tqdm" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/09/48/516290f38745cc1e72856f50e8afed4a7f9ac396a5a18f39e892ab89dfc2/openai-2.9.0.tar.gz", hash = "sha256:b52ec65727fc8f1eed2fbc86c8eac0998900c7ef63aa2eb5c24b69717c56fa5f", size = 608202, upload-time = "2025-12-04T18:15:09.01Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/86/e3/9161d27c725ff69ff8feb1c97834b9bd993d54f41e5aa04de1cc6b998ad7/openai-2.19.0.tar.gz", hash = "sha256:4ba78da821b44e0ea38fd182f252a45c237c340f303a9fbf9da600701fa16c75", size = 642278, upload-time = "2026-02-10T18:21:39.902Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/59/fd/ae2da789cd923dd033c99b8d544071a827c92046b150db01cfa5cea5b3fd/openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad", size = 1030836, upload-time = "2025-12-04T18:15:07.063Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/10/7c60f66c74e02eb0375cda9c2b28c7cb43e8f95cab7338124449fae1bde7/openai-2.19.0-py3-none-any.whl", hash = "sha256:425d657ce4dcc9d6294b78f0f41dc36aa43230750e61f49c551fa490c8575803", size = 1098416, upload-time = "2026-02-10T18:21:37.928Z" },
]
[[package]]
@@ -1532,6 +1733,50 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
]
+[[package]]
+name = "parsel"
+version = "1.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cssselect" },
+ { name = "jmespath" },
+ { name = "lxml" },
+ { name = "packaging" },
+ { name = "w3lib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/91/c8/4ace3a5c61e39ca21734a5715d0e076eea6200dd8daea2a5b99452f5a0d6/parsel-1.11.0.tar.gz", hash = "sha256:5925fe087eb16fc404a7ed91e31e2c1e2a9b230da4b64f34d81358c0d0e27e88", size = 106849, upload-time = "2026-01-29T07:19:23.388Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/08/23/4e0dae5e5bee14aea26dba003a682e621563451a20f751ed985810f818b6/parsel-1.11.0-py3-none-any.whl", hash = "sha256:bda82575df1774dd64e1c1396163f3cadb3e383e0f8080d43d45fa6705355daa", size = 14176, upload-time = "2026-01-29T07:19:22.255Z" },
+]
+
+[[package]]
+name = "playwright"
+version = "1.58.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "greenlet" },
+ { name = "pyee" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f8/c9/9c6061d5703267f1baae6a4647bfd1862e386fbfdb97d889f6f6ae9e3f64/playwright-1.58.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:96e3204aac292ee639edbfdef6298b4be2ea0a55a16b7068df91adac077cc606", size = 42251098, upload-time = "2026-01-30T15:09:24.028Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/40/59d34a756e02f8c670f0fee987d46f7ee53d05447d43cd114ca015cb168c/playwright-1.58.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:70c763694739d28df71ed578b9c8202bb83e8fe8fb9268c04dd13afe36301f71", size = 41039625, upload-time = "2026-01-30T15:09:27.558Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/ee/3ce6209c9c74a650aac9028c621f357a34ea5cd4d950700f8e2c4b7fe2c4/playwright-1.58.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:185e0132578733d02802dfddfbbc35f42be23a45ff49ccae5081f25952238117", size = 42251098, upload-time = "2026-01-30T15:09:30.461Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/af/009958cbf23fac551a940d34e3206e6c7eed2b8c940d0c3afd1feb0b0589/playwright-1.58.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:c95568ba1eda83812598c1dc9be60b4406dffd60b149bc1536180ad108723d6b", size = 46235268, upload-time = "2026-01-30T15:09:33.787Z" },
+ { url = "https://files.pythonhosted.org/packages/d9/a6/0e66ad04b6d3440dae73efb39540c5685c5fc95b17c8b29340b62abbd952/playwright-1.58.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f9999948f1ab541d98812de25e3a8c410776aa516d948807140aff797b4bffa", size = 45964214, upload-time = "2026-01-30T15:09:36.751Z" },
+ { url = "https://files.pythonhosted.org/packages/0e/4b/236e60ab9f6d62ed0fd32150d61f1f494cefbf02304c0061e78ed80c1c32/playwright-1.58.0-py3-none-win32.whl", hash = "sha256:1e03be090e75a0fabbdaeab65ce17c308c425d879fa48bb1d7986f96bfad0b99", size = 36815998, upload-time = "2026-01-30T15:09:39.627Z" },
+ { url = "https://files.pythonhosted.org/packages/41/f8/5ec599c5e59d2f2f336a05b4f318e733077cd5044f24adb6f86900c3e6a7/playwright-1.58.0-py3-none-win_amd64.whl", hash = "sha256:a2bf639d0ce33b3ba38de777e08697b0d8f3dc07ab6802e4ac53fb65e3907af8", size = 36816005, upload-time = "2026-01-30T15:09:42.449Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/c4/cc0229fea55c87d6c9c67fe44a21e2cd28d1d558a5478ed4d617e9fb0c93/playwright-1.58.0-py3-none-win_arm64.whl", hash = "sha256:32ffe5c303901a13a0ecab91d1c3f74baf73b84f4bedbb6b935f5bc11cc98e1b", size = 33085919, upload-time = "2026-01-30T15:09:45.71Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
[[package]]
name = "propcache"
version = "0.4.1"
@@ -1601,6 +1846,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
]
+[[package]]
+name = "protego"
+version = "0.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/07/a7/955c422611d00a6e4a06d30b367ea9bb4fb09d48552e92aef1ba312493c7/protego-0.6.0.tar.gz", hash = "sha256:3466f41438421cf90008e98534d5fde47dc16a17482571d021143ac18b70ace9", size = 3137423, upload-time = "2026-01-29T10:58:28.267Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d8/8c/f4dd590f48addf31398f78a78962eaa99eb4c87ac09c1927497032644731/protego-0.6.0-py3-none-any.whl", hash = "sha256:7210e6e06a8db839502baf1bfbcb810689a58e394d31408ef1ef9e4e3d79fc44", size = 10313, upload-time = "2026-01-29T10:58:26.748Z" },
+]
+
[[package]]
name = "pyasn1"
version = "0.6.1"
@@ -1622,6 +1876,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
]
+[[package]]
+name = "pycparser"
+version = "3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
+]
+
[[package]]
name = "pydantic"
version = "2.12.4"
@@ -1709,6 +1972,27 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" },
]
+[[package]]
+name = "pydispatcher"
+version = "2.0.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/db/030d0700ae90d2f9d52c2f3c1f864881e19cef8cba3b0a08759c8494c19c/PyDispatcher-2.0.7.tar.gz", hash = "sha256:b777c6ad080dc1bad74a4c29d6a46914fa6701ac70f94b0d66fbcfde62f5be31", size = 38891, upload-time = "2023-02-17T20:11:13.106Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/66/0e/9ee7bc0b48ec45d93b302fa2d787830dca4dc454d31a237faa5815995988/PyDispatcher-2.0.7-py3-none-any.whl", hash = "sha256:96543bea04115ffde08f851e1d45cacbfd1ee866ac42127d9b476dc5aefa7de0", size = 12040, upload-time = "2023-02-17T20:11:11.991Z" },
+]
+
+[[package]]
+name = "pyee"
+version = "13.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/95/03/1fd98d5841cd7964a27d729ccf2199602fe05eb7a405c1462eb7277945ed/pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37", size = 31250, upload-time = "2025-03-17T18:53:15.955Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" },
+]
+
[[package]]
name = "pygments"
version = "2.19.2"
@@ -1727,6 +2011,52 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997, upload-time = "2024-11-28T03:43:27.893Z" },
]
+[[package]]
+name = "pyopenssl"
+version = "25.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cryptography" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/80/be/97b83a464498a79103036bc74d1038df4a7ef0e402cfaf4d5e113fb14759/pyopenssl-25.3.0.tar.gz", hash = "sha256:c981cb0a3fd84e8602d7afc209522773b94c1c2446a3c710a75b06fe1beae329", size = 184073, upload-time = "2025-09-17T00:32:21.037Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d1/81/ef2b1dfd1862567d573a4fdbc9f969067621764fbb74338496840a1d2977/pyopenssl-25.3.0-py3-none-any.whl", hash = "sha256:1fda6fc034d5e3d179d39e59c1895c9faeaf40a79de5fc4cbbfbe0d36f4a77b6", size = 57268, upload-time = "2025-09-17T00:32:19.474Z" },
+]
+
+[[package]]
+name = "pypydispatcher"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d5/7b/65f55513d3c769fd677f90032d8d8703e3dc17e88a41b6074d2177548bca/PyPyDispatcher-2.1.2.tar.gz", hash = "sha256:b6bec5dfcff9d2535bca2b23c80eae367b1ac250a645106948d315fcfa9130f2", size = 23224, upload-time = "2017-07-03T14:20:51.806Z" }
+
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+ { name = "iniconfig" },
+ { name = "packaging" },
+ { name = "pluggy" },
+ { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
+[[package]]
+name = "pytest-asyncio"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" },
+]
+
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -1802,6 +2132,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
]
+[[package]]
+name = "queuelib"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/76/f3/d80ab8c7c91b8c42d9a2aa4dd97a8be1321e7b26000c2675b75e641d958c/queuelib-1.9.0.tar.gz", hash = "sha256:b12fea79fd8c1dd23e212b1f3db58003b773949801d4f4e6f34d882467d4a192", size = 11729, upload-time = "2026-01-29T11:19:37.065Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/4a/1c/8df7b461497b42fcc1e7c44529201975ec77b0e1ebecd00df4b1f096c1d4/queuelib-1.9.0-py3-none-any.whl", hash = "sha256:c5fd3bebf2c924446fa94fca6b72e81168f79cf4c2a9143b8b26f266a423fcf3", size = 13585, upload-time = "2026-01-29T11:19:35.616Z" },
+]
+
[[package]]
name = "redis"
version = "7.0.1"
@@ -1813,66 +2152,74 @@ wheels = [
[[package]]
name = "regex"
-version = "2025.11.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/a9/546676f25e573a4cf00fe8e119b78a37b6a8fe2dc95cda877b30889c9c45/regex-2025.11.3.tar.gz", hash = "sha256:1fedc720f9bb2494ce31a58a1631f9c82df6a09b49c19517ea5cc280b4541e01", size = 414669, upload-time = "2025-11-03T21:34:22.089Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/e1/a7/dda24ebd49da46a197436ad96378f17df30ceb40e52e859fc42cac45b850/regex-2025.11.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c1e448051717a334891f2b9a620fe36776ebf3dd8ec46a0b877c8ae69575feb4", size = 489081, upload-time = "2025-11-03T21:31:55.9Z" },
- { url = "https://files.pythonhosted.org/packages/19/22/af2dc751aacf88089836aa088a1a11c4f21a04707eb1b0478e8e8fb32847/regex-2025.11.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9b5aca4d5dfd7fbfbfbdaf44850fcc7709a01146a797536a8f84952e940cca76", size = 291123, upload-time = "2025-11-03T21:31:57.758Z" },
- { url = "https://files.pythonhosted.org/packages/a3/88/1a3ea5672f4b0a84802ee9891b86743438e7c04eb0b8f8c4e16a42375327/regex-2025.11.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:04d2765516395cf7dda331a244a3282c0f5ae96075f728629287dfa6f76ba70a", size = 288814, upload-time = "2025-11-03T21:32:01.12Z" },
- { url = "https://files.pythonhosted.org/packages/fb/8c/f5987895bf42b8ddeea1b315c9fedcfe07cadee28b9c98cf50d00adcb14d/regex-2025.11.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d9903ca42bfeec4cebedba8022a7c97ad2aab22e09573ce9976ba01b65e4361", size = 798592, upload-time = "2025-11-03T21:32:03.006Z" },
- { url = "https://files.pythonhosted.org/packages/99/2a/6591ebeede78203fa77ee46a1c36649e02df9eaa77a033d1ccdf2fcd5d4e/regex-2025.11.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:639431bdc89d6429f6721625e8129413980ccd62e9d3f496be618a41d205f160", size = 864122, upload-time = "2025-11-03T21:32:04.553Z" },
- { url = "https://files.pythonhosted.org/packages/94/d6/be32a87cf28cf8ed064ff281cfbd49aefd90242a83e4b08b5a86b38e8eb4/regex-2025.11.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f117efad42068f9715677c8523ed2be1518116d1c49b1dd17987716695181efe", size = 912272, upload-time = "2025-11-03T21:32:06.148Z" },
- { url = "https://files.pythonhosted.org/packages/62/11/9bcef2d1445665b180ac7f230406ad80671f0fc2a6ffb93493b5dd8cd64c/regex-2025.11.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4aecb6f461316adf9f1f0f6a4a1a3d79e045f9b71ec76055a791affa3b285850", size = 803497, upload-time = "2025-11-03T21:32:08.162Z" },
- { url = "https://files.pythonhosted.org/packages/e5/a7/da0dc273d57f560399aa16d8a68ae7f9b57679476fc7ace46501d455fe84/regex-2025.11.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3b3a5f320136873cc5561098dfab677eea139521cb9a9e8db98b7e64aef44cbc", size = 787892, upload-time = "2025-11-03T21:32:09.769Z" },
- { url = "https://files.pythonhosted.org/packages/da/4b/732a0c5a9736a0b8d6d720d4945a2f1e6f38f87f48f3173559f53e8d5d82/regex-2025.11.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:75fa6f0056e7efb1f42a1c34e58be24072cb9e61a601340cc1196ae92326a4f9", size = 858462, upload-time = "2025-11-03T21:32:11.769Z" },
- { url = "https://files.pythonhosted.org/packages/0c/f5/a2a03df27dc4c2d0c769220f5110ba8c4084b0bfa9ab0f9b4fcfa3d2b0fc/regex-2025.11.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:dbe6095001465294f13f1adcd3311e50dd84e5a71525f20a10bd16689c61ce0b", size = 850528, upload-time = "2025-11-03T21:32:13.906Z" },
- { url = "https://files.pythonhosted.org/packages/d6/09/e1cd5bee3841c7f6eb37d95ca91cdee7100b8f88b81e41c2ef426910891a/regex-2025.11.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:454d9b4ae7881afbc25015b8627c16d88a597479b9dea82b8c6e7e2e07240dc7", size = 789866, upload-time = "2025-11-03T21:32:15.748Z" },
- { url = "https://files.pythonhosted.org/packages/eb/51/702f5ea74e2a9c13d855a6a85b7f80c30f9e72a95493260193c07f3f8d74/regex-2025.11.3-cp313-cp313-win32.whl", hash = "sha256:28ba4d69171fc6e9896337d4fc63a43660002b7da53fc15ac992abcf3410917c", size = 266189, upload-time = "2025-11-03T21:32:17.493Z" },
- { url = "https://files.pythonhosted.org/packages/8b/00/6e29bb314e271a743170e53649db0fdb8e8ff0b64b4f425f5602f4eb9014/regex-2025.11.3-cp313-cp313-win_amd64.whl", hash = "sha256:bac4200befe50c670c405dc33af26dad5a3b6b255dd6c000d92fe4629f9ed6a5", size = 277054, upload-time = "2025-11-03T21:32:19.042Z" },
- { url = "https://files.pythonhosted.org/packages/25/f1/b156ff9f2ec9ac441710764dda95e4edaf5f36aca48246d1eea3f1fd96ec/regex-2025.11.3-cp313-cp313-win_arm64.whl", hash = "sha256:2292cd5a90dab247f9abe892ac584cb24f0f54680c73fcb4a7493c66c2bf2467", size = 270325, upload-time = "2025-11-03T21:32:21.338Z" },
- { url = "https://files.pythonhosted.org/packages/20/28/fd0c63357caefe5680b8ea052131acbd7f456893b69cc2a90cc3e0dc90d4/regex-2025.11.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:1eb1ebf6822b756c723e09f5186473d93236c06c579d2cc0671a722d2ab14281", size = 491984, upload-time = "2025-11-03T21:32:23.466Z" },
- { url = "https://files.pythonhosted.org/packages/df/ec/7014c15626ab46b902b3bcc4b28a7bae46d8f281fc7ea9c95e22fcaaa917/regex-2025.11.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1e00ec2970aab10dc5db34af535f21fcf32b4a31d99e34963419636e2f85ae39", size = 292673, upload-time = "2025-11-03T21:32:25.034Z" },
- { url = "https://files.pythonhosted.org/packages/23/ab/3b952ff7239f20d05f1f99e9e20188513905f218c81d52fb5e78d2bf7634/regex-2025.11.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a4cb042b615245d5ff9b3794f56be4138b5adc35a4166014d31d1814744148c7", size = 291029, upload-time = "2025-11-03T21:32:26.528Z" },
- { url = "https://files.pythonhosted.org/packages/21/7e/3dc2749fc684f455f162dcafb8a187b559e2614f3826877d3844a131f37b/regex-2025.11.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:44f264d4bf02f3176467d90b294d59bf1db9fe53c141ff772f27a8b456b2a9ed", size = 807437, upload-time = "2025-11-03T21:32:28.363Z" },
- { url = "https://files.pythonhosted.org/packages/1b/0b/d529a85ab349c6a25d1ca783235b6e3eedf187247eab536797021f7126c6/regex-2025.11.3-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7be0277469bf3bd7a34a9c57c1b6a724532a0d235cd0dc4e7f4316f982c28b19", size = 873368, upload-time = "2025-11-03T21:32:30.4Z" },
- { url = "https://files.pythonhosted.org/packages/7d/18/2d868155f8c9e3e9d8f9e10c64e9a9f496bb8f7e037a88a8bed26b435af6/regex-2025.11.3-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0d31e08426ff4b5b650f68839f5af51a92a5b51abd8554a60c2fbc7c71f25d0b", size = 914921, upload-time = "2025-11-03T21:32:32.123Z" },
- { url = "https://files.pythonhosted.org/packages/2d/71/9d72ff0f354fa783fe2ba913c8734c3b433b86406117a8db4ea2bf1c7a2f/regex-2025.11.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e43586ce5bd28f9f285a6e729466841368c4a0353f6fd08d4ce4630843d3648a", size = 812708, upload-time = "2025-11-03T21:32:34.305Z" },
- { url = "https://files.pythonhosted.org/packages/e7/19/ce4bf7f5575c97f82b6e804ffb5c4e940c62609ab2a0d9538d47a7fdf7d4/regex-2025.11.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0f9397d561a4c16829d4e6ff75202c1c08b68a3bdbfe29dbfcdb31c9830907c6", size = 795472, upload-time = "2025-11-03T21:32:36.364Z" },
- { url = "https://files.pythonhosted.org/packages/03/86/fd1063a176ffb7b2315f9a1b08d17b18118b28d9df163132615b835a26ee/regex-2025.11.3-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:dd16e78eb18ffdb25ee33a0682d17912e8cc8a770e885aeee95020046128f1ce", size = 868341, upload-time = "2025-11-03T21:32:38.042Z" },
- { url = "https://files.pythonhosted.org/packages/12/43/103fb2e9811205e7386366501bc866a164a0430c79dd59eac886a2822950/regex-2025.11.3-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:ffcca5b9efe948ba0661e9df0fa50d2bc4b097c70b9810212d6b62f05d83b2dd", size = 854666, upload-time = "2025-11-03T21:32:40.079Z" },
- { url = "https://files.pythonhosted.org/packages/7d/22/e392e53f3869b75804762c7c848bd2dd2abf2b70fb0e526f58724638bd35/regex-2025.11.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c56b4d162ca2b43318ac671c65bd4d563e841a694ac70e1a976ac38fcf4ca1d2", size = 799473, upload-time = "2025-11-03T21:32:42.148Z" },
- { url = "https://files.pythonhosted.org/packages/4f/f9/8bd6b656592f925b6845fcbb4d57603a3ac2fb2373344ffa1ed70aa6820a/regex-2025.11.3-cp313-cp313t-win32.whl", hash = "sha256:9ddc42e68114e161e51e272f667d640f97e84a2b9ef14b7477c53aac20c2d59a", size = 268792, upload-time = "2025-11-03T21:32:44.13Z" },
- { url = "https://files.pythonhosted.org/packages/e5/87/0e7d603467775ff65cd2aeabf1b5b50cc1c3708556a8b849a2fa4dd1542b/regex-2025.11.3-cp313-cp313t-win_amd64.whl", hash = "sha256:7a7c7fdf755032ffdd72c77e3d8096bdcb0eb92e89e17571a196f03d88b11b3c", size = 280214, upload-time = "2025-11-03T21:32:45.853Z" },
- { url = "https://files.pythonhosted.org/packages/8d/d0/2afc6f8e94e2b64bfb738a7c2b6387ac1699f09f032d363ed9447fd2bb57/regex-2025.11.3-cp313-cp313t-win_arm64.whl", hash = "sha256:df9eb838c44f570283712e7cff14c16329a9f0fb19ca492d21d4b7528ee6821e", size = 271469, upload-time = "2025-11-03T21:32:48.026Z" },
- { url = "https://files.pythonhosted.org/packages/31/e9/f6e13de7e0983837f7b6d238ad9458800a874bf37c264f7923e63409944c/regex-2025.11.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9697a52e57576c83139d7c6f213d64485d3df5bf84807c35fa409e6c970801c6", size = 489089, upload-time = "2025-11-03T21:32:50.027Z" },
- { url = "https://files.pythonhosted.org/packages/a3/5c/261f4a262f1fa65141c1b74b255988bd2fa020cc599e53b080667d591cfc/regex-2025.11.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e18bc3f73bd41243c9b38a6d9f2366cd0e0137a9aebe2d8ff76c5b67d4c0a3f4", size = 291059, upload-time = "2025-11-03T21:32:51.682Z" },
- { url = "https://files.pythonhosted.org/packages/8e/57/f14eeb7f072b0e9a5a090d1712741fd8f214ec193dba773cf5410108bb7d/regex-2025.11.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:61a08bcb0ec14ff4e0ed2044aad948d0659604f824cbd50b55e30b0ec6f09c73", size = 288900, upload-time = "2025-11-03T21:32:53.569Z" },
- { url = "https://files.pythonhosted.org/packages/3c/6b/1d650c45e99a9b327586739d926a1cd4e94666b1bd4af90428b36af66dc7/regex-2025.11.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9c30003b9347c24bcc210958c5d167b9e4f9be786cb380a7d32f14f9b84674f", size = 799010, upload-time = "2025-11-03T21:32:55.222Z" },
- { url = "https://files.pythonhosted.org/packages/99/ee/d66dcbc6b628ce4e3f7f0cbbb84603aa2fc0ffc878babc857726b8aab2e9/regex-2025.11.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4e1e592789704459900728d88d41a46fe3969b82ab62945560a31732ffc19a6d", size = 864893, upload-time = "2025-11-03T21:32:57.239Z" },
- { url = "https://files.pythonhosted.org/packages/bf/2d/f238229f1caba7ac87a6c4153d79947fb0261415827ae0f77c304260c7d3/regex-2025.11.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6538241f45eb5a25aa575dbba1069ad786f68a4f2773a29a2bd3dd1f9de787be", size = 911522, upload-time = "2025-11-03T21:32:59.274Z" },
- { url = "https://files.pythonhosted.org/packages/bd/3d/22a4eaba214a917c80e04f6025d26143690f0419511e0116508e24b11c9b/regex-2025.11.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce22519c989bb72a7e6b36a199384c53db7722fe669ba891da75907fe3587db", size = 803272, upload-time = "2025-11-03T21:33:01.393Z" },
- { url = "https://files.pythonhosted.org/packages/84/b1/03188f634a409353a84b5ef49754b97dbcc0c0f6fd6c8ede505a8960a0a4/regex-2025.11.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:66d559b21d3640203ab9075797a55165d79017520685fb407b9234d72ab63c62", size = 787958, upload-time = "2025-11-03T21:33:03.379Z" },
- { url = "https://files.pythonhosted.org/packages/99/6a/27d072f7fbf6fadd59c64d210305e1ff865cc3b78b526fd147db768c553b/regex-2025.11.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:669dcfb2e38f9e8c69507bace46f4889e3abbfd9b0c29719202883c0a603598f", size = 859289, upload-time = "2025-11-03T21:33:05.374Z" },
- { url = "https://files.pythonhosted.org/packages/9a/70/1b3878f648e0b6abe023172dacb02157e685564853cc363d9961bcccde4e/regex-2025.11.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:32f74f35ff0f25a5021373ac61442edcb150731fbaa28286bbc8bb1582c89d02", size = 850026, upload-time = "2025-11-03T21:33:07.131Z" },
- { url = "https://files.pythonhosted.org/packages/dd/d5/68e25559b526b8baab8e66839304ede68ff6727237a47727d240006bd0ff/regex-2025.11.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e6c7a21dffba883234baefe91bc3388e629779582038f75d2a5be918e250f0ed", size = 789499, upload-time = "2025-11-03T21:33:09.141Z" },
- { url = "https://files.pythonhosted.org/packages/fc/df/43971264857140a350910d4e33df725e8c94dd9dee8d2e4729fa0d63d49e/regex-2025.11.3-cp314-cp314-win32.whl", hash = "sha256:795ea137b1d809eb6836b43748b12634291c0ed55ad50a7d72d21edf1cd565c4", size = 271604, upload-time = "2025-11-03T21:33:10.9Z" },
- { url = "https://files.pythonhosted.org/packages/01/6f/9711b57dc6894a55faf80a4c1b5aa4f8649805cb9c7aef46f7d27e2b9206/regex-2025.11.3-cp314-cp314-win_amd64.whl", hash = "sha256:9f95fbaa0ee1610ec0fc6b26668e9917a582ba80c52cc6d9ada15e30aa9ab9ad", size = 280320, upload-time = "2025-11-03T21:33:12.572Z" },
- { url = "https://files.pythonhosted.org/packages/f1/7e/f6eaa207d4377481f5e1775cdeb5a443b5a59b392d0065f3417d31d80f87/regex-2025.11.3-cp314-cp314-win_arm64.whl", hash = "sha256:dfec44d532be4c07088c3de2876130ff0fbeeacaa89a137decbbb5f665855a0f", size = 273372, upload-time = "2025-11-03T21:33:14.219Z" },
- { url = "https://files.pythonhosted.org/packages/c3/06/49b198550ee0f5e4184271cee87ba4dfd9692c91ec55289e6282f0f86ccf/regex-2025.11.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ba0d8a5d7f04f73ee7d01d974d47c5834f8a1b0224390e4fe7c12a3a92a78ecc", size = 491985, upload-time = "2025-11-03T21:33:16.555Z" },
- { url = "https://files.pythonhosted.org/packages/ce/bf/abdafade008f0b1c9da10d934034cb670432d6cf6cbe38bbb53a1cfd6cf8/regex-2025.11.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:442d86cf1cfe4faabf97db7d901ef58347efd004934da045c745e7b5bd57ac49", size = 292669, upload-time = "2025-11-03T21:33:18.32Z" },
- { url = "https://files.pythonhosted.org/packages/f9/ef/0c357bb8edbd2ad8e273fcb9e1761bc37b8acbc6e1be050bebd6475f19c1/regex-2025.11.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fd0a5e563c756de210bb964789b5abe4f114dacae9104a47e1a649b910361536", size = 291030, upload-time = "2025-11-03T21:33:20.048Z" },
- { url = "https://files.pythonhosted.org/packages/79/06/edbb67257596649b8fb088d6aeacbcb248ac195714b18a65e018bf4c0b50/regex-2025.11.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf3490bcbb985a1ae97b2ce9ad1c0f06a852d5b19dde9b07bdf25bf224248c95", size = 807674, upload-time = "2025-11-03T21:33:21.797Z" },
- { url = "https://files.pythonhosted.org/packages/f4/d9/ad4deccfce0ea336296bd087f1a191543bb99ee1c53093dcd4c64d951d00/regex-2025.11.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3809988f0a8b8c9dcc0f92478d6501fac7200b9ec56aecf0ec21f4a2ec4b6009", size = 873451, upload-time = "2025-11-03T21:33:23.741Z" },
- { url = "https://files.pythonhosted.org/packages/13/75/a55a4724c56ef13e3e04acaab29df26582f6978c000ac9cd6810ad1f341f/regex-2025.11.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f4ff94e58e84aedb9c9fce66d4ef9f27a190285b451420f297c9a09f2b9abee9", size = 914980, upload-time = "2025-11-03T21:33:25.999Z" },
- { url = "https://files.pythonhosted.org/packages/67/1e/a1657ee15bd9116f70d4a530c736983eed997b361e20ecd8f5ca3759d5c5/regex-2025.11.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eb542fd347ce61e1321b0a6b945d5701528dca0cd9759c2e3bb8bd57e47964d", size = 812852, upload-time = "2025-11-03T21:33:27.852Z" },
- { url = "https://files.pythonhosted.org/packages/b8/6f/f7516dde5506a588a561d296b2d0044839de06035bb486b326065b4c101e/regex-2025.11.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2d5919075a1f2e413c00b056ea0c2f065b3f5fe83c3d07d325ab92dce51d6", size = 795566, upload-time = "2025-11-03T21:33:32.364Z" },
- { url = "https://files.pythonhosted.org/packages/d9/dd/3d10b9e170cc16fb34cb2cef91513cf3df65f440b3366030631b2984a264/regex-2025.11.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:3f8bf11a4827cc7ce5a53d4ef6cddd5ad25595d3c1435ef08f76825851343154", size = 868463, upload-time = "2025-11-03T21:33:34.459Z" },
- { url = "https://files.pythonhosted.org/packages/f5/8e/935e6beff1695aa9085ff83195daccd72acc82c81793df480f34569330de/regex-2025.11.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:22c12d837298651e5550ac1d964e4ff57c3f56965fc1812c90c9fb2028eaf267", size = 854694, upload-time = "2025-11-03T21:33:36.793Z" },
- { url = "https://files.pythonhosted.org/packages/92/12/10650181a040978b2f5720a6a74d44f841371a3d984c2083fc1752e4acf6/regex-2025.11.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ba394a3dda9ad41c7c780f60f6e4a70988741415ae96f6d1bf6c239cf01379", size = 799691, upload-time = "2025-11-03T21:33:39.079Z" },
- { url = "https://files.pythonhosted.org/packages/67/90/8f37138181c9a7690e7e4cb388debbd389342db3c7381d636d2875940752/regex-2025.11.3-cp314-cp314t-win32.whl", hash = "sha256:4bf146dca15cdd53224a1bf46d628bd7590e4a07fbb69e720d561aea43a32b38", size = 274583, upload-time = "2025-11-03T21:33:41.302Z" },
- { url = "https://files.pythonhosted.org/packages/8f/cd/867f5ec442d56beb56f5f854f40abcfc75e11d10b11fdb1869dd39c63aaf/regex-2025.11.3-cp314-cp314t-win_amd64.whl", hash = "sha256:adad1a1bcf1c9e76346e091d22d23ac54ef28e1365117d99521631078dfec9de", size = 284286, upload-time = "2025-11-03T21:33:43.324Z" },
- { url = "https://files.pythonhosted.org/packages/20/31/32c0c4610cbc070362bf1d2e4ea86d1ea29014d400a6d6c2486fcfd57766/regex-2025.11.3-cp314-cp314t-win_arm64.whl", hash = "sha256:c54f768482cef41e219720013cd05933b6f971d9562544d691c68699bf2b6801", size = 274741, upload-time = "2025-11-03T21:33:45.557Z" },
+version = "2026.1.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/86/07d5056945f9ec4590b518171c4254a5925832eb727b56d3c38a7476f316/regex-2026.1.15.tar.gz", hash = "sha256:164759aa25575cbc0651bef59a0b18353e54300d79ace8084c818ad8ac72b7d5", size = 414811, upload-time = "2026-01-14T23:18:02.775Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f8/2e/6870bb16e982669b674cce3ee9ff2d1d46ab80528ee6bcc20fb2292efb60/regex-2026.1.15-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e69d0deeb977ffe7ed3d2e4439360089f9c3f217ada608f0f88ebd67afb6385e", size = 489164, upload-time = "2026-01-14T23:15:13.962Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/67/9774542e203849b0286badf67199970a44ebdb0cc5fb739f06e47ada72f8/regex-2026.1.15-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3601ffb5375de85a16f407854d11cca8fe3f5febbe3ac78fb2866bb220c74d10", size = 291218, upload-time = "2026-01-14T23:15:15.647Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/87/b0cda79f22b8dee05f774922a214da109f9a4c0eca5da2c9d72d77ea062c/regex-2026.1.15-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4c5ef43b5c2d4114eb8ea424bb8c9cec01d5d17f242af88b2448f5ee81caadbc", size = 288895, upload-time = "2026-01-14T23:15:17.788Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/6a/0041f0a2170d32be01ab981d6346c83a8934277d82c780d60b127331f264/regex-2026.1.15-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:968c14d4f03e10b2fd960f1d5168c1f0ac969381d3c1fcc973bc45fb06346599", size = 798680, upload-time = "2026-01-14T23:15:19.342Z" },
+ { url = "https://files.pythonhosted.org/packages/58/de/30e1cfcdbe3e891324aa7568b7c968771f82190df5524fabc1138cb2d45a/regex-2026.1.15-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:56a5595d0f892f214609c9f76b41b7428bed439d98dc961efafdd1354d42baae", size = 864210, upload-time = "2026-01-14T23:15:22.005Z" },
+ { url = "https://files.pythonhosted.org/packages/64/44/4db2f5c5ca0ccd40ff052ae7b1e9731352fcdad946c2b812285a7505ca75/regex-2026.1.15-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf650f26087363434c4e560011f8e4e738f6f3e029b85d4904c50135b86cfa5", size = 912358, upload-time = "2026-01-14T23:15:24.569Z" },
+ { url = "https://files.pythonhosted.org/packages/79/b6/e6a5665d43a7c42467138c8a2549be432bad22cbd206f5ec87162de74bd7/regex-2026.1.15-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18388a62989c72ac24de75f1449d0fb0b04dfccd0a1a7c1c43af5eb503d890f6", size = 803583, upload-time = "2026-01-14T23:15:26.526Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/53/7cd478222169d85d74d7437e74750005e993f52f335f7c04ff7adfda3310/regex-2026.1.15-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d220a2517f5893f55daac983bfa9fe998a7dbcaee4f5d27a88500f8b7873788", size = 775782, upload-time = "2026-01-14T23:15:29.352Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/b5/75f9a9ee4b03a7c009fe60500fe550b45df94f0955ca29af16333ef557c5/regex-2026.1.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c9c08c2fbc6120e70abff5d7f28ffb4d969e14294fb2143b4b5c7d20e46d1714", size = 787978, upload-time = "2026-01-14T23:15:31.295Z" },
+ { url = "https://files.pythonhosted.org/packages/72/b3/79821c826245bbe9ccbb54f6eadb7879c722fd3e0248c17bfc90bf54e123/regex-2026.1.15-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7ef7d5d4bd49ec7364315167a4134a015f61e8266c6d446fc116a9ac4456e10d", size = 858550, upload-time = "2026-01-14T23:15:33.558Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/85/2ab5f77a1c465745bfbfcb3ad63178a58337ae8d5274315e2cc623a822fa/regex-2026.1.15-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:6e42844ad64194fa08d5ccb75fe6a459b9b08e6d7296bd704460168d58a388f3", size = 763747, upload-time = "2026-01-14T23:15:35.206Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/84/c27df502d4bfe2873a3e3a7cf1bdb2b9cc10284d1a44797cf38bed790470/regex-2026.1.15-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cfecdaa4b19f9ca534746eb3b55a5195d5c95b88cac32a205e981ec0a22b7d31", size = 850615, upload-time = "2026-01-14T23:15:37.523Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/b7/658a9782fb253680aa8ecb5ccbb51f69e088ed48142c46d9f0c99b46c575/regex-2026.1.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:08df9722d9b87834a3d701f3fca570b2be115654dbfd30179f30ab2f39d606d3", size = 789951, upload-time = "2026-01-14T23:15:39.582Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/2a/5928af114441e059f15b2f63e188bd00c6529b3051c974ade7444b85fcda/regex-2026.1.15-cp313-cp313-win32.whl", hash = "sha256:d426616dae0967ca225ab12c22274eb816558f2f99ccb4a1d52ca92e8baf180f", size = 266275, upload-time = "2026-01-14T23:15:42.108Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/16/5bfbb89e435897bff28cf0352a992ca719d9e55ebf8b629203c96b6ce4f7/regex-2026.1.15-cp313-cp313-win_amd64.whl", hash = "sha256:febd38857b09867d3ed3f4f1af7d241c5c50362e25ef43034995b77a50df494e", size = 277145, upload-time = "2026-01-14T23:15:44.244Z" },
+ { url = "https://files.pythonhosted.org/packages/56/c1/a09ff7392ef4233296e821aec5f78c51be5e91ffde0d163059e50fd75835/regex-2026.1.15-cp313-cp313-win_arm64.whl", hash = "sha256:8e32f7896f83774f91499d239e24cebfadbc07639c1494bb7213983842348337", size = 270411, upload-time = "2026-01-14T23:15:45.858Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/38/0cfd5a78e5c6db00e6782fdae70458f89850ce95baa5e8694ab91d89744f/regex-2026.1.15-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:ec94c04149b6a7b8120f9f44565722c7ae31b7a6d2275569d2eefa76b83da3be", size = 492068, upload-time = "2026-01-14T23:15:47.616Z" },
+ { url = "https://files.pythonhosted.org/packages/50/72/6c86acff16cb7c959c4355826bbf06aad670682d07c8f3998d9ef4fee7cd/regex-2026.1.15-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:40c86d8046915bb9aeb15d3f3f15b6fd500b8ea4485b30e1bbc799dab3fe29f8", size = 292756, upload-time = "2026-01-14T23:15:49.307Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/58/df7fb69eadfe76526ddfce28abdc0af09ffe65f20c2c90932e89d705153f/regex-2026.1.15-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:726ea4e727aba21643205edad8f2187ec682d3305d790f73b7a51c7587b64bdd", size = 291114, upload-time = "2026-01-14T23:15:51.484Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/6c/a4011cd1cf96b90d2cdc7e156f91efbd26531e822a7fbb82a43c1016678e/regex-2026.1.15-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1cb740d044aff31898804e7bf1181cc72c03d11dfd19932b9911ffc19a79070a", size = 807524, upload-time = "2026-01-14T23:15:53.102Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/25/a53ffb73183f69c3e9f4355c4922b76d2840aee160af6af5fac229b6201d/regex-2026.1.15-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:05d75a668e9ea16f832390d22131fe1e8acc8389a694c8febc3e340b0f810b93", size = 873455, upload-time = "2026-01-14T23:15:54.956Z" },
+ { url = "https://files.pythonhosted.org/packages/66/0b/8b47fc2e8f97d9b4a851736f3890a5f786443aa8901061c55f24c955f45b/regex-2026.1.15-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d991483606f3dbec93287b9f35596f41aa2e92b7c2ebbb935b63f409e243c9af", size = 915007, upload-time = "2026-01-14T23:15:57.041Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/fa/97de0d681e6d26fabe71968dbee06dd52819e9a22fdce5dac7256c31ed84/regex-2026.1.15-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:194312a14819d3e44628a44ed6fea6898fdbecb0550089d84c403475138d0a09", size = 812794, upload-time = "2026-01-14T23:15:58.916Z" },
+ { url = "https://files.pythonhosted.org/packages/22/38/e752f94e860d429654aa2b1c51880bff8dfe8f084268258adf9151cf1f53/regex-2026.1.15-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe2fda4110a3d0bc163c2e0664be44657431440722c5c5315c65155cab92f9e5", size = 781159, upload-time = "2026-01-14T23:16:00.817Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/a7/d739ffaef33c378fc888302a018d7f81080393d96c476b058b8c64fd2b0d/regex-2026.1.15-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:124dc36c85d34ef2d9164da41a53c1c8c122cfb1f6e1ec377a1f27ee81deb794", size = 795558, upload-time = "2026-01-14T23:16:03.267Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/c4/542876f9a0ac576100fc73e9c75b779f5c31e3527576cfc9cb3009dcc58a/regex-2026.1.15-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:a1774cd1981cd212506a23a14dba7fdeaee259f5deba2df6229966d9911e767a", size = 868427, upload-time = "2026-01-14T23:16:05.646Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/0f/d5655bea5b22069e32ae85a947aa564912f23758e112cdb74212848a1a1b/regex-2026.1.15-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:b5f7d8d2867152cdb625e72a530d2ccb48a3d199159144cbdd63870882fb6f80", size = 769939, upload-time = "2026-01-14T23:16:07.542Z" },
+ { url = "https://files.pythonhosted.org/packages/20/06/7e18a4fa9d326daeda46d471a44ef94201c46eaa26dbbb780b5d92cbfdda/regex-2026.1.15-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:492534a0ab925d1db998defc3c302dae3616a2fc3fe2e08db1472348f096ddf2", size = 854753, upload-time = "2026-01-14T23:16:10.395Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/67/dc8946ef3965e166f558ef3b47f492bc364e96a265eb4a2bb3ca765c8e46/regex-2026.1.15-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c661fc820cfb33e166bf2450d3dadbda47c8d8981898adb9b6fe24e5e582ba60", size = 799559, upload-time = "2026-01-14T23:16:12.347Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/61/1bba81ff6d50c86c65d9fd84ce9699dd106438ee4cdb105bf60374ee8412/regex-2026.1.15-cp313-cp313t-win32.whl", hash = "sha256:99ad739c3686085e614bf77a508e26954ff1b8f14da0e3765ff7abbf7799f952", size = 268879, upload-time = "2026-01-14T23:16:14.049Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/5e/cef7d4c5fb0ea3ac5c775fd37db5747f7378b29526cc83f572198924ff47/regex-2026.1.15-cp313-cp313t-win_amd64.whl", hash = "sha256:32655d17905e7ff8ba5c764c43cb124e34a9245e45b83c22e81041e1071aee10", size = 280317, upload-time = "2026-01-14T23:16:15.718Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/52/4317f7a5988544e34ab57b4bde0f04944c4786128c933fb09825924d3e82/regex-2026.1.15-cp313-cp313t-win_arm64.whl", hash = "sha256:b2a13dd6a95e95a489ca242319d18fc02e07ceb28fa9ad146385194d95b3c829", size = 271551, upload-time = "2026-01-14T23:16:17.533Z" },
+ { url = "https://files.pythonhosted.org/packages/52/0a/47fa888ec7cbbc7d62c5f2a6a888878e76169170ead271a35239edd8f0e8/regex-2026.1.15-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:d920392a6b1f353f4aa54328c867fec3320fa50657e25f64abf17af054fc97ac", size = 489170, upload-time = "2026-01-14T23:16:19.835Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/c4/d000e9b7296c15737c9301708e9e7fbdea009f8e93541b6b43bdb8219646/regex-2026.1.15-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b5a28980a926fa810dbbed059547b02783952e2efd9c636412345232ddb87ff6", size = 291146, upload-time = "2026-01-14T23:16:21.541Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/b6/921cc61982e538682bdf3bdf5b2c6ab6b34368da1f8e98a6c1ddc503c9cf/regex-2026.1.15-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:621f73a07595d83f28952d7bd1e91e9d1ed7625fb7af0064d3516674ec93a2a2", size = 288986, upload-time = "2026-01-14T23:16:23.381Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/33/eb7383dde0bbc93f4fb9d03453aab97e18ad4024ac7e26cef8d1f0a2cff0/regex-2026.1.15-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d7d92495f47567a9b1669c51fc8d6d809821849063d168121ef801bbc213846", size = 799098, upload-time = "2026-01-14T23:16:25.088Z" },
+ { url = "https://files.pythonhosted.org/packages/27/56/b664dccae898fc8d8b4c23accd853f723bde0f026c747b6f6262b688029c/regex-2026.1.15-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8dd16fba2758db7a3780a051f245539c4451ca20910f5a5e6ea1c08d06d4a76b", size = 864980, upload-time = "2026-01-14T23:16:27.297Z" },
+ { url = "https://files.pythonhosted.org/packages/16/40/0999e064a170eddd237bae9ccfcd8f28b3aa98a38bf727a086425542a4fc/regex-2026.1.15-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1e1808471fbe44c1a63e5f577a1d5f02fe5d66031dcbdf12f093ffc1305a858e", size = 911607, upload-time = "2026-01-14T23:16:29.235Z" },
+ { url = "https://files.pythonhosted.org/packages/07/78/c77f644b68ab054e5a674fb4da40ff7bffb2c88df58afa82dbf86573092d/regex-2026.1.15-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0751a26ad39d4f2ade8fe16c59b2bf5cb19eb3d2cd543e709e583d559bd9efde", size = 803358, upload-time = "2026-01-14T23:16:31.369Z" },
+ { url = "https://files.pythonhosted.org/packages/27/31/d4292ea8566eaa551fafc07797961c5963cf5235c797cc2ae19b85dfd04d/regex-2026.1.15-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0f0c7684c7f9ca241344ff95a1de964f257a5251968484270e91c25a755532c5", size = 775833, upload-time = "2026-01-14T23:16:33.141Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/b2/cff3bf2fea4133aa6fb0d1e370b37544d18c8350a2fa118c7e11d1db0e14/regex-2026.1.15-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:74f45d170a21df41508cb67165456538425185baaf686281fa210d7e729abc34", size = 788045, upload-time = "2026-01-14T23:16:35.005Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/99/2cb9b69045372ec877b6f5124bda4eb4253bc58b8fe5848c973f752bc52c/regex-2026.1.15-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f1862739a1ffb50615c0fde6bae6569b5efbe08d98e59ce009f68a336f64da75", size = 859374, upload-time = "2026-01-14T23:16:36.919Z" },
+ { url = "https://files.pythonhosted.org/packages/09/16/710b0a5abe8e077b1729a562d2f297224ad079f3a66dce46844c193416c8/regex-2026.1.15-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:453078802f1b9e2b7303fb79222c054cb18e76f7bdc220f7530fdc85d319f99e", size = 763940, upload-time = "2026-01-14T23:16:38.685Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/d1/7585c8e744e40eb3d32f119191969b91de04c073fca98ec14299041f6e7e/regex-2026.1.15-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:a30a68e89e5a218b8b23a52292924c1f4b245cb0c68d1cce9aec9bbda6e2c160", size = 850112, upload-time = "2026-01-14T23:16:40.646Z" },
+ { url = "https://files.pythonhosted.org/packages/af/d6/43e1dd85df86c49a347aa57c1f69d12c652c7b60e37ec162e3096194a278/regex-2026.1.15-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9479cae874c81bf610d72b85bb681a94c95722c127b55445285fb0e2c82db8e1", size = 789586, upload-time = "2026-01-14T23:16:42.799Z" },
+ { url = "https://files.pythonhosted.org/packages/93/38/77142422f631e013f316aaae83234c629555729a9fbc952b8a63ac91462a/regex-2026.1.15-cp314-cp314-win32.whl", hash = "sha256:d639a750223132afbfb8f429c60d9d318aeba03281a5f1ab49f877456448dcf1", size = 271691, upload-time = "2026-01-14T23:16:44.671Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/a9/ab16b4649524ca9e05213c1cdbb7faa85cc2aa90a0230d2f796cbaf22736/regex-2026.1.15-cp314-cp314-win_amd64.whl", hash = "sha256:4161d87f85fa831e31469bfd82c186923070fc970b9de75339b68f0c75b51903", size = 280422, upload-time = "2026-01-14T23:16:46.607Z" },
+ { url = "https://files.pythonhosted.org/packages/be/2a/20fd057bf3521cb4791f69f869635f73e0aaf2b9ad2d260f728144f9047c/regex-2026.1.15-cp314-cp314-win_arm64.whl", hash = "sha256:91c5036ebb62663a6b3999bdd2e559fd8456d17e2b485bf509784cd31a8b1705", size = 273467, upload-time = "2026-01-14T23:16:48.967Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/77/0b1e81857060b92b9cad239104c46507dd481b3ff1fa79f8e7f865aae38a/regex-2026.1.15-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ee6854c9000a10938c79238de2379bea30c82e4925a371711af45387df35cab8", size = 492073, upload-time = "2026-01-14T23:16:51.154Z" },
+ { url = "https://files.pythonhosted.org/packages/70/f3/f8302b0c208b22c1e4f423147e1913fd475ddd6230565b299925353de644/regex-2026.1.15-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2c2b80399a422348ce5de4fe40c418d6299a0fa2803dd61dc0b1a2f28e280fcf", size = 292757, upload-time = "2026-01-14T23:16:53.08Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/f0/ef55de2460f3b4a6da9d9e7daacd0cb79d4ef75c64a2af316e68447f0df0/regex-2026.1.15-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:dca3582bca82596609959ac39e12b7dad98385b4fefccb1151b937383cec547d", size = 291122, upload-time = "2026-01-14T23:16:55.383Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/55/bb8ccbacabbc3a11d863ee62a9f18b160a83084ea95cdfc5d207bfc3dd75/regex-2026.1.15-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef71d476caa6692eea743ae5ea23cde3260677f70122c4d258ca952e5c2d4e84", size = 807761, upload-time = "2026-01-14T23:16:57.251Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/84/f75d937f17f81e55679a0509e86176e29caa7298c38bd1db7ce9c0bf6075/regex-2026.1.15-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c243da3436354f4af6c3058a3f81a97d47ea52c9bd874b52fd30274853a1d5df", size = 873538, upload-time = "2026-01-14T23:16:59.349Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/d9/0da86327df70349aa8d86390da91171bd3ca4f0e7c1d1d453a9c10344da3/regex-2026.1.15-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8355ad842a7c7e9e5e55653eade3b7d1885ba86f124dd8ab1f722f9be6627434", size = 915066, upload-time = "2026-01-14T23:17:01.607Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/5e/f660fb23fc77baa2a61aa1f1fe3a4eea2bbb8a286ddec148030672e18834/regex-2026.1.15-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f192a831d9575271a22d804ff1a5355355723f94f31d9eef25f0d45a152fdc1a", size = 812938, upload-time = "2026-01-14T23:17:04.366Z" },
+ { url = "https://files.pythonhosted.org/packages/69/33/a47a29bfecebbbfd1e5cd3f26b28020a97e4820f1c5148e66e3b7d4b4992/regex-2026.1.15-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:166551807ec20d47ceaeec380081f843e88c8949780cd42c40f18d16168bed10", size = 781314, upload-time = "2026-01-14T23:17:06.378Z" },
+ { url = "https://files.pythonhosted.org/packages/65/ec/7ec2bbfd4c3f4e494a24dec4c6943a668e2030426b1b8b949a6462d2c17b/regex-2026.1.15-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f9ca1cbdc0fbfe5e6e6f8221ef2309988db5bcede52443aeaee9a4ad555e0dac", size = 795652, upload-time = "2026-01-14T23:17:08.521Z" },
+ { url = "https://files.pythonhosted.org/packages/46/79/a5d8651ae131fe27d7c521ad300aa7f1c7be1dbeee4d446498af5411b8a9/regex-2026.1.15-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b30bcbd1e1221783c721483953d9e4f3ab9c5d165aa709693d3f3946747b1aea", size = 868550, upload-time = "2026-01-14T23:17:10.573Z" },
+ { url = "https://files.pythonhosted.org/packages/06/b7/25635d2809664b79f183070786a5552dd4e627e5aedb0065f4e3cf8ee37d/regex-2026.1.15-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2a8d7b50c34578d0d3bf7ad58cde9652b7d683691876f83aedc002862a35dc5e", size = 769981, upload-time = "2026-01-14T23:17:12.871Z" },
+ { url = "https://files.pythonhosted.org/packages/16/8b/fc3fcbb2393dcfa4a6c5ffad92dc498e842df4581ea9d14309fcd3c55fb9/regex-2026.1.15-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9d787e3310c6a6425eb346be4ff2ccf6eece63017916fd77fe8328c57be83521", size = 854780, upload-time = "2026-01-14T23:17:14.837Z" },
+ { url = "https://files.pythonhosted.org/packages/d0/38/dde117c76c624713c8a2842530be9c93ca8b606c0f6102d86e8cd1ce8bea/regex-2026.1.15-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:619843841e220adca114118533a574a9cd183ed8a28b85627d2844c500a2b0db", size = 799778, upload-time = "2026-01-14T23:17:17.369Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/0d/3a6cfa9ae99606afb612d8fb7a66b245a9d5ff0f29bb347c8a30b6ad561b/regex-2026.1.15-cp314-cp314t-win32.whl", hash = "sha256:e90b8db97f6f2c97eb045b51a6b2c5ed69cedd8392459e0642d4199b94fabd7e", size = 274667, upload-time = "2026-01-14T23:17:19.301Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/b2/297293bb0742fd06b8d8e2572db41a855cdf1cae0bf009b1cb74fe07e196/regex-2026.1.15-cp314-cp314t-win_amd64.whl", hash = "sha256:5ef19071f4ac9f0834793af85bd04a920b4407715624e40cb7a0631a11137cdf", size = 284386, upload-time = "2026-01-14T23:17:21.231Z" },
+ { url = "https://files.pythonhosted.org/packages/95/e4/a3b9480c78cf8ee86626cb06f8d931d74d775897d44201ccb813097ae697/regex-2026.1.15-cp314-cp314t-win_arm64.whl", hash = "sha256:ca89c5e596fc05b015f27561b3793dc2fa0917ea0d7507eebb448efd35274a70", size = 274837, upload-time = "2026-01-14T23:17:23.146Z" },
]
[[package]]
@@ -1890,6 +2237,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
]
+[[package]]
+name = "requests-file"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3c/f8/5dc70102e4d337063452c82e1f0d95e39abfe67aa222ed8a5ddeb9df8de8/requests_file-3.0.1.tar.gz", hash = "sha256:f14243d7796c588f3521bd423c5dea2ee4cc730e54a3cac9574d78aca1272576", size = 6967, upload-time = "2025-10-20T18:56:42.279Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e1/d5/de8f089119205a09da657ed4784c584ede8381a0ce6821212a6d4ca47054/requests_file-3.0.1-py2.py3-none-any.whl", hash = "sha256:d0f5eb94353986d998f80ac63c7f146a307728be051d4d1cd390dbdb59c10fa2", size = 4514, upload-time = "2025-10-20T18:56:41.184Z" },
+]
+
[[package]]
name = "requests-oauthlib"
version = "2.0.0"
@@ -2009,14 +2368,56 @@ wheels = [
[[package]]
name = "s3transfer"
-version = "0.14.0"
+version = "0.16.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "botocore" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/62/74/8d69dcb7a9efe8baa2046891735e5dfe433ad558ae23d9e3c14c633d1d58/s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125", size = 151547, upload-time = "2025-09-09T19:23:31.089Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
+]
+
+[[package]]
+name = "scrapy"
+version = "2.14.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cryptography" },
+ { name = "cssselect" },
+ { name = "defusedxml" },
+ { name = "itemadapter" },
+ { name = "itemloaders" },
+ { name = "lxml" },
+ { name = "packaging" },
+ { name = "parsel" },
+ { name = "protego" },
+ { name = "pydispatcher", marker = "platform_python_implementation == 'CPython'" },
+ { name = "pyopenssl" },
+ { name = "pypydispatcher", marker = "platform_python_implementation == 'PyPy'" },
+ { name = "queuelib" },
+ { name = "service-identity" },
+ { name = "tldextract" },
+ { name = "twisted" },
+ { name = "w3lib" },
+ { name = "zope-interface" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/db/fb/0ccc11eaabdac1f210f27fb6b2ad4aa4ff8a5085cbc616102536fe2c56f4/scrapy-2.14.1.tar.gz", hash = "sha256:b2a4e61802e0a5518bc8293058adedbb6b0d51c08c125d1322b1af7c7cbca4c1", size = 1251898, upload-time = "2026-01-12T19:26:44.572Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" },
+ { url = "https://files.pythonhosted.org/packages/28/5d/e9e42968535589423a0831adf9daac7555758e5e723f99fa2d6a7e68f715/scrapy-2.14.1-py3-none-any.whl", hash = "sha256:e3a3c7969b7e692864f7de05e10ecb2d1bc813ed571361dc71142e5368ff92dc", size = 331706, upload-time = "2026-01-12T19:26:42.434Z" },
+]
+
+[[package]]
+name = "scrapy-playwright"
+version = "0.0.46"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "playwright" },
+ { name = "scrapy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cb/bc/12435db52dcb0cb33698e499b35411fc2553bcddf228291083a2019036d4/scrapy_playwright-0.0.46.tar.gz", hash = "sha256:2efe31155b2bbbd13fb011f3c189c21a6f34409c7f7b958881780509ce14a6bb", size = 48429, upload-time = "2026-01-21T12:11:16.653Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/1d/6f/33fb13206fa4b93450484592cb5dbd243516b4ca63cf2ef1f641b51c3ca0/scrapy_playwright-0.0.46-py3-none-any.whl", hash = "sha256:8b20c504d773ec744ae81f5383989d04ba01dbf5e6c1991c0816e2f877398c54", size = 27749, upload-time = "2026-01-21T12:11:15.508Z" },
]
[[package]]
@@ -2032,6 +2433,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a8/56/c16bda4d53012c71fa1b588edde603c6b455bc8206bf6de7b83388fcce75/sentry_sdk-2.44.0-py2.py3-none-any.whl", hash = "sha256:9e36a0372b881e8f92fdbff4564764ce6cec4b7f25424d0a3a8d609c9e4651a7", size = 402352, upload-time = "2025-11-11T09:35:54.1Z" },
]
+[[package]]
+name = "service-identity"
+version = "24.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "attrs" },
+ { name = "cryptography" },
+ { name = "pyasn1" },
+ { name = "pyasn1-modules" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/07/a5/dfc752b979067947261dbbf2543470c58efe735c3c1301dd870ef27830ee/service_identity-24.2.0.tar.gz", hash = "sha256:b8683ba13f0d39c6cd5d625d2c5f65421d6d707b013b375c355751557cbe8e09", size = 39245, upload-time = "2024-10-26T07:21:57.736Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/08/2c/ca6dd598b384bc1ce581e24aaae0f2bed4ccac57749d5c3befbb5e742081/service_identity-24.2.0-py3-none-any.whl", hash = "sha256:6b047fbd8a84fd0bb0d55ebce4031e400562b9196e1e0d3e0fe2b8a59f6d4a85", size = 11364, upload-time = "2024-10-26T07:21:56.302Z" },
+]
+
[[package]]
name = "shellingham"
version = "1.5.4"
@@ -2079,23 +2495,37 @@ wheels = [
[[package]]
name = "sqlalchemy"
-version = "2.0.44"
+version = "2.0.46"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/f0/f2/840d7b9496825333f532d2e3976b8eadbf52034178aac53630d09fe6e1ef/sqlalchemy-2.0.44.tar.gz", hash = "sha256:0ae7454e1ab1d780aee69fd2aae7d6b8670a581d8847f2d1e0f7ddfbf47e5a22", size = 9819830, upload-time = "2025-10-10T14:39:12.935Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/45/d3/c67077a2249fdb455246e6853166360054c331db4613cda3e31ab1cadbef/sqlalchemy-2.0.44-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ff486e183d151e51b1d694c7aa1695747599bb00b9f5f604092b54b74c64a8e1", size = 2135479, upload-time = "2025-10-10T16:03:37.671Z" },
- { url = "https://files.pythonhosted.org/packages/2b/91/eabd0688330d6fd114f5f12c4f89b0d02929f525e6bf7ff80aa17ca802af/sqlalchemy-2.0.44-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b1af8392eb27b372ddb783b317dea0f650241cea5bd29199b22235299ca2e45", size = 2123212, upload-time = "2025-10-10T16:03:41.755Z" },
- { url = "https://files.pythonhosted.org/packages/b0/bb/43e246cfe0e81c018076a16036d9b548c4cc649de241fa27d8d9ca6f85ab/sqlalchemy-2.0.44-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b61188657e3a2b9ac4e8f04d6cf8e51046e28175f79464c67f2fd35bceb0976", size = 3255353, upload-time = "2025-10-10T15:35:31.221Z" },
- { url = "https://files.pythonhosted.org/packages/b9/96/c6105ed9a880abe346b64d3b6ddef269ddfcab04f7f3d90a0bf3c5a88e82/sqlalchemy-2.0.44-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b87e7b91a5d5973dda5f00cd61ef72ad75a1db73a386b62877d4875a8840959c", size = 3260222, upload-time = "2025-10-10T15:43:50.124Z" },
- { url = "https://files.pythonhosted.org/packages/44/16/1857e35a47155b5ad927272fee81ae49d398959cb749edca6eaa399b582f/sqlalchemy-2.0.44-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:15f3326f7f0b2bfe406ee562e17f43f36e16167af99c4c0df61db668de20002d", size = 3189614, upload-time = "2025-10-10T15:35:32.578Z" },
- { url = "https://files.pythonhosted.org/packages/88/ee/4afb39a8ee4fc786e2d716c20ab87b5b1fb33d4ac4129a1aaa574ae8a585/sqlalchemy-2.0.44-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e77faf6ff919aa8cd63f1c4e561cac1d9a454a191bb864d5dd5e545935e5a40", size = 3226248, upload-time = "2025-10-10T15:43:51.862Z" },
- { url = "https://files.pythonhosted.org/packages/32/d5/0e66097fc64fa266f29a7963296b40a80d6a997b7ac13806183700676f86/sqlalchemy-2.0.44-cp313-cp313-win32.whl", hash = "sha256:ee51625c2d51f8baadf2829fae817ad0b66b140573939dd69284d2ba3553ae73", size = 2101275, upload-time = "2025-10-10T15:03:26.096Z" },
- { url = "https://files.pythonhosted.org/packages/03/51/665617fe4f8c6450f42a6d8d69243f9420f5677395572c2fe9d21b493b7b/sqlalchemy-2.0.44-cp313-cp313-win_amd64.whl", hash = "sha256:c1c80faaee1a6c3428cecf40d16a2365bcf56c424c92c2b6f0f9ad204b899e9e", size = 2127901, upload-time = "2025-10-10T15:03:27.548Z" },
- { url = "https://files.pythonhosted.org/packages/9c/5e/6a29fa884d9fb7ddadf6b69490a9d45fded3b38541713010dad16b77d015/sqlalchemy-2.0.44-py3-none-any.whl", hash = "sha256:19de7ca1246fbef9f9d1bff8f1ab25641569df226364a0e40457dc5457c54b05", size = 1928718, upload-time = "2025-10-10T15:29:45.32Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/06/aa/9ce0f3e7a9829ead5c8ce549392f33a12c4555a6c0609bb27d882e9c7ddf/sqlalchemy-2.0.46.tar.gz", hash = "sha256:cf36851ee7219c170bb0793dbc3da3e80c582e04a5437bc601bfe8c85c9216d7", size = 9865393, upload-time = "2026-01-21T18:03:45.119Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b3/4b/fa7838fe20bb752810feed60e45625a9a8b0102c0c09971e2d1d95362992/sqlalchemy-2.0.46-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:93a12da97cca70cea10d4b4fc602589c4511f96c1f8f6c11817620c021d21d00", size = 2150268, upload-time = "2026-01-21T19:05:56.621Z" },
+ { url = "https://files.pythonhosted.org/packages/46/c1/b34dccd712e8ea846edf396e00973dda82d598cb93762e55e43e6835eba9/sqlalchemy-2.0.46-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af865c18752d416798dae13f83f38927c52f085c52e2f32b8ab0fef46fdd02c2", size = 3276511, upload-time = "2026-01-21T18:46:49.022Z" },
+ { url = "https://files.pythonhosted.org/packages/96/48/a04d9c94753e5d5d096c628c82a98c4793b9c08ca0e7155c3eb7d7db9f24/sqlalchemy-2.0.46-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8d679b5f318423eacb61f933a9a0f75535bfca7056daeadbf6bd5bcee6183aee", size = 3292881, upload-time = "2026-01-21T18:40:13.089Z" },
+ { url = "https://files.pythonhosted.org/packages/be/f4/06eda6e91476f90a7d8058f74311cb65a2fb68d988171aced81707189131/sqlalchemy-2.0.46-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:64901e08c33462acc9ec3bad27fc7a5c2b6491665f2aa57564e57a4f5d7c52ad", size = 3224559, upload-time = "2026-01-21T18:46:50.974Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/a2/d2af04095412ca6345ac22b33b89fe8d6f32a481e613ffcb2377d931d8d0/sqlalchemy-2.0.46-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8ac45e8f4eaac0f9f8043ea0e224158855c6a4329fd4ee37c45c61e3beb518e", size = 3262728, upload-time = "2026-01-21T18:40:14.883Z" },
+ { url = "https://files.pythonhosted.org/packages/31/48/1980c7caa5978a3b8225b4d230e69a2a6538a3562b8b31cea679b6933c83/sqlalchemy-2.0.46-cp313-cp313-win32.whl", hash = "sha256:8d3b44b3d0ab2f1319d71d9863d76eeb46766f8cf9e921ac293511804d39813f", size = 2111295, upload-time = "2026-01-21T18:42:52.366Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/54/f8d65bbde3d877617c4720f3c9f60e99bb7266df0d5d78b6e25e7c149f35/sqlalchemy-2.0.46-cp313-cp313-win_amd64.whl", hash = "sha256:77f8071d8fbcbb2dd11b7fd40dedd04e8ebe2eb80497916efedba844298065ef", size = 2137076, upload-time = "2026-01-21T18:42:53.924Z" },
+ { url = "https://files.pythonhosted.org/packages/56/ba/9be4f97c7eb2b9d5544f2624adfc2853e796ed51d2bb8aec90bc94b7137e/sqlalchemy-2.0.46-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1e8cc6cc01da346dc92d9509a63033b9b1bda4fed7a7a7807ed385c7dccdc10", size = 3556533, upload-time = "2026-01-21T18:33:06.636Z" },
+ { url = "https://files.pythonhosted.org/packages/20/a6/b1fc6634564dbb4415b7ed6419cdfeaadefd2c39cdab1e3aa07a5f2474c2/sqlalchemy-2.0.46-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:96c7cca1a4babaaf3bfff3e4e606e38578856917e52f0384635a95b226c87764", size = 3523208, upload-time = "2026-01-21T18:45:08.436Z" },
+ { url = "https://files.pythonhosted.org/packages/a1/d8/41e0bdfc0f930ff236f86fccd12962d8fa03713f17ed57332d38af6a3782/sqlalchemy-2.0.46-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b2a9f9aee38039cf4755891a1e50e1effcc42ea6ba053743f452c372c3152b1b", size = 3464292, upload-time = "2026-01-21T18:33:08.208Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/8b/9dcbec62d95bea85f5ecad9b8d65b78cc30fb0ffceeb3597961f3712549b/sqlalchemy-2.0.46-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:db23b1bf8cfe1f7fda19018e7207b20cdb5168f83c437ff7e95d19e39289c447", size = 3473497, upload-time = "2026-01-21T18:45:10.552Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/f8/5ecdfc73383ec496de038ed1614de9e740a82db9ad67e6e4514ebc0708a3/sqlalchemy-2.0.46-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:56bdd261bfd0895452006d5316cbf35739c53b9bb71a170a331fa0ea560b2ada", size = 2152079, upload-time = "2026-01-21T19:05:58.477Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/bf/eba3036be7663ce4d9c050bc3d63794dc29fbe01691f2bf5ccb64e048d20/sqlalchemy-2.0.46-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:33e462154edb9493f6c3ad2125931e273bbd0be8ae53f3ecd1c161ea9a1dd366", size = 3272216, upload-time = "2026-01-21T18:46:52.634Z" },
+ { url = "https://files.pythonhosted.org/packages/05/45/1256fb597bb83b58a01ddb600c59fe6fdf0e5afe333f0456ed75c0f8d7bd/sqlalchemy-2.0.46-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9bcdce05f056622a632f1d44bb47dbdb677f58cad393612280406ce37530eb6d", size = 3277208, upload-time = "2026-01-21T18:40:16.38Z" },
+ { url = "https://files.pythonhosted.org/packages/d9/a0/2053b39e4e63b5d7ceb3372cface0859a067c1ddbd575ea7e9985716f771/sqlalchemy-2.0.46-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8e84b09a9b0f19accedcbeff5c2caf36e0dd537341a33aad8d680336152dc34e", size = 3221994, upload-time = "2026-01-21T18:46:54.622Z" },
+ { url = "https://files.pythonhosted.org/packages/1e/87/97713497d9502553c68f105a1cb62786ba1ee91dea3852ae4067ed956a50/sqlalchemy-2.0.46-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4f52f7291a92381e9b4de9050b0a65ce5d6a763333406861e33906b8aa4906bf", size = 3243990, upload-time = "2026-01-21T18:40:18.253Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/87/5d1b23548f420ff823c236f8bea36b1a997250fd2f892e44a3838ca424f4/sqlalchemy-2.0.46-cp314-cp314-win32.whl", hash = "sha256:70ed2830b169a9960193f4d4322d22be5c0925357d82cbf485b3369893350908", size = 2114215, upload-time = "2026-01-21T18:42:55.232Z" },
+ { url = "https://files.pythonhosted.org/packages/3a/20/555f39cbcf0c10cf452988b6a93c2a12495035f68b3dbd1a408531049d31/sqlalchemy-2.0.46-cp314-cp314-win_amd64.whl", hash = "sha256:3c32e993bc57be6d177f7d5d31edb93f30726d798ad86ff9066d75d9bf2e0b6b", size = 2139867, upload-time = "2026-01-21T18:42:56.474Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/f0/f96c8057c982d9d8a7a68f45d69c674bc6f78cad401099692fe16521640a/sqlalchemy-2.0.46-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4dafb537740eef640c4d6a7c254611dca2df87eaf6d14d6a5fca9d1f4c3fc0fa", size = 3561202, upload-time = "2026-01-21T18:33:10.337Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/53/3b37dda0a5b137f21ef608d8dfc77b08477bab0fe2ac9d3e0a66eaeab6fc/sqlalchemy-2.0.46-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42a1643dc5427b69aca967dae540a90b0fbf57eaf248f13a90ea5930e0966863", size = 3526296, upload-time = "2026-01-21T18:45:12.657Z" },
+ { url = "https://files.pythonhosted.org/packages/33/75/f28622ba6dde79cd545055ea7bd4062dc934e0621f7b3be2891f8563f8de/sqlalchemy-2.0.46-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ff33c6e6ad006bbc0f34f5faf941cfc62c45841c64c0a058ac38c799f15b5ede", size = 3470008, upload-time = "2026-01-21T18:33:11.725Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/42/4afecbbc38d5e99b18acef446453c76eec6fbd03db0a457a12a056836e22/sqlalchemy-2.0.46-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:82ec52100ec1e6ec671563bbd02d7c7c8d0b9e71a0723c72f22ecf52d1755330", size = 3476137, upload-time = "2026-01-21T18:45:15.001Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/a1/9c4efa03300926601c19c18582531b45aededfb961ab3c3585f1e24f120b/sqlalchemy-2.0.46-py3-none-any.whl", hash = "sha256:f9c11766e7e7c0a2767dda5acb006a118640c9fc0a4104214b96269bfb78399e", size = 1937882, upload-time = "2026-01-21T18:22:10.456Z" },
]
[[package]]
@@ -2159,41 +2589,75 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
]
+[[package]]
+name = "tldextract"
+version = "5.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "filelock" },
+ { name = "idna" },
+ { name = "requests" },
+ { name = "requests-file" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/65/7b/644fbbb49564a6cb124a8582013315a41148dba2f72209bba14a84242bf0/tldextract-5.3.1.tar.gz", hash = "sha256:a72756ca170b2510315076383ea2993478f7da6f897eef1f4a5400735d5057fb", size = 126105, upload-time = "2025-12-28T23:58:05.532Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/6d/42/0e49d6d0aac449ca71952ec5bae764af009754fcb2e76a5cc097543747b3/tldextract-5.3.1-py3-none-any.whl", hash = "sha256:6bfe36d518de569c572062b788e16a659ccaceffc486d243af0484e8ecf432d9", size = 105886, upload-time = "2025-12-28T23:58:04.071Z" },
+]
+
[[package]]
name = "tokenizers"
-version = "0.22.1"
+version = "0.22.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "huggingface-hub" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" },
- { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" },
- { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" },
- { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" },
- { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" },
- { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" },
- { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" },
- { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" },
- { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" },
- { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" },
- { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" },
- { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" },
- { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" },
- { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
+ { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload-time = "2026-01-05T10:41:02.158Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload-time = "2026-01-05T10:41:00.276Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload-time = "2026-01-05T10:40:32.165Z" },
+ { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload-time = "2026-01-05T10:40:38.847Z" },
+ { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload-time = "2026-01-05T10:40:56.614Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload-time = "2026-01-05T10:40:44.507Z" },
+ { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload-time = "2026-01-05T10:40:51.139Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload-time = "2026-01-05T10:40:58.331Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload-time = "2026-01-05T10:41:04.053Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload-time = "2026-01-05T10:45:10.673Z" },
+ { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload-time = "2026-01-05T10:45:12.559Z" },
+ { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" },
+ { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" },
+ { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
]
[[package]]
name = "tqdm"
-version = "4.67.1"
+version = "4.67.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+ { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
+]
+
+[[package]]
+name = "twisted"
+version = "25.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "attrs" },
+ { name = "automat" },
+ { name = "constantly" },
+ { name = "hyperlink" },
+ { name = "incremental" },
+ { name = "typing-extensions" },
+ { name = "zope-interface" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/0f/82716ed849bf7ea4984c21385597c949944f0f9b428b5710f79d0afc084d/twisted-25.5.0.tar.gz", hash = "sha256:1deb272358cb6be1e3e8fc6f9c8b36f78eb0fa7c2233d2dbe11ec6fee04ea316", size = 3545725, upload-time = "2025-06-07T09:52:24.858Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/eb/66/ab7efd8941f0bc7b2bd555b0f0471bff77df4c88e0cc31120c82737fec77/twisted-25.5.0-py3-none-any.whl", hash = "sha256:8559f654d01a54a8c3efe66d533d43f383531ebf8d81d9f9ab4769d91ca15df7", size = 3204767, upload-time = "2025-06-07T09:52:21.428Z" },
]
[[package]]
@@ -2222,14 +2686,14 @@ wheels = [
[[package]]
name = "types-requests"
-version = "2.32.4.20250913"
+version = "2.32.4.20260107"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "urllib3" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/36/27/489922f4505975b11de2b5ad07b4fe1dca0bca9be81a703f26c5f3acfce5/types_requests-2.32.4.20250913.tar.gz", hash = "sha256:abd6d4f9ce3a9383f269775a9835a4c24e5cd6b9f647d64f88aa4613c33def5d", size = 23113, upload-time = "2025-09-13T02:40:02.309Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/f3/a0663907082280664d745929205a89d41dffb29e89a50f753af7d57d0a96/types_requests-2.32.4.20260107.tar.gz", hash = "sha256:018a11ac158f801bfa84857ddec1650750e393df8a004a8a9ae2a9bec6fcb24f", size = 23165, upload-time = "2026-01-07T03:20:54.091Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/2a/20/9a227ea57c1285986c4cf78400d0a91615d25b24e257fd9e2969606bdfae/types_requests-2.32.4.20250913-py3-none-any.whl", hash = "sha256:78c9c1fffebbe0fa487a418e0fa5252017e9c60d1a2da394077f1780f655d7e1", size = 20658, upload-time = "2025-09-13T02:40:01.115Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/12/709ea261f2bf91ef0a26a9eed20f2623227a8ed85610c1e54c5805692ecb/types_requests-2.32.4.20260107-py3-none-any.whl", hash = "sha256:b703fe72f8ce5b31ef031264fe9395cac8f46a04661a79f7ed31a80fb308730d", size = 20676, upload-time = "2026-01-07T03:20:52.929Z" },
]
[[package]]
@@ -2275,6 +2739,28 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369, upload-time = "2024-12-22T07:47:28.074Z" },
]
+[[package]]
+name = "uuid-utils"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/7c/3a926e847516e67bc6838634f2e54e24381105b4e80f9338dc35cca0086b/uuid_utils-0.14.0.tar.gz", hash = "sha256:fc5bac21e9933ea6c590433c11aa54aaca599f690c08069e364eb13a12f670b4", size = 22072, upload-time = "2026-01-20T20:37:15.729Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a7/42/42d003f4a99ddc901eef2fd41acb3694163835e037fb6dde79ad68a72342/uuid_utils-0.14.0-cp39-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f6695c0bed8b18a904321e115afe73b34444bc8451d0ce3244a1ec3b84deb0e5", size = 601786, upload-time = "2026-01-20T20:37:09.843Z" },
+ { url = "https://files.pythonhosted.org/packages/96/e6/775dfb91f74b18f7207e3201eb31ee666d286579990dc69dd50db2d92813/uuid_utils-0.14.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:4f0a730bbf2d8bb2c11b93e1005e91769f2f533fa1125ed1f00fd15b6fcc732b", size = 303943, upload-time = "2026-01-20T20:37:18.767Z" },
+ { url = "https://files.pythonhosted.org/packages/17/82/ea5f5e85560b08a1f30cdc65f75e76494dc7aba9773f679e7eaa27370229/uuid_utils-0.14.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40ce3fd1a4fdedae618fc3edc8faf91897012469169d600133470f49fd699ed3", size = 340467, upload-time = "2026-01-20T20:37:11.794Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/33/54b06415767f4569882e99b6470c6c8eeb97422686a6d432464f9967fd91/uuid_utils-0.14.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:09ae4a98416a440e78f7d9543d11b11cae4bab538b7ed94ec5da5221481748f2", size = 346333, upload-time = "2026-01-20T20:37:12.818Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/10/a6bce636b8f95e65dc84bf4a58ce8205b8e0a2a300a38cdbc83a3f763d27/uuid_utils-0.14.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:971e8c26b90d8ae727e7f2ac3ee23e265971d448b3672882f2eb44828b2b8c3e", size = 470859, upload-time = "2026-01-20T20:37:01.512Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/27/84121c51ea72f013f0e03d0886bcdfa96b31c9b83c98300a7bd5cc4fa191/uuid_utils-0.14.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5cde1fa82804a8f9d2907b7aec2009d440062c63f04abbdb825fce717a5e860", size = 341988, upload-time = "2026-01-20T20:37:22.881Z" },
+ { url = "https://files.pythonhosted.org/packages/90/a4/01c1c7af5e6a44f20b40183e8dac37d6ed83e7dc9e8df85370a15959b804/uuid_utils-0.14.0-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c7343862a2359e0bd48a7f3dfb5105877a1728677818bb694d9f40703264a2db", size = 365784, upload-time = "2026-01-20T20:37:10.808Z" },
+ { url = "https://files.pythonhosted.org/packages/04/f0/65ee43ec617b8b6b1bf2a5aecd56a069a08cca3d9340c1de86024331bde3/uuid_utils-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c51e4818fdb08ccec12dc7083a01f49507b4608770a0ab22368001685d59381b", size = 523750, upload-time = "2026-01-20T20:37:06.152Z" },
+ { url = "https://files.pythonhosted.org/packages/95/d3/6bf503e3f135a5dfe705a65e6f89f19bccd55ac3fb16cb5d3ec5ba5388b8/uuid_utils-0.14.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:181bbcccb6f93d80a8504b5bd47b311a1c31395139596edbc47b154b0685b533", size = 615818, upload-time = "2026-01-20T20:37:21.816Z" },
+ { url = "https://files.pythonhosted.org/packages/df/6c/99937dd78d07f73bba831c8dc9469dfe4696539eba2fc269ae1b92752f9e/uuid_utils-0.14.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:5c8ae96101c3524ba8dbf762b6f05e9e9d896544786c503a727c5bf5cb9af1a7", size = 580831, upload-time = "2026-01-20T20:37:19.691Z" },
+ { url = "https://files.pythonhosted.org/packages/44/fa/bbc9e2c25abd09a293b9b097a0d8fc16acd6a92854f0ec080f1ea7ad8bb3/uuid_utils-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00ac3c6edfdaff7e1eed041f4800ae09a3361287be780d7610a90fdcde9befdc", size = 546333, upload-time = "2026-01-20T20:37:03.117Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/9b/e5e99b324b1b5f0c62882230455786df0bc66f67eff3b452447e703f45d2/uuid_utils-0.14.0-cp39-abi3-win32.whl", hash = "sha256:ec2fd80adf8e0e6589d40699e6f6df94c93edcc16dd999be0438dd007c77b151", size = 177319, upload-time = "2026-01-20T20:37:04.208Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/28/2c7d417ea483b6ff7820c948678fdf2ac98899dc7e43bb15852faa95acaf/uuid_utils-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:efe881eb43a5504fad922644cb93d725fd8a6a6d949bd5a4b4b7d1a1587c7fd1", size = 182566, upload-time = "2026-01-20T20:37:16.868Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/86/49e4bdda28e962fbd7266684171ee29b3d92019116971d58783e51770745/uuid_utils-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:32b372b8fd4ebd44d3a219e093fe981af4afdeda2994ee7db208ab065cfcd080", size = 182809, upload-time = "2026-01-20T20:37:05.139Z" },
+]
+
[[package]]
name = "uvicorn"
version = "0.38.0"
@@ -2325,6 +2811,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e4/16/c1fd27e9549f3c4baf1dc9c20c456cd2f822dbf8de9f463824b0c0357e06/uvloop-0.22.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6cde23eeda1a25c75b2e07d39970f3374105d5eafbaab2a4482be82f272d5a5e", size = 4296730, upload-time = "2025-10-16T22:17:00.744Z" },
]
+[[package]]
+name = "w3lib"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f1/91/186665abf1a6d16c0c5ea1f0e681d9c852b45c3a750aa8657f8f956690a8/w3lib-2.4.0.tar.gz", hash = "sha256:e233ad21649b69d0e047a10f30181ae9677524a29f6f71f6f3c758dc0c8d2648", size = 48302, upload-time = "2026-01-29T07:05:07.504Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2e/f5/ce3ab627e0cb51591c9e3dc4b9b173f15d7f2bec1c0010420b15fc442940/w3lib-2.4.0-py3-none-any.whl", hash = "sha256:260b5a22aeb86ae73213857f69ed20829a45150f8d5b12050b1f02ada414db79", size = 21603, upload-time = "2026-01-29T07:05:05.841Z" },
+]
+
[[package]]
name = "watchfiles"
version = "1.1.1"
@@ -2557,6 +3052,26 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" },
]
+[[package]]
+name = "zope-interface"
+version = "8.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/86/a4/77daa5ba398996d16bb43fc721599d27d03eae68fe3c799de1963c72e228/zope_interface-8.2.tar.gz", hash = "sha256:afb20c371a601d261b4f6edb53c3c418c249db1a9717b0baafc9a9bb39ba1224", size = 254019, upload-time = "2026-01-09T07:51:07.253Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/66/47/45188fb101fa060b20e6090e500682398ab415e516a0c228fbb22bc7def2/zope_interface-8.2-cp313-cp313-macosx_10_9_x86_64.whl", hash = "sha256:6068322004a0158c80dfd4708dfb103a899635408c67c3b10e9acec4dbacefec", size = 209170, upload-time = "2026-01-09T08:05:26.616Z" },
+ { url = "https://files.pythonhosted.org/packages/09/03/f6b9336c03c2b48403c4eb73a1ec961d94dc2fb5354c583dfb5fa05fd41f/zope_interface-8.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2499de92e8275d0dd68f84425b3e19e9268cd1fa8507997900fa4175f157733c", size = 209229, upload-time = "2026-01-09T08:05:28.521Z" },
+ { url = "https://files.pythonhosted.org/packages/07/b1/65fe1dca708569f302ade02e6cdca309eab6752bc9f80105514f5b708651/zope_interface-8.2-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f777e68c76208503609c83ca021a6864902b646530a1a39abb9ed310d1100664", size = 259393, upload-time = "2026-01-09T08:05:29.897Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/a5/97b49cfceb6ed53d3dcfb3f3ebf24d83b5553194f0337fbbb3a9fec6cf78/zope_interface-8.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9b05a919fdb0ed6ea942e5a7800e09a8b6cdae6f98fee1bef1c9d1a3fc43aaa0", size = 264863, upload-time = "2026-01-09T08:05:31.501Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/02/0b7a77292810efe3a0586a505b077ebafd5114e10c6e6e659f0c8e387e1f/zope_interface-8.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ccc62b5712dd7bd64cfba3ee63089fb11e840f5914b990033beeae3b2180b6cb", size = 264369, upload-time = "2026-01-09T08:05:32.941Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/1d/0d1ff3846302ed1b5bbf659316d8084b30106770a5f346b7ff4e9f540f80/zope_interface-8.2-cp313-cp313-win_amd64.whl", hash = "sha256:34f877d1d3bb7565c494ed93828fa6417641ca26faf6e8f044e0d0d500807028", size = 212447, upload-time = "2026-01-09T08:05:35.064Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/da/3c89de3917751446728b8898b4d53318bc2f8f6bf8196e150a063c59905e/zope_interface-8.2-cp314-cp314-macosx_10_9_x86_64.whl", hash = "sha256:46c7e4e8cbc698398a67e56ca985d19cb92365b4aafbeb6a712e8c101090f4cb", size = 209223, upload-time = "2026-01-09T08:05:36.449Z" },
+ { url = "https://files.pythonhosted.org/packages/00/7f/62d00ec53f0a6e5df0c984781e6f3999ed265129c4c3413df8128d1e0207/zope_interface-8.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a87fc7517f825a97ff4a4ca4c8a950593c59e0f8e7bfe1b6f898a38d5ba9f9cf", size = 209366, upload-time = "2026-01-09T08:05:38.197Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/a2/f241986315174be8e00aabecfc2153cf8029c1327cab8ed53a9d979d7e08/zope_interface-8.2-cp314-cp314-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:ccf52f7d44d669203c2096c1a0c2c15d52e36b2e7a9413df50f48392c7d4d080", size = 261037, upload-time = "2026-01-09T08:05:39.568Z" },
+ { url = "https://files.pythonhosted.org/packages/02/cc/b321c51d6936ede296a1b8860cf173bee2928357fe1fff7f97234899173f/zope_interface-8.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:aae807efc7bd26302eb2fea05cd6de7d59269ed6ae23a6de1ee47add6de99b8c", size = 264219, upload-time = "2026-01-09T08:05:41.624Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/fb/5f5e7b40a2f4efd873fe173624795ca47eaa22e29051270c981361b45209/zope_interface-8.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:05a0e42d6d830f547e114de2e7cd15750dc6c0c78f8138e6c5035e51ddfff37c", size = 264390, upload-time = "2026-01-09T08:05:42.936Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/82/3f2bc594370bc3abd58e5f9085d263bf682a222f059ed46275cde0570810/zope_interface-8.2-cp314-cp314-win_amd64.whl", hash = "sha256:561ce42390bee90bae51cf1c012902a8033b2aaefbd0deed81e877562a116d48", size = 212585, upload-time = "2026-01-09T08:05:44.419Z" },
+]
+
[[package]]
name = "zstandard"
version = "0.25.0"
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/README.md b/ai_platform_engineering/knowledge_bases/rag/server/README.md
index 68d35f5dc..c1f4d1ae0 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/README.md
+++ b/ai_platform_engineering/knowledge_bases/rag/server/README.md
@@ -64,13 +64,33 @@ The server will be available at `http://localhost:9446`
# OIDC configuration for UI token validation
OIDC_ISSUER=https://your-keycloak.com/realms/production
OIDC_CLIENT_ID=rag-ui
-OIDC_GROUP_CLAIM=groups # Optional: auto-detects (memberOf, groups, roles, cognito:groups)
+
+# Optional: specify which claims to check for groups (comma-separated)
+# If not set, auto-detects from: members, memberOf, groups, group, roles, cognito:groups
+# All specified claims are checked and groups are combined (deduplicated)
+OIDC_GROUP_CLAIM=groups,members,roles
# OIDC configuration for ingestor token validation
INGESTOR_OIDC_ISSUER=https://your-keycloak.com/realms/production
INGESTOR_OIDC_CLIENT_ID=rag-ingestor
```
+**ID Token for Claims Extraction (Optional):**
+
+Some OIDC providers (Azure AD, Okta, Auth0, etc.) include user claims like `email` and `groups` only in the ID token, not the access token. The UI can pass the ID token in a separate header:
+
+```
+Authorization: Bearer
+X-Identity-Token:
+```
+
+The server will:
+1. Validate the **access token** for authentication (signature, expiry, audience, issuer)
+2. Validate the **ID token** signature only (skip audience/issuer checks)
+3. Extract email and groups from the **ID token** claims
+
+If the ID token is provided but invalid (bad signature, expired), the request is rejected with 401 Unauthorized. If no ID token is provided, claims are extracted from the access token (backwards compatible).
+
**Trusted Network (Development):**
```bash
# Enable IP-based trust for localhost/internal networks
@@ -165,22 +185,44 @@ SKIP_INIT_TESTS=false
### Embeddings Configuration
+The server supports multiple embedding providers. Most are API-based and work with the default image.
+
+**Supported Providers:**
+
+| Provider | Image Required | Environment Variables |
+|----------|---------------|----------------------|
+| `azure-openai` (default) | Default | `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_VERSION` |
+| `openai` | Default | `OPENAI_API_KEY` |
+| `aws-bedrock` | Default | AWS credentials via boto3 |
+| `cohere` | Default | `COHERE_API_KEY` |
+| `ollama` | Default | `OLLAMA_BASE_URL` |
+| `litellm` | Default | `LITELLM_API_BASE`, `LITELLM_API_KEY` |
+| `huggingface` | **`-hf` variant** | `HUGGINGFACEHUB_API_TOKEN` (optional), `EMBEDDINGS_DEVICE` |
+
```bash
-# Embeddings provider (azure_openai or openai)
-EMBEDDINGS_PROVIDER=azure_openai
+# Embeddings provider
+EMBEDDINGS_PROVIDER=azure-openai
# Model name
EMBEDDINGS_MODEL=text-embedding-3-small
-# Azure OpenAI (if using azure_openai provider)
+# Azure OpenAI (if using azure-openai provider)
AZURE_OPENAI_API_KEY=your-api-key
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com
AZURE_OPENAI_API_VERSION=2024-02-15-preview
# OpenAI (if using openai provider)
OPENAI_API_KEY=your-api-key
+
+# HuggingFace (requires -hf image variant)
+# EMBEDDINGS_PROVIDER=huggingface
+# EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# EMBEDDINGS_DEVICE=cpu # or cuda, mps
+# EMBEDDINGS_BATCH_SIZE=32
```
+> **Note:** Using `EMBEDDINGS_PROVIDER=huggingface` with the default image will result in an error prompting you to use the `-hf` image variant.
+
### Performance & Limits
```bash
@@ -278,10 +320,31 @@ For detailed architecture and tool descriptions, see [ARCHITECTURE.md](./ARCHITE
### Docker
-Build the server image:
+The server is available in two image variants:
+
+| Variant | Tag | Size | Use Case |
+|---------|-----|------|----------|
+| **Default (slim)** | `:latest`, `:0.2.x` | ~1.3 GB | API-based embeddings (Azure OpenAI, OpenAI, Bedrock, Cohere, LiteLLM, Ollama) |
+| **HuggingFace** | `:latest-hf`, `:0.2.x-hf` | ~2.3 GB | Local HuggingFace/sentence-transformers models (includes PyTorch) |
+
+**Pull the default image:**
+```bash
+docker pull ghcr.io/cnoe-io/caipe-rag-server:latest
+```
+**Pull the HuggingFace variant (if using local embeddings):**
```bash
+docker pull ghcr.io/cnoe-io/caipe-rag-server:latest-hf
+```
+
+Build the server image locally:
+
+```bash
+# Default (slim) variant
docker build -f build/Dockerfile.server -t rag-server .
+
+# HuggingFace variant (includes PyTorch)
+docker build -f build/Dockerfile.server --build-arg VARIANT=huggingface -t rag-server:hf .
```
Run with environment variables:
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/pyproject.toml b/ai_platform_engineering/knowledge_bases/rag/server/pyproject.toml
index d1da973fd..8f10363f4 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/pyproject.toml
+++ b/ai_platform_engineering/knowledge_bases/rag/server/pyproject.toml
@@ -41,8 +41,6 @@ dependencies = [
"fastmcp>=2.12.4",
"aiohttp-retry>=2.9.1",
"pyyaml>=6.0.3",
- "sentence-transformers>=5.2.0",
- "huggingface-hub>=0.36.0",
"common",
]
@@ -57,24 +55,12 @@ dev = [
"pytest-mock>=3.10.0",
"ruff>=0.4.0",
]
-# Optional embeddings providers - install based on your needs
-embeddings-aws = [
- "langchain-aws>=0.3.0", # For AWS Bedrock embeddings
-]
-embeddings-cohere = [
- "langchain-cohere>=0.3.0", # For Cohere embeddings
-]
-embeddings-huggingface = [
- "langchain-huggingface>=0.3.0", # For local HuggingFace models
-]
-embeddings-ollama = [
- "langchain-ollama>=0.3.0", # For local Ollama models
-]
-embeddings-all = [
- "langchain-aws>=0.3.0",
- "langchain-cohere>=0.3.0",
- "langchain-huggingface>=0.3.0",
- "langchain-ollama>=0.3.0",
+# HuggingFace local embeddings variant (~900MB additional due to PyTorch)
+# Use the -hf image variant or install with: pip install server[huggingface]
+huggingface = [
+ "sentence-transformers>=5.2.0",
+ "huggingface-hub>=0.36.0",
+ "common[huggingface]",
]
[build-system]
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/src/server/auth.py b/ai_platform_engineering/knowledge_bases/rag/server/src/server/auth.py
index ea33e7d46..036bc16ef 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/src/server/auth.py
+++ b/ai_platform_engineering/knowledge_bases/rag/server/src/server/auth.py
@@ -3,6 +3,7 @@
Supports multiple OIDC providers (UI and Ingestor) with JWKS-based validation.
"""
+
import os
import time
from typing import Dict, Any, Tuple, Optional
@@ -15,271 +16,349 @@
class OIDCProvider:
- """Represents an OIDC provider configuration with JWKS caching."""
-
- def __init__(self, issuer: str, audience: str, name: str, discovery_url: Optional[str] = None):
- """
- Initialize OIDC provider.
-
- Args:
- issuer: OIDC issuer URL (e.g., https://keycloak.example.com/realms/production)
- audience: Expected audience claim (typically client_id)
- name: Human-readable name for this provider (e.g., "ui", "ingestor")
- discovery_url: Optional explicit discovery URL (if not provided, constructs from issuer)
- """
- self.issuer = issuer
- self.audience = audience
- self.name = name
- self.discovery_url = discovery_url
- self.jwks_uri: Optional[str] = None
- self.jwks_cache: Dict[str, Any] = {}
- self.jwks_cache_time: float = 0
- self.jwks_cache_ttl: int = 3600 # Cache JWKS for 1 hour
-
- if discovery_url:
- logger.info(f"Initialized OIDC provider '{name}': issuer={issuer}, audience={audience}, discovery_url={discovery_url}")
- else:
- logger.info(f"Initialized OIDC provider '{name}': issuer={issuer}, audience={audience}")
-
- async def _fetch_jwks(self) -> Dict[str, Any]:
- """
- Fetch JWKS (JSON Web Key Set) from OIDC provider.
-
- Strategy:
- 1. Try explicit discovery URL if provided
- 2. Fallback to constructing from issuer if discovery URL fails or not set
-
- Returns:
- JWKS dictionary with keys
- """
- # Get JWKS URI from well-known configuration if not cached
- if not self.jwks_uri:
- discovery_attempts = []
- oidc_config = None
-
- # Attempt 1: Try explicit discovery URL if provided
- if self.discovery_url:
- try:
- logger.debug(f"Provider '{self.name}': Attempting discovery with explicit URL: {self.discovery_url}")
- oidc_config = await self._fetch_oidc_config(self.discovery_url)
- except Exception as e:
- logger.warning(f"Provider '{self.name}': Explicit discovery URL failed: {e}")
- discovery_attempts.append(f"Discovery URL: {e}")
-
- # Attempt 2: Construct from issuer if config not yet obtained
- if not oidc_config and self.issuer:
- try:
- constructed_url = f"{self.issuer}/.well-known/openid-configuration"
- logger.debug(f"Provider '{self.name}': Attempting discovery from issuer: {constructed_url}")
- oidc_config = await self._fetch_oidc_config(constructed_url)
- except Exception as e:
- logger.warning(f"Provider '{self.name}': Discovery from issuer failed: {e}")
- discovery_attempts.append(f"Constructed from issuer: {e}")
-
- if not oidc_config:
- error_msg = "All OIDC discovery attempts failed: " + "; ".join(discovery_attempts)
- raise ValueError(error_msg)
-
- # Extract issuer from discovery if not explicitly set and available
- if not self.issuer and oidc_config.get("issuer"):
- self.issuer = oidc_config.get("issuer")
- logger.info(f"OIDC provider '{self.name}': Extracted issuer from discovery: {self.issuer}")
-
- self.jwks_uri = oidc_config.get("jwks_uri")
-
- if not self.jwks_uri:
- raise ValueError(f"JWKS URI not found in OIDC configuration for {self.issuer}")
-
- logger.info(f"OIDC provider '{self.name}' JWKS URI: {self.jwks_uri}")
-
- # Fetch JWKS
- logger.debug(f"Fetching JWKS from {self.jwks_uri}")
- async with httpx.AsyncClient(follow_redirects=True) as client:
- response = await client.get(self.jwks_uri, timeout=10.0)
- response.raise_for_status()
- return response.json()
-
- async def _fetch_oidc_config(self, well_known_url: str) -> Dict[str, Any]:
- """
- Fetch OIDC configuration from discovery endpoint.
-
- Args:
- well_known_url: Discovery endpoint URL
-
- Returns:
- OIDC configuration dictionary
-
- Raises:
- Exception if fetch fails
- """
- async with httpx.AsyncClient(follow_redirects=True) as client:
- response = await client.get(well_known_url, timeout=10.0)
- response.raise_for_status()
- return response.json()
-
- async def get_jwks(self) -> Dict[str, Any]:
- """
- Get JWKS with caching.
-
- Returns:
- Cached or fresh JWKS
- """
- now = time.time()
-
- # Return cached JWKS if still valid
- if self.jwks_cache and (now - self.jwks_cache_time) < self.jwks_cache_ttl:
- logger.debug(f"Using cached JWKS for provider '{self.name}'")
- return self.jwks_cache
-
- # Fetch fresh JWKS
- logger.debug(f"Fetching fresh JWKS for provider '{self.name}'")
- self.jwks_cache = await self._fetch_jwks()
- self.jwks_cache_time = now
-
- return self.jwks_cache
-
- async def validate_token(self, token: str) -> Dict[str, Any]:
- """
- Validate JWT token against this provider.
-
- Args:
- token: JWT token string
-
- Returns:
- Decoded token claims
-
- Raises:
- JWTError: If token is invalid
- """
+ """Represents an OIDC provider configuration with JWKS caching."""
+
+ def __init__(self, issuer: str, audience: str, name: str, discovery_url: Optional[str] = None):
+ """
+ Initialize OIDC provider.
+
+ Args:
+ issuer: OIDC issuer URL (e.g., https://keycloak.example.com/realms/production)
+ audience: Expected audience claim (typically client_id)
+ name: Human-readable name for this provider (e.g., "ui", "ingestor")
+ discovery_url: Optional explicit discovery URL (if not provided, constructs from issuer)
+ """
+ self.issuer = issuer
+ self.audience = audience
+ self.name = name
+ self.discovery_url = discovery_url
+ self.jwks_uri: Optional[str] = None
+ self.jwks_cache: Dict[str, Any] = {}
+ self.jwks_cache_time: float = 0
+ self.jwks_cache_ttl: int = 3600 # Cache JWKS for 1 hour
+
+ if discovery_url:
+ logger.info(f"Initialized OIDC provider '{name}': issuer={issuer}, audience={audience}, discovery_url={discovery_url}")
+ else:
+ logger.info(f"Initialized OIDC provider '{name}': issuer={issuer}, audience={audience}")
+
+ async def _fetch_jwks(self) -> Dict[str, Any]:
+ """
+ Fetch JWKS (JSON Web Key Set) from OIDC provider.
+
+ Strategy:
+ 1. Try explicit discovery URL if provided
+ 2. Fallback to constructing from issuer if discovery URL fails or not set
+
+ Returns:
+ JWKS dictionary with keys
+ """
+ # Get JWKS URI from well-known configuration if not cached
+ if not self.jwks_uri:
+ discovery_attempts = []
+ oidc_config = None
+
+ # Attempt 1: Try explicit discovery URL if provided
+ if self.discovery_url:
+ try:
+ logger.debug(f"Provider '{self.name}': Attempting discovery with explicit URL: {self.discovery_url}")
+ oidc_config = await self._fetch_oidc_config(self.discovery_url)
+ except Exception as e:
+ logger.warning(f"Provider '{self.name}': Explicit discovery URL failed: {e}")
+ discovery_attempts.append(f"Discovery URL: {e}")
+
+ # Attempt 2: Construct from issuer if config not yet obtained
+ if not oidc_config and self.issuer:
try:
- # Get JWKS
- jwks = await self.get_jwks()
-
- # Decode token header to get key ID (kid)
- unverified_header = jwt.get_unverified_header(token)
- kid = unverified_header.get("kid")
-
- if not kid:
- raise JWTError("Token missing 'kid' (key ID) in header")
-
- # Find matching key in JWKS
- key_dict = None
- for key in jwks.get("keys", []):
- if key.get("kid") == kid:
- key_dict = key
- break
-
- if not key_dict:
- raise JWTError(f"Key ID '{kid}' not found in JWKS")
-
- # Validate and decode token
- claims = jwt.decode(
- token,
- key_dict,
- algorithms=["RS256", "RS384", "RS512"],
- audience=self.audience,
- issuer=self.issuer,
- options={
- "verify_signature": True,
- "verify_exp": True,
- "verify_nbf": True,
- "verify_iat": True,
- "verify_aud": True,
- "verify_iss": True,
- }
- )
-
- logger.debug(f"Token validated successfully for provider '{self.name}'")
- return claims
-
- except JWTError as e:
- logger.debug(f"Token validation failed for provider '{self.name}': {e}")
- raise
+ constructed_url = f"{self.issuer}/.well-known/openid-configuration"
+ logger.debug(f"Provider '{self.name}': Attempting discovery from issuer: {constructed_url}")
+ oidc_config = await self._fetch_oidc_config(constructed_url)
+ except Exception as e:
+ logger.warning(f"Provider '{self.name}': Discovery from issuer failed: {e}")
+ discovery_attempts.append(f"Constructed from issuer: {e}")
+
+ if not oidc_config:
+ error_msg = "All OIDC discovery attempts failed: " + "; ".join(discovery_attempts)
+ raise ValueError(error_msg)
+
+ # Extract issuer from discovery if not explicitly set and available
+ if not self.issuer and oidc_config.get("issuer"):
+ self.issuer = oidc_config.get("issuer")
+ logger.info(f"OIDC provider '{self.name}': Extracted issuer from discovery: {self.issuer}")
+
+ self.jwks_uri = oidc_config.get("jwks_uri")
+
+ if not self.jwks_uri:
+ raise ValueError(f"JWKS URI not found in OIDC configuration for {self.issuer}")
+
+ logger.info(f"OIDC provider '{self.name}' JWKS URI: {self.jwks_uri}")
+
+ # Fetch JWKS
+ logger.debug(f"Fetching JWKS from {self.jwks_uri}")
+ async with httpx.AsyncClient(follow_redirects=True) as client:
+ response = await client.get(self.jwks_uri, timeout=10.0)
+ response.raise_for_status()
+ return response.json()
+
+ async def _fetch_oidc_config(self, well_known_url: str) -> Dict[str, Any]:
+ """
+ Fetch OIDC configuration from discovery endpoint.
+
+ Args:
+ well_known_url: Discovery endpoint URL
+
+ Returns:
+ OIDC configuration dictionary
+
+ Raises:
+ Exception if fetch fails
+ """
+ async with httpx.AsyncClient(follow_redirects=True) as client:
+ response = await client.get(well_known_url, timeout=10.0)
+ response.raise_for_status()
+ return response.json()
+
+ async def get_jwks(self) -> Dict[str, Any]:
+ """
+ Get JWKS with caching.
+
+ Returns:
+ Cached or fresh JWKS
+ """
+ now = time.time()
+
+ # Return cached JWKS if still valid
+ if self.jwks_cache and (now - self.jwks_cache_time) < self.jwks_cache_ttl:
+ logger.debug(f"Using cached JWKS for provider '{self.name}'")
+ return self.jwks_cache
+
+ # Fetch fresh JWKS
+ logger.debug(f"Fetching fresh JWKS for provider '{self.name}'")
+ self.jwks_cache = await self._fetch_jwks()
+ self.jwks_cache_time = now
+
+ return self.jwks_cache
+
+ async def validate_token(self, token: str) -> Dict[str, Any]:
+ """
+ Validate JWT token against this provider.
+
+ Args:
+ token: JWT token string
+
+ Returns:
+ Decoded token claims
+
+ Raises:
+ JWTError: If token is invalid
+ """
+ try:
+ # Get JWKS
+ jwks = await self.get_jwks()
+
+ # Decode token header to get key ID (kid)
+ unverified_header = jwt.get_unverified_header(token)
+ kid = unverified_header.get("kid")
+
+ if not kid:
+ raise JWTError("Token missing 'kid' (key ID) in header")
+
+ # Find matching key in JWKS
+ key_dict = None
+ for key in jwks.get("keys", []):
+ if key.get("kid") == kid:
+ key_dict = key
+ break
+
+ if not key_dict:
+ raise JWTError(f"Key ID '{kid}' not found in JWKS")
+
+ # Validate and decode token
+ # Supported algorithms: RSA (RS*) and ECDSA (ES*) - covers most OIDC providers
+ claims = jwt.decode(
+ token,
+ key_dict,
+ algorithms=["RS256", "RS384", "RS512", "ES256", "ES384", "ES512"],
+ audience=self.audience,
+ issuer=self.issuer,
+ options={
+ "verify_signature": True,
+ "verify_exp": True,
+ "verify_nbf": True,
+ "verify_iat": True,
+ "verify_aud": True,
+ "verify_iss": True,
+ },
+ )
+
+ logger.debug(f"Token validated successfully for provider '{self.name}'")
+ return claims
+
+ except JWTError as e:
+ logger.debug(f"Token validation failed for provider '{self.name}': {e}")
+ raise
+
+ async def validate_id_token(self, token: str) -> Dict[str, Any]:
+ """
+ Validate ID token with relaxed checks (signature and expiry only).
+
+ ID tokens are used for identity claims extraction (email, groups), not authorization.
+ We validate the signature to ensure authenticity but skip audience/issuer
+ checks since ID tokens have different semantics than access tokens.
+
+ Args:
+ token: JWT ID token string
+
+ Returns:
+ Decoded token claims
+
+ Raises:
+ JWTError: If token signature is invalid or token is expired
+ """
+ try:
+ # Get JWKS
+ jwks = await self.get_jwks()
+
+ # Decode token header to get key ID (kid)
+ unverified_header = jwt.get_unverified_header(token)
+ kid = unverified_header.get("kid")
+
+ if not kid:
+ raise JWTError("ID token missing 'kid' (key ID) in header")
+
+ # Find matching key in JWKS
+ key_dict = None
+ for key in jwks.get("keys", []):
+ if key.get("kid") == kid:
+ key_dict = key
+ break
+
+ if not key_dict:
+ raise JWTError(f"Key ID '{kid}' not found in JWKS for ID token")
+
+ # Validate signature and expiry only (skip audience/issuer)
+ # Supported algorithms: RSA (RS*) and ECDSA (ES*) - covers most OIDC providers
+ claims = jwt.decode(
+ token,
+ key_dict,
+ algorithms=["RS256", "RS384", "RS512", "ES256", "ES384", "ES512"],
+ options={
+ "verify_signature": True,
+ "verify_exp": True,
+ "verify_nbf": True,
+ "verify_iat": True,
+ "verify_aud": False, # Skip - ID tokens have different audience semantics
+ "verify_iss": False, # Skip - we already validated access token issuer
+ },
+ )
+
+ logger.debug(f"ID token validated successfully for provider '{self.name}'")
+ return claims
+
+ except JWTError as e:
+ logger.debug(f"ID token validation failed for provider '{self.name}': {e}")
+ raise
class AuthManager:
- """Manages multiple OIDC providers and token validation."""
-
- def __init__(self):
- """Initialize auth manager and load OIDC providers from environment."""
- self.providers: Dict[str, OIDCProvider] = {}
- self._load_providers()
-
- def _load_providers(self):
- """Load OIDC provider configurations from environment variables."""
- # Load UI provider
- ui_issuer = os.getenv("OIDC_ISSUER")
- ui_client_id = os.getenv("OIDC_CLIENT_ID")
- ui_discovery_url = os.getenv("OIDC_DISCOVERY_URL")
-
- # Require either issuer or discovery URL, plus client_id
- if (ui_issuer or ui_discovery_url) and ui_client_id:
- self.providers["ui"] = OIDCProvider(
- issuer=ui_issuer.rstrip("/") if ui_issuer else "", # Empty string if only discovery URL
- audience=ui_client_id,
- name="ui",
- discovery_url=ui_discovery_url
- )
- logger.info("UI OIDC provider configured")
- else:
- logger.warning("UI OIDC provider not configured (need OIDC_CLIENT_ID and either OIDC_ISSUER or OIDC_DISCOVERY_URL)")
-
- # Load Ingestor provider
- ingestor_issuer = os.getenv("INGESTOR_OIDC_ISSUER")
- ingestor_client_id = os.getenv("INGESTOR_OIDC_CLIENT_ID")
- ingestor_discovery_url = os.getenv("INGESTOR_OIDC_DISCOVERY_URL")
-
- # Require either issuer or discovery URL, plus client_id
- if (ingestor_issuer or ingestor_discovery_url) and ingestor_client_id:
- self.providers["ingestor"] = OIDCProvider(
- issuer=ingestor_issuer.rstrip("/") if ingestor_issuer else "", # Empty string if only discovery URL
- audience=ingestor_client_id,
- name="ingestor",
- discovery_url=ingestor_discovery_url
- )
- logger.info("Ingestor OIDC provider configured")
- else:
- logger.info("Ingestor OIDC provider not configured (using trusted network or UI-only auth)")
-
- if not self.providers:
- logger.warning(
- "No OIDC providers configured! Either configure OIDC providers or enable "
- "trusted network access (ALLOW_TRUSTED_NETWORK=true)"
- )
-
- async def validate_token(self, token: str) -> Tuple[OIDCProvider, Dict[str, Any]]:
- """
- Validate token against all configured providers.
-
- Tries each provider in sequence until one succeeds.
-
- Args:
- token: JWT token string
-
- Returns:
- Tuple of (provider, claims) for the first successful validation
-
- Raises:
- JWTError: If token is invalid for all providers
- """
- if not self.providers:
- raise JWTError("No OIDC providers configured for token validation")
-
- errors = []
-
- for provider in self.providers.values():
- try:
- claims = await provider.validate_token(token)
- logger.info(f"Token validated successfully by provider '{provider.name}'")
- return provider, claims
- except JWTError as e:
- errors.append(f"{provider.name}: {str(e)}")
- continue
-
- # All providers failed
- error_msg = f"Token validation failed for all providers: {'; '.join(errors)}"
- logger.warning(error_msg)
- raise JWTError(error_msg)
+ """Manages multiple OIDC providers and token validation."""
+
+ def __init__(self):
+ """Initialize auth manager and load OIDC providers from environment."""
+ self.providers: Dict[str, OIDCProvider] = {}
+ self._load_providers()
+
+ def _load_providers(self):
+ """Load OIDC provider configurations from environment variables."""
+ # Load UI provider
+ ui_issuer = os.getenv("OIDC_ISSUER")
+ ui_client_id = os.getenv("OIDC_CLIENT_ID")
+ ui_discovery_url = os.getenv("OIDC_DISCOVERY_URL")
+
+ # Require either issuer or discovery URL, plus client_id
+ if (ui_issuer or ui_discovery_url) and ui_client_id:
+ self.providers["ui"] = OIDCProvider(
+ issuer=ui_issuer.rstrip("/") if ui_issuer else "", # Empty string if only discovery URL
+ audience=ui_client_id,
+ name="ui",
+ discovery_url=ui_discovery_url,
+ )
+ logger.info("UI OIDC provider configured")
+ else:
+ logger.warning("UI OIDC provider not configured (need OIDC_CLIENT_ID and either OIDC_ISSUER or OIDC_DISCOVERY_URL)")
+
+ # Load Ingestor provider
+ ingestor_issuer = os.getenv("INGESTOR_OIDC_ISSUER")
+ ingestor_client_id = os.getenv("INGESTOR_OIDC_CLIENT_ID")
+ ingestor_discovery_url = os.getenv("INGESTOR_OIDC_DISCOVERY_URL")
+
+ # Require either issuer or discovery URL, plus client_id
+ if (ingestor_issuer or ingestor_discovery_url) and ingestor_client_id:
+ self.providers["ingestor"] = OIDCProvider(
+ issuer=ingestor_issuer.rstrip("/") if ingestor_issuer else "", # Empty string if only discovery URL
+ audience=ingestor_client_id,
+ name="ingestor",
+ discovery_url=ingestor_discovery_url,
+ )
+ logger.info("Ingestor OIDC provider configured")
+ else:
+ logger.info("Ingestor OIDC provider not configured (using trusted network or UI-only auth)")
+
+ if not self.providers:
+ logger.warning("No OIDC providers configured! Either configure OIDC providers or enable trusted network access (ALLOW_TRUSTED_NETWORK=true)")
+
+ async def validate_token(self, token: str) -> Tuple[OIDCProvider, Dict[str, Any]]:
+ """
+ Validate token against all configured providers.
+
+ Tries each provider in sequence until one succeeds.
+
+ Args:
+ token: JWT token string
+
+ Returns:
+ Tuple of (provider, claims) for the first successful validation
+
+ Raises:
+ JWTError: If token is invalid for all providers
+ """
+ if not self.providers:
+ raise JWTError("No OIDC providers configured for token validation")
+
+ errors = []
+
+ for provider in self.providers.values():
+ try:
+ claims = await provider.validate_token(token)
+ logger.info(f"Token validated successfully by provider '{provider.name}'")
+ return provider, claims
+ except JWTError as e:
+ errors.append(f"{provider.name}: {str(e)}")
+ continue
+
+ # All providers failed
+ error_msg = f"Token validation failed for all providers: {'; '.join(errors)}"
+ logger.warning(error_msg)
+ raise JWTError(error_msg)
+
+ async def validate_id_token(self, token: str, provider: OIDCProvider) -> Dict[str, Any]:
+ """
+ Validate ID token using the specified provider.
+
+ The ID token should be validated using the same provider that validated
+ the access token, to ensure consistent key material.
+
+ Args:
+ token: JWT ID token string
+ provider: The OIDC provider that validated the access token
+
+ Returns:
+ Decoded ID token claims
+
+ Raises:
+ JWTError: If ID token is invalid
+ """
+ return await provider.validate_id_token(token)
# Global auth manager instance (initialized on first use)
@@ -288,15 +367,15 @@ async def validate_token(self, token: str) -> Tuple[OIDCProvider, Dict[str, Any]
@lru_cache(maxsize=1)
def get_auth_manager() -> AuthManager:
- """
- Get or create the global auth manager instance.
-
- Uses lru_cache to ensure singleton pattern.
-
- Returns:
- AuthManager instance
- """
- global _auth_manager
- if _auth_manager is None:
- _auth_manager = AuthManager()
- return _auth_manager
+ """
+ Get or create the global auth manager instance.
+
+ Uses lru_cache to ensure singleton pattern.
+
+ Returns:
+ AuthManager instance
+ """
+ global _auth_manager
+ if _auth_manager is None:
+ _auth_manager = AuthManager()
+ return _auth_manager
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/src/server/ingestion.py b/ai_platform_engineering/knowledge_bases/rag/server/src/server/ingestion.py
index dcabf2e0f..6f5969c3d 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/src/server/ingestion.py
+++ b/ai_platform_engineering/knowledge_bases/rag/server/src/server/ingestion.py
@@ -14,932 +14,831 @@
class DocumentProcessor:
- # Milvus varchar field limit (65535 bytes, using 60000 to be safe with UTF-8 encoding)
- MILVUS_MAX_VARCHAR_LENGTH = 60000
-
- def __init__(self, vstore: Milvus, job_manager: JobManager, graph_rag_enabled: bool, data_graph_db: Optional[GraphDB] = None, max_property_length: int = 250, batch_size: int = 1000):
- """
- Args:
- vstore: Milvus instance
- job_manager: JobManager instance
- graph_rag_enabled: Whether graph RAG is enabled
- data_graph_db: GraphDB instance
- max_property_length: Maximum length for property values for graph entities
- batch_size: Batch size for ingestion into vector database
- """
- self.vstore = vstore
- self.data_graph_db = data_graph_db
- self.graph_rag_enabled = graph_rag_enabled
- self.job_manager = job_manager
- self.max_property_length = max_property_length
- self.batch_size = batch_size
- self.logger = utils.get_logger("DocumentProcessor")
-
- @staticmethod
- def sanitize_entity_properties(entity: Entity, max_length: int = 250) -> None:
- """
- Sanitize entity properties by removing or filtering values that exceed max_length.
- - For string properties: remove the entire property if length > max_length
- - For list properties: remove individual elements that have length > max_length
-
- Args:
- entity: Entity to sanitize (modified in-place)
- max_length: Maximum allowed length for property values (default: 250)
- """
- properties_to_remove = []
- properties_to_update = {}
-
- for key, value in entity.all_properties.items():
- # Skip internal properties (starting with _)
- if key.startswith('_'):
- continue
-
- if isinstance(value, str):
- # Remove string properties that are too long
- if len(value) > max_length:
- properties_to_remove.append(key)
-
- elif isinstance(value, list):
- # Filter list elements that are too long
- filtered_list = []
- for item in value:
- # Only check string items in the list
- if isinstance(item, str):
- if len(item) <= max_length:
- filtered_list.append(item)
- else:
- # Keep non-string items as-is
- filtered_list.append(item)
-
- # Update the property with filtered list
- if len(filtered_list) != len(value):
- properties_to_update[key] = filtered_list
-
- # Remove properties that are too long
- for key in properties_to_remove:
- del entity.all_properties[key]
-
- # Update properties with filtered lists
- for key, value in properties_to_update.items():
- entity.all_properties[key] = value
-
- @staticmethod
- def format_entity_for_embedding(entity: Entity) -> str:
- """
- Format entity properties for embedding with emphasis on entity type and primary keys.
- Properties section is formatted as JSON for clean, readable output.
- """
- entity_properties = entity.get_external_properties()
-
- # Create formatted entity type using utility function
- formatted_type = utils.format_entity_type_for_display(entity.entity_type)
-
- # Extract primary key values
- primary_key_values = {}
- for pk_prop in entity.primary_key_properties:
- if pk_prop in entity_properties:
- primary_key_values[pk_prop] = entity_properties[pk_prop]
-
- # Build the formatted text with emphasis
- formatted_parts = []
-
- # Emphasize entity type at the top
- formatted_parts.append(f"=== ENTITY TYPE: {formatted_type} (Label: {entity.entity_type}) ===")
- formatted_parts.append("")
-
- # Emphasize primary key properties
- if primary_key_values:
- formatted_parts.append("=== PRIMARY KEY PROPERTIES ===")
- for pk_prop, pk_value in primary_key_values.items():
- formatted_parts.append(f" {pk_prop}: {pk_value}")
- formatted_parts.append("")
-
- # Show additional key properties with their values if present
- if entity.additional_key_properties:
- formatted_parts.append("=== ADDITIONAL KEY PROPERTIES ===")
- # Collect unique values only
- unique_values = set()
- for key_set in entity.additional_key_properties:
- for key in key_set:
- if key in entity_properties:
- value = entity_properties[key]
- unique_values.add(str(value))
- # Print only unique values
- for value in sorted(unique_values):
- formatted_parts.append(f" {value}")
- formatted_parts.append("")
-
- # Add all properties as JSON
- formatted_parts.append("=== ALL PROPERTIES ===")
- properties_json = utils.json_encode(entity_properties, indent=2)
- formatted_parts.append(properties_json)
-
- return "\n".join(formatted_parts)
-
- def _create_chunks_from_content(
- self,
- content: str,
- document_id: str,
- document_metadata: DocumentMetadata,
- max_chunk_size: int = 60000,
- ) -> Tuple[List[Document], List[str]]:
- """
- Create document chunks from text content with proper metadata.
- Splits content into chunks if it exceeds max_chunk_size.
-
- Args:
- content: The text content to chunk
- document_id: ID for the document
- document_metadata: Base metadata to inherit
- max_chunk_size: Maximum size per chunk
-
- Returns:
- Tuple of (list of Document chunks, list of chunk IDs)
- """
- chunks = []
- chunk_ids = []
-
- # Check if content needs chunking
- if len(content) > max_chunk_size:
- self.logger.debug(
- f"Content for {document_id} exceeds max size "
- f"({len(content)} > {max_chunk_size}), splitting into chunks"
- )
-
- # Use text splitter to chunk the content
- text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=max_chunk_size,
- chunk_overlap=min(200, max_chunk_size // 10),
- length_function=len,
- separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
- )
-
- temp_doc = Document(page_content=content)
- split_docs = text_splitter.split_documents([temp_doc])
-
- self.logger.info(f"Split {document_id} into {len(split_docs)} chunks")
-
- # Create chunks with metadata
- for i, chunk_doc in enumerate(split_docs):
- chunk_id = f"{document_id}_chunk_{i}"
-
- chunk_metadata = DocumentChunkMetadata(
- id=chunk_id,
- document_id=document_id,
- datasource_id=document_metadata.datasource_id,
- ingestor_id=document_metadata.ingestor_id,
- title=document_metadata.title,
- description=document_metadata.description,
- is_graph_entity=document_metadata.is_graph_entity,
- document_type=document_metadata.document_type,
- document_ingested_at=document_metadata.document_ingested_at,
- fresh_until=document_metadata.fresh_until,
- metadata=document_metadata.metadata,
- chunk_index=i,
- total_chunks=len(split_docs),
- )
-
- chunk_doc.metadata = chunk_metadata.model_dump()
- chunks.append(chunk_doc)
- chunk_ids.append(chunk_id)
- else:
- # Content is small enough, create single chunk
- chunk_id = f"{document_id}_chunk_0"
-
- chunk_metadata = DocumentChunkMetadata(
- id=chunk_id,
- document_id=document_id,
- datasource_id=document_metadata.datasource_id,
- ingestor_id=document_metadata.ingestor_id,
- title=document_metadata.title,
- description=document_metadata.description,
- is_graph_entity=document_metadata.is_graph_entity,
- document_type=document_metadata.document_type,
- document_ingested_at=document_metadata.document_ingested_at,
- fresh_until=document_metadata.fresh_until,
- metadata=document_metadata.metadata,
- chunk_index=0,
- total_chunks=1,
- )
-
- chunk_doc = Document(
- page_content=content,
- metadata=chunk_metadata.model_dump()
- )
-
- chunks.append(chunk_doc)
- chunk_ids.append(chunk_id)
-
- return chunks, chunk_ids
-
- def _process_graph_entity_document(
- self,
- doc: Document,
- document_metadata: DocumentMetadata,
- ingestor_id: str,
- datasource_id: str,
- fresh_until: int,
- ) -> Optional[Tuple[List[Entity], List[Document], List[str], List[str]]]:
- """
- Process a graph entity document by splitting nested entities for the graph database,
- but keeping the full entity text as a single document for vector search.
-
- Args:
- doc: The document containing the graph entity
- document_metadata: Metadata for the document
- ingestor_id: ID of the ingestor
- datasource_id: ID of the datasource
- fresh_until: Timestamp until which data is fresh
-
- Returns:
- Tuple of (entities list, chunks list, chunk_ids list, error_messages list), or None if entity parsing failed
- """
- # Initialize result containers
- all_entities = []
- all_chunks = []
- all_chunk_ids = []
- validation_errors = []
-
- # Parse the graph entity from document
- entity = self._parse_graph_entity(doc)
- if entity is None:
- # Validation failed, skip this entity
- return None
-
- entity_type = entity.entity_type
-
- self.logger.debug(f"Parsed graph entity of type {entity_type}")
-
- # Flatten properties BUT preserve arrays of dicts (they'll be split later)
- # This ensures primary_key_properties with dot notation work correctly
- entity.all_properties = utils.flatten_dict(entity.all_properties, wildcard_index=False, preserve_list_of_dicts=True)
-
- # Create the full entity text from the entity
- # This is what will be used for vector search embedding
- full_entity_text = self.format_entity_for_embedding(entity)
-
- # Split the entity into multiple entities for the graph database
- # Properties are already flattened except arrays of dicts
- entities = self.split_nested_graph_entity(entity)
- self.logger.debug(f"Split entity into {len(entities)} entities for graph DB (including parent)")
-
- # Log primary key after splitting
- if entities:
- parent_entity = entities[0] # First entity is always the parent
- self.logger.debug(f"Parent entity primary key: {parent_entity.generate_primary_key()}")
-
- # Process each split entity and add to graph database collection
- for split_entity in entities:
- # Validate entity after splitting and flattening (properties now have dot notation)
- all_props = split_entity.all_properties
-
- # Validate primary_key_properties - critical, must exist
- missing_primary_keys = [
- key for key in split_entity.primary_key_properties
- if key not in all_props
- ]
- if missing_primary_keys:
- error_msg = (
- f"Entity type '{split_entity.entity_type}': "
- f"missing primary key properties: {missing_primary_keys}. "
- f"Available properties: {list(all_props.keys())}"
- )
- self.logger.warning(f"Skipping entity - {error_msg}")
- validation_errors.append(error_msg)
- continue
-
- # Validate additional_key_properties - optional, just warn and remove invalid ones
- if split_entity.additional_key_properties:
- valid_additional_keys = []
- for id_keys in split_entity.additional_key_properties:
- missing_additional_keys = [
- key for key in id_keys
- if key not in all_props
- ]
- if not missing_additional_keys:
- # All keys exist, keep this additional key set
- valid_additional_keys.append(id_keys)
- else:
- warning_msg = (
- f"Entity type '{split_entity.entity_type}' with primary key '{split_entity.generate_primary_key()}': "
- f"additional_key_properties {id_keys} has missing properties: {missing_additional_keys}. "
- f"These additional keys will be ignored."
- )
- self.logger.warning(warning_msg)
- validation_errors.append(warning_msg)
-
- # Update with only valid additional keys
- split_entity.additional_key_properties = valid_additional_keys
-
- # Sanitize entity properties (remove values that exceed max_property_length)
- self.sanitize_entity_properties(split_entity, max_length=self.max_property_length)
-
- # Add document metadata fields to entity properties
- split_entity.all_properties.update({
- INGESTOR_ID_KEY: ingestor_id,
- DATASOURCE_ID_KEY: datasource_id,
- LAST_UPDATED_KEY: document_metadata.document_ingested_at,
- FRESH_UNTIL_KEY: fresh_until
- })
-
- # Add to entities list
- all_entities.append(split_entity)
-
- # Create ONE document with the full entity text for vector search
- # Use the original (parent) entity information for the document ID
- # Set graph entity metadata in the document metadata
- document_id = self.graph_document_id(entity.entity_type, entity.generate_primary_key())
- entity_primary_key = entity.generate_primary_key()
- if document_metadata.metadata is None:
- document_metadata.metadata = {}
- document_metadata.metadata.update({
- "graph_entity_type": entity.entity_type,
- "graph_entity_pk": entity_primary_key
- })
-
- # Prepare metadata for chunking - set graph entity metadata explicitly for root entity
- entity_doc_metadata = DocumentMetadata(
- document_id=document_id,
- datasource_id=document_metadata.datasource_id,
- ingestor_id=document_metadata.ingestor_id,
- title=document_metadata.title,
- description=document_metadata.description,
- is_graph_entity=True,
- document_type=self.graph_document_type(entity.entity_type),
- document_ingested_at=document_metadata.document_ingested_at,
- fresh_until=document_metadata.fresh_until,
- metadata=document_metadata.metadata,
- )
-
- # Use common chunking method for the full entity text
- chunks, chunk_ids = self._create_chunks_from_content(
- content=full_entity_text,
- document_id=document_id,
- document_metadata=entity_doc_metadata,
- )
-
- all_chunks.extend(chunks)
- all_chunk_ids.extend(chunk_ids)
-
- return all_entities, all_chunks, all_chunk_ids, validation_errors
-
- def split_nested_graph_entity(self, entity: Entity) -> List[Entity]:
- """
- Split a nested graph entity into a list of entities.
-
- This function recursively processes an entity's properties:
- - Homogeneous lists of primitives (str, bool, int, float) are kept as-is
- - Lists of dictionaries are split into separate entities
- - Sub-dictionaries are flattened into the parent with dot notation
- - Lists of dictionaries in flattened sub-dictionaries are split as usual
-
- For each property that contains a list of dictionaries:
- - Creates new entities with type: parent_entity_type + "_" + property_name (singularized/capitalized)
- - Adds SUB_ENTITY_INDEX_KEY: index in the list
- - Adds PARENT_ENTITY_PK_KEY: primary key of parent entity
- - Uses ENTITY_TYPE_KEY (already present in all entities) in primary key to avoid clashes
- - Sets primary_key_properties to [PARENT_ENTITY_PK_KEY, ENTITY_TYPE_KEY, SUB_ENTITY_INDEX_KEY]
- - Recursively processes nested structures
-
- Returns:
- List[Entity]: List containing the modified parent entity and all extracted nested entities
- """
- result_entities = []
-
- # Get all properties (including internal ones starting with _)
- all_properties = entity.all_properties.copy()
- parent_primary_key = entity.generate_primary_key()
- parent_entity_type = entity.entity_type
-
- # Separate internal properties (starting with _) from external ones
- internal_properties = {k: v for k, v in all_properties.items() if k.startswith("_")}
- external_properties = {k: v for k, v in all_properties.items() if not k.startswith("_")}
-
- # Process external properties: flatten dicts, split list of dicts, keep homogeneous lists
- processed_properties = self._process_entity_properties(
- external_properties,
- parent_entity_type,
- parent_primary_key,
- result_entities
+ # Milvus varchar field limit (65535 bytes, using 60000 to be safe with UTF-8 encoding)
+ MILVUS_MAX_VARCHAR_LENGTH = 60000
+
+ def __init__(self, vstore: Milvus, job_manager: JobManager, graph_rag_enabled: bool, data_graph_db: Optional[GraphDB] = None, max_property_length: int = 250, batch_size: int = 1000):
+ """
+ Args:
+ vstore: Milvus instance
+ job_manager: JobManager instance
+ graph_rag_enabled: Whether graph RAG is enabled
+ data_graph_db: GraphDB instance
+ max_property_length: Maximum length for property values for graph entities
+ batch_size: Batch size for ingestion into vector database
+ """
+ self.vstore = vstore
+ self.data_graph_db = data_graph_db
+ self.graph_rag_enabled = graph_rag_enabled
+ self.job_manager = job_manager
+ self.max_property_length = max_property_length
+ self.batch_size = batch_size
+ self.logger = utils.get_logger("DocumentProcessor")
+
+ @staticmethod
+ def sanitize_entity_properties(entity: Entity, max_length: int = 250) -> None:
+ """
+ Sanitize entity properties by removing or filtering values that exceed max_length.
+ - For string properties: remove the entire property if length > max_length
+ - For list properties: remove individual elements that have length > max_length
+
+ Args:
+ entity: Entity to sanitize (modified in-place)
+ max_length: Maximum allowed length for property values (default: 250)
+ """
+ properties_to_remove = []
+ properties_to_update = {}
+
+ for key, value in entity.all_properties.items():
+ # Skip internal properties (starting with _)
+ if key.startswith("_"):
+ continue
+
+ if isinstance(value, str):
+ # Remove string properties that are too long
+ if len(value) > max_length:
+ properties_to_remove.append(key)
+
+ elif isinstance(value, list):
+ # Filter list elements that are too long
+ filtered_list = []
+ for item in value:
+ # Only check string items in the list
+ if isinstance(item, str):
+ if len(item) <= max_length:
+ filtered_list.append(item)
+ else:
+ # Keep non-string items as-is
+ filtered_list.append(item)
+
+ # Update the property with filtered list
+ if len(filtered_list) != len(value):
+ properties_to_update[key] = filtered_list
+
+ # Remove properties that are too long
+ for key in properties_to_remove:
+ del entity.all_properties[key]
+
+ # Update properties with filtered lists
+ for key, value in properties_to_update.items():
+ entity.all_properties[key] = value
+
+ @staticmethod
+ def format_entity_for_embedding(entity: Entity) -> str:
+ """
+ Format entity properties for embedding with emphasis on entity type and primary keys.
+ Properties section is formatted as JSON for clean, readable output.
+ """
+ entity_properties = entity.get_external_properties()
+
+ # Create formatted entity type using utility function
+ formatted_type = utils.format_entity_type_for_display(entity.entity_type)
+
+ # Extract primary key values
+ primary_key_values = {}
+ for pk_prop in entity.primary_key_properties:
+ if pk_prop in entity_properties:
+ primary_key_values[pk_prop] = entity_properties[pk_prop]
+
+ # Build the formatted text with emphasis
+ formatted_parts = []
+
+ # Emphasize entity type at the top
+ formatted_parts.append(f"=== ENTITY TYPE: {formatted_type} (Label: {entity.entity_type}) ===")
+ formatted_parts.append("")
+
+ # Emphasize primary key properties
+ if primary_key_values:
+ formatted_parts.append("=== PRIMARY KEY PROPERTIES ===")
+ for pk_prop, pk_value in primary_key_values.items():
+ formatted_parts.append(f" {pk_prop}: {pk_value}")
+ formatted_parts.append("")
+
+ # Show additional key properties with their values if present
+ if entity.additional_key_properties:
+ formatted_parts.append("=== ADDITIONAL KEY PROPERTIES ===")
+ # Collect unique values only
+ unique_values = set()
+ for key_set in entity.additional_key_properties:
+ for key in key_set:
+ if key in entity_properties:
+ value = entity_properties[key]
+ unique_values.add(str(value))
+ # Print only unique values
+ for value in sorted(unique_values):
+ formatted_parts.append(f" {value}")
+ formatted_parts.append("")
+
+ # Add all properties as JSON
+ formatted_parts.append("=== ALL PROPERTIES ===")
+ properties_json = utils.json_encode(entity_properties, indent=2)
+ formatted_parts.append(properties_json)
+
+ return "\n".join(formatted_parts)
+
+ def _create_chunks_from_content(
+ self,
+ content: str,
+ document_id: str,
+ document_metadata: DocumentMetadata,
+ max_chunk_size: int = 60000,
+ ) -> Tuple[List[Document], List[str]]:
+ """
+ Create document chunks from text content with proper metadata.
+ Splits content into chunks if it exceeds max_chunk_size.
+
+ Args:
+ content: The text content to chunk
+ document_id: ID for the document
+ document_metadata: Base metadata to inherit
+ max_chunk_size: Maximum size per chunk
+
+ Returns:
+ Tuple of (list of Document chunks, list of chunk IDs)
+ """
+ chunks = []
+ chunk_ids = []
+
+ # Check if content needs chunking
+ if len(content) > max_chunk_size:
+ self.logger.debug(f"Content for {document_id} exceeds max size ({len(content)} > {max_chunk_size}), splitting into chunks")
+
+ # Use text splitter to chunk the content
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=min(200, max_chunk_size // 10), length_function=len, separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""])
+
+ temp_doc = Document(page_content=content)
+ split_docs = text_splitter.split_documents([temp_doc])
+
+ self.logger.info(f"Split {document_id} into {len(split_docs)} chunks")
+
+ # Create chunks with metadata
+ for i, chunk_doc in enumerate(split_docs):
+ chunk_id = f"{document_id}_chunk_{i}"
+
+ chunk_metadata = DocumentChunkMetadata(
+ id=chunk_id,
+ document_id=document_id,
+ datasource_id=document_metadata.datasource_id,
+ ingestor_id=document_metadata.ingestor_id,
+ title=document_metadata.title,
+ description=document_metadata.description,
+ is_graph_entity=document_metadata.is_graph_entity,
+ document_type=document_metadata.document_type,
+ document_ingested_at=document_metadata.document_ingested_at,
+ fresh_until=document_metadata.fresh_until,
+ metadata=document_metadata.metadata,
+ chunk_index=i,
+ total_chunks=len(split_docs),
)
-
- # Merge back internal properties (they should always be preserved)
- processed_properties.update(internal_properties)
-
- # Create the updated parent entity, preserving additional_labels and additional_key_properties
- updated_parent = Entity(
- entity_type=parent_entity_type,
- all_properties=processed_properties,
- primary_key_properties=entity.primary_key_properties,
- additional_labels=entity.additional_labels,
- additional_key_properties=entity.additional_key_properties,
+
+ chunk_doc.metadata = chunk_metadata.model_dump()
+ chunks.append(chunk_doc)
+ chunk_ids.append(chunk_id)
+ else:
+ # Content is small enough, create single chunk
+ chunk_id = f"{document_id}_chunk_0"
+
+ chunk_metadata = DocumentChunkMetadata(
+ id=chunk_id,
+ document_id=document_id,
+ datasource_id=document_metadata.datasource_id,
+ ingestor_id=document_metadata.ingestor_id,
+ title=document_metadata.title,
+ description=document_metadata.description,
+ is_graph_entity=document_metadata.is_graph_entity,
+ document_type=document_metadata.document_type,
+ document_ingested_at=document_metadata.document_ingested_at,
+ fresh_until=document_metadata.fresh_until,
+ metadata=document_metadata.metadata,
+ chunk_index=0,
+ total_chunks=1,
+ )
+
+ chunk_doc = Document(page_content=content, metadata=chunk_metadata.model_dump())
+
+ chunks.append(chunk_doc)
+ chunk_ids.append(chunk_id)
+
+ return chunks, chunk_ids
+
+ def _process_graph_entity_document(
+ self,
+ doc: Document,
+ document_metadata: DocumentMetadata,
+ ingestor_id: str,
+ datasource_id: str,
+ fresh_until: int,
+ ) -> Optional[Tuple[List[Entity], List[Document], List[str], List[str]]]:
+ """
+ Process a graph entity document by splitting nested entities for the graph database,
+ but keeping the full entity text as a single document for vector search.
+
+ Args:
+ doc: The document containing the graph entity
+ document_metadata: Metadata for the document
+ ingestor_id: ID of the ingestor
+ datasource_id: ID of the datasource
+ fresh_until: Timestamp until which data is fresh
+
+ Returns:
+ Tuple of (entities list, chunks list, chunk_ids list, error_messages list), or None if entity parsing failed
+ """
+ # Initialize result containers
+ all_entities = []
+ all_chunks = []
+ all_chunk_ids = []
+ validation_errors = []
+
+ # Parse the graph entity from document
+ entity = self._parse_graph_entity(doc)
+ if entity is None:
+ # Validation failed, skip this entity
+ return None
+
+ entity_type = entity.entity_type
+
+ self.logger.debug(f"Parsed graph entity of type {entity_type}")
+
+ # Flatten properties BUT preserve arrays of dicts (they'll be split later)
+ # This ensures primary_key_properties with dot notation work correctly
+ entity.all_properties = utils.flatten_dict(entity.all_properties, wildcard_index=False, preserve_list_of_dicts=True)
+
+ # Create the full entity text from the entity
+ # This is what will be used for vector search embedding
+ full_entity_text = self.format_entity_for_embedding(entity)
+
+ # Split the entity into multiple entities for the graph database
+ # Properties are already flattened except arrays of dicts
+ entities = self.split_nested_graph_entity(entity)
+ self.logger.debug(f"Split entity into {len(entities)} entities for graph DB (including parent)")
+
+ # Log primary key after splitting
+ if entities:
+ parent_entity = entities[0] # First entity is always the parent
+ self.logger.debug(f"Parent entity primary key: {parent_entity.generate_primary_key()}")
+
+ # Process each split entity and add to graph database collection
+ for split_entity in entities:
+ # Validate entity after splitting and flattening (properties now have dot notation)
+ all_props = split_entity.all_properties
+
+ # Validate primary_key_properties - critical, must exist
+ missing_primary_keys = [key for key in split_entity.primary_key_properties if key not in all_props]
+ if missing_primary_keys:
+ error_msg = f"Entity type '{split_entity.entity_type}': missing primary key properties: {missing_primary_keys}. Available properties: {list(all_props.keys())}"
+ self.logger.warning(f"Skipping entity - {error_msg}")
+ validation_errors.append(error_msg)
+ continue
+
+ # Validate additional_key_properties - optional, just warn and remove invalid ones
+ if split_entity.additional_key_properties:
+ valid_additional_keys = []
+ for id_keys in split_entity.additional_key_properties:
+ missing_additional_keys = [key for key in id_keys if key not in all_props]
+ if not missing_additional_keys:
+ # All keys exist, keep this additional key set
+ valid_additional_keys.append(id_keys)
+ else:
+ warning_msg = f"Entity type '{split_entity.entity_type}' with primary key '{split_entity.generate_primary_key()}': additional_key_properties {id_keys} has missing properties: {missing_additional_keys}. These additional keys will be ignored."
+ self.logger.warning(warning_msg)
+ validation_errors.append(warning_msg)
+
+ # Update with only valid additional keys
+ split_entity.additional_key_properties = valid_additional_keys
+
+ # Sanitize entity properties (remove values that exceed max_property_length)
+ self.sanitize_entity_properties(split_entity, max_length=self.max_property_length)
+
+ # Add document metadata fields to entity properties
+ split_entity.all_properties.update({INGESTOR_ID_KEY: ingestor_id, DATASOURCE_ID_KEY: datasource_id, LAST_UPDATED_KEY: document_metadata.document_ingested_at, FRESH_UNTIL_KEY: fresh_until})
+
+ # Add to entities list
+ all_entities.append(split_entity)
+
+ # Create ONE document with the full entity text for vector search
+ # Use the original (parent) entity information for the document ID
+ # Set graph entity metadata in the document metadata
+ document_id = self.graph_document_id(entity.entity_type, entity.generate_primary_key())
+ entity_primary_key = entity.generate_primary_key()
+ if document_metadata.metadata is None:
+ document_metadata.metadata = {}
+ document_metadata.metadata.update({"graph_entity_type": entity.entity_type, "graph_entity_pk": entity_primary_key})
+
+ # Prepare metadata for chunking - set graph entity metadata explicitly for root entity
+ entity_doc_metadata = DocumentMetadata(
+ document_id=document_id,
+ datasource_id=document_metadata.datasource_id,
+ ingestor_id=document_metadata.ingestor_id,
+ title=document_metadata.title,
+ description=document_metadata.description,
+ is_graph_entity=True,
+ document_type=self.graph_document_type(entity.entity_type),
+ document_ingested_at=document_metadata.document_ingested_at,
+ fresh_until=document_metadata.fresh_until,
+ metadata=document_metadata.metadata,
+ )
+
+ # Use common chunking method for the full entity text
+ chunks, chunk_ids = self._create_chunks_from_content(
+ content=full_entity_text,
+ document_id=document_id,
+ document_metadata=entity_doc_metadata,
+ )
+
+ all_chunks.extend(chunks)
+ all_chunk_ids.extend(chunk_ids)
+
+ return all_entities, all_chunks, all_chunk_ids, validation_errors
+
+ def split_nested_graph_entity(self, entity: Entity) -> List[Entity]:
+ """
+ Split a nested graph entity into a list of entities.
+
+ This function recursively processes an entity's properties:
+ - Homogeneous lists of primitives (str, bool, int, float) are kept as-is
+ - Lists of dictionaries are split into separate entities
+ - Sub-dictionaries are flattened into the parent with dot notation
+ - Lists of dictionaries in flattened sub-dictionaries are split as usual
+
+ For each property that contains a list of dictionaries:
+ - Creates new entities with type: parent_entity_type + "_" + property_name (singularized/capitalized)
+ - Adds SUB_ENTITY_INDEX_KEY: index in the list
+ - Adds PARENT_ENTITY_PK_KEY: primary key of parent entity
+ - Uses ENTITY_TYPE_KEY (already present in all entities) in primary key to avoid clashes
+ - Sets primary_key_properties to [PARENT_ENTITY_PK_KEY, ENTITY_TYPE_KEY, SUB_ENTITY_INDEX_KEY]
+ - Recursively processes nested structures
+
+ Returns:
+ List[Entity]: List containing the modified parent entity and all extracted nested entities
+ """
+ result_entities = []
+
+ # Get all properties (including internal ones starting with _)
+ all_properties = entity.all_properties.copy()
+ parent_primary_key = entity.generate_primary_key()
+ parent_entity_type = entity.entity_type
+
+ # Separate internal properties (starting with _) from external ones
+ internal_properties = {k: v for k, v in all_properties.items() if k.startswith("_")}
+ external_properties = {k: v for k, v in all_properties.items() if not k.startswith("_")}
+
+ # Process external properties: flatten dicts, split list of dicts, keep homogeneous lists
+ processed_properties = self._process_entity_properties(external_properties, parent_entity_type, parent_primary_key, result_entities)
+
+ # Merge back internal properties (they should always be preserved)
+ processed_properties.update(internal_properties)
+
+ # Create the updated parent entity, preserving additional_labels and additional_key_properties
+ updated_parent = Entity(
+ entity_type=parent_entity_type,
+ all_properties=processed_properties,
+ primary_key_properties=entity.primary_key_properties,
+ additional_labels=entity.additional_labels,
+ additional_key_properties=entity.additional_key_properties,
+ )
+
+ # Parent entity comes first in the result
+ return [updated_parent] + result_entities
+
+ def _process_entity_properties(self, properties: dict, parent_entity_type: str, parent_primary_key: str, result_entities: List[Entity], prefix: str = "") -> dict:
+ """
+ Process entity properties by splitting lists of dicts.
+ Properties should already be flattened (except arrays of dicts) before calling this.
+
+ Args:
+ properties: Properties to process (already flattened, except arrays of dicts)
+ parent_entity_type: Type of parent entity
+ parent_primary_key: Primary key of parent entity
+ result_entities: List to collect split entities
+ prefix: Prefix for flattened keys (for recursive calls within split entities)
+
+ Returns:
+ Processed properties dictionary
+ """
+ processed = {}
+
+ for key, value in properties.items():
+ # Properties are already flattened, just use the key
+ full_key = f"{prefix}.{key}" if prefix else key
+
+ if isinstance(value, list) and value and isinstance(value[0], dict):
+ # Split list of dictionaries into separate entities
+ self._split_list_of_dicts(full_key, value, parent_entity_type, parent_primary_key, result_entities)
+ # Don't add to processed properties (it's been split out)
+
+ elif isinstance(value, dict):
+ # This should only happen in recursive calls within split entities
+ # Flatten these sub-dictionaries
+ flattened = self._process_entity_properties(value, parent_entity_type, parent_primary_key, result_entities, full_key)
+ processed.update(flattened)
+
+ elif self._is_homogeneous_primitive_list(value) if isinstance(value, list) else True:
+ # Keep primitives and homogeneous lists of primitives as-is
+ processed[full_key] = value
+ else:
+ # Mixed or complex list - convert to string for safety
+ processed[full_key] = str(value) if isinstance(value, list) else value
+
+ return processed
+
+ def _is_homogeneous_primitive_list(self, lst: list) -> bool:
+ """
+ Check if a list contains only primitive types (str, bool, int, float, None).
+
+ Args:
+ lst: List to check
+
+ Returns:
+ True if all elements are primitives, False otherwise
+ """
+ if not lst:
+ return True
+
+ primitive_types = (str, bool, int, float, type(None))
+ return all(isinstance(item, primitive_types) for item in lst)
+
+ def _split_list_of_dicts(self, prop_key: str, prop_value: list, parent_entity_type: str, parent_primary_key: str, result_entities: List[Entity]) -> None:
+ """
+ Split a list of dictionaries into separate entities.
+
+ Args:
+ prop_key: Property key (may include dots from flattening)
+ prop_value: List of dictionaries
+ parent_entity_type: Type of parent entity
+ parent_primary_key: Primary key of parent entity
+ result_entities: List to collect split entities
+ """
+ # Generate the new entity type name
+ # Use more context to avoid collisions (e.g., nodeAffinity vs podAntiAffinity)
+ key_parts = prop_key.split(".")
+
+ # Use last 2 parts if available for better disambiguation
+ if len(key_parts) >= 2:
+ # e.g., "nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution"
+ # becomes "Nodeaffinity_Preferredduringschedulingignoredduringexecution"
+ type_parts = [self._generate_type_suffix(key_parts[-2]), self._generate_type_suffix(key_parts[-1])]
+ type_suffix = "_".join(type_parts)
+ else:
+ # Only one part, use it
+ type_suffix = self._generate_type_suffix(key_parts[-1])
+
+ new_entity_type = f"{parent_entity_type}_{type_suffix}"
+
+ # Create a new entity for each item in the list
+ for index, item_dict in enumerate(prop_value):
+ # Create properties for the new sub-entity
+ sub_entity_properties = item_dict.copy()
+ sub_entity_properties[PARENT_ENTITY_PK_KEY] = parent_primary_key
+ sub_entity_properties[PARENT_ENTITY_TYPE_KEY] = parent_entity_type
+ # Convert index to string to ensure homogeneous arrays in Neo4j
+ sub_entity_properties[SUB_ENTITY_INDEX_KEY] = str(index)
+ # Add entity type to properties so it can be used in primary key
+ sub_entity_properties[ENTITY_TYPE_KEY] = new_entity_type
+
+ # Create the new entity with SUB_ENTITY_LABEL to mark it as a sub-entity
+ # Include ENTITY_TYPE_KEY in primary key to avoid clashes between different sub-entity types
+ sub_entity = Entity(
+ entity_type=new_entity_type,
+ all_properties=sub_entity_properties,
+ primary_key_properties=[PARENT_ENTITY_PK_KEY, ENTITY_TYPE_KEY, SUB_ENTITY_INDEX_KEY],
+ additional_labels={SUB_ENTITY_LABEL},
+ )
+
+ self.logger.debug(f"Created sub-entity '{new_entity_type}' with additional_labels: {sub_entity.additional_labels}")
+
+ # Recursively process the sub-entity for nested structures
+ sub_entities = self.split_nested_graph_entity(sub_entity)
+
+ # Verify labels are preserved after recursive processing
+ for se in sub_entities:
+ self.logger.debug(f"After recursive processing: entity_type='{se.entity_type}', additional_labels={se.additional_labels}")
+
+ result_entities.extend(sub_entities)
+
+ def _generate_type_suffix(self, property_key: str) -> str:
+ """
+ Generate a type suffix from a property key. Based on common patterns and heuristics.
+
+ This performs basic singularization and capitalization:
+ - "containers" -> "Container"
+ - "items" -> "Item"
+ - "status" -> "Status"
+ - "ops" -> "Ops"
+
+ Args:
+ property_key: The property key to convert
+
+ Returns:
+ str: The formatted type suffix
+ """
+ # Words that should not be singularized - heuristic based on common patterns
+ NO_SINGULARIZE_SUFFIXES = {"prometheus", "aas", "ops", "status", "series", "species", "apparatus", "progress", "chassis", "redis", "jenkins"}
+
+ # Remove underscores and convert to title case
+ words = property_key.replace("_", " ").split()
+
+ # Simple singularization heuristic
+ result_words = []
+ for word in words:
+ word_lower = word.lower()
+
+ # If word is too short, don't singularize
+ if len(word) <= 4:
+ singular = word
+ # Check if word should not be singularized
+ elif any(word_lower.endswith(suffix) for suffix in NO_SINGULARIZE_SUFFIXES):
+ singular = word
+ # Handle common plurals d
+ elif word_lower.endswith("ies"):
+ # "categories" -> "category"
+ singular = word[:-3] + "y"
+ elif word_lower.endswith("ses"):
+ # "addresses" -> "address"
+ singular = word[:-2]
+ elif word_lower.endswith("s") and not word_lower.endswith("ss"):
+ # "containers" -> "container"
+ singular = word[:-1]
+ else:
+ singular = word
+
+ result_words.append(singular.capitalize())
+
+ return "".join(result_words)
+
+ @staticmethod
+ def graph_document_type(entity_type: str) -> str:
+ return f"graph:{entity_type}"
+
+ @staticmethod
+ def graph_document_id(entity_type: str, entity_pk: str) -> str:
+ return f"graph:{entity_type}:{entity_pk}"
+
+ @staticmethod
+ def parse_graph_entity_from_document_id(document_id: str) -> Tuple[str, str]:
+ """
+ Parse entity_type and entity_pk from a graph document ID of the form "graph:entity_type:entity_pk"
+ """
+ parts = document_id.split(":")
+ if len(parts) < 3 or parts[0] != "graph":
+ raise ValueError(f"Invalid graph document ID format: {document_id}")
+ entity_type = parts[1]
+ entity_pk = ":".join(parts[2:]) # In case entity_pk contains ':'
+ return entity_type, entity_pk
+
+ def _parse_document_metadata(self, doc: Document) -> DocumentMetadata:
+ """
+ Parse document metadata from Document.metadata dict into DocumentMetadata model.
+ """
+ try:
+ return DocumentMetadata.model_validate(doc.metadata)
+ except Exception as e:
+ self.logger.error(f"Failed to parse document metadata: {e}")
+ raise ValueError(f"Invalid document metadata: {e}")
+
+ def _parse_graph_entity(self, doc: Document) -> Optional[Entity]:
+ """
+ Parse document page_content into a graph Entity if it's a graph entity document.
+ Returns None if parsing fails (entity will be skipped).
+ Validation happens after splitting and flattening on server side.
+ Optimized for high-throughput ingestion.
+ """
+ try:
+ # Use Pydantic's model_validate_json to parse JSON string directly into Entity
+ entity = Entity.model_validate_json(doc.page_content)
+ return entity
+ except Exception as e:
+ self.logger.error(f"Failed to parse graph entity from document: {e}")
+ raise ValueError(f"Invalid graph entity document: {e}")
+
+ def _chunk_document(self, doc: Document, document_metadata: DocumentMetadata, chunk_size: int, chunk_overlap: int) -> List[Document]:
+ """
+ Chunk a document if it exceeds chunk_size, otherwise return as single chunk.
+ If chunk_size is 0, skip chunking entirely.
+ Chunk size is capped at MILVUS_MAX_VARCHAR_LENGTH to respect Milvus field limits.
+ """
+ content = doc.page_content
+ if not content:
+ self.logger.warning("Empty content, returning empty chunks")
+ return []
+
+ chunks = []
+
+ # Cap chunk size at Milvus limit
+ effective_chunk_size = min(chunk_size, self.MILVUS_MAX_VARCHAR_LENGTH) if chunk_size > 0 else self.MILVUS_MAX_VARCHAR_LENGTH
+
+ # Check if document needs chunking
+ if len(content) > effective_chunk_size and chunk_size > 0:
+ self.logger.debug(f"Document exceeds chunk size ({len(content)} > {effective_chunk_size}), splitting into chunks")
+
+ # Create document-specific text splitter
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=effective_chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""])
+
+ doc_chunks = text_splitter.split_documents([doc])
+ self.logger.debug(f"Split document into {len(doc_chunks)} chunks for: {document_metadata.document_id}")
+
+ for i, chunk_doc in enumerate(doc_chunks):
+ # Compute chunk id and add the metadata from document, as well as chunk-specific info
+ chunk_id = f"{document_metadata.document_id}_chunk_{i}"
+ chunk_metadata = DocumentChunkMetadata(
+ id=chunk_id,
+ document_id=document_metadata.document_id,
+ datasource_id=document_metadata.datasource_id,
+ ingestor_id=document_metadata.ingestor_id,
+ title=document_metadata.title,
+ description=document_metadata.description,
+ is_graph_entity=document_metadata.is_graph_entity,
+ document_type=document_metadata.document_type,
+ document_ingested_at=document_metadata.document_ingested_at,
+ fresh_until=document_metadata.fresh_until,
+ metadata=document_metadata.metadata,
+ chunk_index=i,
+ total_chunks=len(doc_chunks),
)
-
- # Parent entity comes first in the result
- return [updated_parent] + result_entities
-
- def _process_entity_properties(
- self,
- properties: dict,
- parent_entity_type: str,
- parent_primary_key: str,
- result_entities: List[Entity],
- prefix: str = ""
- ) -> dict:
- """
- Process entity properties by splitting lists of dicts.
- Properties should already be flattened (except arrays of dicts) before calling this.
-
- Args:
- properties: Properties to process (already flattened, except arrays of dicts)
- parent_entity_type: Type of parent entity
- parent_primary_key: Primary key of parent entity
- result_entities: List to collect split entities
- prefix: Prefix for flattened keys (for recursive calls within split entities)
-
- Returns:
- Processed properties dictionary
- """
- processed = {}
-
- for key, value in properties.items():
- # Properties are already flattened, just use the key
- full_key = f"{prefix}.{key}" if prefix else key
-
- if isinstance(value, list) and value and isinstance(value[0], dict):
- # Split list of dictionaries into separate entities
- self._split_list_of_dicts(
- full_key,
- value,
- parent_entity_type,
- parent_primary_key,
- result_entities
- )
- # Don't add to processed properties (it's been split out)
-
- elif isinstance(value, dict):
- # This should only happen in recursive calls within split entities
- # Flatten these sub-dictionaries
- flattened = self._process_entity_properties(
- value,
- parent_entity_type,
- parent_primary_key,
- result_entities,
- full_key
- )
- processed.update(flattened)
-
- elif self._is_homogeneous_primitive_list(value) if isinstance(value, list) else True:
- # Keep primitives and homogeneous lists of primitives as-is
- processed[full_key] = value
- else:
- # Mixed or complex list - convert to string for safety
- processed[full_key] = str(value) if isinstance(value, list) else value
-
- return processed
-
- def _is_homogeneous_primitive_list(self, lst: list) -> bool:
- """
- Check if a list contains only primitive types (str, bool, int, float, None).
-
- Args:
- lst: List to check
-
- Returns:
- True if all elements are primitives, False otherwise
- """
- if not lst:
- return True
-
- primitive_types = (str, bool, int, float, type(None))
- return all(isinstance(item, primitive_types) for item in lst)
-
- def _split_list_of_dicts(
- self,
- prop_key: str,
- prop_value: list,
- parent_entity_type: str,
- parent_primary_key: str,
- result_entities: List[Entity]
- ) -> None:
- """
- Split a list of dictionaries into separate entities.
-
- Args:
- prop_key: Property key (may include dots from flattening)
- prop_value: List of dictionaries
- parent_entity_type: Type of parent entity
- parent_primary_key: Primary key of parent entity
- result_entities: List to collect split entities
- """
- # Generate the new entity type name
- # Use more context to avoid collisions (e.g., nodeAffinity vs podAntiAffinity)
- key_parts = prop_key.split(".")
-
- # Use last 2 parts if available for better disambiguation
- if len(key_parts) >= 2:
- # e.g., "nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution"
- # becomes "Nodeaffinity_Preferredduringschedulingignoredduringexecution"
- type_parts = [self._generate_type_suffix(key_parts[-2]),
- self._generate_type_suffix(key_parts[-1])]
- type_suffix = "_".join(type_parts)
- else:
- # Only one part, use it
- type_suffix = self._generate_type_suffix(key_parts[-1])
-
- new_entity_type = f"{parent_entity_type}_{type_suffix}"
-
- # Create a new entity for each item in the list
- for index, item_dict in enumerate(prop_value):
- # Create properties for the new sub-entity
- sub_entity_properties = item_dict.copy()
- sub_entity_properties[PARENT_ENTITY_PK_KEY] = parent_primary_key
- sub_entity_properties[PARENT_ENTITY_TYPE_KEY] = parent_entity_type
- # Convert index to string to ensure homogeneous arrays in Neo4j
- sub_entity_properties[SUB_ENTITY_INDEX_KEY] = str(index)
- # Add entity type to properties so it can be used in primary key
- sub_entity_properties[ENTITY_TYPE_KEY] = new_entity_type
-
- # Create the new entity with SUB_ENTITY_LABEL to mark it as a sub-entity
- # Include ENTITY_TYPE_KEY in primary key to avoid clashes between different sub-entity types
- sub_entity = Entity(
- entity_type=new_entity_type,
- all_properties=sub_entity_properties,
- primary_key_properties=[PARENT_ENTITY_PK_KEY, ENTITY_TYPE_KEY, SUB_ENTITY_INDEX_KEY],
- additional_labels={SUB_ENTITY_LABEL},
- )
-
- self.logger.debug(
- f"Created sub-entity '{new_entity_type}' with additional_labels: {sub_entity.additional_labels}"
- )
-
- # Recursively process the sub-entity for nested structures
- sub_entities = self.split_nested_graph_entity(sub_entity)
-
- # Verify labels are preserved after recursive processing
- for se in sub_entities:
- self.logger.debug(
- f"After recursive processing: entity_type='{se.entity_type}', "
- f"additional_labels={se.additional_labels}"
- )
-
- result_entities.extend(sub_entities)
-
- def _generate_type_suffix(self, property_key: str) -> str:
- """
- Generate a type suffix from a property key. Based on common patterns and heuristics.
-
- This performs basic singularization and capitalization:
- - "containers" -> "Container"
- - "items" -> "Item"
- - "status" -> "Status"
- - "ops" -> "Ops"
-
- Args:
- property_key: The property key to convert
-
- Returns:
- str: The formatted type suffix
- """
- # Words that should not be singularized - heuristic based on common patterns
- NO_SINGULARIZE_SUFFIXES = {
- "prometheus", "aas", "ops", "status", "series", "species", "apparatus",
- "progress", "chassis", "redis", "jenkins"
- }
-
- # Remove underscores and convert to title case
- words = property_key.replace("_", " ").split()
-
- # Simple singularization heuristic
- result_words = []
- for word in words:
- word_lower = word.lower()
-
- # If word is too short, don't singularize
- if len(word) <= 4:
- singular = word
- # Check if word should not be singularized
- elif any(word_lower.endswith(suffix) for suffix in NO_SINGULARIZE_SUFFIXES):
- singular = word
- # Handle common plurals d
- elif word_lower.endswith("ies"):
- # "categories" -> "category"
- singular = word[:-3] + "y"
- elif word_lower.endswith("ses"):
- # "addresses" -> "address"
- singular = word[:-2]
- elif word_lower.endswith("s") and not word_lower.endswith("ss"):
- # "containers" -> "container"
- singular = word[:-1]
- else:
- singular = word
-
- result_words.append(singular.capitalize())
-
- return "".join(result_words)
-
- @staticmethod
- def graph_document_type(entity_type: str) -> str:
- return f"graph:{entity_type}"
-
- @staticmethod
- def graph_document_id(entity_type: str, entity_pk: str) -> str:
- return f"graph:{entity_type}:{entity_pk}"
-
- @staticmethod
- def parse_graph_entity_from_document_id(document_id: str) -> Tuple[str, str]:
- """
- Parse entity_type and entity_pk from a graph document ID of the form "graph:entity_type:entity_pk"
- """
- parts = document_id.split(":")
- if len(parts) < 3 or parts[0] != "graph":
- raise ValueError(f"Invalid graph document ID format: {document_id}")
- entity_type = parts[1]
- entity_pk = ":".join(parts[2:]) # In case entity_pk contains ':'
- return entity_type, entity_pk
-
- def _parse_document_metadata(self, doc: Document) -> DocumentMetadata:
- """
- Parse document metadata from Document.metadata dict into DocumentMetadata model.
- """
- try:
- return DocumentMetadata.model_validate(doc.metadata)
- except Exception as e:
- self.logger.error(f"Failed to parse document metadata: {e}")
- raise ValueError(f"Invalid document metadata: {e}")
-
- def _parse_graph_entity(self, doc: Document) -> Optional[Entity]:
- """
- Parse document page_content into a graph Entity if it's a graph entity document.
- Returns None if parsing fails (entity will be skipped).
- Validation happens after splitting and flattening on server side.
- Optimized for high-throughput ingestion.
- """
- try:
- # Use Pydantic's model_validate_json to parse JSON string directly into Entity
- entity = Entity.model_validate_json(doc.page_content)
- return entity
- except Exception as e:
- self.logger.error(f"Failed to parse graph entity from document: {e}")
- raise ValueError(f"Invalid graph entity document: {e}")
-
- def _chunk_document(self, doc: Document, document_metadata: DocumentMetadata,
- chunk_size: int, chunk_overlap: int) -> List[Document]:
- """
- Chunk a document if it exceeds chunk_size, otherwise return as single chunk.
- If chunk_size is 0, skip chunking entirely.
- Chunk size is capped at MILVUS_MAX_VARCHAR_LENGTH to respect Milvus field limits.
- """
- content = doc.page_content
- if not content:
- self.logger.warning("Empty content, returning empty chunks")
- return []
-
- chunks = []
-
- # Cap chunk size at Milvus limit
- effective_chunk_size = min(chunk_size, self.MILVUS_MAX_VARCHAR_LENGTH) if chunk_size > 0 else self.MILVUS_MAX_VARCHAR_LENGTH
-
- # Check if document needs chunking
- if len(content) > effective_chunk_size and chunk_size > 0:
- self.logger.debug(f"Document exceeds chunk size ({len(content)} > {effective_chunk_size}), splitting into chunks")
-
- # Create document-specific text splitter
- text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=effective_chunk_size,
- chunk_overlap=chunk_overlap,
- length_function=len,
- separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
+
+ chunk_doc.metadata = chunk_metadata.model_dump()
+ chunks.append(chunk_doc)
+ else:
+ if chunk_size == 0:
+ self.logger.debug(f"Chunk size is 0, skipping chunking for: {document_metadata.document_id}")
+ else:
+ self.logger.debug(f"Document is smaller than chunk size, processing as single chunk: {document_metadata.document_id}")
+
+ # Compute chunk id and add the metadata from document, as well as chunk-specific info
+ chunk_id = f"{document_metadata.document_id}_chunk_0"
+ chunk_metadata = DocumentChunkMetadata(
+ id=chunk_id,
+ document_id=document_metadata.document_id,
+ datasource_id=document_metadata.datasource_id,
+ ingestor_id=document_metadata.ingestor_id,
+ title=document_metadata.title,
+ description=document_metadata.description,
+ is_graph_entity=document_metadata.is_graph_entity,
+ document_type=document_metadata.document_type,
+ document_ingested_at=document_metadata.document_ingested_at,
+ fresh_until=document_metadata.fresh_until,
+ metadata=document_metadata.metadata,
+ chunk_index=0,
+ total_chunks=1,
+ )
+
+ doc.metadata = chunk_metadata.model_dump()
+ chunks.append(doc)
+
+ return chunks
+
+ async def ingest_documents(self, ingestor_id: str, datasource_id: str, job_id: str, documents: List[Document], fresh_until: int, chunk_size: int, chunk_overlap: int):
+ """
+ Process documents, splitting into chunks if necessary, and handle both regular documents and graph entities.
+
+ Args:
+ ingestor_id: ID of the ingestor ingesting the documents
+ datasource_id: ID of the datasource ingesting the documents
+ job_id: ID of the ingestion job
+ documents: List of documents to ingest
+ fresh_until: Timestamp until which this data is considered fresh (epoch seconds)
+ chunk_size: Maximum size for document chunks
+ chunk_overlap: Overlap between chunks
+ """
+ if not documents:
+ self.logger.warning("No documents provided for ingestion")
+ return
+
+ self.logger.info(f"Starting ingestion of {len(documents)} documents")
+
+ # Step 1: Process each document and collect chunks and graph entities
+ all_chunks = []
+ all_chunk_ids = []
+ all_entities = [] # Simple list of entities
+
+ for doc in documents:
+ try:
+ # Step 1a: Parse document metadata
+ document_metadata = self._parse_document_metadata(doc)
+
+ # Override metadata fields - this is to ensure correct association (even if doc.metadata has different values)
+ document_metadata.ingestor_id = ingestor_id
+ document_metadata.datasource_id = datasource_id
+ document_metadata.fresh_until = fresh_until
+ document_metadata.document_ingested_at = int(time.time())
+
+ # Step 1b: Check if it's a graph entity and parse if needed
+ if self.graph_rag_enabled and document_metadata.is_graph_entity:
+ try:
+ result = self._process_graph_entity_document(
+ doc=doc,
+ document_metadata=document_metadata,
+ ingestor_id=ingestor_id,
+ datasource_id=datasource_id,
+ fresh_until=fresh_until,
)
-
- doc_chunks = text_splitter.split_documents([doc])
- self.logger.debug(f"Split document into {len(doc_chunks)} chunks for: {document_metadata.document_id}")
-
- for i, chunk_doc in enumerate(doc_chunks):
-
- # Compute chunk id and add the metadata from document, as well as chunk-specific info
- chunk_id = f"{document_metadata.document_id}_chunk_{i}"
- chunk_metadata = DocumentChunkMetadata(
- id=chunk_id,
- document_id=document_metadata.document_id,
- datasource_id=document_metadata.datasource_id,
- ingestor_id=document_metadata.ingestor_id,
- title=document_metadata.title,
- description=document_metadata.description,
- is_graph_entity=document_metadata.is_graph_entity,
- document_type=document_metadata.document_type,
- document_ingested_at=document_metadata.document_ingested_at,
- fresh_until=document_metadata.fresh_until,
- metadata=document_metadata.metadata,
- chunk_index=i,
- total_chunks=len(doc_chunks),
- )
-
- chunk_doc.metadata = chunk_metadata.model_dump()
- chunks.append(chunk_doc)
+
+ # Skip if entity parsing failed
+ if result is None:
+ self.logger.warning("Skipping graph entity document due to parsing failure")
+ continue
+
+ entities, chunks, chunk_ids, validation_errors = result
+
+ # Add validation errors to job if any occurred
+ if validation_errors:
+ for error in validation_errors:
+ await self.job_manager.add_error_msg(job_id, error)
+
+ # Collect entities, chunks, and chunk IDs
+ all_entities.extend(entities)
+ all_chunks.extend(chunks)
+ all_chunk_ids.extend(chunk_ids)
+
+ except Exception as e:
+ error_msg = f"Failed to parse graph entity: {e}"
+ self.logger.error(f"{error_msg}, skipping")
+ self.logger.error(traceback.format_exc())
+ await self.job_manager.add_error_msg(job_id, error_msg)
+ continue
+
else:
+ # adding this debug log to clarify processing of regular documents when graph RAG is disabled but document is a graph entity
+ if not self.graph_rag_enabled and document_metadata.is_graph_entity:
+ self.logger.debug(f"Document marked as graph entity but graph RAG is disabled, treating as regular document: {document_metadata.document_id}")
- if chunk_size == 0:
- self.logger.debug(f"Chunk size is 0, skipping chunking for: {document_metadata.document_id}")
- else:
- self.logger.debug(f"Document is smaller than chunk size, processing as single chunk: {document_metadata.document_id}")
-
- # Compute chunk id and add the metadata from document, as well as chunk-specific info
- chunk_id = f"{document_metadata.document_id}_chunk_0"
- chunk_metadata = DocumentChunkMetadata(
- id=chunk_id,
- document_id=document_metadata.document_id,
- datasource_id=document_metadata.datasource_id,
- ingestor_id=document_metadata.ingestor_id,
- title=document_metadata.title,
- description=document_metadata.description,
- is_graph_entity=document_metadata.is_graph_entity,
- document_type=document_metadata.document_type,
- document_ingested_at=document_metadata.document_ingested_at,
- fresh_until=document_metadata.fresh_until,
- metadata=document_metadata.metadata,
- chunk_index=0,
- total_chunks=1,
- )
-
- doc.metadata = chunk_metadata.model_dump()
- chunks.append(doc)
-
- return chunks
-
- async def ingest_documents(self, ingestor_id: str, datasource_id: str, job_id: str,
- documents: List[Document], fresh_until: int, chunk_size: int, chunk_overlap: int):
- """
- Process documents, splitting into chunks if necessary, and handle both regular documents and graph entities.
-
- Args:
- ingestor_id: ID of the ingestor ingesting the documents
- datasource_id: ID of the datasource ingesting the documents
- job_id: ID of the ingestion job
- documents: List of documents to ingest
- fresh_until: Timestamp until which this data is considered fresh (epoch seconds)
- chunk_size: Maximum size for document chunks
- chunk_overlap: Overlap between chunks
- """
- if not documents:
- self.logger.warning("No documents provided for ingestion")
- return
-
- self.logger.info(f"Starting ingestion of {len(documents)} documents")
-
- # Step 1: Process each document and collect chunks and graph entities
- all_chunks = []
- all_chunk_ids = []
- all_entities = [] # Simple list of entities
-
- for doc in documents:
- try:
- # Step 1a: Parse document metadata
- document_metadata = self._parse_document_metadata(doc)
-
- # Override metadata fields - this is to ensure correct association (even if doc.metadata has different values)
- document_metadata.ingestor_id = ingestor_id
- document_metadata.datasource_id = datasource_id
- document_metadata.fresh_until = fresh_until
- document_metadata.document_ingested_at = int(time.time())
-
- # Step 1b: Check if it's a graph entity and parse if needed
- if self.graph_rag_enabled and document_metadata.is_graph_entity:
- try:
- result = self._process_graph_entity_document(
- doc=doc,
- document_metadata=document_metadata,
- ingestor_id=ingestor_id,
- datasource_id=datasource_id,
- fresh_until=fresh_until,
- )
-
- # Skip if entity parsing failed
- if result is None:
- self.logger.warning("Skipping graph entity document due to parsing failure")
- continue
-
- entities, chunks, chunk_ids, validation_errors = result
-
- # Add validation errors to job if any occurred
- if validation_errors:
- for error in validation_errors:
- await self.job_manager.add_error_msg(job_id, error)
-
- # Collect entities, chunks, and chunk IDs
- all_entities.extend(entities)
- all_chunks.extend(chunks)
- all_chunk_ids.extend(chunk_ids)
-
- except Exception as e:
- error_msg = f"Failed to parse graph entity: {e}"
- self.logger.error(f"{error_msg}, skipping")
- self.logger.error(traceback.format_exc())
- await self.job_manager.add_error_msg(job_id, error_msg)
- continue
-
- else:
- # adding this debug log to clarify processing of regular documents when graph RAG is disabled but document is a graph entity
- if not self.graph_rag_enabled and document_metadata.is_graph_entity:
- self.logger.debug(f"Document marked as graph entity but graph RAG is disabled, treating as regular document: {document_metadata.document_id}")
-
- # Step 2: Chunk regular documents
- chunks = self._chunk_document(doc, document_metadata, chunk_size, chunk_overlap)
-
- for chunk in chunks:
- chunk_metadata = DocumentChunkMetadata.model_validate(chunk.metadata)
- all_chunks.append(chunk)
- all_chunk_ids.append(chunk_metadata.id)
-
- except Exception as e:
- error_msg = f"Failed to parse and process document: {e}"
- self.logger.error(f"{error_msg}, skipping")
- self.logger.error(traceback.format_exc())
- await self.job_manager.add_error_msg(job_id, error_msg)
- continue
-
- # Step 3: Add all document chunks to vector database
- if all_chunks:
- self.logger.info(f"Adding {len(all_chunks)} document chunks to vector database")
-
- # Deduplicate chunks by ID (keep first occurrence)
- seen_ids = set()
- deduped_chunks = []
- deduped_chunk_ids = []
-
- for chunk, chunk_id in zip(all_chunks, all_chunk_ids):
- if chunk_id not in seen_ids:
- seen_ids.add(chunk_id)
- deduped_chunks.append(chunk)
- deduped_chunk_ids.append(chunk_id)
- else:
- self.logger.debug(f"Skipping duplicate chunk ID: {chunk_id}")
-
- if len(deduped_chunks) < len(all_chunks):
- self.logger.warning(
- f"Removed {len(all_chunks) - len(deduped_chunks)} duplicate chunks. "
- f"Original: {len(all_chunks)}, After dedup: {len(deduped_chunks)}"
- )
-
- try:
- # Update job message
- await self.job_manager.upsert_job(
- job_id=job_id,
- message=f"Adding {len(deduped_chunks)} document chunks to vector database"
- )
-
- await self.vstore.aupsert(documents=deduped_chunks, ids=deduped_chunk_ids, batch_size=self.batch_size)
- self.logger.info(f"Successfully added {len(deduped_chunks)} chunks to vector database")
-
- # Update job with success message
- await self.job_manager.upsert_job(
- job_id=job_id,
- message=f"Added {len(deduped_chunks)} document chunks to vector database"
- )
- except Exception as e:
- error_msg = f"Failed to add chunks to vector database: {e}"
- self.logger.error(error_msg)
- self.logger.error(traceback.format_exc())
- await self.job_manager.add_error_msg(job_id, error_msg)
- raise
-
- # Step 4: Add graph entities to graph database in one batch
- if all_entities and self.data_graph_db:
- total_entities = len(all_entities)
- self.logger.info(f"Adding {total_entities} graph entities to graph database in one batch")
-
- # Update job message
- await self.job_manager.upsert_job(
- job_id=job_id,
- message=f"Adding {total_entities} graph entities in one batch"
- )
-
- try:
- # Add all entities to graph database in ONE call
- await self.data_graph_db.update_entity_batch(
- entities=all_entities,
- batch_size=1000
- )
- self.logger.info(f"Successfully added {total_entities} entities to graph database in ONE batch")
-
- # Final success message for graph entities
- await self.job_manager.upsert_job(
- job_id=job_id,
- message=f"Successfully added {total_entities} graph entities to graph database in one batch"
- )
- except Exception as e:
- error_msg = f"Failed to add entities to graph database: {e}"
- self.logger.error(error_msg)
- self.logger.error(traceback.format_exc())
- await self.job_manager.add_error_msg(job_id, error_msg)
- # Continue with the rest of the processing
-
-
- # Final completion message
- total_entities = len(all_entities)
- completion_msg = f"Ingestion complete: {len(deduped_chunks) if all_chunks else 0} document chunks, {total_entities} graph entities"
- self.logger.info(completion_msg)
- await self.job_manager.upsert_job(
- job_id=job_id,
- message=completion_msg
- )
+ # Step 2: Chunk regular documents
+ chunks = self._chunk_document(doc, document_metadata, chunk_size, chunk_overlap)
+
+ for chunk in chunks:
+ chunk_metadata = DocumentChunkMetadata.model_validate(chunk.metadata)
+ all_chunks.append(chunk)
+ all_chunk_ids.append(chunk_metadata.id)
+
+ except Exception as e:
+ error_msg = f"Failed to parse and process document: {e}"
+ self.logger.error(f"{error_msg}, skipping")
+ self.logger.error(traceback.format_exc())
+ await self.job_manager.add_error_msg(job_id, error_msg)
+ continue
+
+ # Step 3: Add all document chunks to vector database
+ deduped_chunks = [] # Initialize for final message
+ if all_chunks:
+ self.logger.info(f"Adding {len(all_chunks)} document chunks to vector database")
+
+ # Deduplicate chunks by ID (keep first occurrence)
+ seen_ids = set()
+ deduped_chunk_ids = []
+
+ for chunk, chunk_id in zip(all_chunks, all_chunk_ids):
+ if chunk_id not in seen_ids:
+ seen_ids.add(chunk_id)
+ deduped_chunks.append(chunk)
+ deduped_chunk_ids.append(chunk_id)
+ else:
+ self.logger.debug(f"Skipping duplicate chunk ID: {chunk_id}")
+
+ if len(deduped_chunks) < len(all_chunks):
+ self.logger.warning(f"Removed {len(all_chunks) - len(deduped_chunks)} duplicate chunks. Original: {len(all_chunks)}, After dedup: {len(deduped_chunks)}")
+
+ try:
+ # Update job message
+ await self.job_manager.upsert_job(job_id=job_id, message=f"[Server] Adding {len(deduped_chunks)} document chunks to vector database")
+
+ await self.vstore.aupsert(documents=deduped_chunks, ids=deduped_chunk_ids, batch_size=self.batch_size)
+ self.logger.info(f"Successfully added {len(deduped_chunks)} chunks to vector database")
+
+ # Track chunk count
+ await self.job_manager.increment_chunk_count(job_id, len(deduped_chunks))
+
+ # Update job with success message
+ await self.job_manager.upsert_job(job_id=job_id, message=f"[Server] Added {len(deduped_chunks)} document chunks to vector database")
+ except Exception as e:
+ error_msg = f"Failed to add chunks to vector database: {e}"
+ self.logger.error(error_msg)
+ self.logger.error(traceback.format_exc())
+ await self.job_manager.add_error_msg(job_id, error_msg)
+ raise
+
+ # Step 4: Add graph entities to graph database in one batch
+ if all_entities and self.data_graph_db:
+ total_entities = len(all_entities)
+ self.logger.info(f"Adding {total_entities} graph entities to graph database in one batch")
+
+ # Update job message
+ await self.job_manager.upsert_job(job_id=job_id, message=f"[Server] Adding {total_entities} graph entities in one batch")
+
+ try:
+ # Add all entities to graph database in ONE call
+ await self.data_graph_db.update_entity_batch(entities=all_entities, batch_size=1000)
+ self.logger.info(f"Successfully added {total_entities} entities to graph database in ONE batch")
+
+ # Final success message for graph entities
+ await self.job_manager.upsert_job(job_id=job_id, message=f"[Server] Successfully added {total_entities} graph entities to graph database in one batch")
+ except Exception as e:
+ error_msg = f"Failed to add entities to graph database: {e}"
+ self.logger.error(error_msg)
+ self.logger.error(traceback.format_exc())
+ await self.job_manager.add_error_msg(job_id, error_msg)
+ # Continue with the rest of the processing
+
+ # Final completion message
+ total_entities = len(all_entities)
+ completion_msg = f"[Server] Ingestion complete: {len(deduped_chunks) if all_chunks else 0} document chunks, {total_entities} graph entities"
+ self.logger.info(completion_msg)
+ await self.job_manager.upsert_job(job_id=job_id, message=completion_msg)
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/src/server/rbac.py b/ai_platform_engineering/knowledge_bases/rag/server/src/server/rbac.py
index 98220e7ff..80b700290 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/src/server/rbac.py
+++ b/ai_platform_engineering/knowledge_bases/rag/server/src/server/rbac.py
@@ -311,40 +311,57 @@ def extract_groups_from_claims(claims: Dict[str, Any]) -> List[str]:
Extract groups from JWT claims with configurable claim name.
Mirrors the logic in ui/src/lib/auth-config.ts extractGroups()
- Uses OIDC_GROUP_CLAIM if set, otherwise tries common claim names.
+ Uses OIDC_GROUP_CLAIM if set (comma-separated for multiple claims),
+ otherwise checks ALL common claim names and combines groups from all
+ of them (using a set for deduplication).
Args:
claims: JWT token claims
Returns:
- List of group names
+ List of unique group names
"""
- # Default group claim names to try (in order)
- default_group_claims = ["memberOf", "groups", "group", "roles", "cognito:groups"]
+ # Default group claim names to check (in order of priority)
+ # Note: Duo SSO uses "members" for full group list, "groups" for limited set
+ default_group_claims = ["members", "memberOf", "groups", "group", "roles", "cognito:groups"]
- # If explicit group claim is configured, use only that
- if OIDC_GROUP_CLAIM:
- value = claims.get(OIDC_GROUP_CLAIM)
+ # Use a set to collect all groups and deduplicate
+ all_groups: set[str] = set()
+
+ def add_groups_from_value(value: Any) -> None:
+ """Helper to extract groups from a claim value and add to set."""
if isinstance(value, list):
- return [str(g) for g in value]
+ for g in value:
+ all_groups.add(str(g))
elif isinstance(value, str):
# Split on comma or whitespace
- return [g.strip() for g in re.split(r"[,\s]+", value) if g.strip()]
- else:
- logger.warning(f"Group claim '{OIDC_GROUP_CLAIM}' not found in token")
- return []
+ for g in re.split(r"[,\s]+", value):
+ if g.strip():
+ all_groups.add(g.strip())
- # Auto-detect: Try common group claim names in order
+ # If explicit group claim(s) configured, use only those
+ # Supports comma-separated list of claim names (e.g., "groups,members,roles")
+ if OIDC_GROUP_CLAIM:
+ configured_claims = [c.strip() for c in OIDC_GROUP_CLAIM.split(",") if c.strip()]
+ for claim_name in configured_claims:
+ value = claims.get(claim_name)
+ if value is not None:
+ add_groups_from_value(value)
+ if not all_groups:
+ logger.warning(f"No groups found in configured claims: {configured_claims}")
+ return list(all_groups)
+
+ # Auto-detect: check ALL common group claim names and combine them
+ # This is important for Duo SSO which uses both "groups" and "members"
for claim_name in default_group_claims:
value = claims.get(claim_name)
- if isinstance(value, list):
- return [str(g) for g in value]
- elif isinstance(value, str):
- return [g.strip() for g in re.split(r"[,\s]+", value) if g.strip()]
+ if value is not None:
+ add_groups_from_value(value)
- # No groups found
- logger.debug("No group claims found in token")
- return []
+ if not all_groups:
+ logger.debug("No group claims found in token")
+
+ return list(all_groups)
# ============================================================================
@@ -399,6 +416,10 @@ async def _authenticate_from_token(request: Request, auth_manager: AuthManager)
"""
Internal helper to authenticate user from JWT token.
+ Supports optional X-Identity-Token header for ID token with user claims.
+ If provided, the ID token is validated and used for email/groups extraction.
+ If not provided, falls back to extracting claims from the access token.
+
Returns:
UserContext if authentication successful, None if no auth or invalid
"""
@@ -409,19 +430,23 @@ async def _authenticate_from_token(request: Request, auth_manager: AuthManager)
token = auth_header[7:] # Remove "Bearer " prefix
+ # Extract optional ID token for claims (some OIDC providers only include
+ # user claims like email/groups in the ID token, not the access token)
+ id_token = request.headers.get("X-Identity-Token")
+
# Extract optional ingestor identification headers
ingestor_type = request.headers.get("X-Ingestor-Type")
ingestor_name = request.headers.get("X-Ingestor-Name")
# Validate token against configured providers
try:
- provider, claims = await auth_manager.validate_token(token)
- logger.debug(f"Token validated by provider '{provider.name}'")
- logger.debug(f"Token claims keys: {list(claims.keys())}")
+ provider, access_claims = await auth_manager.validate_token(token)
+ logger.debug(f"Access token validated by provider '{provider.name}'")
+ logger.debug(f"Access token claims keys: {list(access_claims.keys())}")
# Check if this is a client credentials token (machine-to-machine)
- if is_client_credentials_token(claims):
- client_id = extract_client_id_from_claims(claims)
+ if is_client_credentials_token(access_claims):
+ client_id = extract_client_id_from_claims(access_claims)
# Enrich logging with ingestor info if provided
if ingestor_type and ingestor_name:
@@ -443,9 +468,27 @@ async def _authenticate_from_token(request: Request, auth_manager: AuthManager)
else:
logger.debug("Regular user token detected (not client credentials)")
- # Regular user token - extract email and groups
- email = extract_email_from_claims(claims)
- groups = extract_groups_from_claims(claims)
+ # Determine which claims to use for email/groups extraction
+ claims_for_extraction = access_claims
+ claims_source = "access_token"
+
+ # If ID token provided, validate and use its claims for user identity
+ if id_token:
+ try:
+ id_claims = await auth_manager.validate_id_token(id_token, provider)
+ claims_for_extraction = id_claims
+ claims_source = "id_token"
+ logger.debug(f"Using ID token for claims extraction, keys: {list(id_claims.keys())}")
+ except JWTError as e:
+ # ID token provided but invalid - reject the request
+ logger.warning(f"ID token validation failed: {e}")
+ raise # Re-raise to trigger 401 response
+
+ # Regular user token - extract email and groups from appropriate claims
+ email = extract_email_from_claims(claims_for_extraction)
+ groups = extract_groups_from_claims(claims_for_extraction)
+
+ logger.debug(f"Extracted from {claims_source}: email={email}, groups={groups}")
# Validate email format
if email and email != "unknown" and not EMAIL_REGEX.match(email):
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/src/server/restapi.py b/ai_platform_engineering/knowledge_bases/rag/server/src/server/restapi.py
index af09e4874..d0b16dafc 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/src/server/restapi.py
+++ b/ai_platform_engineering/knowledge_bases/rag/server/src/server/restapi.py
@@ -18,42 +18,26 @@
from common.metadata_storage import MetadataStorage
from common.job_manager import JobManager, JobStatus
from common.models.server import (
- ExploreNeighborhoodRequest,
- QueryRequest,
- QueryResult,
- DocumentIngestRequest,
- IngestorPingRequest,
- IngestorPingResponse,
- UrlIngestRequest,
- IngestorRequest,
- WebIngestorCommand,
- ConfluenceIngestorCommand,
- UrlReloadRequest,
- ConfluenceIngestRequest,
- ConfluenceReloadRequest
+ ExploreNeighborhoodRequest,
+ QueryRequest,
+ QueryResult,
+ DocumentIngestRequest,
+ IngestorPingRequest,
+ IngestorPingResponse,
+ UrlIngestRequest,
+ IngestorRequest,
+ WebIngestorCommand,
+ ConfluenceIngestorCommand,
+ UrlReloadRequest,
+ ConfluenceIngestRequest,
+ ConfluenceReloadRequest,
)
from common.models.rag import DataSourceInfo, IngestorInfo, valid_metadata_keys
from common.models.rbac import Role, UserContext, UserInfoResponse
-from server.rbac import (
- get_user_or_anonymous,
- require_role,
- has_permission,
- get_permissions,
- is_trusted_request
-)
+from server.rbac import get_user_or_anonymous, require_role, has_permission, get_permissions, is_trusted_request
from common.graph_db.neo4j.graph_db import Neo4jDB
from common.graph_db.base import GraphDB
-from common.constants import (
- DATASOURCE_ID_KEY,
- WEBLOADER_INGESTOR_REDIS_QUEUE,
- WEBLOADER_INGESTOR_NAME,
- WEBLOADER_INGESTOR_TYPE,
- CONFLUENCE_INGESTOR_REDIS_QUEUE,
- CONFLUENCE_INGESTOR_NAME,
- CONFLUENCE_INGESTOR_TYPE,
- DEFAULT_DATA_LABEL,
- DEFAULT_SCHEMA_LABEL
-)
+from common.constants import DATASOURCE_ID_KEY, WEBLOADER_INGESTOR_REDIS_QUEUE, WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE, CONFLUENCE_INGESTOR_REDIS_QUEUE, CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE, DEFAULT_DATA_LABEL, DEFAULT_SCHEMA_LABEL
from common.embeddings_factory import EmbeddingsFactory
import redis.asyncio as redis
from langchain_milvus import BM25BuiltInFunction, Milvus
@@ -74,10 +58,10 @@
# Initialize logger
logger = utils.get_logger(__name__)
-logger.setLevel( os.getenv("LOG_LEVEL", "INFO").upper())
+logger.setLevel(os.getenv("LOG_LEVEL", "INFO").upper())
print(f"LOG LEVEL set to {logger.level}")
-if logger.level == logging.DEBUG: # enable langchain verbose logging
- set_langchain_verbose(True)
+if logger.level == logging.DEBUG: # enable langchain verbose logging
+ set_langchain_verbose(True)
# Read configuration from environment variables
clean_up_interval = int(os.getenv("CLEANUP_INTERVAL", 3 * 60 * 60)) # Default to 3 hours
@@ -87,14 +71,14 @@
milvus_uri = os.getenv("MILVUS_URI", "http://localhost:19530")
embeddings_model = os.getenv("EMBEDDINGS_MODEL", "text-embedding-3-small")
neo4j_addr = os.getenv("NEO4J_ADDR", "bolt://localhost:7687")
-skip_init_tests = os.getenv("SKIP_INIT_TESTS", "false").lower() in ("true", "1", "yes") # used when debugging to skip connection tests
-max_ingestion_concurrency = int(os.getenv("MAX_INGESTION_CONCURRENCY", 30)) # max concurrent tasks during ingestion for one datasource
+skip_init_tests = os.getenv("SKIP_INIT_TESTS", "false").lower() in ("true", "1", "yes") # used when debugging to skip connection tests
+max_ingestion_concurrency = int(os.getenv("MAX_INGESTION_CONCURRENCY", 30)) # max concurrent tasks during ingestion for one datasource
ui_url = os.getenv("UI_URL", "http://localhost:9447")
mcp_enabled = os.getenv("ENABLE_MCP", "true").lower() in ("true", "1", "yes")
-sleep_on_init_failure = int(os.getenv("SLEEP_ON_INIT_FAILURE_SECONDS", 180)) # seconds to sleep on init failure before shutdown
-max_documents_per_ingest = int(os.getenv("MAX_DOCUMENTS_PER_INGEST", 1000)) # max number of documents to ingest per ingestion request
-max_results_per_query = int(os.getenv("MAX_RESULTS_PER_QUERY", 100)) # max number of results to return per query
-confluence_url = os.getenv("CONFLUENCE_URL") # optional - base URL for Confluence instance (e.g., https://company.atlassian.net/wiki)
+sleep_on_init_failure = int(os.getenv("SLEEP_ON_INIT_FAILURE_SECONDS", 180)) # seconds to sleep on init failure before shutdown
+max_documents_per_ingest = int(os.getenv("MAX_DOCUMENTS_PER_INGEST", 1000)) # max number of documents to ingest per ingestion request
+max_results_per_query = int(os.getenv("MAX_RESULTS_PER_QUERY", 100)) # max number of results to return per query
+confluence_url = os.getenv("CONFLUENCE_URL") # optional - base URL for Confluence instance (e.g., https://company.atlassian.net/wiki)
default_collection_name_docs = "rag_default"
dense_index_params = {"index_type": "HNSW", "metric_type": "COSINE"}
@@ -103,177 +87,163 @@
milvus_connection_args = {"uri": milvus_uri}
if graph_rag_enabled:
- logger.warning("Graph RAG is ENABLED ✅")
+ logger.warning("Graph RAG is ENABLED ✅")
else:
- logger.warning("Graph RAG is DISABLED ❌")
+ logger.warning("Graph RAG is DISABLED ❌")
# Application lifespan management - initalization and cleanup
@asynccontextmanager
async def app_lifespan(app: FastAPI):
- """Manage application lifespan events"""
- # Startup
- logging.info("Starting up the app...")
- logging.info("setting up dbs")
-
- global metadata_storage
- global jobmanager
- global data_graph_db
- global ontology_graph_db
- global vector_db
- global redis_client
- global vector_db_query_service
- global ingestor
-
- redis_client = redis.from_url(redis_url, decode_responses=True)
- metadata_storage = MetadataStorage(redis_client=redis_client)
- jobmanager = JobManager(redis_client=redis_client)
-
- # Use EmbeddingsFactory to get embeddings based on EMBEDDINGS_PROVIDER env var
- embeddings = EmbeddingsFactory.get_embeddings()
-
-
- logger.info("SKIP_INIT_TESTS=" + str(skip_init_tests))
- if not skip_init_tests:
- try:
- # Do some inital tests to ensure the connections are all working
- await init_tests(
- logger=logger,
- redis_client=redis_client,
- embeddings=EmbeddingsFactory(),
- milvus_uri=milvus_uri
- )
- except Exception as e:
- logger.error(traceback.format_exc())
- logger.error("Initial connection tests failed, shutting down the app.")
- logger.error(f"Error in init test, sleeping {sleep_on_init_failure} seconds before shutdown...")
- logger.error("Press Ctrl+C to exit immediately...")
- try:
- for remaining in range(sleep_on_init_failure, 0, -1):
- logger.info(f"Shutting down in {remaining} seconds...")
- time.sleep(1)
- except KeyboardInterrupt:
- logger.info("Shutdown interrupted by user (Ctrl+C)")
- raise e
-
- # Setup vector db for document data
- vector_db = Milvus(
- embedding_function=embeddings,
- collection_name=default_collection_name_docs,
- connection_args=milvus_connection_args,
- index_params=[dense_index_params, sparse_index_params],
- builtin_function=BM25BuiltInFunction(output_field_names="sparse"),
- vector_field=["dense", "sparse"],
- enable_dynamic_field=True, # allow for dynamic metadata fields
- )
+ """Manage application lifespan events"""
+ # Startup
+ logging.info("Starting up the app...")
+ logging.info("setting up dbs")
+
+ global metadata_storage
+ global jobmanager
+ global data_graph_db
+ global ontology_graph_db
+ global vector_db
+ global redis_client
+ global vector_db_query_service
+ global ingestor
+
+ redis_client = redis.from_url(redis_url, decode_responses=True)
+ metadata_storage = MetadataStorage(redis_client=redis_client)
+ jobmanager = JobManager(redis_client=redis_client)
+
+ # Use EmbeddingsFactory to get embeddings based on EMBEDDINGS_PROVIDER env var
+ embeddings = EmbeddingsFactory.get_embeddings()
+
+ logger.info("SKIP_INIT_TESTS=" + str(skip_init_tests))
+ if not skip_init_tests:
+ try:
+ # Do some inital tests to ensure the connections are all working
+ await init_tests(logger=logger, redis_client=redis_client, embeddings=EmbeddingsFactory(), milvus_uri=milvus_uri)
+ except Exception as e:
+ logger.error(traceback.format_exc())
+ logger.error("Initial connection tests failed, shutting down the app.")
+ logger.error(f"Error in init test, sleeping {sleep_on_init_failure} seconds before shutdown...")
+ logger.error("Press Ctrl+C to exit immediately...")
+ try:
+ for remaining in range(sleep_on_init_failure, 0, -1):
+ logger.info(f"Shutting down in {remaining} seconds...")
+ time.sleep(1)
+ except KeyboardInterrupt:
+ logger.info("Shutdown interrupted by user (Ctrl+C)")
+ raise e
+
+ # Setup vector db for document data
+ vector_db = Milvus(
+ embedding_function=embeddings,
+ collection_name=default_collection_name_docs,
+ connection_args=milvus_connection_args,
+ index_params=[dense_index_params, sparse_index_params],
+ builtin_function=BM25BuiltInFunction(output_field_names="sparse"),
+ vector_field=["dense", "sparse"],
+ enable_dynamic_field=True, # allow for dynamic metadata fields
+ )
+
+ # Ensure the collection exists (required for upsert operations)
+ # The Milvus langchain wrapper only auto-creates collections on add_documents, not upsert
+ if not vector_db.client.has_collection(default_collection_name_docs):
+ logger.info(f"Collection {default_collection_name_docs} does not exist, creating it...")
+ # Add a dummy document to trigger collection creation with proper schema
+ dummy_doc = Document(page_content="__init__", metadata={"_init": True})
+ vector_db.add_documents(documents=[dummy_doc], ids=["__init_doc__"])
+ # Delete the dummy document
+ vector_db.delete(ids=["__init_doc__"])
+ logger.info(f"Collection {default_collection_name_docs} created successfully")
+ else:
+ logger.info(f"Collection {default_collection_name_docs} already exists")
+
+ vector_db_query_service = VectorDBQueryService(vector_db=vector_db)
+
+ if graph_rag_enabled:
+ # Setup graph dbs - both use the same Neo4j instance with different tenant labels
+ data_graph_db = Neo4jDB(tenant_label=DEFAULT_DATA_LABEL, uri=neo4j_addr)
+ await data_graph_db.setup()
+ ontology_graph_db = Neo4jDB(tenant_label=DEFAULT_SCHEMA_LABEL, uri=neo4j_addr)
+ await ontology_graph_db.setup()
+
+ # setup ingestor with graph db
+ ingestor = DocumentProcessor(vstore=vector_db, graph_rag_enabled=graph_rag_enabled, job_manager=jobmanager, data_graph_db=data_graph_db, batch_size=max_documents_per_ingest)
+ else:
+ # setup ingestor without graph db
+ ingestor = DocumentProcessor(vstore=vector_db, job_manager=jobmanager, graph_rag_enabled=graph_rag_enabled, batch_size=max_documents_per_ingest)
+
+ yield
+ # Shutdown
+ logging.info("Shutting down the app...")
- # Ensure the collection exists (required for upsert operations)
- # The Milvus langchain wrapper only auto-creates collections on add_documents, not upsert
- if not vector_db.client.has_collection(default_collection_name_docs):
- logger.info(f"Collection {default_collection_name_docs} does not exist, creating it...")
- # Add a dummy document to trigger collection creation with proper schema
- dummy_doc = Document(page_content="__init__", metadata={"_init": True})
- vector_db.add_documents(documents=[dummy_doc], ids=["__init_doc__"])
- # Delete the dummy document
- vector_db.delete(ids=["__init_doc__"])
- logger.info(f"Collection {default_collection_name_docs} created successfully")
- else:
- logger.info(f"Collection {default_collection_name_docs} already exists")
-
- vector_db_query_service = VectorDBQueryService(vector_db=vector_db)
-
- if graph_rag_enabled:
- # Setup graph dbs - both use the same Neo4j instance with different tenant labels
- data_graph_db = Neo4jDB(tenant_label=DEFAULT_DATA_LABEL, uri=neo4j_addr)
- await data_graph_db.setup()
- ontology_graph_db = Neo4jDB(tenant_label=DEFAULT_SCHEMA_LABEL, uri=neo4j_addr)
- await ontology_graph_db.setup()
-
- # setup ingestor with graph db
- ingestor = DocumentProcessor(
- vstore=vector_db,
- graph_rag_enabled=graph_rag_enabled,
- job_manager=jobmanager,
- data_graph_db=data_graph_db,
- batch_size=max_documents_per_ingest
- )
- else:
- # setup ingestor without graph db
- ingestor = DocumentProcessor(
- vstore=vector_db,
- job_manager=jobmanager,
- graph_rag_enabled=graph_rag_enabled,
- batch_size=max_documents_per_ingest
- )
-
- yield
- # Shutdown
- logging.info("Shutting down the app...")
if mcp_enabled:
- # Initialize MCP server
- mcp = FastMCP("RAG Tools")
- mcp_app = mcp.http_app(path='/mcp')
+ # Initialize MCP server
+ mcp = FastMCP("RAG Tools")
+ mcp_app = mcp.http_app(path="/mcp")
# Combine both lifespans - App and MCP (if enabled)
@asynccontextmanager
async def combined_lifespan(app: FastAPI):
- async with app_lifespan(app):
- if not mcp_enabled:
- yield # Skip MCP setup
- else:
- if not metadata_storage:
- raise HTTPException(status_code=500, detail="Cannot initialize MCP server - metadata storage not initialized")
- # Initialize MCP server tools
- agent_tools = AgentTools(
- vector_db_query_service=vector_db_query_service,
- redis_client=redis_client,
- metadata_storage=metadata_storage,
- data_graph_db=data_graph_db,
- ontology_graph_db=ontology_graph_db,
- )
-
- # Add all agent tools to the MCP app
- await agent_tools.register_tools(mcp, graph_rag_enabled=graph_rag_enabled)
-
- # Register MCP app lifespan
- async with mcp_app.lifespan(app):
- yield
+ async with app_lifespan(app):
+ if not mcp_enabled:
+ yield # Skip MCP setup
+ else:
+ if not metadata_storage:
+ raise HTTPException(status_code=500, detail="Cannot initialize MCP server - metadata storage not initialized")
+ # Initialize MCP server tools
+ agent_tools = AgentTools(
+ vector_db_query_service=vector_db_query_service,
+ redis_client=redis_client,
+ metadata_storage=metadata_storage,
+ data_graph_db=data_graph_db,
+ ontology_graph_db=ontology_graph_db,
+ )
+
+ # Add all agent tools to the MCP app
+ await agent_tools.register_tools(mcp, graph_rag_enabled=graph_rag_enabled)
+
+ # Register MCP app lifespan
+ async with mcp_app.lifespan(app):
+ yield
# Initialize FastAPI app
if mcp_enabled:
- app = FastAPI(
- title="CAIPE RAG API",
- description="API for indexing and querying knowledge base for CAIPE",
- version="2.0.0",
- lifespan=combined_lifespan,
- routes=[*mcp_app.routes] # Include MCP routes
- )
+ app = FastAPI(
+ title="CAIPE RAG API",
+ description="API for indexing and querying knowledge base for CAIPE",
+ version="2.0.0",
+ lifespan=combined_lifespan,
+ routes=[*mcp_app.routes], # Include MCP routes
+ )
else:
- app = FastAPI(
- title="CAIPE RAG API",
- description="API for indexing and querying knowledge base for CAIPE",
- version="2.0.0",
- lifespan=combined_lifespan,
- )
+ app = FastAPI(
+ title="CAIPE RAG API",
+ description="API for indexing and querying knowledge base for CAIPE",
+ version="2.0.0",
+ lifespan=combined_lifespan,
+ )
+
def generate_ingestor_id(ingestor_name: str, ingestor_type: str) -> str:
- """Generate a unique ingestor ID for webloader ingestor"""
- return f"{ingestor_type}:{ingestor_name}"
+ """Generate a unique ingestor ID for webloader ingestor"""
+ return f"{ingestor_type}:{ingestor_name}"
# ============================================================================
# User Info Endpoint
# ============================================================================
+
@app.get(
- "/v1/user/info",
- response_model=UserInfoResponse,
- tags=["Authentication"],
- summary="Get current user information",
- description="""
+ "/v1/user/info",
+ response_model=UserInfoResponse,
+ tags=["Authentication"],
+ summary="Get current user information",
+ description="""
Retrieve the current user's authentication status, role, and permissions.
This endpoint is used by the UI to:
@@ -291,1288 +261,1050 @@ def generate_ingestor_id(ingestor_name: str, ingestor_type: str) -> str:
- `ingest`: Can ingest new data and manage ingestion jobs (INGESTONLY, ADMIN)
- `delete`: Can delete resources and perform bulk operations (ADMIN only)
""",
- responses={
- 200: {
- "description": "Successfully retrieved user information",
- "content": {
- "application/json": {
- "examples": {
- "authenticated": {
- "summary": "Authenticated user",
- "value": {
- "email": "user@example.com",
- "role": "readonly",
- "is_authenticated": True,
- "groups": ["engineering", "platform-team"],
- "permissions": ["read"],
- "in_trusted_network": False
- }
- },
- "anonymous": {
- "summary": "Anonymous user",
- "value": {
- "email": "anonymous",
- "role": "anonymous",
- "is_authenticated": False,
- "groups": [],
- "permissions": [],
- "in_trusted_network": False
- }
- },
- "trusted_network": {
- "summary": "Trusted network user",
- "value": {
- "email": "trusted-network",
- "role": "admin",
- "is_authenticated": False,
- "groups": [],
- "permissions": ["read", "ingest", "delete"],
- "in_trusted_network": True
- }
- }
- }
- }
- }
+ responses={
+ 200: {
+ "description": "Successfully retrieved user information",
+ "content": {
+ "application/json": {
+ "examples": {
+ "authenticated": {"summary": "Authenticated user", "value": {"email": "user@example.com", "role": "readonly", "is_authenticated": True, "groups": ["engineering", "platform-team"], "permissions": ["read"], "in_trusted_network": False}},
+ "anonymous": {"summary": "Anonymous user", "value": {"email": "anonymous", "role": "anonymous", "is_authenticated": False, "groups": [], "permissions": [], "in_trusted_network": False}},
+ "trusted_network": {"summary": "Trusted network user", "value": {"email": "trusted-network", "role": "admin", "is_authenticated": False, "groups": [], "permissions": ["read", "ingest", "delete"], "in_trusted_network": True}},
+ }
}
+ },
}
+ },
)
-async def get_user_info(
- request: Request,
- user: UserContext = Depends(get_user_or_anonymous)
-):
- """Get current user's authentication and role information."""
- # Determine if request is from trusted network
- trusted = is_trusted_request(request)
-
- return UserInfoResponse(
- email=user.email,
- role=user.role,
- is_authenticated=user.is_authenticated,
- groups=user.groups,
- permissions=get_permissions(user.role),
- in_trusted_network=trusted
- )
+async def get_user_info(request: Request, user: UserContext = Depends(get_user_or_anonymous)):
+ """Get current user's authentication and role information."""
+ # Determine if request is from trusted network
+ trusted = is_trusted_request(request)
+
+ return UserInfoResponse(email=user.email, role=user.role, is_authenticated=user.is_authenticated, groups=user.groups, permissions=get_permissions(user.role), in_trusted_network=trusted)
+
# ============================================================================
# Ingestor Endpoints
# ============================================================================
+
@app.get("/v1/ingestors")
async def list_ingestors(user: UserContext = Depends(require_role(Role.READONLY))):
- """
- Lists all ingestors in the database
- """
- if not metadata_storage:
- raise HTTPException(status_code=500, detail="Server not initialized")
- logger.debug("Listing ingestors")
- ingestors = await metadata_storage.fetch_all_ingestor_info()
- return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(ingestors))
+ """
+ Lists all ingestors in the database
+ """
+ if not metadata_storage:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ logger.debug("Listing ingestors")
+ ingestors = await metadata_storage.fetch_all_ingestor_info()
+ return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(ingestors))
+
@app.post("/v1/ingestor/heartbeat", response_model=IngestorPingResponse, status_code=status.HTTP_200_OK)
-async def ping_ingestor(
- ingestor_ping: IngestorPingRequest,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """
- Registers a heartbeat from a ingestor, creating or updating its entry
- """
- if not metadata_storage:
- raise HTTPException(status_code=500, detail="Server not initialized")
- logger.info(f"Received heartbeat from ingestor: name={ingestor_ping.ingestor_name} type={ingestor_ping.ingestor_type} (by {user.email})")
- ingestor_id = generate_ingestor_id(ingestor_ping.ingestor_name, ingestor_ping.ingestor_type)
- ingestor_info = IngestorInfo(
- ingestor_id=ingestor_id,
- ingestor_type=ingestor_ping.ingestor_type,
- ingestor_name=ingestor_ping.ingestor_name,
- description=ingestor_ping.description,
- metadata=ingestor_ping.metadata,
- last_seen=int(time.time())
- )
- await metadata_storage.store_ingestor_info(ingestor_info=ingestor_info)
- return IngestorPingResponse(
- ingestor_id=ingestor_id,
- message="Ingestor heartbeat registered",
- max_documents_per_ingest=max_documents_per_ingest
- )
+async def ping_ingestor(ingestor_ping: IngestorPingRequest, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """
+ Registers a heartbeat from a ingestor, creating or updating its entry
+ """
+ if not metadata_storage:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ logger.info(f"Received heartbeat from ingestor: name={ingestor_ping.ingestor_name} type={ingestor_ping.ingestor_type} (by {user.email})")
+ ingestor_id = generate_ingestor_id(ingestor_ping.ingestor_name, ingestor_ping.ingestor_type)
+ ingestor_info = IngestorInfo(ingestor_id=ingestor_id, ingestor_type=ingestor_ping.ingestor_type, ingestor_name=ingestor_ping.ingestor_name, description=ingestor_ping.description, metadata=ingestor_ping.metadata, last_seen=int(time.time()))
+ await metadata_storage.store_ingestor_info(ingestor_info=ingestor_info)
+ return IngestorPingResponse(ingestor_id=ingestor_id, message="Ingestor heartbeat registered", max_documents_per_ingest=max_documents_per_ingest)
+
@app.delete("/v1/ingestor/delete")
-async def delete_ingestor(
- ingestor_id: str,
- user: UserContext = Depends(require_role(Role.ADMIN))
-):
- """
- Deletes an ingestor from metadata storage, does not delete any associated datasources or data
- """
- if not vector_db or not metadata_storage:
- raise HTTPException(status_code=500, detail="Server not initialized")
- if graph_rag_enabled and not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized")
+async def delete_ingestor(ingestor_id: str, user: UserContext = Depends(require_role(Role.ADMIN))):
+ """
+ Deletes an ingestor from metadata storage, does not delete any associated datasources or data
+ """
+ if not vector_db or not metadata_storage:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ if graph_rag_enabled and not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized")
- # Fetch ingestor info - check if it exists
- ingestor_info = await metadata_storage.get_ingestor_info(ingestor_id)
+ # Fetch ingestor info - check if it exists
+ ingestor_info = await metadata_storage.get_ingestor_info(ingestor_id)
- if not ingestor_info:
- raise HTTPException(status_code=404, detail="Ingestor not found")
+ if not ingestor_info:
+ raise HTTPException(status_code=404, detail="Ingestor not found")
+
+ logger.warning(f"Deleting ingestor: {ingestor_id} (by {user.email})")
+ await metadata_storage.delete_ingestor_info(ingestor_id) # remove metadata
- logger.warning(f"Deleting ingestor: {ingestor_id} (by {user.email})")
- await metadata_storage.delete_ingestor_info(ingestor_id) # remove metadata
# ============================================================================
# Datasources Endpoints
# ============================================================================
+
@app.post("/v1/datasource", status_code=status.HTTP_202_ACCEPTED)
-async def upsert_datasource(
- datasource_info: DataSourceInfo,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Create or update datasource metadata entry."""
- if not metadata_storage:
- raise HTTPException(status_code=500, detail="Server not initialized")
+async def upsert_datasource(datasource_info: DataSourceInfo, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Create or update datasource metadata entry."""
+ if not metadata_storage:
+ raise HTTPException(status_code=500, detail="Server not initialized")
- await metadata_storage.store_datasource_info(datasource_info)
+ await metadata_storage.store_datasource_info(datasource_info)
+
+ return status.HTTP_202_ACCEPTED
- return status.HTTP_202_ACCEPTED
@app.delete("/v1/datasource", status_code=status.HTTP_200_OK)
-async def delete_datasource(
- datasource_id: str,
- user: UserContext = Depends(require_role(Role.ADMIN))
-):
- """Delete datasource from vector storage and metadata."""
+async def delete_datasource(datasource_id: str, user: UserContext = Depends(require_role(Role.ADMIN))):
+ """Delete datasource from vector storage and metadata."""
- # Check initialization
- if not vector_db or not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
- if graph_rag_enabled and not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Fetch datasource info
- datasource_info = await metadata_storage.get_datasource_info(datasource_id)
- if not datasource_info:
- raise HTTPException(status_code=404, detail="Datasource not found")
-
- # Check if any jobs are running for this datasource
- jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
- if jobs and any(job.status == JobStatus.IN_PROGRESS for job in jobs):
- raise HTTPException(
- status_code=400,
- detail="Cannot delete datasource while ingestion job is in progress."
- )
-
- # remove all jobs for this datasource
- jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
- if jobs:
- for job in jobs:
- await jobmanager.delete_job(job.job_id)
+ # Check initialization
+ if not vector_db or not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ if graph_rag_enabled and not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized")
- await vector_db.adelete(expr=f"datasource_id == '{datasource_id}'")
- await metadata_storage.delete_datasource_info(datasource_id) # remove metadata
+ # Fetch datasource info
+ datasource_info = await metadata_storage.get_datasource_info(datasource_id)
+ if not datasource_info:
+ raise HTTPException(status_code=404, detail="Datasource not found")
- if graph_rag_enabled and data_graph_db:
- await data_graph_db.remove_entity(None, {DATASOURCE_ID_KEY: datasource_id}) # remove from graph db
+ # Check if any jobs are running for this datasource
+ jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
+ if jobs and any(job.status == JobStatus.IN_PROGRESS for job in jobs):
+ raise HTTPException(status_code=400, detail="Cannot delete datasource while ingestion job is in progress.")
+ # remove all jobs for this datasource
+ jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
+ if jobs:
+ for job in jobs:
+ await jobmanager.delete_job(job.job_id)
+
+ await vector_db.adelete(expr=f"datasource_id == '{datasource_id}'")
+ await metadata_storage.delete_datasource_info(datasource_id) # remove metadata
+
+ if graph_rag_enabled and data_graph_db:
+ await data_graph_db.remove_entity(None, {DATASOURCE_ID_KEY: datasource_id}) # remove from graph db
+
+ return status.HTTP_200_OK
- return status.HTTP_200_OK
@app.get("/v1/datasources")
-async def list_datasources(
- ingestor_id: Optional[str] = None,
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """List all stored datasources"""
- if not metadata_storage:
- raise HTTPException(status_code=500, detail="Server not initialized")
- try:
- datasources = await metadata_storage.fetch_all_datasource_info()
- if ingestor_id:
- datasources = [ds for ds in datasources if ds.ingestor_id == ingestor_id]
- return {
- "success": True,
- "datasources": datasources,
- "count": len(datasources)
- }
- except Exception as e:
- logger.error(f"Failed to list datasources: {e}")
- raise HTTPException(status_code=500, detail=str(e))
+async def list_datasources(ingestor_id: Optional[str] = None, user: UserContext = Depends(require_role(Role.READONLY))):
+ """List all stored datasources"""
+ if not metadata_storage:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ try:
+ datasources = await metadata_storage.fetch_all_datasource_info()
+ if ingestor_id:
+ datasources = [ds for ds in datasources if ds.ingestor_id == ingestor_id]
+ return {"success": True, "datasources": datasources, "count": len(datasources)}
+ except Exception as e:
+ logger.error(f"Failed to list datasources: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
# ============================================================================
# Job Endpoints
# ============================================================================
@app.get("/v1/job/{job_id}")
-async def get_job(
- job_id: str,
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """Get the status of an ingestion job."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
- job_info = await jobmanager.get_job(job_id)
- if not job_info:
- raise HTTPException(status_code=404, detail="Job not found")
+async def get_job(job_id: str, user: UserContext = Depends(require_role(Role.READONLY))):
+ """Get the status of an ingestion job."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ job_info = await jobmanager.get_job(job_id)
+ if not job_info:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ logger.info(f"Returning job {job_info}")
+ return job_info
- logger.info(f"Returning job {job_info}")
- return job_info
@app.get("/v1/jobs/datasource/{datasource_id}")
-async def get_jobs_by_datasource(
- datasource_id: str,
- status_filter: Optional[JobStatus] = None,
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """Get all jobs for a specific datasource, optionally filtered by status."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
- jobs = await jobmanager.get_jobs_by_datasource(datasource_id, status_filter=status_filter)
- if jobs is None:
- raise HTTPException(status_code=404, detail="No jobs found for the specified datasource")
+async def get_jobs_by_datasource(datasource_id: str, status_filter: Optional[JobStatus] = None, user: UserContext = Depends(require_role(Role.READONLY))):
+ """Get all jobs for a specific datasource, optionally filtered by status."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ jobs = await jobmanager.get_jobs_by_datasource(datasource_id, status_filter=status_filter)
+ if jobs is None:
+ raise HTTPException(status_code=404, detail="No jobs found for the specified datasource")
+
+ logger.info(f"Returning {len(jobs)} jobs for datasource {datasource_id}")
+ return jobs
- logger.info(f"Returning {len(jobs)} jobs for datasource {datasource_id}")
- return jobs
@app.post("/v1/job", status_code=status.HTTP_201_CREATED)
-async def create_job(
- datasource_id: str,
- job_status: Optional[JobStatus] = None,
- message: Optional[str] = None,
- total: Optional[int] = None,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Create a new job for a datasource."""
- if not jobmanager or not metadata_storage:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Check if datasource exists
- datasource_info = await metadata_storage.get_datasource_info(datasource_id)
- if not datasource_info:
- raise HTTPException(status_code=404, detail="Datasource not found")
-
- # Generate new job ID
- job_id = str(uuid.uuid4())
-
- # Create job with datasource_id
- success = await jobmanager.upsert_job(
- job_id,
- status=job_status or JobStatus.PENDING,
- message=message or "Job created",
- total=total,
- datasource_id=datasource_id
- )
-
- if not success:
- raise HTTPException(status_code=400, detail="Failed to create job")
-
- logger.info(f"Created job {job_id} for datasource {datasource_id}")
- return {"job_id": job_id, "datasource_id": datasource_id}
+async def create_job(datasource_id: str, job_status: Optional[JobStatus] = None, message: Optional[str] = None, total: Optional[int] = None, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Create a new job for a datasource."""
+ if not jobmanager or not metadata_storage:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ # Check if datasource exists
+ datasource_info = await metadata_storage.get_datasource_info(datasource_id)
+ if not datasource_info:
+ raise HTTPException(status_code=404, detail="Datasource not found")
+
+ # Generate new job ID
+ job_id = str(uuid.uuid4())
+
+ # Create job with datasource_id
+ success = await jobmanager.upsert_job(job_id, status=job_status or JobStatus.PENDING, message=message or "Job created", total=total, datasource_id=datasource_id)
+
+ if not success:
+ raise HTTPException(status_code=400, detail="Failed to create job")
+
+ logger.info(f"Created job {job_id} for datasource {datasource_id}")
+ return {"job_id": job_id, "datasource_id": datasource_id}
+
@app.patch("/v1/job/{job_id}", status_code=status.HTTP_200_OK)
-async def update_job(
- job_id: str,
- job_status: Optional[JobStatus] = None,
- message: Optional[str] = None,
- total: Optional[int] = None,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Update an existing job."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Check if job exists
- existing_job = await jobmanager.get_job(job_id)
- if not existing_job:
- raise HTTPException(status_code=404, detail="Job not found")
-
- # Update job
- success = await jobmanager.upsert_job(
- job_id,
- status=job_status,
- message=message,
- total=total,
- datasource_id=existing_job.datasource_id
- )
-
- if not success:
- raise HTTPException(status_code=400, detail="Failed to update job (job may be terminated)")
-
- logger.info(f"Updated job {job_id}")
- return {"job_id": job_id, "datasource_id": existing_job.datasource_id}
+async def update_job(job_id: str, job_status: Optional[JobStatus] = None, message: Optional[str] = None, total: Optional[int] = None, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Update an existing job."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ # Check if job exists
+ existing_job = await jobmanager.get_job(job_id)
+ if not existing_job:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ # Update job
+ success = await jobmanager.upsert_job(job_id, status=job_status, message=message, total=total, datasource_id=existing_job.datasource_id)
+
+ if not success:
+ raise HTTPException(status_code=400, detail="Failed to update job (job may be terminated)")
+
+ logger.info(f"Updated job {job_id}")
+ return {"job_id": job_id, "datasource_id": existing_job.datasource_id}
+
@app.post("/v1/job/{job_id}/terminate", status_code=status.HTTP_200_OK)
-async def terminate_job_endpoint(
- job_id: str,
- user: UserContext = Depends(require_role(Role.ADMIN))
-):
- """Terminate an ingestion job."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- job_info = await jobmanager.get_job(job_id)
- if not job_info:
- raise HTTPException(status_code=404, detail="Job not found")
+async def terminate_job_endpoint(job_id: str, user: UserContext = Depends(require_role(Role.ADMIN))):
+ """Terminate an ingestion job."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ job_info = await jobmanager.get_job(job_id)
+ if not job_info:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ success = await jobmanager.terminate_job(job_id)
+ if not success:
+ raise HTTPException(status_code=500, detail="Failed to terminate job")
+
+ logger.info(f"Job {job_id} has been terminated.")
+ return {"message": f"Job {job_id} has been terminated."}
- success = await jobmanager.terminate_job(job_id)
- if not success:
- raise HTTPException(status_code=500, detail="Failed to terminate job")
-
- logger.info(f"Job {job_id} has been terminated.")
- return {"message": f"Job {job_id} has been terminated."}
@app.post("/v1/job/{job_id}/increment-progress")
-async def increment_job_progress(
- job_id: str,
- increment: int = 1,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Increment the progress counter for a job."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- new_value = await jobmanager.increment_progress(job_id, increment)
- if new_value == -1:
- raise HTTPException(status_code=400, detail="Cannot increment progress - job is terminated")
-
- logger.debug(f"Incremented progress for job {job_id} by {increment}, new value: {new_value}")
- return {"job_id": job_id, "progress_counter": new_value}
+async def increment_job_progress(job_id: str, increment: int = 1, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Increment the progress counter for a job."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ new_value = await jobmanager.increment_progress(job_id, increment)
+ if new_value == -1:
+ raise HTTPException(status_code=400, detail="Cannot increment progress - job is terminated")
+
+ logger.debug(f"Incremented progress for job {job_id} by {increment}, new value: {new_value}")
+ return {"job_id": job_id, "progress_counter": new_value}
+
@app.post("/v1/job/{job_id}/increment-failure")
-async def increment_job_failure(
- job_id: str,
- increment: int = 1,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Increment the failure counter for a job."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- new_value = await jobmanager.increment_failure(job_id, increment)
- if new_value == -1:
- raise HTTPException(status_code=400, detail="Cannot increment failure - job is terminated")
-
- logger.debug(f"Incremented failure for job {job_id} by {increment}, new value: {new_value}")
- return {"job_id": job_id, "failed_counter": new_value}
+async def increment_job_failure(job_id: str, increment: int = 1, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Increment the failure counter for a job."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ new_value = await jobmanager.increment_failure(job_id, increment)
+ if new_value == -1:
+ raise HTTPException(status_code=400, detail="Cannot increment failure - job is terminated")
+
+ logger.debug(f"Incremented failure for job {job_id} by {increment}, new value: {new_value}")
+ return {"job_id": job_id, "failed_counter": new_value}
+
@app.post("/v1/job/{job_id}/add-errors")
-async def add_job_errors(
- job_id: str,
- error_messages: List[str],
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Add error messages to a job."""
- if not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- if not error_messages:
- raise HTTPException(status_code=400, detail="Error messages list cannot be empty")
-
- results = []
- for error_msg in error_messages:
- new_length = await jobmanager.add_error_msg(job_id, error_msg)
- if new_length == -1:
- raise HTTPException(status_code=400, detail="Cannot add error messages - job is terminated")
- results.append(new_length)
-
- final_length = results[-1] if results else 0
- logger.debug(f"Added {len(error_messages)} error messages to job {job_id}, total errors: {final_length}")
- return {"job_id": job_id, "errors_added": len(error_messages), "total_errors": final_length}
+async def add_job_errors(job_id: str, error_messages: List[str], user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Add error messages to a job."""
+ if not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ if not error_messages:
+ raise HTTPException(status_code=400, detail="Error messages list cannot be empty")
+
+ results = []
+ for error_msg in error_messages:
+ new_length = await jobmanager.add_error_msg(job_id, error_msg)
+ if new_length == -1:
+ raise HTTPException(status_code=400, detail="Cannot add error messages - job is terminated")
+ results.append(new_length)
+
+ final_length = results[-1] if results else 0
+ logger.debug(f"Added {len(error_messages)} error messages to job {job_id}, total errors: {final_length}")
+ return {"job_id": job_id, "errors_added": len(error_messages), "total_errors": final_length}
+
# ============================================================================
# Query Endpoint
# ============================================================================
+
@app.post("/v1/query", response_model=List[QueryResult])
-async def query_documents(
- query_request: QueryRequest,
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """Query for relevant documents using semantic search in the unified collection."""
+async def query_documents(query_request: QueryRequest, user: UserContext = Depends(require_role(Role.READONLY))):
+ """Query for relevant documents using semantic search in the unified collection."""
- # Enforce max results limit
- if query_request.limit > max_results_per_query:
- raise HTTPException(status_code=400, detail=f"Query limit exceeds maximum allowed of {max_results_per_query} results.")
+ # Enforce max results limit
+ if query_request.limit > max_results_per_query:
+ raise HTTPException(status_code=400, detail=f"Query limit exceeds maximum allowed of {max_results_per_query} results.")
- # If weighted ranker specified but no weights then use default weights
- if query_request.ranker_type == "weighted":
- if query_request.ranker_params is None:
- query_request.ranker_params = {"weights": [0.7, 0.3]} # More weight to dense (semantic) score
+ # If weighted ranker specified but no weights then use default weights
+ if query_request.ranker_type == "weighted":
+ if query_request.ranker_params is None:
+ query_request.ranker_params = {"weights": [0.7, 0.3]} # More weight to dense (semantic) score
+ # If no ranker specified then set ranker params to None
+ if not query_request.ranker_type or query_request.ranker_type == "":
+ query_request.ranker_params = None
- # If no ranker specified then set ranker params to None
- if not query_request.ranker_type or query_request.ranker_type == "":
- query_request.ranker_params = None
+ results = await vector_db_query_service.query(
+ query=query_request.query,
+ filters=query_request.filters,
+ limit=query_request.limit,
+ ranker=query_request.ranker_type,
+ ranker_params=query_request.ranker_params,
+ )
+ return results
- results = await vector_db_query_service.query(
- query=query_request.query,
- filters=query_request.filters,
- limit=query_request.limit,
- ranker=query_request.ranker_type,
- ranker_params=query_request.ranker_params,
- )
- return results
# ============================================================================
# Ingestion Endpoints
# ============================================================================
-@app.post("/v1/ingest/webloader/url", status_code=status.HTTP_202_ACCEPTED)
-async def ingest_url(
- url_request: UrlIngestRequest,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Queue a URL for ingestion by the webloader ingestor."""
- if not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- logger.info(f"Received URL ingestion request: {url_request.url}")
-
- # Sanitize URL
- sanitized_url = sanitize_url(url_request.url)
- url_request.url = sanitized_url
-
- # Generate datasource ID and create datasource
- datasource_id = utils.generate_datasource_id_from_url(url_request.url)
-
- # Check if datasource already exists (for web, each URL is unique)
- existing_datasource = await metadata_storage.get_datasource_info(datasource_id)
- if existing_datasource:
- logger.info(f"Datasource already exists for URL {url_request.url}, datasource ID: {datasource_id}")
- raise HTTPException(status_code=400, detail="URL already ingested, please delete existing datasource before re-ingesting")
-
- # Check if there is already a job for this datasource in progress or pending
- existing_jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
- if existing_jobs:
- existing_pending_jobs = [job for job in existing_jobs if job.status in (JobStatus.IN_PROGRESS, JobStatus.PENDING)]
- if existing_pending_jobs:
- logger.info(f"An ingestion job is already in progress or pending for datasource {datasource_id}, job ID: {existing_pending_jobs[0].job_id}")
- raise HTTPException(status_code=400, detail=f"An ingestion job is already in progress or pending for this URL (job ID: {existing_pending_jobs[0].job_id})")
-
- # Create job with PENDING status first
- job_id = str(uuid.uuid4())
- success = await jobmanager.upsert_job(
- job_id,
- status=JobStatus.PENDING,
- message="Waiting for ingestor to process...",
- total=0, # Unknown until sitemap is checked
- datasource_id=datasource_id
- )
- if not success:
- raise HTTPException(status_code=500, detail="Failed to create job")
+@app.post("/v1/ingest/webloader/url", status_code=status.HTTP_202_ACCEPTED)
+async def ingest_url(url_request: UrlIngestRequest, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Queue a URL for ingestion by the webloader ingestor."""
+ if not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ logger.info(f"Received URL ingestion request: {url_request.url}")
+
+ # Sanitize URL
+ sanitized_url = sanitize_url(url_request.url)
+ url_request.url = sanitized_url
+
+ # Generate datasource ID and create datasource
+ datasource_id = utils.generate_datasource_id_from_url(url_request.url)
+
+ # Check if datasource already exists (for web, each URL is unique)
+ existing_datasource = await metadata_storage.get_datasource_info(datasource_id)
+ if existing_datasource:
+ logger.info(f"Datasource already exists for URL {url_request.url}, datasource ID: {datasource_id}")
+ raise HTTPException(status_code=400, detail="URL already ingested, please delete existing datasource before re-ingesting")
+
+ # Check if there is already a job for this datasource in progress or pending
+ existing_jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
+ if existing_jobs:
+ existing_pending_jobs = [job for job in existing_jobs if job.status in (JobStatus.IN_PROGRESS, JobStatus.PENDING)]
+ if existing_pending_jobs:
+ logger.info(f"An ingestion job is already in progress or pending for datasource {datasource_id}, job ID: {existing_pending_jobs[0].job_id}")
+ raise HTTPException(status_code=400, detail=f"An ingestion job is already in progress or pending for this URL (job ID: {existing_pending_jobs[0].job_id})")
+
+ # Create job with PENDING status first
+ job_id = str(uuid.uuid4())
+ success = await jobmanager.upsert_job(
+ job_id,
+ status=JobStatus.PENDING,
+ message="Waiting for ingestor to process...",
+ total=0, # Unknown until sitemap is checked
+ datasource_id=datasource_id,
+ )
+
+ if not success:
+ raise HTTPException(status_code=500, detail="Failed to create job")
+
+ logger.info(f"Created job {job_id} for datasource {datasource_id}")
+
+ if not url_request.description:
+ url_request.description = f"Web content from {url_request.url}"
+
+ # Create datasource
+ # Metadata schema for source_type="web": {"url_ingest_request": UrlIngestRequest}
+ datasource_info = DataSourceInfo(
+ datasource_id=datasource_id,
+ ingestor_id=generate_ingestor_id(WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE),
+ description=url_request.description,
+ source_type="web",
+ last_updated=int(time.time()),
+ default_chunk_size=url_request.settings.chunk_size,
+ default_chunk_overlap=url_request.settings.chunk_overlap,
+ metadata={"url_ingest_request": url_request.model_dump()},
+ )
+
+ await metadata_storage.store_datasource_info(datasource_info)
+ logger.info(f"Created datasource: {datasource_id}")
+
+ # Queue the request for the ingestor
+ ingestor_request = IngestorRequest(ingestor_id=generate_ingestor_id(WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE), command=WebIngestorCommand.INGEST_URL, payload=url_request.model_dump())
+
+ # Push to Redis queue
+ await redis_client.rpush(WEBLOADER_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
+ logger.info(f"Queued URL ingestion request for {url_request.url} to {WEBLOADER_INGESTOR_REDIS_QUEUE}")
+
+ return {"datasource_id": datasource_id, "job_id": job_id, "message": "URL ingestion request queued"}
- logger.info(f"Created job {job_id} for datasource {datasource_id}")
- if not url_request.description:
- url_request.description = f"Web content from {url_request.url}"
+@app.post("/v1/ingest/webloader/reload", status_code=status.HTTP_202_ACCEPTED)
+async def reload_url(reload_request: UrlReloadRequest, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Reloads a previously ingested URL by re-queuing it for ingestion."""
+ if not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
- # Create datasource
- # Metadata schema for source_type="web": {"url_ingest_request": UrlIngestRequest}
- datasource_info = DataSourceInfo(
- datasource_id=datasource_id,
- ingestor_id=generate_ingestor_id(WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE),
- description=url_request.description,
- source_type="web",
- last_updated=int(time.time()),
- default_chunk_size=1000,
- default_chunk_overlap=200,
- metadata={"url_ingest_request": url_request.model_dump()}
- )
+ # Fetch existing datasource
+ datasource_info = await metadata_storage.get_datasource_info(reload_request.datasource_id)
+ if not datasource_info:
+ raise HTTPException(status_code=404, detail="Datasource not found")
- await metadata_storage.store_datasource_info(datasource_info)
- logger.info(f"Created datasource: {datasource_id}")
+ # Queue the request for the ingestor
+ ingestor_request = IngestorRequest(ingestor_id=datasource_info.ingestor_id, command=WebIngestorCommand.RELOAD_DATASOURCE, payload=reload_request.model_dump())
- # Queue the request for the ingestor
- ingestor_request = IngestorRequest(
- ingestor_id=generate_ingestor_id(WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE),
- command=WebIngestorCommand.INGEST_URL,
- payload=url_request.model_dump()
- )
+ # Push to Redis queue
+ await redis_client.rpush(WEBLOADER_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
+ logger.info(f"Re-queued URL ingestion request for {reload_request.datasource_id}")
+ return {"datasource_id": reload_request.datasource_id, "message": "URL reload ingestion request queued"}
- # Push to Redis queue
- await redis_client.rpush(WEBLOADER_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
- logger.info(f"Queued URL ingestion request for {url_request.url} to {WEBLOADER_INGESTOR_REDIS_QUEUE}")
- return {
- "datasource_id": datasource_id,
- "job_id": job_id,
- "message": "URL ingestion request queued"
- }
+@app.post("/v1/ingest/webloader/reload-all", status_code=status.HTTP_202_ACCEPTED)
+async def reload_all_urls(user: UserContext = Depends(require_role(Role.ADMIN))):
+ """Reloads all previously ingested URLs by re-queuing them for ingestion."""
+ if not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ # Queue the request for the ingestor
+ ingestor_request = IngestorRequest(ingestor_id=generate_ingestor_id(WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE), command=WebIngestorCommand.RELOAD_ALL, payload={})
-@app.post("/v1/ingest/webloader/reload", status_code=status.HTTP_202_ACCEPTED)
-async def reload_url(
- reload_request: UrlReloadRequest,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Reloads a previously ingested URL by re-queuing it for ingestion."""
- if not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Fetch existing datasource
- datasource_info = await metadata_storage.get_datasource_info(reload_request.datasource_id)
- if not datasource_info:
- raise HTTPException(status_code=404, detail="Datasource not found")
-
- # Queue the request for the ingestor
- ingestor_request = IngestorRequest(
- ingestor_id=datasource_info.ingestor_id,
- command=WebIngestorCommand.RELOAD_DATASOURCE,
- payload=reload_request.model_dump()
- )
-
- # Push to Redis queue
- await redis_client.rpush(WEBLOADER_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
- logger.info(f"Re-queued URL ingestion request for {reload_request.datasource_id}")
- return {"datasource_id": reload_request.datasource_id, "message": "URL reload ingestion request queued"}
+ # Push to Redis queue
+ await redis_client.rpush(WEBLOADER_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
+ logger.info("Re-queued URL ingestion request for all datasources")
-@app.post("/v1/ingest/webloader/reload-all", status_code=status.HTTP_202_ACCEPTED)
-async def reload_all_urls(user: UserContext = Depends(require_role(Role.ADMIN))):
- """Reloads all previously ingested URLs by re-queuing them for ingestion."""
- if not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Queue the request for the ingestor
- ingestor_request = IngestorRequest(
- ingestor_id=generate_ingestor_id(WEBLOADER_INGESTOR_NAME, WEBLOADER_INGESTOR_TYPE),
- command=WebIngestorCommand.RELOAD_ALL,
- payload={}
- )
+ return {"message": "Reload all URLs request queued"}
- # Push to Redis queue
- await redis_client.rpush(WEBLOADER_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
- logger.info("Re-queued URL ingestion request for all datasources")
-
- return {"message": "Reload all URLs request queued"}
@app.post("/v1/ingest/confluence/page", status_code=status.HTTP_202_ACCEPTED)
-async def ingest_confluence_page(
- confluence_request: ConfluenceIngestRequest,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Queue a Confluence page for ingestion by the confluence ingestor."""
- if not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- logger.info(f"Received Confluence page ingestion request: {confluence_request.url}")
- logger.info(f" get_child_pages: {confluence_request.get_child_pages}")
-
- # Parse Confluence URL to extract space_key and page_id
- confluence_match = re.search(r'/spaces/([^/]+)/pages/(\d+)', confluence_request.url)
- if not confluence_match:
- raise HTTPException(
- status_code=400,
- detail="Invalid Confluence URL format. Expected: https://domain.atlassian.net/wiki/spaces/SPACE/pages/PAGE_ID/Title"
- )
-
- space_key = confluence_match.group(1)
- page_id = confluence_match.group(2)
-
- # Validate that submitted URL matches configured Confluence instance
- if confluence_url:
- submitted_parsed = urlparse(confluence_request.url)
- configured_parsed = urlparse(confluence_url)
-
- # Compare scheme and netloc (domain)
- if submitted_parsed.scheme != configured_parsed.scheme or submitted_parsed.netloc != configured_parsed.netloc:
- raise HTTPException(
- status_code=400,
- detail=f"URL must be from configured Confluence instance: {configured_parsed.scheme}://{configured_parsed.netloc}"
- )
-
- # Generate space-level datasource ID
- domain = urlparse(confluence_request.url).netloc.replace(".", "_").replace("-", "_")
- datasource_id = f"src_confluence___{domain}__{space_key}"
-
- # Build page config for this ingestion
- page_config = {
- "page_id": page_id,
- "source": confluence_request.url,
- "get_child_pages": confluence_request.get_child_pages
- }
-
- # Check if datasource already exists
- existing_datasource = await metadata_storage.get_datasource_info(datasource_id)
- if existing_datasource:
- if not existing_datasource.metadata:
- existing_datasource.metadata = {}
- page_configs = existing_datasource.metadata.get("page_configs", [])
-
- # Check if page already exists in configs
- existing_page_config = next(
- (c for c in page_configs if c.get("page_id") == page_id),
- None
- )
-
- if existing_page_config:
- # Update the get_child_pages flag
- existing_page_config["get_child_pages"] = confluence_request.get_child_pages
- existing_page_config["source"] = confluence_request.url
- logger.info(f"Updated page {page_id} config in {datasource_id}")
- else:
- # Add new page config
- page_configs.append(page_config)
- logger.info(f"Added page {page_id} to {datasource_id}")
-
- existing_datasource.metadata["page_configs"] = page_configs
- await metadata_storage.store_datasource_info(existing_datasource)
+async def ingest_confluence_page(confluence_request: ConfluenceIngestRequest, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Queue a Confluence page for ingestion by the confluence ingestor."""
+ if not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ logger.info(f"Received Confluence page ingestion request: {confluence_request.url}")
+ logger.info(f" get_child_pages: {confluence_request.get_child_pages}")
+
+ # Parse Confluence URL to extract space_key and page_id
+ confluence_match = re.search(r"/spaces/([^/]+)/pages/(\d+)", confluence_request.url)
+ if not confluence_match:
+ raise HTTPException(status_code=400, detail="Invalid Confluence URL format. Expected: https://domain.atlassian.net/wiki/spaces/SPACE/pages/PAGE_ID/Title")
+
+ space_key = confluence_match.group(1)
+ page_id = confluence_match.group(2)
+
+ # Validate that submitted URL matches configured Confluence instance
+ if confluence_url:
+ submitted_parsed = urlparse(confluence_request.url)
+ configured_parsed = urlparse(confluence_url)
+
+ # Compare scheme and netloc (domain)
+ if submitted_parsed.scheme != configured_parsed.scheme or submitted_parsed.netloc != configured_parsed.netloc:
+ raise HTTPException(status_code=400, detail=f"URL must be from configured Confluence instance: {configured_parsed.scheme}://{configured_parsed.netloc}")
+
+ # Generate space-level datasource ID
+ domain = urlparse(confluence_request.url).netloc.replace(".", "_").replace("-", "_")
+ datasource_id = f"src_confluence___{domain}__{space_key}"
+
+ # Build page config for this ingestion
+ page_config = {"page_id": page_id, "source": confluence_request.url, "get_child_pages": confluence_request.get_child_pages}
+
+ # Check if datasource already exists
+ existing_datasource = await metadata_storage.get_datasource_info(datasource_id)
+ if existing_datasource:
+ if not existing_datasource.metadata:
+ existing_datasource.metadata = {}
+ page_configs = existing_datasource.metadata.get("page_configs", [])
+
+ # Check if page already exists in configs
+ existing_page_config = next((c for c in page_configs if c.get("page_id") == page_id), None)
+
+ if existing_page_config:
+ # Update the get_child_pages flag
+ existing_page_config["get_child_pages"] = confluence_request.get_child_pages
+ existing_page_config["source"] = confluence_request.url
+ logger.info(f"Updated page {page_id} config in {datasource_id}")
else:
- # Create new datasource
- if not confluence_request.description:
- confluence_request.description = f"Confluence space {space_key}"
-
- confluence_url_base = confluence_request.url.split('/wiki/')[0] + '/wiki' if '/wiki/' in confluence_request.url else confluence_request.url
-
- datasource_info = DataSourceInfo(
- datasource_id=datasource_id,
- ingestor_id=generate_ingestor_id(CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE),
- description=confluence_request.description,
- source_type="confluence",
- last_updated=int(time.time()),
- default_chunk_size=1000,
- default_chunk_overlap=200,
- metadata={
- "confluence_ingest_request": confluence_request.model_dump(),
- "space_key": space_key,
- "page_configs": [page_config],
- "confluence_url": confluence_url_base,
- }
- )
-
- await metadata_storage.store_datasource_info(datasource_info)
- logger.info(f"Created datasource: {datasource_id}")
-
- # Check if there is already a job for this datasource in progress or pending
- existing_jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
- if existing_jobs:
- existing_pending_jobs = [job for job in existing_jobs if job.status in (JobStatus.IN_PROGRESS, JobStatus.PENDING)]
- if existing_pending_jobs:
- logger.info(f"An ingestion job is already in progress or pending for datasource {datasource_id}, job ID: {existing_pending_jobs[0].job_id}")
- raise HTTPException(
- status_code=400,
- detail=f"An ingestion job is already in progress or pending for this Confluence space (job ID: {existing_pending_jobs[0].job_id})"
- )
-
- # Create job with PENDING status
- job_id = str(uuid.uuid4())
- success = await jobmanager.upsert_job(
- job_id,
- status=JobStatus.PENDING,
- message="Waiting for ingestor to process...",
- total=1, # Single page ingestion
- datasource_id=datasource_id
- )
+ # Add new page config
+ page_configs.append(page_config)
+ logger.info(f"Added page {page_id} to {datasource_id}")
- if not success:
- raise HTTPException(status_code=500, detail="Failed to create job")
+ existing_datasource.metadata["page_configs"] = page_configs
+ await metadata_storage.store_datasource_info(existing_datasource)
+ else:
+ # Create new datasource
+ if not confluence_request.description:
+ confluence_request.description = f"Confluence space {space_key}"
- logger.info(f"Created job {job_id} for datasource {datasource_id}")
+ confluence_url_base = confluence_request.url.split("/wiki/")[0] + "/wiki" if "/wiki/" in confluence_request.url else confluence_request.url
- # Queue the request for the ingestor
- ingestor_request = IngestorRequest(
- ingestor_id=generate_ingestor_id(CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE),
- command=ConfluenceIngestorCommand.INGEST_PAGE,
- payload=confluence_request.model_dump()
+ datasource_info = DataSourceInfo(
+ datasource_id=datasource_id,
+ ingestor_id=generate_ingestor_id(CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE),
+ description=confluence_request.description,
+ source_type="confluence",
+ last_updated=int(time.time()),
+ default_chunk_size=1000,
+ default_chunk_overlap=200,
+ metadata={
+ "confluence_ingest_request": confluence_request.model_dump(),
+ "space_key": space_key,
+ "page_configs": [page_config],
+ "confluence_url": confluence_url_base,
+ },
)
- # Push to Redis queue
- await redis_client.rpush(CONFLUENCE_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
- logger.info(f"Queued Confluence page ingestion request for {confluence_request.url} to {CONFLUENCE_INGESTOR_REDIS_QUEUE}")
+ await metadata_storage.store_datasource_info(datasource_info)
+ logger.info(f"Created datasource: {datasource_id}")
- return {
- "datasource_id": datasource_id,
- "job_id": job_id,
- "message": "Confluence page ingestion request queued"
- }
+ # Check if there is already a job for this datasource in progress or pending
+ existing_jobs = await jobmanager.get_jobs_by_datasource(datasource_id)
+ if existing_jobs:
+ existing_pending_jobs = [job for job in existing_jobs if job.status in (JobStatus.IN_PROGRESS, JobStatus.PENDING)]
+ if existing_pending_jobs:
+ logger.info(f"An ingestion job is already in progress or pending for datasource {datasource_id}, job ID: {existing_pending_jobs[0].job_id}")
+ raise HTTPException(status_code=400, detail=f"An ingestion job is already in progress or pending for this Confluence space (job ID: {existing_pending_jobs[0].job_id})")
+ # Create job with PENDING status
+ job_id = str(uuid.uuid4())
+ success = await jobmanager.upsert_job(
+ job_id,
+ status=JobStatus.PENDING,
+ message="Waiting for ingestor to process...",
+ total=1, # Single page ingestion
+ datasource_id=datasource_id,
+ )
-@app.post("/v1/ingest/confluence/reload", status_code=status.HTTP_202_ACCEPTED)
-async def reload_confluence_page(
- reload_request: ConfluenceReloadRequest,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Reloads a previously ingested Confluence page by re-queuing it for ingestion."""
- if not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Fetch existing datasource
- datasource_info = await metadata_storage.get_datasource_info(reload_request.datasource_id)
- if not datasource_info:
- raise HTTPException(status_code=404, detail="Datasource not found")
-
- # Queue the request for the ingestor
- ingestor_request = IngestorRequest(
- ingestor_id=datasource_info.ingestor_id,
- command=ConfluenceIngestorCommand.RELOAD_DATASOURCE,
- payload=reload_request.model_dump()
- )
+ if not success:
+ raise HTTPException(status_code=500, detail="Failed to create job")
- # Push to Redis queue
- await redis_client.rpush(CONFLUENCE_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
- logger.info(f"Re-queued Confluence page ingestion request for {reload_request.datasource_id}")
- return {"datasource_id": reload_request.datasource_id, "message": "Confluence page reload request queued"}
+ logger.info(f"Created job {job_id} for datasource {datasource_id}")
+ # Queue the request for the ingestor
+ ingestor_request = IngestorRequest(ingestor_id=generate_ingestor_id(CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE), command=ConfluenceIngestorCommand.INGEST_PAGE, payload=confluence_request.model_dump())
-@app.post("/v1/ingest/confluence/reload-all", status_code=status.HTTP_202_ACCEPTED)
-async def reload_all_confluence_pages(user: UserContext = Depends(require_role(Role.ADMIN))):
- """Reloads all previously ingested Confluence pages by re-queuing them for ingestion."""
- if not metadata_storage or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
-
- # Queue the request for the ingestor
- ingestor_request = IngestorRequest(
- ingestor_id=generate_ingestor_id(CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE),
- command=ConfluenceIngestorCommand.RELOAD_ALL,
- payload={}
- )
+ # Push to Redis queue
+ await redis_client.rpush(CONFLUENCE_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
+ logger.info(f"Queued Confluence page ingestion request for {confluence_request.url} to {CONFLUENCE_INGESTOR_REDIS_QUEUE}")
- # Push to Redis queue
- await redis_client.rpush(CONFLUENCE_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
- logger.info("Re-queued Confluence ingestion request for all datasources")
+ return {"datasource_id": datasource_id, "job_id": job_id, "message": "Confluence page ingestion request queued"}
- return {"message": "Reload all Confluence pages request queued"}
+@app.post("/v1/ingest/confluence/reload", status_code=status.HTTP_202_ACCEPTED)
+async def reload_confluence_page(reload_request: ConfluenceReloadRequest, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Reloads a previously ingested Confluence page by re-queuing it for ingestion."""
+ if not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
-@app.post("/v1/ingest")
-async def ingest_documents(
- ingest_request: DocumentIngestRequest,
- user: UserContext = Depends(require_role(Role.INGESTONLY))
-):
- """Updates/Ingests text and graph data to the appropriate databases"""
+ # Fetch existing datasource
+ datasource_info = await metadata_storage.get_datasource_info(reload_request.datasource_id)
+ if not datasource_info:
+ raise HTTPException(status_code=404, detail="Datasource not found")
- if not vector_db or not metadata_storage or not ingestor or not jobmanager:
- raise HTTPException(status_code=500, detail="Server not initialized")
- logger.info(f"Starting data ingestion for datasource: {ingest_request.datasource_id}")
+ # Queue the request for the ingestor
+ ingestor_request = IngestorRequest(ingestor_id=datasource_info.ingestor_id, command=ConfluenceIngestorCommand.RELOAD_DATASOURCE, payload=reload_request.model_dump())
- # Check if datasource exists
- datasource_info = await metadata_storage.get_datasource_info(ingest_request.datasource_id)
- if not datasource_info:
- raise HTTPException(status_code=404, detail="Datasource not found")
-
- # Find the current job for this datasource is IN_PROGRESS
- job_info = await jobmanager.get_job(ingest_request.job_id)
- if not job_info:
- raise HTTPException(status_code=404, detail="Job not found")
-
- if job_info.status != JobStatus.IN_PROGRESS:
- raise HTTPException(status_code=400, detail="Ingestion can only be started for jobs in IN_PROGRESS status")
+ # Push to Redis queue
+ await redis_client.rpush(CONFLUENCE_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
+ logger.info(f"Re-queued Confluence page ingestion request for {reload_request.datasource_id}")
+ return {"datasource_id": reload_request.datasource_id, "message": "Confluence page reload request queued"}
- # Check max documents limit
- if len(ingest_request.documents) > max_documents_per_ingest:
- return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content={"message": f"Number of documents exceeds the maximum limit of {max_documents_per_ingest} per ingestion request."})
-
- if ingest_request.fresh_until is None:
- ingest_request.fresh_until = get_default_fresh_until()
- if datasource_info.default_chunk_overlap is None:
- datasource_info.default_chunk_overlap = 0
-
- if datasource_info.default_chunk_size is None:
- datasource_info.default_chunk_size = 0 # Don't chunk if chunk size is not set
+@app.post("/v1/ingest/confluence/reload-all", status_code=status.HTTP_202_ACCEPTED)
+async def reload_all_confluence_pages(user: UserContext = Depends(require_role(Role.ADMIN))):
+ """Reloads all previously ingested Confluence pages by re-queuing them for ingestion."""
+ if not metadata_storage or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+
+ # Queue the request for the ingestor
+ ingestor_request = IngestorRequest(ingestor_id=generate_ingestor_id(CONFLUENCE_INGESTOR_NAME, CONFLUENCE_INGESTOR_TYPE), command=ConfluenceIngestorCommand.RELOAD_ALL, payload={})
+
+ # Push to Redis queue
+ await redis_client.rpush(CONFLUENCE_INGESTOR_REDIS_QUEUE, ingestor_request.model_dump_json()) # type: ignore
+ logger.info("Re-queued Confluence ingestion request for all datasources")
+
+ return {"message": "Reload all Confluence pages request queued"}
+
+
+@app.post("/v1/ingest")
+async def ingest_documents(ingest_request: DocumentIngestRequest, user: UserContext = Depends(require_role(Role.INGESTONLY))):
+ """Updates/Ingests text and graph data to the appropriate databases"""
+
+ if not vector_db or not metadata_storage or not ingestor or not jobmanager:
+ raise HTTPException(status_code=500, detail="Server not initialized")
+ logger.info(f"Starting data ingestion for datasource: {ingest_request.datasource_id}")
+
+ # Check if datasource exists
+ datasource_info = await metadata_storage.get_datasource_info(ingest_request.datasource_id)
+ if not datasource_info:
+ raise HTTPException(status_code=404, detail="Datasource not found")
+
+ # Find the current job for this datasource is IN_PROGRESS
+ job_info = await jobmanager.get_job(ingest_request.job_id)
+ if not job_info:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ if job_info.status != JobStatus.IN_PROGRESS:
+ raise HTTPException(status_code=400, detail="Ingestion can only be started for jobs in IN_PROGRESS status")
+
+ # Check max documents limit
+ if len(ingest_request.documents) > max_documents_per_ingest:
+ return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content={"message": f"Number of documents exceeds the maximum limit of {max_documents_per_ingest} per ingestion request."})
+
+ if ingest_request.fresh_until is None:
+ ingest_request.fresh_until = get_default_fresh_until()
+
+ if datasource_info.default_chunk_overlap is None:
+ datasource_info.default_chunk_overlap = 0
+
+ if datasource_info.default_chunk_size is None:
+ datasource_info.default_chunk_size = 0 # Don't chunk if chunk size is not set
+
+ try:
+ await ingestor.ingest_documents(
+ ingestor_id=ingest_request.ingestor_id,
+ datasource_id=ingest_request.datasource_id,
+ job_id=job_info.job_id,
+ documents=ingest_request.documents,
+ fresh_until=ingest_request.fresh_until,
+ chunk_overlap=datasource_info.default_chunk_overlap,
+ chunk_size=datasource_info.default_chunk_size,
+ )
+ except ValueError as ve:
+ return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content={"message": str(ve)})
+ return JSONResponse(status_code=status.HTTP_202_ACCEPTED, content={"message": "Text data ingestion started successfully"})
- try:
- await ingestor.ingest_documents(
- ingestor_id=ingest_request.ingestor_id,
- datasource_id=ingest_request.datasource_id,
- job_id=job_info.job_id,
- documents=ingest_request.documents,
- fresh_until=ingest_request.fresh_until,
- chunk_overlap=datasource_info.default_chunk_overlap,
- chunk_size=datasource_info.default_chunk_size,
- )
- except ValueError as ve:
- return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content={"message": str(ve)})
- return JSONResponse(status_code=status.HTTP_202_ACCEPTED, content={"message": "Text data ingestion started successfully"})
# ============================================================================
# Knowledge Graph Endpoints
# ============================================================================
+
@app.get("/v1/graph/explore/entity_type")
async def list_entity_types(user: UserContext = Depends(require_role(Role.READONLY))):
- """
- Lists all entity types in the database
- """
- if not ontology_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
- logger.debug("Listing entity types")
- e = await ontology_graph_db.get_all_entity_types()
- return JSONResponse(status_code=status.HTTP_200_OK, content=e)
+ """
+ Lists all entity types in the database
+ """
+ if not ontology_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+ logger.debug("Listing entity types")
+ e = await ontology_graph_db.get_all_entity_types()
+ return JSONResponse(status_code=status.HTTP_200_OK, content=e)
+
# ====
# Data Graph Endpoints
# ====
@app.get("/v1/graph/explore/data/entities/batch")
async def fetch_data_entities_batch(
- offset: int = Query(0, description="Number of entities to skip (for pagination)", ge=0),
- limit: int = Query(100, description="Maximum number of entities to return", ge=1, le=1000),
- entity_type: Optional[str] = Query(None, description="Optional filter by entity type"),
- user: UserContext = Depends(require_role(Role.READONLY))
+ offset: int = Query(0, description="Number of entities to skip (for pagination)", ge=0),
+ limit: int = Query(100, description="Maximum number of entities to return", ge=1, le=1000),
+ entity_type: Optional[str] = Query(None, description="Optional filter by entity type"),
+ user: UserContext = Depends(require_role(Role.READONLY)),
):
- """
- Fetch entities from the data graph in batches for efficient bulk processing.
- Useful for pagination and bulk export of graph data.
- Maximum limit is 1000 entities per request.
- """
- if not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- # Enforce max limit of 1000
- if limit > 1000:
- raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 entities per request")
-
- logger.debug(f"Fetching data entities batch: offset={offset}, limit={limit}, entity_type={entity_type}")
-
- entities = await data_graph_db.fetch_entities_batch(offset=offset, limit=limit, entity_type=entity_type)
-
- return JSONResponse(
- status_code=status.HTTP_200_OK,
- content={
- "entities": jsonable_encoder(entities),
- "count": len(entities),
- "offset": offset,
- "limit": limit
- }
- )
+ """
+ Fetch entities from the data graph in batches for efficient bulk processing.
+ Useful for pagination and bulk export of graph data.
+ Maximum limit is 1000 entities per request.
+ """
+ if not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ # Enforce max limit of 1000
+ if limit > 1000:
+ raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 entities per request")
+
+ logger.debug(f"Fetching data entities batch: offset={offset}, limit={limit}, entity_type={entity_type}")
+
+ entities = await data_graph_db.fetch_entities_batch(offset=offset, limit=limit, entity_type=entity_type)
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content={"entities": jsonable_encoder(entities), "count": len(entities), "offset": offset, "limit": limit})
+
@app.get("/v1/graph/explore/data/relations/batch")
async def fetch_data_relations_batch(
- offset: int = Query(0, description="Number of relations to skip (for pagination)", ge=0),
- limit: int = Query(100, description="Maximum number of relations to return", ge=1, le=1000),
- relation_name: Optional[str] = Query(None, description="Optional filter by relation name"),
- user: UserContext = Depends(require_role(Role.READONLY))
+ offset: int = Query(0, description="Number of relations to skip (for pagination)", ge=0),
+ limit: int = Query(100, description="Maximum number of relations to return", ge=1, le=1000),
+ relation_name: Optional[str] = Query(None, description="Optional filter by relation name"),
+ user: UserContext = Depends(require_role(Role.READONLY)),
):
- """
- Fetch relations from the data graph in batches for efficient bulk processing.
- Useful for pagination and bulk export of graph relations.
- Maximum limit is 1000 relations per request.
- """
- if not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- # Enforce max limit of 1000
- if limit > 1000:
- raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 relations per request")
-
- logger.debug(f"Fetching data relations batch: offset={offset}, limit={limit}, relation_name={relation_name}")
-
- relations = await data_graph_db.fetch_relations_batch(offset=offset, limit=limit, relation_name=relation_name)
-
- return JSONResponse(
- status_code=status.HTTP_200_OK,
- content={
- "relations": jsonable_encoder(relations),
- "count": len(relations),
- "offset": offset,
- "limit": limit
- }
- )
+ """
+ Fetch relations from the data graph in batches for efficient bulk processing.
+ Useful for pagination and bulk export of graph relations.
+ Maximum limit is 1000 relations per request.
+ """
+ if not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ # Enforce max limit of 1000
+ if limit > 1000:
+ raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 relations per request")
+
+ logger.debug(f"Fetching data relations batch: offset={offset}, limit={limit}, relation_name={relation_name}")
+
+ relations = await data_graph_db.fetch_relations_batch(offset=offset, limit=limit, relation_name=relation_name)
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content={"relations": jsonable_encoder(relations), "count": len(relations), "offset": offset, "limit": limit})
+
@app.post("/v1/graph/explore/data/entity/neighborhood")
-async def explore_data_entity_neighborhood(
- request: ExploreNeighborhoodRequest,
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """
- Explore an entity and its neighborhood in the data graph up to a specified depth.
- Depth 0 returns just the entity, depth 1 includes direct neighbors, etc.
- """
- if not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- logger.debug(f"Exploring data neighborhood for entity_type={request.entity_type}, entity_pk={request.entity_pk}, depth={request.depth}")
-
- result = await data_graph_db.explore_neighborhood(entity_type=request.entity_type, entity_pk=request.entity_pk, depth=request.depth, max_results=1000)
-
- if result["entity"] is None:
- return JSONResponse(status_code=status.HTTP_404_NOT_FOUND, content={"message": "Entity not found"})
-
- return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(result))
+async def explore_data_entity_neighborhood(request: ExploreNeighborhoodRequest, user: UserContext = Depends(require_role(Role.READONLY))):
+ """
+ Explore an entity and its neighborhood in the data graph up to a specified depth.
+ Depth 0 returns just the entity, depth 1 includes direct neighbors, etc.
+ """
+ if not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ logger.debug(f"Exploring data neighborhood for entity_type={request.entity_type}, entity_pk={request.entity_pk}, depth={request.depth}")
+
+ result = await data_graph_db.explore_neighborhood(entity_type=request.entity_type, entity_pk=request.entity_pk, depth=request.depth, max_results=1000)
+
+ if result["entity"] is None:
+ return JSONResponse(status_code=status.HTTP_404_NOT_FOUND, content={"message": "Entity not found"})
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(result))
+
@app.get("/v1/graph/explore/data/entity/start")
-async def get_random_start_nodes(
- n: int = Query(10, description="Number of random nodes to fetch", ge=1, le=100),
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """
- Fetch random starting nodes from the data graph.
- Useful for initializing graph visualization or exploration.
- """
- if not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- logger.debug(f"Fetching {n} random nodes from data graph")
-
- entities = await data_graph_db.fetch_random_entities(count=n)
-
- return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(entities))
+async def get_random_start_nodes(n: int = Query(10, description="Number of random nodes to fetch", ge=1, le=100), user: UserContext = Depends(require_role(Role.READONLY))):
+ """
+ Fetch random starting nodes from the data graph.
+ Useful for initializing graph visualization or exploration.
+ """
+ if not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ logger.debug(f"Fetching {n} random nodes from data graph")
+
+ entities = await data_graph_db.fetch_random_entities(count=n)
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(entities))
+
@app.get("/v1/graph/explore/data/stats")
async def get_data_graph_stats(user: UserContext = Depends(require_role(Role.READONLY))):
- """
- Get statistics about the data graph (node count, relation count).
- """
- if not data_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- logger.debug("Fetching data graph statistics")
-
- stats = await data_graph_db.get_graph_stats()
-
- return JSONResponse(status_code=status.HTTP_200_OK, content=stats)
+ """
+ Get statistics about the data graph (node count, relation count).
+ """
+ if not data_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ logger.debug("Fetching data graph statistics")
+
+ stats = await data_graph_db.get_graph_stats()
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content=stats)
+
# ====
# Ontology Graph Endpoints
# ====
+
@app.get("/v1/graph/explore/ontology/entities/batch")
async def fetch_ontology_entities_batch(
- offset: int = Query(0, description="Number of entities to skip (for pagination)", ge=0),
- limit: int = Query(100, description="Maximum number of entities to return", ge=1, le=1000),
- entity_type: Optional[str] = Query(None, description="Optional filter by entity type"),
- user: UserContext = Depends(require_role(Role.READONLY))
+ offset: int = Query(0, description="Number of entities to skip (for pagination)", ge=0),
+ limit: int = Query(100, description="Maximum number of entities to return", ge=1, le=1000),
+ entity_type: Optional[str] = Query(None, description="Optional filter by entity type"),
+ user: UserContext = Depends(require_role(Role.READONLY)),
):
- """
- Fetch entities from the ontology graph in batches for efficient bulk processing.
- Useful for pagination and bulk export of ontology data.
- Maximum limit is 1000 entities per request.
- """
- if not ontology_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- # Enforce max limit of 1000
- if limit > 1000:
- raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 entities per request")
-
- logger.debug(f"Fetching ontology entities batch: offset={offset}, limit={limit}, entity_type={entity_type}")
-
- entities = await ontology_graph_db.fetch_entities_batch(offset=offset, limit=limit, entity_type=entity_type)
-
- return JSONResponse(
- status_code=status.HTTP_200_OK,
- content={
- "entities": jsonable_encoder(entities),
- "count": len(entities),
- "offset": offset,
- "limit": limit
- }
- )
+ """
+ Fetch entities from the ontology graph in batches for efficient bulk processing.
+ Useful for pagination and bulk export of ontology data.
+ Maximum limit is 1000 entities per request.
+ """
+ if not ontology_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ # Enforce max limit of 1000
+ if limit > 1000:
+ raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 entities per request")
+
+ logger.debug(f"Fetching ontology entities batch: offset={offset}, limit={limit}, entity_type={entity_type}")
+
+ entities = await ontology_graph_db.fetch_entities_batch(offset=offset, limit=limit, entity_type=entity_type)
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content={"entities": jsonable_encoder(entities), "count": len(entities), "offset": offset, "limit": limit})
+
@app.get("/v1/graph/explore/ontology/relations/batch")
async def fetch_ontology_relations_batch(
- offset: int = Query(0, description="Number of relations to skip (for pagination)", ge=0),
- limit: int = Query(100, description="Maximum number of relations to return", ge=1, le=1000),
- relation_name: Optional[str] = Query(None, description="Optional filter by relation name"),
- user: UserContext = Depends(require_role(Role.READONLY))
+ offset: int = Query(0, description="Number of relations to skip (for pagination)", ge=0),
+ limit: int = Query(100, description="Maximum number of relations to return", ge=1, le=1000),
+ relation_name: Optional[str] = Query(None, description="Optional filter by relation name"),
+ user: UserContext = Depends(require_role(Role.READONLY)),
):
- """
- Fetch relations from the ontology graph in batches for efficient bulk processing.
- Useful for pagination and bulk export of ontology relations.
- Maximum limit is 1000 relations per request.
- """
- if not ontology_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- # Enforce max limit of 1000
- if limit > 1000:
- raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 relations per request")
-
- logger.debug(f"Fetching ontology relations batch: offset={offset}, limit={limit}, relation_name={relation_name}")
-
- relations = await ontology_graph_db.fetch_relations_batch(offset=offset, limit=limit, relation_name=relation_name)
-
- return JSONResponse(
- status_code=status.HTTP_200_OK,
- content={
- "relations": jsonable_encoder(relations),
- "count": len(relations),
- "offset": offset,
- "limit": limit
- }
- )
+ """
+ Fetch relations from the ontology graph in batches for efficient bulk processing.
+ Useful for pagination and bulk export of ontology relations.
+ Maximum limit is 1000 relations per request.
+ """
+ if not ontology_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ # Enforce max limit of 1000
+ if limit > 1000:
+ raise HTTPException(status_code=400, detail="Limit cannot exceed 1000 relations per request")
+
+ logger.debug(f"Fetching ontology relations batch: offset={offset}, limit={limit}, relation_name={relation_name}")
+
+ relations = await ontology_graph_db.fetch_relations_batch(offset=offset, limit=limit, relation_name=relation_name)
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content={"relations": jsonable_encoder(relations), "count": len(relations), "offset": offset, "limit": limit})
+
@app.post("/v1/graph/explore/ontology/entity/neighborhood")
-async def explore_ontology_entity_neighborhood(
- request: ExploreNeighborhoodRequest,
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """
- Explore an entity and its neighborhood in the ontology graph up to a specified depth.
- Depth 0 returns just the entity, depth 1 includes direct neighbors, etc.
- """
- if not ontology_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- logger.debug(f"Exploring ontology neighborhood for entity_type={request.entity_type}, entity_pk={request.entity_pk}, depth={request.depth}")
-
- result = await ontology_graph_db.explore_neighborhood(entity_type=request.entity_type, entity_pk=request.entity_pk, depth=request.depth, max_results=1000)
-
- if result["entity"] is None:
- return JSONResponse(status_code=status.HTTP_404_NOT_FOUND, content={"message": "Entity not found"})
-
- return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(result))
+async def explore_ontology_entity_neighborhood(request: ExploreNeighborhoodRequest, user: UserContext = Depends(require_role(Role.READONLY))):
+ """
+ Explore an entity and its neighborhood in the ontology graph up to a specified depth.
+ Depth 0 returns just the entity, depth 1 includes direct neighbors, etc.
+ """
+ if not ontology_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ logger.debug(f"Exploring ontology neighborhood for entity_type={request.entity_type}, entity_pk={request.entity_pk}, depth={request.depth}")
+
+ result = await ontology_graph_db.explore_neighborhood(entity_type=request.entity_type, entity_pk=request.entity_pk, depth=request.depth, max_results=1000)
+
+ if result["entity"] is None:
+ return JSONResponse(status_code=status.HTTP_404_NOT_FOUND, content={"message": "Entity not found"})
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(result))
+
@app.get("/v1/graph/explore/ontology/entity/start")
-async def get_random_ontology_start_nodes(
- n: int = Query(10, description="Number of random nodes to fetch", ge=1, le=100),
- user: UserContext = Depends(require_role(Role.READONLY))
-):
- """
- Fetch random starting nodes from the ontology graph.
- Useful for initializing graph visualization or exploration.
- """
- if not ontology_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- logger.debug(f"Fetching {n} random nodes from ontology graph")
-
- entities = await ontology_graph_db.fetch_random_entities(count=n)
-
- return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(entities))
+async def get_random_ontology_start_nodes(n: int = Query(10, description="Number of random nodes to fetch", ge=1, le=100), user: UserContext = Depends(require_role(Role.READONLY))):
+ """
+ Fetch random starting nodes from the ontology graph.
+ Useful for initializing graph visualization or exploration.
+ """
+ if not ontology_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ logger.debug(f"Fetching {n} random nodes from ontology graph")
+
+ entities = await ontology_graph_db.fetch_random_entities(count=n)
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder(entities))
+
@app.get("/v1/graph/explore/ontology/stats")
async def get_ontology_graph_stats(user: UserContext = Depends(require_role(Role.READONLY))):
- """
- Get statistics about the ontology graph (node count, relation count).
- """
- if not ontology_graph_db:
- raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
-
- logger.debug("Fetching ontology graph statistics")
-
- stats = await ontology_graph_db.get_graph_stats()
-
- return JSONResponse(status_code=status.HTTP_200_OK, content=stats)
+ """
+ Get statistics about the ontology graph (node count, relation count).
+ """
+ if not ontology_graph_db:
+ raise HTTPException(status_code=500, detail="Server not initialized, or graph RAG is disabled")
+
+ logger.debug("Fetching ontology graph statistics")
+
+ stats = await ontology_graph_db.get_graph_stats()
+
+ return JSONResponse(status_code=status.HTTP_200_OK, content=stats)
+
# ====
# Ontology Agent Reverse Proxy
# ====
async def _reverse_proxy(request: Request):
- """
- Reverse proxy to ontology agent service, which runs a separate FastAPI instance,
- and is responsible for handling ontology related requests.
-
- Read-only operations (GET /status) require READONLY role.
- Write operations (POST/DELETE) require ADMIN role.
-
- This acts as a security gateway - the ontology agent service doesn't need
- its own RBAC implementation since it's only accessible through this proxy.
- """
- # Manually invoke the RBAC check since app.add_route doesn't support Depends()
- user = await get_user_or_anonymous(request)
-
- # Determine required role based on method and path
- # GET /status endpoints are read-only, allow READONLY access
- # All other operations (POST/DELETE) require ADMIN
- is_status_endpoint = request.url.path.endswith('/status')
- is_read_only = request.method == 'GET' and is_status_endpoint
-
- required_role = Role.READONLY if is_read_only else Role.ADMIN
-
- if not has_permission(user.role, required_role):
- raise HTTPException(
- status_code=403,
- detail=f"Insufficient permissions. Required role: {required_role}, your role: {user.role}"
- )
-
- logger.info(f"Ontology agent request by {user.email} to {request.url.path}")
-
- url = httpx.URL(path=request.url.path,
- query=request.url.query.encode("utf-8"))
- rp_req = ontology_agent_client.build_request(request.method, url,
- headers=request.headers.raw,
- content=request.stream(), timeout=30.0)
- rp_resp = await ontology_agent_client.send(rp_req, stream=True)
- return StreamingResponse(
- rp_resp.aiter_raw(),
- status_code=rp_resp.status_code,
- headers=rp_resp.headers,
- background=BackgroundTask(rp_resp.aclose),
- )
+ """
+ Reverse proxy to ontology agent service, which runs a separate FastAPI instance,
+ and is responsible for handling ontology related requests.
+
+ Read-only operations (GET /status) require READONLY role.
+ Write operations (POST/DELETE) require ADMIN role.
+
+ This acts as a security gateway - the ontology agent service doesn't need
+ its own RBAC implementation since it's only accessible through this proxy.
+ """
+ # Manually invoke the RBAC check since app.add_route doesn't support Depends()
+ user = await get_user_or_anonymous(request)
+
+ # Determine required role based on method and path
+ # GET /status endpoints are read-only, allow READONLY access
+ # All other operations (POST/DELETE) require ADMIN
+ is_status_endpoint = request.url.path.endswith("/status")
+ is_read_only = request.method == "GET" and is_status_endpoint
+
+ required_role = Role.READONLY if is_read_only else Role.ADMIN
+
+ if not has_permission(user.role, required_role):
+ raise HTTPException(status_code=403, detail=f"Insufficient permissions. Required role: {required_role}, your role: {user.role}")
+
+ logger.info(f"Ontology agent request by {user.email} to {request.url.path}")
+
+ url = httpx.URL(path=request.url.path, query=request.url.query.encode("utf-8"))
+ rp_req = ontology_agent_client.build_request(request.method, url, headers=request.headers.raw, content=request.stream(), timeout=30.0)
+ rp_resp = await ontology_agent_client.send(rp_req, stream=True)
+ return StreamingResponse(
+ rp_resp.aiter_raw(),
+ status_code=rp_resp.status_code,
+ headers=rp_resp.headers,
+ background=BackgroundTask(rp_resp.aclose),
+ )
+
-if graph_rag_enabled: # Only add reverse proxy if graph RAG is enabled
- app.add_route("/v1/graph/ontology/agent/{path:path}",
- _reverse_proxy, ["GET", "POST", "DELETE"])
+if graph_rag_enabled: # Only add reverse proxy if graph RAG is enabled
+ app.add_route("/v1/graph/ontology/agent/{path:path}", _reverse_proxy, ["GET", "POST", "DELETE"])
# ============================================================================
# Health Check and Configuration Endpoint
# ============================================================================
+
@app.get("/healthz")
async def health_check():
- """Health check endpoint."""
- health_status = "healthy"
- health_details = {}
-
- # Check if services are initialized
- if not metadata_storage or \
- not vector_db or \
- not jobmanager or \
- not redis_client or \
- (graph_rag_enabled and (not data_graph_db or not ontology_graph_db)):
- health_status = "unhealthy"
- health_details["error"] = "One or more services are not initialized"
- logger.error("healthz: One or more services are not initialized")
-
- config = {
- "graph_rag_enabled": graph_rag_enabled,
- "search" : {
- "keys": valid_metadata_keys(),
- },
- "vector_db": {
- "milvus": {
- "uri": milvus_uri,
- "collections": [default_collection_name_docs],
- "index_params": {"dense": dense_index_params, "sparse": sparse_index_params}
- }
- },
- "embeddings": {
- "model": embeddings_model
- },
- "metadata_storage": {
- "redis": {
- "url": redis_url
- }
- },
- "ui_url": ui_url,
- "datasources": await metadata_storage.fetch_all_datasource_info() if metadata_storage else []
- }
+ """Health check endpoint."""
+ health_status = "healthy"
+ health_details = {}
+
+ # Check if services are initialized
+ if not metadata_storage or not vector_db or not jobmanager or not redis_client or (graph_rag_enabled and (not data_graph_db or not ontology_graph_db)):
+ health_status = "unhealthy"
+ health_details["error"] = "One or more services are not initialized"
+ logger.error("healthz: One or more services are not initialized")
+
+ config = {
+ "graph_rag_enabled": graph_rag_enabled,
+ "search": {
+ "keys": valid_metadata_keys(),
+ },
+ "vector_db": {"milvus": {"uri": milvus_uri, "collections": [default_collection_name_docs], "index_params": {"dense": dense_index_params, "sparse": sparse_index_params}}},
+ "embeddings": {"model": embeddings_model},
+ "metadata_storage": {"redis": {"url": redis_url}},
+ "ui_url": ui_url,
+ "datasources": await metadata_storage.fetch_all_datasource_info() if metadata_storage else [],
+ }
+
+ if graph_rag_enabled:
+ if data_graph_db and ontology_graph_db:
+ config["graph_db"] = {
+ "data_graph": {"type": data_graph_db.database_type, "query_language": data_graph_db.query_language, "uri": neo4j_addr, "tenant_label": data_graph_db.tenant_label},
+ "ontology_graph": {"type": ontology_graph_db.database_type, "query_language": ontology_graph_db.query_language, "uri": neo4j_addr, "tenant_label": ontology_graph_db.tenant_label},
+ "graph_entity_types": await data_graph_db.get_all_entity_types() if data_graph_db else [],
+ }
+
+ response = {"status": health_status, "timestamp": int(time.time()), "details": health_details, "config": config}
+ return response
+
+
+async def init_tests(logger: logging.Logger, redis_client: redis.Redis, embeddings: EmbeddingsFactory, milvus_uri: str):
+ """
+ Run initial tests to ensure connections to check if deps are working.
+ Note: This does not check the graph db connection as its done in the init of the class.
+ """
+ logger.info("====== Running initialization tests ======")
+ logger.info(f"1. Testing connections to Redis: URI [{redis_url}]...")
+ resp = await redis_client.ping()
+ logger.info(f"Redis ping response: {resp}")
+
+ # Test embeddings endpoint
+ logger.info(f"2. Testing connections to [{embeddings_model}]...")
+ resp = embeddings.get_embeddings().embed_documents(["Test document"])
+ logger.info(f"Embeddings response: {resp}")
+
+ # Test vector DB connections
+ logger.info(f"3. Testing connections to Milvus: [{milvus_uri}]...")
+ client = MilvusClient(uri=milvus_uri)
+ logger.info("4. Listing Milvus collections")
+ collections = client.list_collections()
+ logger.info(f"Milvus collections: {collections}")
+
+ test_collection_name = "test_collection"
+
+ # Setup vector db for graph data
+ vector_db_test = Milvus(
+ embedding_function=embeddings.get_embeddings(), collection_name=test_collection_name, connection_args=milvus_connection_args, index_params=[dense_index_params, sparse_index_params], builtin_function=BM25BuiltInFunction(output_field_names="sparse"), vector_field=["dense", "sparse"]
+ )
+
+ doc = Document(page_content="Test document", metadata={"source": "test"})
+ logger.info(f"5. Adding test document to Milvus {doc}")
+ resp = vector_db_test.add_documents(documents=[doc], ids=["test_doc_1"])
+ logger.info(f"Milvus add response: {resp}")
+
+ logger.info("6. Searching test document in Milvus")
+ docs_with_score = vector_db_test.similarity_search_with_score("Test", k=1)
+ logger.info(f"Milvus similarity search response: {docs_with_score}")
+
+ logger.info(f"7. Listing Milvus collections (again, should see {test_collection_name})")
+ collections = client.list_collections()
+ logger.info(f"Milvus collections: {collections}")
+
+ logger.info(f"8. Dropping {test_collection_name} collection in Milvus")
+ resp = client.drop_collection(collection_name=test_collection_name)
+ logger.info(f"Milvus drop collection response: {resp}")
+
+ logger.info(f"9. Listing Milvus collections (final - should not see {test_collection_name})")
+ collections = client.list_collections()
+ logger.info(f"Milvus collections: {collections}")
+
+ # Enhanced health checks for collections
+ logger.info("10. Running enhanced health checks on collections...")
+
+ # Get embedding dimensions for validation
+ test_embedding = embeddings.get_embeddings().embed_documents(["test"])
+ expected_dim = len(test_embedding[0])
+ logger.info(f"Expected embedding dimension: {expected_dim}")
+
+ collections_to_check = [default_collection_name_docs]
+
+ for collection_name in collections_to_check:
+ logger.info(f"11. Validating collection {collection_name} in Milvus")
+
+ # Check if collection exists
+ if collection_name not in client.list_collections():
+ logger.warning(f"Collection {collection_name} does not exist in Milvus, it should be created upon first ingestion.")
+ continue
+
+ # Get collection schema
+ collection_info = client.describe_collection(collection_name=collection_name)
+ logger.info(f"Collection {collection_name} info: {collection_info}")
+
+ # Extract field information
+ fields = collection_info.get("fields", [])
+ field_names = {field["name"] for field in fields}
+
+ # Check 1: Validate embedding dimensions
+ logger.info(f"11a. Validating embedding dimensions for collection {collection_name}...")
+ dense_field = next((field for field in fields if field["name"] == "dense"), None)
+ if dense_field:
+ actual_dim = dense_field["params"].get("dim")
+ if actual_dim != expected_dim:
+ raise Exception(f"Collection {collection_name}: Dense vector dimension mismatch. Expected: {expected_dim}, Actual: {actual_dim}, Have you changed the embeddings model? Please delete and re-ingest the collection.")
+ logger.info(f"✓ Collection {collection_name}: Dense vector dimension correct ({actual_dim})")
+ else:
+ raise Exception(f"Collection {collection_name}: Dense vector field not found, please delete and re-ingest the collection.")
- if graph_rag_enabled:
- if data_graph_db and ontology_graph_db:
- config["graph_db"] = {
- "data_graph": {
- "type": data_graph_db.database_type,
- "query_language": data_graph_db.query_language,
- "uri": neo4j_addr,
- "tenant_label": data_graph_db.tenant_label
- },
- "ontology_graph": {
- "type": ontology_graph_db.database_type,
- "query_language": ontology_graph_db.query_language,
- "uri": neo4j_addr,
- "tenant_label": ontology_graph_db.tenant_label
- },
- "graph_entity_types": await data_graph_db.get_all_entity_types() if data_graph_db else []
- }
-
- response = {
- "status": health_status,
- "timestamp": int(time.time()),
- "details": health_details,
- "config": config
- }
- return response
-
-async def init_tests(logger: logging.Logger,
- redis_client: redis.Redis,
- embeddings: EmbeddingsFactory,
- milvus_uri: str):
- """
- Run initial tests to ensure connections to check if deps are working.
- Note: This does not check the graph db connection as its done in the init of the class.
- """
- logger.info("====== Running initialization tests ======")
- logger.info(f"1. Testing connections to Redis: URI [{redis_url}]...")
- resp = await redis_client.ping()
- logger.info(f"Redis ping response: {resp}")
-
- # Test embeddings endpoint
- logger.info(f"2. Testing connections to [{embeddings_model}]...")
- resp = embeddings.get_embeddings().embed_documents(["Test document"])
- logger.info(f"Embeddings response: {resp}")
-
- # Test vector DB connections
- logger.info(f"3. Testing connections to Milvus: [{milvus_uri}]...")
- client = MilvusClient(uri=milvus_uri)
- logger.info("4. Listing Milvus collections")
- collections = client.list_collections()
- logger.info(f"Milvus collections: {collections}")
-
- test_collection_name = "test_collection"
-
- # Setup vector db for graph data
- vector_db_test = Milvus(
- embedding_function=embeddings.get_embeddings(),
- collection_name=test_collection_name,
- connection_args=milvus_connection_args,
- index_params=[dense_index_params, sparse_index_params],
- builtin_function=BM25BuiltInFunction(output_field_names="sparse"),
- vector_field=["dense", "sparse"]
- )
+ # Check 2: Validate vector fields exists
+ logger.info(f"11b. Validating vector fields for collection {collection_name}...")
+ sparse_field = next((field for field in fields if field["name"] == "sparse"), None)
+ if not sparse_field:
+ raise Exception(f"Collection {collection_name}: Sparse vector field not found")
+
+ # Validate required vector fields exist
+ if "dense" not in field_names or "sparse" not in field_names:
+ raise Exception(f"Collection {collection_name}: Missing required vector fields (dense, sparse), please delete and re-ingest the collection.")
+ logger.info(f"✓ Collection {collection_name}: Vector fields present")
+
+ if not collection_info.get("enable_dynamic_field"):
+ raise Exception(f"Collection {collection_name}: Dynamic fields not enabled, please delete and re-ingest the collection.")
+
+ logger.info(f"✓ Collection {collection_name}: Dynamic fields enabled")
+ logger.info(f"✓ Collection {collection_name}: Metadata fields will be stored dynamically")
- doc = Document(page_content="Test document", metadata={"source": "test"})
- logger.info(f"5. Adding test document to Milvus {doc}")
- resp = vector_db_test.add_documents(documents=[doc], ids=["test_doc_1"])
- logger.info(f"Milvus add response: {resp}")
-
- logger.info("6. Searching test document in Milvus")
- docs_with_score = vector_db_test.similarity_search_with_score("Test", k=1)
- logger.info(f"Milvus similarity search response: {docs_with_score}")
-
- logger.info(f"7. Listing Milvus collections (again, should see {test_collection_name})")
- collections = client.list_collections()
- logger.info(f"Milvus collections: {collections}")
-
- logger.info(f"8. Dropping {test_collection_name} collection in Milvus")
- resp = client.drop_collection(collection_name=test_collection_name)
- logger.info(f"Milvus drop collection response: {resp}")
-
- logger.info(f"9. Listing Milvus collections (final - should not see {test_collection_name})")
- collections = client.list_collections()
- logger.info(f"Milvus collections: {collections}")
-
- # Enhanced health checks for collections
- logger.info("10. Running enhanced health checks on collections...")
-
- # Get embedding dimensions for validation
- test_embedding = embeddings.get_embeddings().embed_documents(["test"])
- expected_dim = len(test_embedding[0])
- logger.info(f"Expected embedding dimension: {expected_dim}")
-
- collections_to_check = [default_collection_name_docs]
-
- for collection_name in collections_to_check:
- logger.info(f"11. Validating collection {collection_name} in Milvus")
-
- # Check if collection exists
- if collection_name not in client.list_collections():
- logger.warning(f"Collection {collection_name} does not exist in Milvus, it should be created upon first ingestion.")
- continue
-
- # Get collection schema
- collection_info = client.describe_collection(collection_name=collection_name)
- logger.info(f"Collection {collection_name} info: {collection_info}")
-
- # Extract field information
- fields = collection_info.get('fields', [])
- field_names = {field['name'] for field in fields}
-
- # Check 1: Validate embedding dimensions
- logger.info(f"11a. Validating embedding dimensions for collection {collection_name}...")
- dense_field = next((field for field in fields if field['name'] == 'dense'), None)
- if dense_field:
- actual_dim = dense_field['params'].get('dim')
- if actual_dim != expected_dim:
- raise Exception(f"Collection {collection_name}: Dense vector dimension mismatch. Expected: {expected_dim}, Actual: {actual_dim}, Have you changed the embeddings model? Please delete and re-ingest the collection.")
- logger.info(f"✓ Collection {collection_name}: Dense vector dimension correct ({actual_dim})")
- else:
- raise Exception(f"Collection {collection_name}: Dense vector field not found, please delete and re-ingest the collection.")
-
- # Check 2: Validate vector fields exists
- logger.info(f"11b. Validating vector fields for collection {collection_name}...")
- sparse_field = next((field for field in fields if field['name'] == 'sparse'), None)
- if not sparse_field:
- raise Exception(f"Collection {collection_name}: Sparse vector field not found")
-
- # Validate required vector fields exist
- if 'dense' not in field_names or 'sparse' not in field_names:
- raise Exception(f"Collection {collection_name}: Missing required vector fields (dense, sparse), please delete and re-ingest the collection.")
- logger.info(f"✓ Collection {collection_name}: Vector fields present")
-
- if not collection_info.get("enable_dynamic_field"):
- raise Exception(f"Collection {collection_name}: Dynamic fields not enabled, please delete and re-ingest the collection.")
-
- logger.info(f"✓ Collection {collection_name}: Dynamic fields enabled")
- logger.info(f"✓ Collection {collection_name}: Metadata fields will be stored dynamically")
-
- logger.info("====== Initialization tests completed successfully ======")
- return
\ No newline at end of file
+ logger.info("====== Initialization tests completed successfully ======")
+ return
diff --git a/ai_platform_engineering/knowledge_bases/rag/server/uv.lock b/ai_platform_engineering/knowledge_bases/rag/server/uv.lock
index 5f50e644b..075094725 100644
--- a/ai_platform_engineering/knowledge_bases/rag/server/uv.lock
+++ b/ai_platform_engineering/knowledge_bases/rag/server/uv.lock
@@ -453,7 +453,6 @@ dependencies = [
{ name = "cymple" },
{ name = "langchain-aws" },
{ name = "langchain-cohere" },
- { name = "langchain-huggingface" },
{ name = "langchain-ollama" },
{ name = "langchain-openai" },
{ name = "neo4j" },
@@ -461,18 +460,24 @@ dependencies = [
{ name = "redis" },
]
+[package.optional-dependencies]
+huggingface = [
+ { name = "langchain-huggingface" },
+]
+
[package.metadata]
requires-dist = [
{ name = "cymple", specifier = ">=0.12.0" },
{ name = "langchain-aws", specifier = ">=0.2.24" },
{ name = "langchain-cohere", specifier = ">=0.3.0" },
- { name = "langchain-huggingface", specifier = ">=0.3.0" },
+ { name = "langchain-huggingface", marker = "extra == 'huggingface'", specifier = ">=0.3.0" },
{ name = "langchain-ollama", specifier = ">=0.3.0" },
{ name = "langchain-openai", specifier = ">=0.3.18" },
{ name = "neo4j", specifier = ">=5.28.1" },
{ name = "pydantic", specifier = ">=2.11.7" },
{ name = "redis", specifier = ">=6.2.0" },
]
+provides-extras = ["huggingface"]
[[package]]
name = "constantly"
@@ -2410,7 +2415,7 @@ wheels = [
[[package]]
name = "openai"
-version = "2.0.1"
+version = "2.19.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
@@ -2422,9 +2427,9 @@ dependencies = [
{ name = "tqdm" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/a5/a3/ebc9cef5bb5ae144d2bd42059661362c5f769bd15a4c335cb64f06e3cb14/openai-2.0.1.tar.gz", hash = "sha256:ff00c0f6f75fb04f5696bfddf52b6280f9bc2157cfc45cce011af00d8dd385e1", size = 566088, upload-time = "2025-10-01T19:49:09.583Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/86/e3/9161d27c725ff69ff8feb1c97834b9bd993d54f41e5aa04de1cc6b998ad7/openai-2.19.0.tar.gz", hash = "sha256:4ba78da821b44e0ea38fd182f252a45c237c340f303a9fbf9da600701fa16c75", size = 642278, upload-time = "2026-02-10T18:21:39.902Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/4d/5d/7b8dc822de474a283a190fe222d9a074e2fdecfbcb4a14ff49ad4d555404/openai-2.0.1-py3-none-any.whl", hash = "sha256:f0671423666cfd24c15010fd4732738f89f1b6d4f21c47f5c82db411cc2648d5", size = 956304, upload-time = "2025-10-01T19:49:07.497Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/10/7c60f66c74e02eb0375cda9c2b28c7cb43e8f95cab7338124449fae1bde7/openai-2.19.0-py3-none-any.whl", hash = "sha256:425d657ce4dcc9d6294b78f0f41dc36aa43230750e61f49c551fa490c8575803", size = 1098416, upload-time = "2026-02-10T18:21:37.928Z" },
]
[[package]]
@@ -3723,7 +3728,6 @@ dependencies = [
{ name = "fastapi", extra = ["standard"] },
{ name = "fastmcp" },
{ name = "httpx" },
- { name = "huggingface-hub" },
{ name = "langchain" },
{ name = "langchain-community" },
{ name = "langchain-core" },
@@ -3737,7 +3741,6 @@ dependencies = [
{ name = "python-jose", extra = ["cryptography"] },
{ name = "pyyaml" },
{ name = "redis" },
- { name = "sentence-transformers" },
{ name = "uvicorn" },
]
@@ -3749,23 +3752,10 @@ dev = [
{ name = "pytest-mock" },
{ name = "ruff" },
]
-embeddings-all = [
- { name = "langchain-aws" },
- { name = "langchain-cohere" },
- { name = "langchain-huggingface" },
- { name = "langchain-ollama" },
-]
-embeddings-aws = [
- { name = "langchain-aws" },
-]
-embeddings-cohere = [
- { name = "langchain-cohere" },
-]
-embeddings-huggingface = [
- { name = "langchain-huggingface" },
-]
-embeddings-ollama = [
- { name = "langchain-ollama" },
+huggingface = [
+ { name = "common", extra = ["huggingface"] },
+ { name = "huggingface-hub" },
+ { name = "sentence-transformers" },
]
[package.dev-dependencies]
@@ -3793,23 +3783,16 @@ requires-dist = [
{ name = "click", specifier = ">=8.1.7" },
{ name = "cnoe-agent-utils", specifier = "==0.3.9" },
{ name = "common", directory = "../common" },
+ { name = "common", extras = ["huggingface"], marker = "extra == 'huggingface'", directory = "../common" },
{ name = "fastapi", extras = ["standard"], specifier = ">=0.116.1" },
{ name = "fastmcp", specifier = ">=2.12.4" },
{ name = "httpx", specifier = ">=0.28.1" },
- { name = "huggingface-hub", specifier = ">=0.36.0" },
+ { name = "huggingface-hub", marker = "extra == 'huggingface'", specifier = ">=0.36.0" },
{ name = "langchain", specifier = ">=0.1.0" },
- { name = "langchain-aws", marker = "extra == 'embeddings-all'", specifier = ">=0.3.0" },
- { name = "langchain-aws", marker = "extra == 'embeddings-aws'", specifier = ">=0.3.0" },
- { name = "langchain-cohere", marker = "extra == 'embeddings-all'", specifier = ">=0.3.0" },
- { name = "langchain-cohere", marker = "extra == 'embeddings-cohere'", specifier = ">=0.3.0" },
{ name = "langchain-community" },
{ name = "langchain-core", specifier = "==1.1.2" },
- { name = "langchain-huggingface", marker = "extra == 'embeddings-all'", specifier = ">=0.3.0" },
- { name = "langchain-huggingface", marker = "extra == 'embeddings-huggingface'", specifier = ">=0.3.0" },
{ name = "langchain-milvus", specifier = ">=0.1.10" },
{ name = "langchain-milvus", specifier = ">=0.2.1" },
- { name = "langchain-ollama", marker = "extra == 'embeddings-all'", specifier = ">=0.3.0" },
- { name = "langchain-ollama", marker = "extra == 'embeddings-ollama'", specifier = ">=0.3.0" },
{ name = "langchain-openai", specifier = ">=0.3.18" },
{ name = "lxml", specifier = ">=6.0.1" },
{ name = "openai", specifier = ">=1.0.0" },
@@ -3825,10 +3808,10 @@ requires-dist = [
{ name = "pyyaml", specifier = ">=6.0.3" },
{ name = "redis", specifier = ">=5.0.0" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
- { name = "sentence-transformers", specifier = ">=5.2.0" },
+ { name = "sentence-transformers", marker = "extra == 'huggingface'", specifier = ">=5.2.0" },
{ name = "uvicorn", specifier = ">=0.35.0" },
]
-provides-extras = ["dev", "embeddings-aws", "embeddings-cohere", "embeddings-huggingface", "embeddings-ollama", "embeddings-all"]
+provides-extras = ["dev", "huggingface"]
[package.metadata.requires-dev]
dev = [
diff --git a/charts/ai-platform-engineering/Chart.yaml b/charts/ai-platform-engineering/Chart.yaml
index 2ccd53f23..d1d2ef612 100644
--- a/charts/ai-platform-engineering/Chart.yaml
+++ b/charts/ai-platform-engineering/Chart.yaml
@@ -6,7 +6,7 @@ description: Parent chart to deploy multiple agent subcharts as different platfo
sources:
- https://github.com/cnoe-io/ai-platform-engineering/charts
# Chart version for ai-platform-engineering parent chart
-version: 0.2.15-rc.helm.2 # Do NOT bump this. It will be updated automatically using the PR or release workflow
+version: 0.2.15-rc.helm.3 # Do NOT bump this. It will be updated automatically using the PR or release workflow
dependencies:
# AI Platform Engineer Multi-Agent
- name: supervisor-agent
@@ -134,7 +134,7 @@ dependencies:
repository: oci://ghcr.io/agntcy/slim/helm
condition: global.slim.enabled
- name: rag-stack
- version: 0.2.15 # Do NOT bump this. It will be updated automatically using the PR or release workflow
+ version: 0.2.15-rc.helm.1 # Do NOT bump this. It will be updated automatically using the PR or release workflow
repository: file://../rag-stack # TODO: might be changed to be separate
tags:
- rag-stack
diff --git a/charts/rag-stack/Chart.yaml b/charts/rag-stack/Chart.yaml
index f87b60126..7ee8a18c3 100644
--- a/charts/rag-stack/Chart.yaml
+++ b/charts/rag-stack/Chart.yaml
@@ -2,11 +2,11 @@ apiVersion: v2
name: rag-stack
description: A complete RAG stack including web UI, server, agents, Redis, Neo4j and Milvus
type: application
-version: 0.2.15 # Do NOT bump this. It will be updated automatically using the PR or release workflow
+version: 0.2.15-rc.helm.1 # Do NOT bump this. It will be updated automatically using the PR or release workflow
appVersion: 0.2.15 # Do NOT modify. It will be updated automatically using the release workflow
dependencies:
- name: rag-server
- version: 0.2.15 # Do NOT bump this. It will be updated automatically using the PR or release workflow
+ version: 0.2.15-rc.helm.1 # Do NOT bump this. It will be updated automatically using the PR or release workflow
repository: "file://./charts/rag-server"
condition: rag-server.enabled
- name: agent-ontology
diff --git a/charts/rag-stack/README.md b/charts/rag-stack/README.md
index c9d3f9264..783a22a55 100644
--- a/charts/rag-stack/README.md
+++ b/charts/rag-stack/README.md
@@ -30,22 +30,91 @@ helm install rag-stack ./charts/rag-stack -f custom-values.yaml
## Configuration
-All components are fully configurable via `values.yaml`. See the file for:
+RAG server and web ingestor configuration is done via environment variables using the `env:` map in values.yaml.
-- **RAG Server**: Feature flags, embeddings config, performance limits, web ingestor settings
-- **Agent Ontology**: Sync intervals, evaluation thresholds, LLM worker configuration
-- **Ingestors**: Per-ingestor deployment with type-specific environment variables
-- **Databases**: Resource limits, persistence, connection settings
+### RAG Server Configuration
-Refer to `values.yaml` for detailed configuration options and commented examples for each component.
+```yaml
+rag-server:
+ # Feature flag with global fallback
+ enableGraphRag: true
+
+ # All other config via env map
+ env:
+ ENABLE_MCP: "true"
+ EMBEDDINGS_PROVIDER: "azure-openai"
+ EMBEDDINGS_MODEL: "text-embedding-3-small"
+ LOG_LEVEL: "INFO"
+ # ... see values.yaml for all options
+```
+
+### RAG Server Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `ENABLE_MCP` | `true` | Enable/disable MCP tools for AI agents |
+| `SKIP_INIT_TESTS` | `false` | Skip connection tests on startup |
+| `EMBEDDINGS_PROVIDER` | `azure-openai` | Provider: `azure-openai`, `openai`, `litellm` |
+| `EMBEDDINGS_MODEL` | `text-embedding-3-small` | Embeddings model name |
+| `LITELLM_API_BASE` | - | LiteLLM proxy URL (required when using `litellm` provider) |
+| `LOG_LEVEL` | `INFO` | Logging level: DEBUG, INFO, WARNING, ERROR |
+| `MAX_DOCUMENTS_PER_INGEST` | `1000` | Max documents per ingestion request |
+| `MAX_RESULTS_PER_QUERY` | `100` | Max results per query |
+| `ALLOW_UNAUTHENTICATED` | `true` | Allow access without authentication |
+| `RBAC_ADMIN_GROUPS` | `` | Comma-separated group names with admin access |
+| `RBAC_READONLY_GROUPS` | `` | Comma-separated group names with read-only access |
+| `RBAC_DEFAULT_ROLE` | `readonly` | Default role when user doesn't match any group |
+
+### Web Ingestor Configuration
+
+```yaml
+rag-server:
+ webIngestor:
+ enabled: true
+ env:
+ LOG_LEVEL: "INFO"
+ WEBLOADER_MAX_CONCURRENCY: "10"
+ # Scrapy settings (optional)
+ SCRAPY_CONCURRENT_REQUESTS: "16"
+ SCRAPY_JAVASCRIPT_ENABLED: "true"
+```
-## Environment Variables
+### Web Ingestor Environment Variables
-Each component's environment variables are documented in `values.yaml` with:
-- Variable names (e.g., `ENABLE_GRAPH_RAG`)
-- Values.yaml keys (e.g., `enableGraphRag`)
-- Default values
-- Descriptions
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LOG_LEVEL` | `INFO` | Logging level |
+| `WEBLOADER_MAX_CONCURRENCY` | `10` | Max concurrent HTTP requests per ingestion |
+| `WEBLOADER_MAX_INGESTION_TASKS` | `5` | Max concurrent ingestion tasks |
+| `WEBLOADER_RELOAD_INTERVAL` | `86400` | Auto-reload interval in seconds (24 hours) |
+| `SCRAPY_CONCURRENT_REQUESTS` | `16` | Scrapy concurrent requests |
+| `SCRAPY_DOWNLOAD_DELAY` | `0` | Delay between requests in seconds |
+| `SCRAPY_DEPTH_LIMIT` | `0` | Max crawl depth (0 = unlimited) |
+| `SCRAPY_JAVASCRIPT_ENABLED` | `false` | Enable JavaScript rendering via Playwright |
+
+### LiteLLM Embeddings Example
+
+To use LiteLLM proxy for embeddings:
+
+```yaml
+rag-server:
+ env:
+ EMBEDDINGS_PROVIDER: "litellm"
+ EMBEDDINGS_MODEL: "azure/text-embedding-3-small"
+ LITELLM_API_BASE: "http://litellm-proxy:4000"
+ # LITELLM_API_KEY: "sk-..." # or use envFrom with a secret
+```
+
+### Using Secrets
+
+For sensitive values, use `envFrom` to reference a Kubernetes Secret:
+
+```yaml
+rag-server:
+ envFrom:
+ - secretRef:
+ name: rag-server-secrets
+```
## Secrets Required
@@ -129,11 +198,11 @@ oauth2-proxy:
- host: rag-webui.example.com
rag-server:
- rbac:
- allowUnauthenticated: false
- readonlyGroups: "viewers,engineers"
- adminGroups: "admins"
- defaultRole: "readonly"
+ env:
+ ALLOW_UNAUTHENTICATED: "false"
+ RBAC_READONLY_GROUPS: "viewers,engineers"
+ RBAC_ADMIN_GROUPS: "admins"
+ RBAC_DEFAULT_ROLE: "readonly"
rag-webui:
ingress:
@@ -175,11 +244,73 @@ kubectl logs -f deployment/rag-ingestors-
kubectl exec deployment/rag-server -- curl http://localhost:9446/healthz
```
-## Notes
+## Migration Guide
+
+If upgrading from a previous version that used individual values.yaml keys, migrate to the new `env:` map format:
+
+### RAG Server Settings
+
+| Old values.yaml Key | New env: Key |
+|---------------------|--------------|
+| `enableMcp` | `ENABLE_MCP` |
+| `skipInitTests` | `SKIP_INIT_TESTS` |
+| `embeddingsProvider` | `EMBEDDINGS_PROVIDER` |
+| `embeddingsModel` | `EMBEDDINGS_MODEL` |
+| `maxDocumentsPerIngest` | `MAX_DOCUMENTS_PER_INGEST` |
+| `maxResultsPerQuery` | `MAX_RESULTS_PER_QUERY` |
+| `maxIngestionConcurrency` | `MAX_INGESTION_CONCURRENCY` |
+| `maxGraphRawQueryResults` | `MAX_GRAPH_RAW_QUERY_RESULTS` |
+| `maxGraphRawQueryTokens` | `MAX_GRAPH_RAW_QUERY_TOKENS` |
+| `searchResultTruncateLength` | `SEARCH_RESULT_TRUNCATE_LENGTH` |
+| `logLevel` | `LOG_LEVEL` |
+| `uiUrl` | `UI_URL` |
+| `sleepOnInitFailureSeconds` | `SLEEP_ON_INIT_FAILURE_SECONDS` |
+| `cleanupInterval` | `CLEANUP_INTERVAL` |
+| `rbac.allowUnauthenticated` | `ALLOW_UNAUTHENTICATED` |
+| `rbac.readonlyGroups` | `RBAC_READONLY_GROUPS` |
+| `rbac.ingestonlyGroups` | `RBAC_INGESTONLY_GROUPS` |
+| `rbac.adminGroups` | `RBAC_ADMIN_GROUPS` |
+| `rbac.defaultRole` | `RBAC_DEFAULT_ROLE` |
+
+### Web Ingestor Settings
+
+| Old values.yaml Key | New env: Key |
+|---------------------|--------------|
+| `webIngestor.logLevel` | `LOG_LEVEL` |
+| `webIngestor.maxConcurrency` | `WEBLOADER_MAX_CONCURRENCY` |
+| `webIngestor.maxIngestionTasks` | `WEBLOADER_MAX_INGESTION_TASKS` |
+| `webIngestor.reloadInterval` | `WEBLOADER_RELOAD_INTERVAL` |
+
+### Example Migration
+
+**Before (old format):**
+```yaml
+rag-server:
+ enableMcp: true
+ embeddingsProvider: azure-openai
+ logLevel: INFO
+ rbac:
+ allowUnauthenticated: true
+ adminGroups: "admins"
+```
+
+**After (new format):**
+```yaml
+rag-server:
+ env:
+ ENABLE_MCP: "true"
+ EMBEDDINGS_PROVIDER: "azure-openai"
+ LOG_LEVEL: "INFO"
+ ALLOW_UNAUTHENTICATED: "true"
+ RBAC_ADMIN_GROUPS: "admins"
+```
-- ⚠️ Change Neo4j password from `dummy_password` in production
-- 📝 All configuration options are documented in `values.yaml`
-- 🔐 Store sensitive credentials in Kubernetes Secrets
-- 📊 Default sync intervals: 24 hours for ingestors, 72 hours for ontology agent
-- 💾 Default resource limits are suitable for development; increase for production
+**Note:** `enableGraphRag` remains unchanged as it has global fallback support via `global.enableGraphRag`.
+
+## Notes
+- Change Neo4j password from `dummy_password` in production
+- All configuration options are documented in `values.yaml`
+- Store sensitive credentials in Kubernetes Secrets
+- Default sync intervals: 24 hours for ingestors, 72 hours for ontology agent
+- Default resource limits are suitable for development; increase for production
diff --git a/charts/rag-stack/charts/rag-server/Chart.yaml b/charts/rag-stack/charts/rag-server/Chart.yaml
index b8b54f5ed..bd34b0756 100644
--- a/charts/rag-stack/charts/rag-server/Chart.yaml
+++ b/charts/rag-stack/charts/rag-server/Chart.yaml
@@ -2,5 +2,5 @@ apiVersion: v2
name: rag-server
description: RAG server
type: application
-version: 0.2.15 # Do NOT bump this. It will be updated automatically using the PR or release workflow
+version: 0.2.15-rc.helm.1 # Do NOT bump this. It will be updated automatically using the PR or release workflow
appVersion: "0.2.15" # Do NOT modify. It will be updated automatically using the release workflow
diff --git a/charts/rag-stack/charts/rag-server/templates/deployment.yaml b/charts/rag-stack/charts/rag-server/templates/deployment.yaml
index 4acc0a761..8077fe8ec 100644
--- a/charts/rag-stack/charts/rag-server/templates/deployment.yaml
+++ b/charts/rag-stack/charts/rag-server/templates/deployment.yaml
@@ -44,7 +44,7 @@ spec:
name: {{ $llmSecretName }}
{{- end }}
env:
- # Core Connection Settings
+ # Computed connection settings (from global values)
- name: REDIS_URL
value: {{ include "rag-server.redisUrl" . | quote }}
- name: NEO4J_ADDR
@@ -57,65 +57,14 @@ spec:
value: {{ include "rag-server.milvusUri" . | quote }}
- name: ONTOLOGY_AGENT_RESTAPI_ADDR
value: {{ include "rag-server.ontologyAgentRestapiAddr" . | quote }}
-
- # Feature Flags
+ # Feature flag with global fallback
- name: ENABLE_GRAPH_RAG
- value: {{ .Values.enableGraphRag | quote }}
- - name: ENABLE_MCP
- value: {{ .Values.enableMcp | quote }}
- - name: SKIP_INIT_TESTS
- value: {{ .Values.skipInitTests | quote }}
-
- # Embeddings Configuration
- - name: EMBEDDINGS_PROVIDER
- value: {{ .Values.embeddingsProvider | quote }}
- - name: EMBEDDINGS_MODEL
- value: {{ .Values.embeddingsModel | quote }}
-
- # Performance & Limits
- - name: MAX_DOCUMENTS_PER_INGEST
- value: {{ .Values.maxDocumentsPerIngest | quote }}
- - name: MAX_RESULTS_PER_QUERY
- value: {{ .Values.maxResultsPerQuery | quote }}
- - name: MAX_INGESTION_CONCURRENCY
- value: {{ .Values.maxIngestionConcurrency | quote }}
- - name: MAX_GRAPH_RAW_QUERY_RESULTS
- value: {{ .Values.maxGraphRawQueryResults | quote }}
- - name: MAX_GRAPH_RAW_QUERY_TOKENS
- value: {{ .Values.maxGraphRawQueryTokens | quote }}
- - name: SEARCH_RESULT_TRUNCATE_LENGTH
- value: {{ .Values.searchResultTruncateLength | quote }}
-
- # Other Settings
- - name: LOG_LEVEL
- value: {{ .Values.logLevel | quote }}
- - name: UI_URL
- value: {{ .Values.uiUrl | quote }}
- - name: SLEEP_ON_INIT_FAILURE_SECONDS
- value: {{ .Values.sleepOnInitFailureSeconds | quote }}
- - name: CLEANUP_INTERVAL
- value: {{ .Values.cleanupInterval | quote }}
-
- # RBAC/Authentication Configuration
- {{- if .Values.rbac }}
- - name: ALLOW_UNAUTHENTICATED
- value: {{ .Values.rbac.allowUnauthenticated | quote }}
- - name: RBAC_READONLY_GROUPS
- value: {{ .Values.rbac.readonlyGroups | quote }}
- - name: RBAC_INGESTONLY_GROUPS
- value: {{ .Values.rbac.ingestonlyGroups | quote }}
- - name: RBAC_ADMIN_GROUPS
- value: {{ .Values.rbac.adminGroups | quote }}
- - name: RBAC_DEFAULT_ROLE
- value: {{ .Values.rbac.defaultRole | quote }}
- {{- end }}
-
- {{- if .Values.env }}
+ value: {{ include "rag-server.enableGraphRag" . | quote }}
+ # All other configuration via env map
{{- range $key, $value := .Values.env }}
- name: {{ $key }}
value: {{ $value | quote }}
{{- end }}
- {{- end }}
{{- with .Values.livenessProbe }}
livenessProbe:
{{- toYaml . | nindent 12 }}
@@ -133,21 +82,23 @@ spec:
- name: web-ingestor
image: "{{ .Values.webIngestor.image.repository }}:{{ .Values.webIngestor.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.webIngestor.image.pullPolicy }}
+ {{- with .Values.webIngestor.envFrom }}
+ envFrom:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
env:
+ # Fixed/computed values
- name: INGESTOR_TYPE
value: "webloader"
- - name: LOG_LEVEL
- value: {{ .Values.webIngestor.logLevel | quote }}
- name: REDIS_URL
value: {{ include "rag-server.redisUrl" . | quote }}
- name: RAG_SERVER_URL
- value: {{ printf "http://localhost:9446" | quote }}
- - name: WEBLOADER_MAX_CONCURRENCY
- value: {{ .Values.webIngestor.maxConcurrency | quote }}
- - name: WEBLOADER_MAX_INGESTION_TASKS
- value: {{ .Values.webIngestor.maxIngestionTasks | quote }}
- - name: WEBLOADER_RELOAD_INTERVAL
- value: {{ .Values.webIngestor.reloadInterval | quote }}
+ value: "http://localhost:9446"
+ # All other configuration via env map
+ {{- range $key, $value := .Values.webIngestor.env }}
+ - name: {{ $key }}
+ value: {{ $value | quote }}
+ {{- end }}
{{- with .Values.webIngestor.resources }}
resources:
{{- toYaml . | nindent 12 }}
diff --git a/charts/rag-stack/charts/rag-server/values.yaml b/charts/rag-stack/charts/rag-server/values.yaml
index 980adafb6..a06856b4d 100644
--- a/charts/rag-stack/charts/rag-server/values.yaml
+++ b/charts/rag-stack/charts/rag-server/values.yaml
@@ -1,41 +1,10 @@
nameOverride: ""
fullnameOverride: "rag-server"
-# Feature Flags
-enableGraphRag: true # ENABLE_GRAPH_RAG - enable/disable graph RAG features
-enableMcp: true # ENABLE_MCP - enable/disable MCP tools for AI agents
-skipInitTests: false # SKIP_INIT_TESTS - skip connection tests on startup
-
-# Embeddings Configuration
-embeddingsProvider: "azure-openai" # EMBEDDINGS_PROVIDER - azure-openai or openai
-embeddingsModel: "text-embedding-3-small" # EMBEDDINGS_MODEL - model name
-
-# Performance & Limits
-maxDocumentsPerIngest: 1000 # MAX_DOCUMENTS_PER_INGEST - max documents per ingestion request
-maxResultsPerQuery: 100 # MAX_RESULTS_PER_QUERY - max results per query
-maxIngestionConcurrency: 30 # MAX_INGESTION_CONCURRENCY - max concurrent tasks during ingestion
-maxGraphRawQueryResults: 100 # MAX_GRAPH_RAW_QUERY_RESULTS - max results for raw graph queries
-maxGraphRawQueryTokens: 80000 # MAX_GRAPH_RAW_QUERY_TOKENS - max tokens in raw query results before truncation
-searchResultTruncateLength: 500 # SEARCH_RESULT_TRUNCATE_LENGTH - truncate search results to N characters (for MCP tools)
-
-# Other Settings
-logLevel: INFO # LOG_LEVEL - logging level (DEBUG, INFO, WARNING, ERROR)
-uiUrl: "http://localhost:9447" # UI_URL - WebUI URL for health check response
-sleepOnInitFailureSeconds: 180 # SLEEP_ON_INIT_FAILURE_SECONDS - sleep duration on init failure before shutdown
-cleanupInterval: 86400 # CLEANUP_INTERVAL - cleanup interval for stale data (24 hours)
-
-# RBAC/Authentication Configuration
-# Configure when using OAuth2 Proxy for authentication
-# Server enforces RBAC based on headers passed from OAuth2 Proxy (X-Forwarded-Groups)
-rbac:
- allowUnauthenticated: true # ALLOW_UNAUTHENTICATED - allow access without authentication
- readonlyGroups: "" # RBAC_READONLY_GROUPS - comma-separated group names with read-only access
- ingestonlyGroups: "" # RBAC_INGESTONLY_GROUPS - comma-separated group names with ingest-only access
- adminGroups: "" # RBAC_ADMIN_GROUPS - comma-separated group names with admin access
- defaultRole: "readonly" # RBAC_DEFAULT_ROLE - default role when user doesn't match any group
+# Feature flag with global fallback (global.rag.enableGraphRag)
+enableGraphRag: true
image:
- # Prefer the prebuilt image; if building in-cluster is needed, adjust CI
repository: "ghcr.io/cnoe-io/caipe-rag-server"
# tag defaults to .Chart.AppVersion when not specified
tag: ""
@@ -47,6 +16,42 @@ replicaCount: 1
# Kubernetes will automatically clean up ReplicaSets beyond this limit
revisionHistoryLimit: 3
+# Environment variables for rag-server container
+# All application configuration goes here
+env: {}
+ # Feature Flags
+ # ENABLE_MCP: "true"
+ # SKIP_INIT_TESTS: "false"
+ #
+ # Embeddings Configuration
+ # EMBEDDINGS_PROVIDER: "azure-openai" # Options: azure-openai, openai, litellm
+ # EMBEDDINGS_MODEL: "text-embedding-3-small"
+ # LITELLM_API_BASE: "http://litellm-proxy:4000" # Required when using litellm provider
+ #
+ # Performance & Limits
+ # MAX_DOCUMENTS_PER_INGEST: "1000"
+ # MAX_RESULTS_PER_QUERY: "100"
+ # MAX_INGESTION_CONCURRENCY: "30"
+ # MAX_GRAPH_RAW_QUERY_RESULTS: "100"
+ # MAX_GRAPH_RAW_QUERY_TOKENS: "80000"
+ # SEARCH_RESULT_TRUNCATE_LENGTH: "500"
+ #
+ # General Settings
+ # LOG_LEVEL: "INFO" # Options: DEBUG, INFO, WARNING, ERROR
+ # UI_URL: "http://localhost:9447"
+ # SLEEP_ON_INIT_FAILURE_SECONDS: "180"
+ # CLEANUP_INTERVAL: "86400"
+ #
+ # RBAC/Authentication (when using OAuth2 Proxy)
+ # ALLOW_UNAUTHENTICATED: "true"
+ # RBAC_READONLY_GROUPS: ""
+ # RBAC_INGESTONLY_GROUPS: ""
+ # RBAC_ADMIN_GROUPS: ""
+ # RBAC_DEFAULT_ROLE: "readonly"
+
+# Additional envFrom references (secrets, configmaps)
+envFrom: []
+
# Web Ingestor configuration (runs as sidecar container)
webIngestor:
enabled: true
@@ -56,11 +61,21 @@ webIngestor:
tag: ""
pullPolicy: IfNotPresent
- # Ingestor settings
- logLevel: INFO
- maxConcurrency: 10 # WEBLOADER_MAX_CONCURRENCY - max concurrent HTTP requests per ingestion
- maxIngestionTasks: 5 # WEBLOADER_MAX_INGESTION_TASKS - max concurrent ingestion tasks
- reloadInterval: 86400 # WEBLOADER_RELOAD_INTERVAL - auto-reload interval in seconds (24 hours)
+ # Environment variables for web-ingestor container
+ env: {}
+ # LOG_LEVEL: "INFO"
+ # WEBLOADER_MAX_CONCURRENCY: "10"
+ # WEBLOADER_MAX_INGESTION_TASKS: "5"
+ # WEBLOADER_RELOAD_INTERVAL: "86400"
+ #
+ # Scrapy settings (optional)
+ # SCRAPY_CONCURRENT_REQUESTS: "16"
+ # SCRAPY_DOWNLOAD_DELAY: "0"
+ # SCRAPY_DEPTH_LIMIT: "0"
+ # SCRAPY_JAVASCRIPT_ENABLED: "false"
+
+ # Additional envFrom references (secrets, configmaps)
+ envFrom: []
# Resource limits for the web ingestor sidecar
resources:
@@ -96,8 +111,6 @@ podLabels: {}
imagePullSecrets: []
-env: {}
-
llmSecrets:
secretName: "llm-secret"
diff --git a/charts/rag-stack/values.yaml b/charts/rag-stack/values.yaml
index d6edc70bf..218b4ee3d 100644
--- a/charts/rag-stack/values.yaml
+++ b/charts/rag-stack/values.yaml
@@ -58,38 +58,38 @@ rag-server:
type: ClusterIP
port: 9446
- # Feature Flags
- enableGraphRag: true # ENABLE_GRAPH_RAG - enable/disable graph RAG features
- enableMcp: true # ENABLE_MCP - enable/disable MCP tools for AI agents
- skipInitTests: false # SKIP_INIT_TESTS - skip connection tests on startup
-
- # Embeddings Configuration
- embeddingsProvider: "azure-openai" # EMBEDDINGS_PROVIDER - azure-openai or openai
- embeddingsModel: "text-embedding-3-small" # EMBEDDINGS_MODEL - model name
-
- # Performance & Limits
- maxDocumentsPerIngest: 1000 # MAX_DOCUMENTS_PER_INGEST
- maxResultsPerQuery: 100 # MAX_RESULTS_PER_QUERY
- maxIngestionConcurrency: 30 # MAX_INGESTION_CONCURRENCY
- maxGraphRawQueryResults: 100 # MAX_GRAPH_RAW_QUERY_RESULTS
- maxGraphRawQueryTokens: 80000 # MAX_GRAPH_RAW_QUERY_TOKENS
- searchResultTruncateLength: 500 # SEARCH_RESULT_TRUNCATE_LENGTH
-
- # Other Settings
- logLevel: DEBUG # LOG_LEVEL
- uiUrl: "http://localhost:9447" # UI_URL - WebUI URL for health check
- sleepOnInitFailureSeconds: 180 # SLEEP_ON_INIT_FAILURE_SECONDS - how long to wait if init fails
- cleanupInterval: 86400 # CLEANUP_INTERVAL (24 hours) - cleans up stale data in databases
-
- # RBAC/Authentication Configuration
- # Configure when using OAuth2 Proxy for authentication
- # Server enforces RBAC based on headers passed from OAuth2 Proxy (X-Forwarded-Groups)
- rbac:
- allowUnauthenticated: true # ALLOW_UNAUTHENTICATED - allow access without authentication
- readonlyGroups: "" # RBAC_READONLY_GROUPS - comma-separated group names with read-only access
- ingestonlyGroups: "" # RBAC_INGESTONLY_GROUPS - comma-separated group names with ingest-only access
- adminGroups: "" # RBAC_ADMIN_GROUPS - comma-separated group names with admin access
- defaultRole: "readonly" # RBAC_DEFAULT_ROLE - default role when user doesn't match any group
+ # Feature flag with global fallback (global.rag.enableGraphRag)
+ enableGraphRag: true
+
+ # All application configuration via environment variables
+ env:
+ # Feature Flags
+ ENABLE_MCP: "true"
+ SKIP_INIT_TESTS: "false"
+ # Embeddings Configuration
+ EMBEDDINGS_PROVIDER: "azure-openai"
+ EMBEDDINGS_MODEL: "text-embedding-3-small"
+ # Performance & Limits
+ MAX_DOCUMENTS_PER_INGEST: "1000"
+ MAX_RESULTS_PER_QUERY: "100"
+ MAX_INGESTION_CONCURRENCY: "30"
+ MAX_GRAPH_RAW_QUERY_RESULTS: "100"
+ MAX_GRAPH_RAW_QUERY_TOKENS: "80000"
+ SEARCH_RESULT_TRUNCATE_LENGTH: "500"
+ # General Settings
+ LOG_LEVEL: "DEBUG"
+ UI_URL: "http://localhost:9447"
+ SLEEP_ON_INIT_FAILURE_SECONDS: "180"
+ CLEANUP_INTERVAL: "86400"
+ # RBAC/Authentication (when using OAuth2 Proxy)
+ ALLOW_UNAUTHENTICATED: "true"
+ RBAC_READONLY_GROUPS: ""
+ RBAC_INGESTONLY_GROUPS: ""
+ RBAC_ADMIN_GROUPS: ""
+ RBAC_DEFAULT_ROLE: "readonly"
+
+ # Additional envFrom references (secrets, configmaps)
+ envFrom: []
# Web Ingestor sidecar configuration
webIngestor:
@@ -100,10 +100,20 @@ rag-server:
tag: ""
pullPolicy: Always
- logLevel: INFO
- maxConcurrency: 10 # WEBLOADER_MAX_CONCURRENCY - max concurrent HTTP requests per ingestion
- maxIngestionTasks: 5 # WEBLOADER_MAX_INGESTION_TASKS - max concurrent ingestion tasks
- reloadInterval: 86400 # WEBLOADER_RELOAD_INTERVAL - auto-reload interval in seconds (24 hours)
+ # Environment variables for web-ingestor container
+ env:
+ LOG_LEVEL: "INFO"
+ WEBLOADER_MAX_CONCURRENCY: "10"
+ WEBLOADER_MAX_INGESTION_TASKS: "5"
+ WEBLOADER_RELOAD_INTERVAL: "86400"
+ # Scrapy settings (optional):
+ # SCRAPY_CONCURRENT_REQUESTS: "16"
+ # SCRAPY_DOWNLOAD_DELAY: "0"
+ # SCRAPY_DEPTH_LIMIT: "0"
+ # SCRAPY_JAVASCRIPT_ENABLED: "false"
+
+ # Additional envFrom references (secrets, configmaps)
+ envFrom: []
resources:
requests:
diff --git a/ui/src/app/api/rag/[...path]/route.ts b/ui/src/app/api/rag/[...path]/route.ts
index b8f9f32a3..8f6be8c89 100644
--- a/ui/src/app/api/rag/[...path]/route.ts
+++ b/ui/src/app/api/rag/[...path]/route.ts
@@ -9,7 +9,12 @@ import { authOptions } from '@/lib/auth-config';
* The RAG server validates the JWT token and extracts user identity/groups/role.
*
* Authentication:
- * - Authorization: Bearer {access_token} (OIDC JWT token)
+ * - Authorization: Bearer {access_token} (OIDC JWT access token)
+ * - X-Identity-Token: {id_token} (OIDC JWT ID token for claims extraction)
+ *
+ * Some OIDC providers only include user claims (email, groups) in the ID token,
+ * not the access token. The X-Identity-Token header allows the RAG server to
+ * extract these claims from the ID token while using the access token for auth.
*
* The RAG server ONLY supports JWT Bearer tokens, not OAuth2Proxy headers.
* If no JWT is available and trusted network is enabled on RAG server,
@@ -29,9 +34,11 @@ function getRagServerUrl(): string {
/**
* Get auth headers from the current session
*
- * Extracts JWT access token from session and sends as Bearer token to RAG server.
+ * Extracts JWT access token and ID token from session and sends to RAG server.
+ * - Access token: Used for authentication (Bearer token)
+ * - ID token: Used for claims extraction (email, groups) via X-Identity-Token header
*
- * @returns Headers object with Authorization Bearer token for RAG server
+ * @returns Headers object with Authorization Bearer token and optional ID token for RAG server
*/
async function getRbacHeaders(): Promise> {
const headers: Record = {
@@ -42,10 +49,16 @@ async function getRbacHeaders(): Promise> {
const session = await getServerSession(authOptions);
// Pass JWT access token as Bearer token
- // RAG server validates JWT and extracts email, groups, and determines role
+ // RAG server validates JWT and uses it for authentication
if (session?.accessToken) {
headers['Authorization'] = `Bearer ${session.accessToken}`;
}
+
+ // Pass ID token for claims extraction (email, groups)
+ // Some OIDC providers only include user claims in the ID token, not the access token
+ if (session?.idToken) {
+ headers['X-Identity-Token'] = session.idToken;
+ }
} catch (error) {
// If session retrieval fails, continue without auth headers
// RAG server may still allow access from trusted networks or anonymous users
diff --git a/ui/src/app/api/user/info/route.ts b/ui/src/app/api/user/info/route.ts
index b1a0dd9e4..661b100f5 100644
--- a/ui/src/app/api/user/info/route.ts
+++ b/ui/src/app/api/user/info/route.ts
@@ -8,7 +8,11 @@ import { authOptions } from '@/lib/auth-config';
* This endpoint proxies to the RAG server's /v1/user/info endpoint.
* The RAG server determines role and permissions based on JWT Bearer token.
*
- * Authentication: JWT Bearer token only (RAG server does not support OAuth2Proxy headers)
+ * Authentication:
+ * - Authorization: Bearer {access_token} (OIDC JWT access token)
+ * - X-Identity-Token: {id_token} (OIDC JWT ID token for claims extraction)
+ *
+ * The RAG server does not support OAuth2Proxy headers.
*/
function getRagServerUrl(): string {
@@ -31,17 +35,24 @@ async function getRbacHeaders(): Promise> {
hasUser: !!session?.user,
userEmail: session?.user?.email,
hasAccessToken: !!session?.accessToken,
+ hasIdToken: !!session?.idToken,
accessTokenPrefix: session?.accessToken ? session.accessToken.substring(0, 20) + '...' : 'MISSING',
expiresAt: session?.expiresAt ? new Date((session.expiresAt as number) * 1000).toISOString() : 'N/A'
});
// Pass JWT access token as Bearer token
- // RAG server validates JWT and extracts email, groups, and determines role
+ // RAG server validates JWT and uses it for authentication
if (session?.accessToken) {
headers['Authorization'] = `Bearer ${session.accessToken}`;
} else {
console.warn('[User Info] ⚠️ No accessToken in session - RAG server will use trusted network or anonymous');
}
+
+ // Pass ID token for claims extraction (email, groups)
+ // Some OIDC providers only include user claims in the ID token, not the access token
+ if (session?.idToken) {
+ headers['X-Identity-Token'] = session.idToken;
+ }
} catch (error) {
console.error('[User Info] Error retrieving session:', error);
}
diff --git a/ui/src/components/rag/IngestView.tsx b/ui/src/components/rag/IngestView.tsx
index 95ab18e81..14f5364a0 100644
--- a/ui/src/components/rag/IngestView.tsx
+++ b/ui/src/components/rag/IngestView.tsx
@@ -185,11 +185,24 @@ export default function IngestView() {
// Ingestion state
const [url, setUrl] = useState('')
const [ingestType, setIngestType] = useState('web')
- const [checkForSiteMap, setCheckForSiteMap] = useState(true)
- const [sitemapMaxUrls, setSitemapMaxUrls] = useState(2000)
const [description, setDescription] = useState('')
const [includeSubPages, setIncludeSubPages] = useState(false)
const [showAdvancedOptions, setShowAdvancedOptions] = useState(false)
+
+ // Scrapy settings state (for web ingest type)
+ const [crawlMode, setCrawlMode] = useState<'single' | 'sitemap' | 'recursive'>('sitemap')
+ const [maxDepth, setMaxDepth] = useState(2)
+ const [maxPages, setMaxPages] = useState(2000)
+ const [renderJavascript, setRenderJavascript] = useState(false)
+ const [waitForSelector, setWaitForSelector] = useState('')
+ const [downloadDelay, setDownloadDelay] = useState(0.05)
+ const [concurrentRequests, setConcurrentRequests] = useState(30)
+ const [respectRobotsTxt, setRespectRobotsTxt] = useState(true)
+ const [followExternalLinks, setFollowExternalLinks] = useState(false)
+ const [allowedUrlPatterns, setAllowedUrlPatterns] = useState('')
+ const [deniedUrlPatterns, setDeniedUrlPatterns] = useState('')
+ const [chunkSize, setChunkSize] = useState(10000)
+ const [chunkOverlap, setChunkOverlap] = useState(2000)
// DataSources state
const [dataSources, setDataSources] = useState([])
@@ -447,11 +460,25 @@ export default function IngestView() {
try {
const response = await ingestUrl({
url,
- check_for_sitemaps: checkForSiteMap,
- sitemap_max_urls: sitemapMaxUrls,
description: description,
ingest_type: ingestType,
get_child_pages: ingestType === 'confluence' ? includeSubPages : undefined,
+ // ScrapySettings for web ingest type
+ settings: ingestType === 'web' ? {
+ crawl_mode: crawlMode,
+ max_depth: maxDepth,
+ max_pages: maxPages,
+ render_javascript: renderJavascript,
+ wait_for_selector: waitForSelector || null,
+ download_delay: downloadDelay,
+ concurrent_requests: concurrentRequests,
+ respect_robots_txt: respectRobotsTxt,
+ follow_external_links: followExternalLinks,
+ allowed_url_patterns: allowedUrlPatterns ? allowedUrlPatterns.split('\n').filter(p => p.trim()) : null,
+ denied_url_patterns: deniedUrlPatterns ? deniedUrlPatterns.split('\n').filter(p => p.trim()) : null,
+ chunk_size: chunkSize,
+ chunk_overlap: chunkOverlap,
+ } : undefined,
})
const { datasource_id, job_id, message } = response
await fetchDataSources()
@@ -648,17 +675,30 @@ export default function IngestView() {
- {/* Quick options */}
+ {/* Quick options - Crawl Mode for web */}
{ingestType === 'web' && (
-
+
+
Crawl mode:
+
+ {[
+ { value: 'single', label: 'Single Page' },
+ { value: 'sitemap', label: 'Sitemap' },
+ { value: 'recursive', label: 'Follow Links' },
+ ].map((mode) => (
+
+ ))}
+
+
)}
{ingestType === 'confluence' && (
)}
+
+ {/* Description - outside advanced options */}
+
+ setDescription(e.target.value)}
+ className="w-full"
+ />
+
{/* Advanced Options - Animated Collapsible */}
@@ -699,35 +749,221 @@ export default function IngestView() {
className="overflow-hidden"
>
-
-
-
- {ingestType === 'web' && checkForSiteMap && (
-
-
-
setSitemapMaxUrls(Number(e.target.value))}
- className="w-48"
- />
-
- Maximum URLs to fetch from sitemap (0 = no limit)
-
-
+ {/* Web-specific Scrapy settings */}
+ {ingestType === 'web' && (
+ <>
+ {/* Crawl Limits */}
+
+
+
+
setMaxPages(Number(e.target.value))}
+ className="w-full"
+ />
+
+ Maximum pages to crawl
+
+
+ {crawlMode === 'recursive' && (
+
+
+
setMaxDepth(Number(e.target.value))}
+ className="w-full"
+ />
+
+ How deep to follow links (1-10)
+
+
+ )}
+
+
+ {/* JavaScript Rendering */}
+
+
+ {renderJavascript && (
+
+
+
setWaitForSelector(e.target.value)}
+ className="w-full"
+ />
+
+ CSS selector to wait for before extracting content
+
+
+ )}
+
+
+ {/* Rate Limiting */}
+
+
+
+
setDownloadDelay(Number(e.target.value))}
+ className="w-full"
+ />
+
+ Delay between requests to avoid rate limiting
+
+
+
+
+
setConcurrentRequests(Number(e.target.value))}
+ className="w-full"
+ />
+
+ Number of parallel requests (1-50)
+
+
+
+
+ {/* Crawl Behavior */}
+
+
+ {crawlMode === 'recursive' && (
+
+ )}
+
+
+ {/* URL Patterns */}
+ {crawlMode === 'recursive' && (
+
+
+
+
+
+
+
+
+ )}
+
+ {/* Separator before Chunk Settings */}
+
+
+ {/* Chunk Settings */}
+
+
+
+
setChunkSize(Number(e.target.value))}
+ className="w-full"
+ />
+
+ Max characters per chunk (default: 10000)
+
+
+
+
+
setChunkOverlap(Number(e.target.value))}
+ className="w-full"
+ />
+
+ Overlap between chunks (default: 2000)
+
+
+
+ >
)}
@@ -853,6 +1089,10 @@ export default function IngestView() {
const isConfluenceDatasource = ds.ingestor_id === CONFLUENCE_INGESTOR_ID
const supportsReload = isWebloaderDatasource || isConfluenceDatasource
const icon = getIconForType(ds.source_type)
+
+ // Find latest completed job for metrics display
+ const completedJob = jobs.find(j => j.status === 'completed' || j.status === 'completed_with_errors')
+ const hasMetrics = completedJob && ((completedJob.document_count ?? 0) > 0 || (completedJob.chunk_count ?? 0) > 0)
return (
+ {/* Metrics from latest completed job */}
+ {hasMetrics && (
+
+ {completedJob.document_count} documents, {completedJob.chunk_count} chunks
+
+ )}
+
{latestJob ? (
) : (
@@ -1008,8 +1255,9 @@ export default function IngestView() {
{jobs.map((job) => {
const isJobExpanded = expandedJobs.has(job.job_id)
const isJobActive = job.status === 'in_progress' || job.status === 'pending'
- const progress = (job.total > 0 && job.progress_counter >= 0)
- ? Math.min(100, (job.progress_counter / job.total) * 100)
+ const jobTotal = job.total ?? 0
+ const progress = (jobTotal > 0 && job.progress_counter >= 0)
+ ? Math.min(100, (job.progress_counter / jobTotal) * 100)
: 0
return (
@@ -1036,10 +1284,10 @@ export default function IngestView() {
- {isJobActive && job.total > 0 && (
+ {isJobActive && jobTotal > 0 && (
)}
@@ -1097,22 +1345,46 @@ export default function IngestView() {
{job.failed_counter}
+
+
Documents:
+
{job.document_count ?? 0}
+
+
+
Chunks:
+
{job.chunk_count ?? 0}
+
-
Message:
-
{job.message}
+
Status:
+
+ {job.message}
+ {isJobActive && (
+
+ .
+ .
+ .
+
+ )}
+
{job.error_msgs && job.error_msgs.length > 0 && (
-
-
- Errors ({job.error_msgs.length})
+
+
+ ✗
+ {job.error_msgs.length} error{job.error_msgs.length !== 1 ? 's' : ''}
-
+
{job.error_msgs.map((error: string, index: number) => (
-
- {error}
+
+ ›
+ {error}
))}
diff --git a/ui/src/components/rag/Models.ts b/ui/src/components/rag/Models.ts
index bcdf51cef..f6aceb791 100644
--- a/ui/src/components/rag/Models.ts
+++ b/ui/src/components/rag/Models.ts
@@ -20,6 +20,8 @@ export type IngestionJob = {
created_at: string
completed_at?: string
error_msgs?: string[]
+ document_count?: number
+ chunk_count?: number
}
export type IngestorInfo = {
diff --git a/ui/src/components/rag/api/index.ts b/ui/src/components/rag/api/index.ts
index 095ced643..d25047693 100644
--- a/ui/src/components/rag/api/index.ts
+++ b/ui/src/components/rag/api/index.ts
@@ -89,13 +89,31 @@ export const deleteDataSource = async (datasourceId: string): Promise
=> {
return apiDelete('/v1/datasource', { datasource_id: datasourceId });
};
+// ScrapySettings interface for web scraping configuration
+export interface ScrapySettings {
+ crawl_mode: 'single' | 'sitemap' | 'recursive';
+ max_depth?: number;
+ max_pages?: number;
+ render_javascript?: boolean;
+ wait_for_selector?: string | null;
+ page_load_timeout?: number;
+ follow_external_links?: boolean;
+ allowed_url_patterns?: string[] | null;
+ denied_url_patterns?: string[] | null;
+ download_delay?: number;
+ concurrent_requests?: number;
+ respect_robots_txt?: boolean;
+ chunk_size?: number;
+ chunk_overlap?: number;
+ user_agent?: string | null;
+}
+
export const ingestUrl = async (params: {
url: string;
- check_for_sitemaps?: boolean;
- sitemap_max_urls?: number;
description?: string;
ingest_type?: string;
get_child_pages?: boolean;
+ settings?: ScrapySettings;
}): Promise<{ datasource_id: string | null; job_id: string | null; message: string }> => {
// Route to appropriate endpoint based on ingest_type
if (params.ingest_type === 'confluence') {
@@ -105,7 +123,12 @@ export const ingestUrl = async (params: {
get_child_pages: params.get_child_pages || false
});
} else {
- return apiPost('/v1/ingest/webloader/url', params);
+ // Web ingestion with ScrapySettings
+ return apiPost('/v1/ingest/webloader/url', {
+ url: params.url,
+ description: params.description || '',
+ settings: params.settings || { crawl_mode: 'single' }
+ });
}
};