Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
34c9bfa
feat(ingestors): add Scrapy-based web loader infrastructure
subbaksh Feb 10, 2026
bdf40a4
feat(common): add ScrapySettings model and job metrics tracking
subbaksh Feb 10, 2026
38ba6eb
feat(ingestors): integrate Scrapy loader into web ingestor
subbaksh Feb 10, 2026
f0bc38e
feat(server): add chunk count tracking and cleanup utility
subbaksh Feb 10, 2026
a092c87
feat(ui): display document and chunk metrics in ingest view
subbaksh Feb 10, 2026
6ebd4d6
feat(ingestors): add backwards compatibility for deprecated settings …
subbaksh Feb 10, 2026
abfeb3e
fix(ingestors): correct metadata structure for source URL in documents
subbaksh Feb 10, 2026
86bcb5a
feat(build): add Playwright/Chromium support to ingestors image
subbaksh Feb 10, 2026
9ec3bca
feat(ingestors): add status messages for JavaScript rendering mode
subbaksh Feb 10, 2026
b080b6d
feat(auth): add X-Identity-Token header support for ID token claims e…
subbaksh Feb 10, 2026
94277af
feat(embeddings): add LiteLLM proxy support for embeddings
subbaksh Feb 10, 2026
d4b783f
chore(deps): update openai package in lock files
subbaksh Feb 10, 2026
2a3290e
fix(lint): remove unused imports and delete scripts folder
subbaksh Feb 10, 2026
866590b
feat(helm): simplify rag-server config to use generic env map
subbaksh Feb 11, 2026
2d06959
chore: bump chart versions for rag-stack rag-server ai-platform-engin…
github-actions[bot] Feb 11, 2026
18d73c2
fix(lint): remove unnecessary f-string prefix
subbaksh Feb 11, 2026
6583718
feat(rag-server): reduce Docker image size by making HuggingFace opti…
subbaksh Feb 12, 2026
9440ec2
Merge branch 'main' into rag-server-scrapy
subbaksh Feb 12, 2026
b6ebcc7
chore: bump chart versions for ai-platform-engineering
github-actions[bot] Feb 12, 2026
8b2df86
feat(rag): add slim ingestors variant and consolidate CI matrix
subbaksh Feb 12, 2026
87fc240
fix: uv lock
subbaksh Feb 12, 2026
a97c1ed
fix(web-ingestor): report HTTP errors for sitemap, robots.txt, and ba…
subbaksh Feb 12, 2026
19a13ed
fix(web-ingestor): use consistent HTTP error formatting for sitemap a…
subbaksh Feb 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .beads/daemon-error
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

LEGACY DATABASE DETECTED!

This database was created before version 0.17.5 and lacks a repository fingerprint.
To continue using this database, you must explicitly set its repository ID:

bd migrate --update-repo-id

This ensures the database is bound to this repository and prevents accidental
database sharing between different repositories.

If this is a fresh clone, run:
rm -rf .beads && bd init

Note: Auto-claiming legacy databases is intentionally disabled to prevent
silent corruption when databases are copied between repositories.
47 changes: 40 additions & 7 deletions .github/workflows/ci-a2a-rag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,19 @@ jobs:
strategy:
matrix:
component: ${{ fromJson(needs.load-config.outputs.rag_components) }}
variant: [default]
include:
# Add slim variant for ingestors (no Playwright, ~1.5GB smaller)
- component: ingestors
variant: slim
# Add HuggingFace variant for server (with PyTorch, ~900MB larger)
- component: server
variant: huggingface
fail-fast: false

env:
REGISTRY: ghcr.io
# For slim variant, append -slim to image name suffix in tags (not the image name itself)
IMAGE_NAME: ${{ github.repository_owner }}/caipe-rag-${{ matrix.component }}
DOCKERFILE: ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.${{ matrix.component }}

Expand Down Expand Up @@ -194,6 +203,9 @@ jobs:
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
# For non-default variants, append variant name as suffix to all tags
flavor: |
suffix=${{ matrix.variant != 'default' && format('-{0}', matrix.variant) || '' }}
tags: |
type=raw,value=latest,enable=${{ github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' }}
type=raw,value=${{ needs.determine-changes.outputs.tag_version }},enable=${{ needs.determine-changes.outputs.tag_version != '' }}
Expand Down Expand Up @@ -223,6 +235,7 @@ jobs:
cache-from: type=gha
cache-to: ${{ matrix.component != 'server' && 'type=gha,mode=min' || '' }}
build-args: |
VARIANT=${{ matrix.variant }}
BUILDKIT_INLINE_CACHE=1
provenance: false
sbom: false
Expand All @@ -240,6 +253,14 @@ jobs:
strategy:
matrix:
component: ${{ fromJson(needs.load-config.outputs.rag_components) }}
variant: [default]
include:
# Add slim variant for ingestors
- component: ingestors
variant: slim
# Add HuggingFace variant for server
- component: server
variant: huggingface
fail-fast: false

env:
Expand Down Expand Up @@ -269,9 +290,18 @@ jobs:
id: retag-or-build
env:
TAG_VERSION: ${{ needs.determine-changes.outputs.tag_version }}
VARIANT: ${{ matrix.variant }}
run: |
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}"
echo "🏷️ Processing rag-${{ matrix.component }}..."

# Determine variant suffix for tags
if [[ "$VARIANT" != "default" ]]; then
VARIANT_SUFFIX="-${VARIANT}"
else
VARIANT_SUFFIX=""
fi

echo "🏷️ Processing rag-${{ matrix.component }}${VARIANT_SUFFIX}..."

# Determine source tag (previous version)
if [[ "$TAG_VERSION" =~ ^(.+)-rc\.([0-9]+)$ ]]; then
Expand All @@ -280,21 +310,23 @@ jobs:

if [[ "$RC_NUM" -gt 1 ]]; then
PREV_RC=$((RC_NUM - 1))
SOURCE_TAG="${BASE_VERSION}-rc.${PREV_RC}"
SOURCE_TAG="${BASE_VERSION}-rc.${PREV_RC}${VARIANT_SUFFIX}"
else
SOURCE_TAG="${BASE_VERSION}"
SOURCE_TAG="${BASE_VERSION}${VARIANT_SUFFIX}"
fi
else
SOURCE_TAG="latest"
SOURCE_TAG="latest${VARIANT_SUFFIX}"
fi

TARGET_TAG="${TAG_VERSION}${VARIANT_SUFFIX}"

echo " Source: ${SOURCE_TAG}"
echo " Target: ${TAG_VERSION}"
echo " Target: ${TARGET_TAG}"

# Check if source image exists
if crane manifest "${FULL_IMAGE}:${SOURCE_TAG}" >/dev/null 2>&1; then
echo " ✅ Source image exists, retagging..."
if crane tag "${FULL_IMAGE}:${SOURCE_TAG}" "${TAG_VERSION}"; then
if crane tag "${FULL_IMAGE}:${SOURCE_TAG}" "${TARGET_TAG}"; then
echo " ✅ Successfully retagged from ${SOURCE_TAG}"
echo "needs_build=false" >> $GITHUB_OUTPUT
else
Expand Down Expand Up @@ -373,11 +405,12 @@ jobs:
file: ${{ env.DOCKERFILE }}
push: true
tags: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.determine-changes.outputs.tag_version }}
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.determine-changes.outputs.tag_version }}${{ matrix.variant != 'default' && format('-{0}', matrix.variant) || '' }}
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: ${{ matrix.component != 'server' && 'type=gha,mode=min' || '' }}
build-args: |
VARIANT=${{ matrix.variant }}
BUILDKIT_INLINE_CACHE=1
provenance: false
sbom: false
Expand Down
59 changes: 59 additions & 0 deletions PR_DESCRIPTION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
## Helm Chart Simplification

Replaced individual `values.yaml` keys with a generic `env:` map pattern. This reduces template complexity and makes adding new environment variables easier without chart changes.

### What Changed

**Kept as computed values** (from global config):
- `REDIS_URL`, `NEO4J_*`, `MILVUS_URI`, `ONTOLOGY_AGENT_RESTAPI_ADDR`
- `enableGraphRag` (has global fallback)

**Everything else** now uses `env:` map with string values.

### Migration Table

#### RAG Server

| Removed Key | Use Instead |
|-------------|-------------|
| `enableMcp` | `env.ENABLE_MCP` |
| `skipInitTests` | `env.SKIP_INIT_TESTS` |
| `embeddingsProvider` | `env.EMBEDDINGS_PROVIDER` |
| `embeddingsModel` | `env.EMBEDDINGS_MODEL` |
| `maxDocumentsPerIngest` | `env.MAX_DOCUMENTS_PER_INGEST` |
| `maxResultsPerQuery` | `env.MAX_RESULTS_PER_QUERY` |
| `maxIngestionConcurrency` | `env.MAX_INGESTION_CONCURRENCY` |
| `logLevel` | `env.LOG_LEVEL` |
| `rbac.allowUnauthenticated` | `env.ALLOW_UNAUTHENTICATED` |
| `rbac.adminGroups` | `env.RBAC_ADMIN_GROUPS` |
| `rbac.readonlyGroups` | `env.RBAC_READONLY_GROUPS` |
| `rbac.defaultRole` | `env.RBAC_DEFAULT_ROLE` |

#### Web Ingestor

| Removed Key | Use Instead |
|-------------|-------------|
| `webIngestor.logLevel` | `webIngestor.env.LOG_LEVEL` |
| `webIngestor.maxConcurrency` | `webIngestor.env.WEBLOADER_MAX_CONCURRENCY` |
| `webIngestor.maxIngestionTasks` | `webIngestor.env.WEBLOADER_MAX_INGESTION_TASKS` |
| `webIngestor.reloadInterval` | `webIngestor.env.WEBLOADER_RELOAD_INTERVAL` |

### Example

**Before:**
```yaml
rag-server:
enableMcp: true
logLevel: INFO
rbac:
adminGroups: "admins"
```

**After:**
```yaml
rag-server:
env:
ENABLE_MCP: "true"
LOG_LEVEL: "INFO"
RBAC_ADMIN_GROUPS: "admins"
```

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,40 @@
# for an example.
ENV UV_PYTHON_DOWNLOADS=0

# Copy over the local dependencies
# Build variant: "default" (with Playwright/Chromium for JS rendering) or "slim" (no Playwright, smaller image)
ARG VARIANT=default

# Copy over the local dependencies (excluding .venv directories)
COPY common /app/common
# Remove any .venv from common that shouldn't be in the image
RUN rm -rf /app/common/.venv

WORKDIR /app/ingestors
# Install dependencies based on variant
# Default includes Playwright for JS rendering; slim variant excludes it for smaller image
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=ingestors/uv.lock,target=uv.lock \
--mount=type=bind,source=ingestors/pyproject.toml,target=pyproject.toml \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev
if [ "$VARIANT" = "slim" ]; then \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev; \
else \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev --extra playwright; \
fi

COPY ingestors .
RUN --mount=type=cache,target=/root/.cache/uv \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev
if [ "$VARIANT" = "slim" ]; then \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev; \
else \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev --extra playwright; \
fi

# Clean up .venv to reduce image size (remove ~100MB of unnecessary files)
RUN find /app/ingestors/.venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
find /app/ingestors/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
find /app/ingestors/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
find /app/ingestors/.venv -type f -name "*.pyi" -delete 2>/dev/null || true && \
rm -rf /app/ingestors/.venv/include 2>/dev/null || true


# Then, use a final image without uv
Expand All @@ -29,13 +51,34 @@
# Python executable must be the same, e.g., using `python:3.13-slim-bookworm`
# will fail.

# Install AWS CLI v2 for EKS authentication (This is for k8s ingestor)
# Re-declare ARG after FROM to make it available in this stage
ARG VARIANT=default

# Install system dependencies:
# - AWS CLI v2 for EKS authentication (k8s ingestor)
# - Playwright/Chromium dependencies (default variant only, not slim)
RUN apt-get update && \
apt-get install -y --no-install-recommends curl unzip && \
curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip" && \
apt-get install -y --no-install-recommends \
# For AWS CLI installation
curl unzip && \
# Conditionally install Playwright dependencies (default variant)
if [ "$VARIANT" != "slim" ]; then \
apt-get install -y --no-install-recommends \
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
libgbm1 libasound2 libpango-1.0-0 libcairo2 libatspi2.0-0; \
fi && \
# Install AWS CLI v2 (detect architecture)
ARCH=$(uname -m) && \
if [ "$ARCH" = "aarch64" ]; then \
curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \
else \
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \
fi && \
unzip awscliv2.zip && \
./aws/install && \
rm -rf awscliv2.zip aws && \
# Cleanup
apt-get remove -y curl unzip && \
apt-get autoremove -y && \
apt-get clean && \
Expand All @@ -53,8 +96,17 @@
# Place executables in the environment at the front of the path
ENV PATH="/app/ingestors/.venv/bin:$PATH"

# Install Playwright browsers (Chromium only) - default variant only
# This needs to run before switching to non-root user since it downloads to system cache
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright
RUN if [ "$VARIANT" != "slim" ]; then \
mkdir -p /opt/playwright && \
playwright install chromium && \
chmod -R 755 /opt/playwright; \
fi

# Use a non-root user to run the application
USER app

# Run the application by default - use shell form to enable variable expansion
CMD python3 src/ingestors/${INGESTOR_TYPE}/ingestor.py
CMD python3 src/ingestors/${INGESTOR_TYPE}/ingestor.py

Check warning on line 112 in ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors

View workflow job for this annotation

GitHub Actions / build-and-push (ingestors, slim)

JSON arguments recommended for ENTRYPOINT/CMD to prevent unintended behavior related to OS signals

JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals More info: https://docs.docker.com/go/dockerfile/rule/json-args-recommended/

Check warning on line 112 in ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors

View workflow job for this annotation

GitHub Actions / build-and-push (ingestors, default)

JSON arguments recommended for ENTRYPOINT/CMD to prevent unintended behavior related to OS signals

JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals More info: https://docs.docker.com/go/dockerfile/rule/json-args-recommended/
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,44 @@ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
# for an example.
ENV UV_PYTHON_DOWNLOADS=0

# Build argument for variant: "default" or "huggingface"
# - default: Slim image (~1.4GB) with API-based embedding providers
# - huggingface: Full image (~2.3GB) with PyTorch for local HuggingFace models
ARG VARIANT=default

# Copy over the local dependencies
COPY common /app/common

WORKDIR /app/server
# Increase timeout for large packages (e.g., pyarrow 39MB)
# Install dependencies - conditionally include huggingface extra
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=server/uv.lock,target=uv.lock \
--mount=type=bind,source=server/pyproject.toml,target=pyproject.toml \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev
if [ "$VARIANT" = "huggingface" ]; then \
echo "Installing with huggingface extra (includes PyTorch)..." && \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev --extra huggingface; \
else \
echo "Installing default (slim) variant..." && \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev; \
fi

COPY server .
RUN --mount=type=cache,target=/root/.cache/uv \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev
if [ "$VARIANT" = "huggingface" ]; then \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev --extra huggingface; \
else \
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev; \
fi

# Cleanup step - remove unnecessary files from .venv to reduce image size
# Saves ~325MB by removing test files, caches, type stubs, and C headers
RUN find /app/server/.venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
find /app/server/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
find /app/server/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
find /app/server/.venv -name "*.pyc" -delete 2>/dev/null || true && \
find /app/server/.venv -name "*.pyi" -delete 2>/dev/null || true && \
find /app/server/.venv -type d -name "include" -path "*/site-packages/*" -exec rm -rf {} + 2>/dev/null || true


# Then, use a final image without uv
Expand Down
Loading
Loading