Skip to content

Commit 09eab8a

Browse files
authored
Merge pull request #790 from cnoe-io/rag-server-scrapy
feat(rag): scrapy web loader + litellm embeddings
2 parents 90f5a09 + 19a13ed commit 09eab8a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+9587
-4636
lines changed

.beads/daemon-error

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
LEGACY DATABASE DETECTED!
3+
4+
This database was created before version 0.17.5 and lacks a repository fingerprint.
5+
To continue using this database, you must explicitly set its repository ID:
6+
7+
bd migrate --update-repo-id
8+
9+
This ensures the database is bound to this repository and prevents accidental
10+
database sharing between different repositories.
11+
12+
If this is a fresh clone, run:
13+
rm -rf .beads && bd init
14+
15+
Note: Auto-claiming legacy databases is intentionally disabled to prevent
16+
silent corruption when databases are copied between repositories.

.github/workflows/ci-a2a-rag.yml

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,19 @@ jobs:
135135
strategy:
136136
matrix:
137137
component: ${{ fromJson(needs.load-config.outputs.rag_components) }}
138+
variant: [default]
139+
include:
140+
# Add slim variant for ingestors (no Playwright, ~1.5GB smaller)
141+
- component: ingestors
142+
variant: slim
143+
# Add HuggingFace variant for server (with PyTorch, ~900MB larger)
144+
- component: server
145+
variant: huggingface
138146
fail-fast: false
139147

140148
env:
141149
REGISTRY: ghcr.io
150+
# For slim variant, append -slim to image name suffix in tags (not the image name itself)
142151
IMAGE_NAME: ${{ github.repository_owner }}/caipe-rag-${{ matrix.component }}
143152
DOCKERFILE: ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.${{ matrix.component }}
144153

@@ -194,6 +203,9 @@ jobs:
194203
uses: docker/metadata-action@v5
195204
with:
196205
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
206+
# For non-default variants, append variant name as suffix to all tags
207+
flavor: |
208+
suffix=${{ matrix.variant != 'default' && format('-{0}', matrix.variant) || '' }}
197209
tags: |
198210
type=raw,value=latest,enable=${{ github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' }}
199211
type=raw,value=${{ needs.determine-changes.outputs.tag_version }},enable=${{ needs.determine-changes.outputs.tag_version != '' }}
@@ -223,6 +235,7 @@ jobs:
223235
cache-from: type=gha
224236
cache-to: ${{ matrix.component != 'server' && 'type=gha,mode=min' || '' }}
225237
build-args: |
238+
VARIANT=${{ matrix.variant }}
226239
BUILDKIT_INLINE_CACHE=1
227240
provenance: false
228241
sbom: false
@@ -240,6 +253,14 @@ jobs:
240253
strategy:
241254
matrix:
242255
component: ${{ fromJson(needs.load-config.outputs.rag_components) }}
256+
variant: [default]
257+
include:
258+
# Add slim variant for ingestors
259+
- component: ingestors
260+
variant: slim
261+
# Add HuggingFace variant for server
262+
- component: server
263+
variant: huggingface
243264
fail-fast: false
244265

245266
env:
@@ -269,9 +290,18 @@ jobs:
269290
id: retag-or-build
270291
env:
271292
TAG_VERSION: ${{ needs.determine-changes.outputs.tag_version }}
293+
VARIANT: ${{ matrix.variant }}
272294
run: |
273295
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}"
274-
echo "🏷️ Processing rag-${{ matrix.component }}..."
296+
297+
# Determine variant suffix for tags
298+
if [[ "$VARIANT" != "default" ]]; then
299+
VARIANT_SUFFIX="-${VARIANT}"
300+
else
301+
VARIANT_SUFFIX=""
302+
fi
303+
304+
echo "🏷️ Processing rag-${{ matrix.component }}${VARIANT_SUFFIX}..."
275305
276306
# Determine source tag (previous version)
277307
if [[ "$TAG_VERSION" =~ ^(.+)-rc\.([0-9]+)$ ]]; then
@@ -280,21 +310,23 @@ jobs:
280310
281311
if [[ "$RC_NUM" -gt 1 ]]; then
282312
PREV_RC=$((RC_NUM - 1))
283-
SOURCE_TAG="${BASE_VERSION}-rc.${PREV_RC}"
313+
SOURCE_TAG="${BASE_VERSION}-rc.${PREV_RC}${VARIANT_SUFFIX}"
284314
else
285-
SOURCE_TAG="${BASE_VERSION}"
315+
SOURCE_TAG="${BASE_VERSION}${VARIANT_SUFFIX}"
286316
fi
287317
else
288-
SOURCE_TAG="latest"
318+
SOURCE_TAG="latest${VARIANT_SUFFIX}"
289319
fi
320+
321+
TARGET_TAG="${TAG_VERSION}${VARIANT_SUFFIX}"
290322
291323
echo " Source: ${SOURCE_TAG}"
292-
echo " Target: ${TAG_VERSION}"
324+
echo " Target: ${TARGET_TAG}"
293325
294326
# Check if source image exists
295327
if crane manifest "${FULL_IMAGE}:${SOURCE_TAG}" >/dev/null 2>&1; then
296328
echo " ✅ Source image exists, retagging..."
297-
if crane tag "${FULL_IMAGE}:${SOURCE_TAG}" "${TAG_VERSION}"; then
329+
if crane tag "${FULL_IMAGE}:${SOURCE_TAG}" "${TARGET_TAG}"; then
298330
echo " ✅ Successfully retagged from ${SOURCE_TAG}"
299331
echo "needs_build=false" >> $GITHUB_OUTPUT
300332
else
@@ -373,11 +405,12 @@ jobs:
373405
file: ${{ env.DOCKERFILE }}
374406
push: true
375407
tags: |
376-
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.determine-changes.outputs.tag_version }}
408+
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.determine-changes.outputs.tag_version }}${{ matrix.variant != 'default' && format('-{0}', matrix.variant) || '' }}
377409
platforms: linux/amd64,linux/arm64
378410
cache-from: type=gha
379411
cache-to: ${{ matrix.component != 'server' && 'type=gha,mode=min' || '' }}
380412
build-args: |
413+
VARIANT=${{ matrix.variant }}
381414
BUILDKIT_INLINE_CACHE=1
382415
provenance: false
383416
sbom: false

PR_DESCRIPTION.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
## Helm Chart Simplification
2+
3+
Replaced individual `values.yaml` keys with a generic `env:` map pattern. This reduces template complexity and makes adding new environment variables easier without chart changes.
4+
5+
### What Changed
6+
7+
**Kept as computed values** (from global config):
8+
- `REDIS_URL`, `NEO4J_*`, `MILVUS_URI`, `ONTOLOGY_AGENT_RESTAPI_ADDR`
9+
- `enableGraphRag` (has global fallback)
10+
11+
**Everything else** now uses `env:` map with string values.
12+
13+
### Migration Table
14+
15+
#### RAG Server
16+
17+
| Removed Key | Use Instead |
18+
|-------------|-------------|
19+
| `enableMcp` | `env.ENABLE_MCP` |
20+
| `skipInitTests` | `env.SKIP_INIT_TESTS` |
21+
| `embeddingsProvider` | `env.EMBEDDINGS_PROVIDER` |
22+
| `embeddingsModel` | `env.EMBEDDINGS_MODEL` |
23+
| `maxDocumentsPerIngest` | `env.MAX_DOCUMENTS_PER_INGEST` |
24+
| `maxResultsPerQuery` | `env.MAX_RESULTS_PER_QUERY` |
25+
| `maxIngestionConcurrency` | `env.MAX_INGESTION_CONCURRENCY` |
26+
| `logLevel` | `env.LOG_LEVEL` |
27+
| `rbac.allowUnauthenticated` | `env.ALLOW_UNAUTHENTICATED` |
28+
| `rbac.adminGroups` | `env.RBAC_ADMIN_GROUPS` |
29+
| `rbac.readonlyGroups` | `env.RBAC_READONLY_GROUPS` |
30+
| `rbac.defaultRole` | `env.RBAC_DEFAULT_ROLE` |
31+
32+
#### Web Ingestor
33+
34+
| Removed Key | Use Instead |
35+
|-------------|-------------|
36+
| `webIngestor.logLevel` | `webIngestor.env.LOG_LEVEL` |
37+
| `webIngestor.maxConcurrency` | `webIngestor.env.WEBLOADER_MAX_CONCURRENCY` |
38+
| `webIngestor.maxIngestionTasks` | `webIngestor.env.WEBLOADER_MAX_INGESTION_TASKS` |
39+
| `webIngestor.reloadInterval` | `webIngestor.env.WEBLOADER_RELOAD_INTERVAL` |
40+
41+
### Example
42+
43+
**Before:**
44+
```yaml
45+
rag-server:
46+
enableMcp: true
47+
logLevel: INFO
48+
rbac:
49+
adminGroups: "admins"
50+
```
51+
52+
**After:**
53+
```yaml
54+
rag-server:
55+
env:
56+
ENABLE_MCP: "true"
57+
LOG_LEVEL: "INFO"
58+
RBAC_ADMIN_GROUPS: "admins"
59+
```

ai_platform_engineering/knowledge_bases/rag/agent_ontology/uv.lock

Lines changed: 2 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.ingestors

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,40 @@ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
99
# for an example.
1010
ENV UV_PYTHON_DOWNLOADS=0
1111

12-
# Copy over the local dependencies
12+
# Build variant: "default" (with Playwright/Chromium for JS rendering) or "slim" (no Playwright, smaller image)
13+
ARG VARIANT=default
14+
15+
# Copy over the local dependencies (excluding .venv directories)
1316
COPY common /app/common
17+
# Remove any .venv from common that shouldn't be in the image
18+
RUN rm -rf /app/common/.venv
1419

1520
WORKDIR /app/ingestors
21+
# Install dependencies based on variant
22+
# Default includes Playwright for JS rendering; slim variant excludes it for smaller image
1623
RUN --mount=type=cache,target=/root/.cache/uv \
1724
--mount=type=bind,source=ingestors/uv.lock,target=uv.lock \
1825
--mount=type=bind,source=ingestors/pyproject.toml,target=pyproject.toml \
19-
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev
26+
if [ "$VARIANT" = "slim" ]; then \
27+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev; \
28+
else \
29+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev --extra playwright; \
30+
fi
2031

2132
COPY ingestors .
2233
RUN --mount=type=cache,target=/root/.cache/uv \
23-
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev
34+
if [ "$VARIANT" = "slim" ]; then \
35+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev; \
36+
else \
37+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev --extra playwright; \
38+
fi
39+
40+
# Clean up .venv to reduce image size (remove ~100MB of unnecessary files)
41+
RUN find /app/ingestors/.venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
42+
find /app/ingestors/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
43+
find /app/ingestors/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
44+
find /app/ingestors/.venv -type f -name "*.pyi" -delete 2>/dev/null || true && \
45+
rm -rf /app/ingestors/.venv/include 2>/dev/null || true
2446

2547

2648
# Then, use a final image without uv
@@ -29,13 +51,34 @@ FROM python:3.13-slim-bookworm
2951
# Python executable must be the same, e.g., using `python:3.13-slim-bookworm`
3052
# will fail.
3153

32-
# Install AWS CLI v2 for EKS authentication (This is for k8s ingestor)
54+
# Re-declare ARG after FROM to make it available in this stage
55+
ARG VARIANT=default
56+
57+
# Install system dependencies:
58+
# - AWS CLI v2 for EKS authentication (k8s ingestor)
59+
# - Playwright/Chromium dependencies (default variant only, not slim)
3360
RUN apt-get update && \
34-
apt-get install -y --no-install-recommends curl unzip && \
35-
curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip" && \
61+
apt-get install -y --no-install-recommends \
62+
# For AWS CLI installation
63+
curl unzip && \
64+
# Conditionally install Playwright dependencies (default variant)
65+
if [ "$VARIANT" != "slim" ]; then \
66+
apt-get install -y --no-install-recommends \
67+
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
68+
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
69+
libgbm1 libasound2 libpango-1.0-0 libcairo2 libatspi2.0-0; \
70+
fi && \
71+
# Install AWS CLI v2 (detect architecture)
72+
ARCH=$(uname -m) && \
73+
if [ "$ARCH" = "aarch64" ]; then \
74+
curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \
75+
else \
76+
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \
77+
fi && \
3678
unzip awscliv2.zip && \
3779
./aws/install && \
3880
rm -rf awscliv2.zip aws && \
81+
# Cleanup
3982
apt-get remove -y curl unzip && \
4083
apt-get autoremove -y && \
4184
apt-get clean && \
@@ -53,8 +96,17 @@ WORKDIR /app/ingestors
5396
# Place executables in the environment at the front of the path
5497
ENV PATH="/app/ingestors/.venv/bin:$PATH"
5598

99+
# Install Playwright browsers (Chromium only) - default variant only
100+
# This needs to run before switching to non-root user since it downloads to system cache
101+
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright
102+
RUN if [ "$VARIANT" != "slim" ]; then \
103+
mkdir -p /opt/playwright && \
104+
playwright install chromium && \
105+
chmod -R 755 /opt/playwright; \
106+
fi
107+
56108
# Use a non-root user to run the application
57109
USER app
58110

59111
# Run the application by default - use shell form to enable variable expansion
60-
CMD python3 src/ingestors/${INGESTOR_TYPE}/ingestor.py
112+
CMD python3 src/ingestors/${INGESTOR_TYPE}/ingestor.py

ai_platform_engineering/knowledge_bases/rag/build/Dockerfile.server

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,44 @@ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
99
# for an example.
1010
ENV UV_PYTHON_DOWNLOADS=0
1111

12+
# Build argument for variant: "default" or "huggingface"
13+
# - default: Slim image (~1.4GB) with API-based embedding providers
14+
# - huggingface: Full image (~2.3GB) with PyTorch for local HuggingFace models
15+
ARG VARIANT=default
16+
1217
# Copy over the local dependencies
1318
COPY common /app/common
1419

1520
WORKDIR /app/server
1621
# Increase timeout for large packages (e.g., pyarrow 39MB)
22+
# Install dependencies - conditionally include huggingface extra
1723
RUN --mount=type=cache,target=/root/.cache/uv \
1824
--mount=type=bind,source=server/uv.lock,target=uv.lock \
1925
--mount=type=bind,source=server/pyproject.toml,target=pyproject.toml \
20-
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev
26+
if [ "$VARIANT" = "huggingface" ]; then \
27+
echo "Installing with huggingface extra (includes PyTorch)..." && \
28+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev --extra huggingface; \
29+
else \
30+
echo "Installing default (slim) variant..." && \
31+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-install-project --no-dev; \
32+
fi
2133

2234
COPY server .
2335
RUN --mount=type=cache,target=/root/.cache/uv \
24-
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev
36+
if [ "$VARIANT" = "huggingface" ]; then \
37+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev --extra huggingface; \
38+
else \
39+
UV_HTTP_TIMEOUT=300 uv sync --locked --no-dev; \
40+
fi
41+
42+
# Cleanup step - remove unnecessary files from .venv to reduce image size
43+
# Saves ~325MB by removing test files, caches, type stubs, and C headers
44+
RUN find /app/server/.venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
45+
find /app/server/.venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
46+
find /app/server/.venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
47+
find /app/server/.venv -name "*.pyc" -delete 2>/dev/null || true && \
48+
find /app/server/.venv -name "*.pyi" -delete 2>/dev/null || true && \
49+
find /app/server/.venv -type d -name "include" -path "*/site-packages/*" -exec rm -rf {} + 2>/dev/null || true
2550

2651

2752
# Then, use a final image without uv

0 commit comments

Comments
 (0)