Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,35 @@ TEMP_DIR=/tmp
LOG_LEVEL=info

# === Git Repo Document Sources ===
REPO_SOURCES=[{"repo": "https://github.com/RHEcosystemAppEng/llm-on-openshift.git", "globs": ["examples/notebooks/langchain/rhods-doc/*.pdf"]}]
REPO_SOURCES='[{"repo": "https://github.com/RHEcosystemAppEng/llm-on-openshift.git", "globs": ["examples/notebooks/langchain/rhods-doc/*.pdf"]}]'

# === Web Document Sources ===
WEB_SOURCES=["https://ai-on-openshift.io/getting-started/openshift/", "https://ai-on-openshift.io/getting-started/opendatahub/", "https://ai-on-openshift.io/getting-started/openshift-ai/", "https://ai-on-openshift.io/odh-rhoai/configuration/", "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/", "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/", "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/", "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/", "https://ai-on-openshift.io/tools-and-applications/minio/minio/"]
WEB_SOURCES='["https://ai-on-openshift.io/getting-started/openshift/", "https://ai-on-openshift.io/getting-started/opendatahub/", "https://ai-on-openshift.io/getting-started/openshift-ai/", "https://ai-on-openshift.io/odh-rhoai/configuration/", "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/", "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/", "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/", "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/", "https://ai-on-openshift.io/tools-and-applications/minio/minio/"]'

# === General Embedding Config ===
CHUNK_SIZE=1024
CHUNK_OVERLAP=40
CHUNK_SIZE=1024
DB_TYPE=DRYRUN
EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2

# === Redis ===
REDIS_URL=redis://localhost:6379
REDIS_INDEX=docs
REDIS_URL=redis://localhost:6379

# === Elasticsearch ===
ELASTIC_URL=http://localhost:9200
ELASTIC_INDEX=docs
ELASTIC_USER=elastic
ELASTIC_PASSWORD=changeme
ELASTIC_URL=http://localhost:9200
ELASTIC_USER=elastic

# === PGVector ===
PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb
PGVECTOR_COLLECTION_NAME=documents
PGVECTOR_URL=postgresql+psycopg://user:pass@localhost:5432/mydb

# === SQL Server ===
MSSQL_CONNECTION_STRING="Driver={ODBC Driver 18 for SQL Server}; Server=localhost,1433; Database=embeddings; UID=sa; PWD=StrongPassword!; TrustServerCertificate=yes; Encrypt=no;"
MSSQL_TABLE=docs

# === Qdrant ===
QDRANT_URL=http://localhost:6333
QDRANT_COLLECTION=embedded_docs
QDRANT_URL=http://localhost:6333
9 changes: 9 additions & 0 deletions .github/linters/.hadolint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
ignored:
# Always tag the version of an image explicitly
- DL3006
# Using latest is prone to errors if the image will ever update. Pin the version explicitly to a release tag
- DL3007
# Specify version with `dnf install -y <package>-<version>`.
- DL3041
# Pin versions in pip. Instead of `pip install <package>` use `pip install <package>==<version>` or `pip install --requirement <requirements file>`
- DL3013
2 changes: 2 additions & 0 deletions .github/linters/.isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[settings]
profile = "black"
3 changes: 3 additions & 0 deletions .github/linters/.jscpd.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"ignore": ["**/.github/**"]
}
5 changes: 5 additions & 0 deletions .github/linters/.python-lint
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[MESSAGES CONTROL]
disable=
too-few-public-methods,
broad-exception-caught,
import-error
188 changes: 112 additions & 76 deletions .github/workflows/ci-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -1,78 +1,81 @@
name: CI Pipeline
name: Build and push to quay

on:
pull_request:
branches: [main]
push:
branches: [main]
tags:
- "v*"
- "v*.*.*"

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install black isort ruff
- run: black --check .
- run: isort --check-only .
- run: ruff check .
permissions:
contents: read

env:
REGISTRY: localhost
NAME: vector-embedder
TAG: ${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || (github.ref_name == 'main' && 'latest' || github.ref_name) }}

build:
jobs:
build-container:
runs-on: ubuntu-latest
needs: lint
outputs:
image_tag: ${{ steps.meta.outputs.sha_tag }}
steps:
- uses: actions/checkout@v4

- name: Generate tag
id: meta
run: echo "sha_tag=sha-${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
permissions:
contents: read

- name: Build Docker image
uses: docker/build-push-action@v5
steps:
- name: Checkout code
uses: actions/checkout@v5
with:
context: .
file: ./Containerfile
load: true
tags: test-image:${{ steps.meta.outputs.sha_tag }}
fetch-depth: 0
persist-credentials: false

- name: Save image as artifact
run: docker save test-image:${{ steps.meta.outputs.sha_tag }} -o image.tar
- name: Build container and push to local registry
env:
CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
run: |
make build
podman push "${CONTAINER}" "docker-archive:/tmp/image.tar:${CONTAINER}"

- name: Upload image artifact
uses: actions/upload-artifact@v4
with:
name: test-image
path: image.tar
name: image-${{ github.run_id }}
path: /tmp/image.tar
retention-days: 1

test:
needs: [lint, build]
runs-on: ubuntu-latest
needs: [build-container]
if: github.event_name == 'pull_request'
strategy:
fail-fast: false
matrix:
db: [pgvector, redis, elastic, qdrant, mssql]

runs-on: ubuntu-latest
permissions:
contents: read

steps:
- uses: actions/checkout@v4
- name: Checkout code
uses: actions/checkout@v5
with:
fetch-depth: 0
persist-credentials: false

- name: Download image artifact
uses: actions/download-artifact@v4
- name: Download image
uses: actions/download-artifact@v5
with:
name: test-image
path: .
name: image-${{ github.run_id }}
path: /tmp

- name: Load Docker image
run: docker load -i image.tar
- name: Load image into local containers-storage
run: podman pull docker-archive:/tmp/image.tar

- name: Start MSSQL
if: matrix.db == 'mssql'
run: |
docker run -d --name mssql-vector-test \
podman run -d --name mssql-vector-test \
-e "ACCEPT_EULA=Y" \
-e "SA_PASSWORD=StrongPassword!" \
-p 1433:1433 \
Expand All @@ -81,7 +84,7 @@ jobs:
- name: Start PGVector
if: matrix.db == 'pgvector'
run: |
docker run -d --name pgvector-test \
podman run -d --name pgvector-test \
-e POSTGRES_USER=user \
-e POSTGRES_PASSWORD=pass \
-e POSTGRES_DB=mydb \
Expand All @@ -91,14 +94,14 @@ jobs:
- name: Start Redis
if: matrix.db == 'redis'
run: |
docker run -d --name redis-test \
podman run -d --name redis-test \
-p 6379:6379 \
redis/redis-stack-server:6.2.6-v19

- name: Start Elasticsearch
if: matrix.db == 'elastic'
run: |
docker run -d --name es-test \
podman run -d --name es-test \
-e "discovery.type=single-node" \
-e "xpack.security.enabled=true" \
-e "ELASTIC_PASSWORD=changeme" \
Expand All @@ -109,53 +112,86 @@ jobs:
- name: Start Qdrant
if: matrix.db == 'qdrant'
run: |
docker run -d --name qdrant-test \
podman run -d --name qdrant-test \
-p 6333:6333 \
qdrant/qdrant

- name: Wait for DB to start
run: sleep 30

- name: Run embed job
env:
CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
DB_TYPE: ${{ matrix.db }}
run: |
docker run --rm --network host \
podman run --rm --network host \
-e LOG_LEVEL=debug \
-e DB_TYPE=${{ matrix.db }} \
test-image:${{ needs.build.outputs.image_tag }}
-e DB_TYPE="${DB_TYPE}" \
"${REGISTRY}/${CONTAINER}"

push-container:
needs: [build-container]
if: github.event_name != 'pull_request'
strategy:
matrix:
include:
- upload_registry: quay.io/validatedpatterns
legacy: false
- upload_registry: quay.io/hybridcloudpatterns
legacy: true

release:
if: (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) && github.event.repository.fork == false
runs-on: ubuntu-latest
needs: [lint, build, test]
steps:
- uses: actions/checkout@v4
permissions:
contents: read
# This is used to complete the identity challenge
# with sigstore/fulcio when running outside of PRs.
id-token: write

- name: Log in to Quay.io
uses: docker/login-action@v3
steps:
- name: Checkout code
uses: actions/checkout@v5
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_PASSWORD }}
fetch-depth: 0
persist-credentials: false

- name: Download image artifact
uses: actions/download-artifact@v4
- name: Download image
uses: actions/download-artifact@v5
with:
name: test-image
path: .
name: image-${{ github.run_id }}
path: /tmp

- name: Load Docker image
run: docker load -i image.tar
- name: Load image into local containers-storage
run: podman pull docker-archive:/tmp/image.tar

- name: Tag and push image
- name: Log into Quay
env:
USERNAME: ${{ matrix.legacy && secrets.LEGACY_QUAY_USERNAME || secrets.QUAY_USERNAME }}
PASSWORD: ${{ matrix.legacy && secrets.LEGACY_QUAY_PASSWORD || secrets.QUAY_PASSWORD }}
run: |
docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:${{ needs.build.outputs.image_tag }}
podman login -u "${USERNAME}" -p "${PASSWORD}" quay.io

if [[ $GITHUB_REF == refs/tags/* ]]; then
docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:${GITHUB_REF#refs/tags/}
docker push quay.io/hybridcloudpatterns/vector-embedder:${GITHUB_REF#refs/tags/}
elif [[ $GITHUB_REF == refs/heads/main ]]; then
docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/hybridcloudpatterns/vector-embedder:latest
docker push quay.io/hybridcloudpatterns/vector-embedder:latest
fi
- name: Push image to Quay
id: image-push
env:
UPLOADREGISTRY: ${{ matrix.upload_registry }}
CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
run: |
make upload
DIGEST=$(skopeo inspect --format "{{.Digest}}" "docker://${UPLOADREGISTRY}/${CONTAINER}")
echo "digest=$DIGEST" >> "$GITHUB_OUTPUT"

docker push quay.io/hybridcloudpatterns/vector-embedder:${{ needs.build.outputs.image_tag }}
- name: Install cosign
uses: sigstore/cosign-installer@d58896d6a1865668819e1d91763c7751a165e159 # v3.9.2
with:
cosign-release: "v2.2.4"

# Cosign expects the docker config.json for registry authentication so we must
# copy it from buildah
- name: Sign the published Docker image
env:
CONTAINER: ${{ env.NAME }}:${{ env.TAG }}
DIGEST: ${{ steps.image-push.outputs.digest }}
UPLOADREGISTRY: ${{ matrix.upload_registry }}
run: |
cat "${XDG_RUNTIME_DIR}/containers/auth.json" > ~/.docker/config.json
cosign sign --yes "${UPLOADREGISTRY}/${CONTAINER}@${DIGEST}"
16 changes: 16 additions & 0 deletions .github/workflows/super-linter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Super linter

on:
pull_request:
branches: [main]

permissions:
contents: read

jobs:
lint:
uses: validatedpatterns/github-actions-library/.github/workflows/superlinter.yml@v1
with:
sl_env: |
VALIDATE_TRIVY=false
VALIDATE_PYTHON_ISORT=false
18 changes: 11 additions & 7 deletions Containerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
FROM registry.access.redhat.com/ubi9/python-312:9.5
FROM registry.access.redhat.com/ubi10/python-312-minimal:10.0

USER root

WORKDIR /app

RUN dnf install -y \
RUN microdnf install -y git \
unixODBC \
unixODBC-devel && \
curl -sSL https://packages.microsoft.com/config/rhel/9/prod.repo -o /etc/yum.repos.d/mssql-release.repo && \
ACCEPT_EULA=Y dnf install -y msodbcsql18 && \
dnf clean all
ACCEPT_EULA=Y microdnf install -y msodbcsql18 && \
microdnf clean all

COPY requirements.txt .
RUN pip install --upgrade pip && \
pip install -r requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install \
--no-cache-dir \
--compile \
-r requirements.txt

COPY vector_db ./vector_db
COPY loaders ./loaders
Expand All @@ -24,4 +28,4 @@ RUN chown -R 1001:0 .

USER 1001

CMD ./embed_documents.py
CMD ["python", "./embed_documents.py"]
Loading