Skip to content

Commit cdc365e

Browse files
authored
Web pdfs use pdfloader (#2)
* web sources that are pdfs are downloaded and chunked with PDF loader * git loader will try to pull latest if already exists and not pass dirs to textloader * create more robust ci pipeline for testing db providers before pushing image to quay * fix isort issue in gitloader * use sha-tagged test image and make sure all jobs run to completion * use build image as artifact between stages * only spin up db container for db used in each job * make sure failures log AND raise exceptions * make sure jobs are actually using the right DB_TYPE * add psycopg2 to requirements and fix elastic db_type * log upon successful connection to db * initialize logger in config before anything else
1 parent 5cdc469 commit cdc365e

File tree

12 files changed

+356
-154
lines changed

12 files changed

+356
-154
lines changed

.github/workflows/ci-pipeline.yaml

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
name: CI Pipeline
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches: [main]
7+
tags:
8+
- "v*"
9+
10+
jobs:
11+
lint:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
- uses: actions/setup-python@v5
16+
with:
17+
python-version: "3.12"
18+
- run: pip install black isort ruff
19+
- run: black --check .
20+
- run: isort --check-only .
21+
- run: ruff check .
22+
23+
build:
24+
runs-on: ubuntu-latest
25+
needs: lint
26+
outputs:
27+
image_tag: ${{ steps.meta.outputs.sha_tag }}
28+
steps:
29+
- uses: actions/checkout@v4
30+
31+
- name: Generate tag
32+
id: meta
33+
run: echo "sha_tag=sha-${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
34+
35+
- name: Build Docker image
36+
uses: docker/build-push-action@v5
37+
with:
38+
context: .
39+
file: ./Containerfile
40+
load: true
41+
tags: test-image:${{ steps.meta.outputs.sha_tag }}
42+
43+
- name: Save image as artifact
44+
run: docker save test-image:${{ steps.meta.outputs.sha_tag }} -o image.tar
45+
46+
- name: Upload image artifact
47+
uses: actions/upload-artifact@v4
48+
with:
49+
name: test-image
50+
path: image.tar
51+
52+
test:
53+
needs: [lint, build]
54+
runs-on: ubuntu-latest
55+
strategy:
56+
fail-fast: false
57+
matrix:
58+
db: [pgvector, redis, elastic, qdrant]
59+
60+
steps:
61+
- uses: actions/checkout@v4
62+
63+
- name: Download image artifact
64+
uses: actions/download-artifact@v4
65+
with:
66+
name: test-image
67+
path: .
68+
69+
- name: Load Docker image
70+
run: docker load -i image.tar
71+
72+
- name: Start PGVector
73+
if: matrix.db == 'pgvector'
74+
run: |
75+
docker run -d --name pgvector-test \
76+
-e POSTGRES_USER=user \
77+
-e POSTGRES_PASSWORD=pass \
78+
-e POSTGRES_DB=mydb \
79+
-p 5432:5432 \
80+
ankane/pgvector
81+
82+
- name: Start Redis
83+
if: matrix.db == 'redis'
84+
run: |
85+
docker run -d --name redis-test \
86+
-p 6379:6379 \
87+
redis/redis-stack-server:6.2.6-v19
88+
89+
- name: Start Elasticsearch
90+
if: matrix.db == 'elastic'
91+
run: |
92+
docker run -d --name es-test \
93+
-e "discovery.type=single-node" \
94+
-e "xpack.security.enabled=true" \
95+
-e "ELASTIC_PASSWORD=changeme" \
96+
-e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
97+
-p 9200:9200 \
98+
elasticsearch:8.11.1
99+
100+
- name: Start Qdrant
101+
if: matrix.db == 'qdrant'
102+
run: |
103+
docker run -d --name qdrant-test \
104+
-p 6333:6333 \
105+
qdrant/qdrant
106+
107+
- name: Wait for DB to start
108+
run: sleep 30
109+
110+
- name: Run embed job
111+
run: |
112+
docker run --rm --network host \
113+
-e LOG_LEVEL=debug \
114+
-e DB_TYPE=${{ matrix.db }} \
115+
test-image:${{ needs.build.outputs.image_tag }}
116+
117+
release:
118+
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
119+
runs-on: ubuntu-latest
120+
needs: [lint, build, test]
121+
steps:
122+
- uses: actions/checkout@v4
123+
124+
- name: Log in to Quay.io
125+
uses: docker/login-action@v3
126+
with:
127+
registry: quay.io
128+
username: ${{ secrets.QUAY_USERNAME }}
129+
password: ${{ secrets.QUAY_PASSWORD }}
130+
131+
- name: Download image artifact
132+
uses: actions/download-artifact@v4
133+
with:
134+
name: test-image
135+
path: .
136+
137+
- name: Load Docker image
138+
run: docker load -i image.tar
139+
140+
- name: Tag and push image
141+
run: |
142+
docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/dminnear/vector-embedder:${{ needs.build.outputs.image_tag }}
143+
144+
if [[ $GITHUB_REF == refs/tags/* ]]; then
145+
docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/dminnear/vector-embedder:${GITHUB_REF#refs/tags/}
146+
docker push quay.io/dminnear/vector-embedder:${GITHUB_REF#refs/tags/}
147+
elif [[ $GITHUB_REF == refs/heads/main ]]; then
148+
docker tag test-image:${{ needs.build.outputs.image_tag }} quay.io/dminnear/vector-embedder:latest
149+
docker push quay.io/dminnear/vector-embedder:latest
150+
fi
151+
152+
docker push quay.io/dminnear/vector-embedder:${{ needs.build.outputs.image_tag }}

.github/workflows/lint.yaml

Lines changed: 0 additions & 28 deletions
This file was deleted.

.github/workflows/push-to-quay.yaml

Lines changed: 0 additions & 51 deletions
This file was deleted.

config.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import logging
23
import os
34
from dataclasses import dataclass
45
from typing import Dict, List
@@ -97,13 +98,28 @@ def load() -> "Config":
9798
load_dotenv()
9899
get = Config._get_required_env_var
99100

101+
# Initialize logger
102+
log_level_name = get("LOG_LEVEL").lower()
103+
log_levels = {
104+
"debug": 10,
105+
"info": 20,
106+
"warning": 30,
107+
"error": 40,
108+
"critical": 50,
109+
}
110+
if log_level_name not in log_levels:
111+
raise ValueError(
112+
f"Invalid LOG_LEVEL: '{log_level_name}'. Must be one of: {', '.join(log_levels)}"
113+
)
114+
log_level = log_levels[log_level_name]
115+
logging.basicConfig(level=log_level)
116+
logger = logging.getLogger(__name__)
117+
logger.debug("Logging initialized at level: %s", log_level_name.upper())
118+
119+
# Initialize db
100120
db_type = get("DB_TYPE")
101121
db_provider = Config._init_db_provider(db_type)
102122

103-
chunk_size = int(get("CHUNK_SIZE"))
104-
chunk_overlap = int(get("CHUNK_OVERLAP"))
105-
temp_dir = get("TEMP_DIR")
106-
107123
# Web URLs
108124
web_sources_raw = get("WEB_SOURCES")
109125
try:
@@ -118,20 +134,10 @@ def load() -> "Config":
118134
except json.JSONDecodeError as e:
119135
raise ValueError(f"Invalid REPO_SOURCES JSON: {e}") from e
120136

121-
# Logging
122-
log_level_name = get("LOG_LEVEL").lower()
123-
log_levels = {
124-
"debug": 10,
125-
"info": 20,
126-
"warning": 30,
127-
"error": 40,
128-
"critical": 50,
129-
}
130-
if log_level_name not in log_levels:
131-
raise ValueError(
132-
f"Invalid LOG_LEVEL: '{log_level_name}'. Must be one of: {', '.join(log_levels)}"
133-
)
134-
log_level = log_levels[log_level_name]
137+
# Misc
138+
chunk_size = int(get("CHUNK_SIZE"))
139+
chunk_overlap = int(get("CHUNK_OVERLAP"))
140+
temp_dir = get("TEMP_DIR")
135141

136142
return Config(
137143
db_provider=db_provider,

0 commit comments

Comments
 (0)