Skip to content

Commit ca44131

Browse files
Merge pull request #334 from syedriko/syedriko-byok-upgrade-pip
Fix for the "--require-hashes mode" pip issue during BYOK tool build
2 parents e0e6c0d + 95bf103 commit ca44131

9 files changed

+108
-53
lines changed

byok/Containerfile.output

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ ARG BYOK_TOOL_IMAGE=quay.io/$USERNAME/tool:latest
22
ARG UBI_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:latest
33
FROM ${BYOK_TOOL_IMAGE} as tool
44
ARG UBI_BASE_IMAGE
5+
ARG VECTOR_DB_INDEX=vector_db_index
56

67
USER 0
78
WORKDIR /workdir
89

10+
ENV VECTOR_DB_INDEX=$VECTOR_DB_INDEX
911
RUN python3.11 generate_embeddings_tool.py -i /markdown -emd embeddings_model \
10-
-emn sentence-transformers/all-mpnet-base-v2 -o vector_db -id vector_db_index
12+
-emn sentence-transformers/all-mpnet-base-v2 -o vector_db -id $VECTOR_DB_INDEX
1113

1214
FROM ${UBI_BASE_IMAGE}
1315
COPY --from=tool /workdir/vector_db /rag/vector_db

byok/Containerfile.tool

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,21 @@ ARG UBI_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:latest
33
FROM ${UBI_BASE_IMAGE}
44
ARG LOG_LEVEL=info
55
ARG OUT_IMAGE_TAG=byok-image
6+
ARG VECTOR_DB_INDEX=vector_db_index
67
ARG BYOK_TOOL_IMAGE
78
ARG UBI_BASE_IMAGE
8-
RUN dnf install -y buildah python3.11 python3.11-pip
9+
RUN dnf install -y buildah python3.11 python3.11-pip wget
910

1011
USER 0
1112
WORKDIR /workdir
1213

1314
COPY requirements.cpu.txt .
14-
RUN pip3.11 install --no-cache-dir -r requirements.cpu.txt
15+
RUN pip3.11 install --upgrade pip && pip3.11 install --no-cache-dir -r requirements.cpu.txt
1516

1617
COPY embeddings_model ./embeddings_model
18+
RUN cd embeddings_model && if [ ! -f embeddings_model/model.safetensors ]; then \
19+
wget -q https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/9a3225965996d404b775526de6dbfe85d3368642/model.safetensors; \
20+
fi
1721
COPY byok/generate_embeddings_tool.py byok/Containerfile.output .
1822

1923
ENV _BUILDAH_STARTED_IN_USERNS=""
@@ -22,7 +26,9 @@ ENV OUT_IMAGE_TAG=$OUT_IMAGE_TAG
2226
ENV BYOK_TOOL_IMAGE=$BYOK_TOOL_IMAGE
2327
ENV UBI_BASE_IMAGE=$UBI_BASE_IMAGE
2428
ENV LOG_LEVEL=$LOG_LEVEL
29+
ENV VECTOR_DB_INDEX=$VECTOR_DB_INDEX
2530
CMD buildah --log-level $LOG_LEVEL build --build-arg BYOK_TOOL_IMAGE=$BYOK_TOOL_IMAGE \
26-
--build-arg UBI_BASE_IMAGE=$UBI_BASE_IMAGE -t $OUT_IMAGE_TAG -f Containerfile.output \
31+
--build-arg UBI_BASE_IMAGE=$UBI_BASE_IMAGE --env VECTOR_DB_INDEX=$VECTOR_DB_INDEX \
32+
-t $OUT_IMAGE_TAG -f Containerfile.output \
2733
-v /markdown:/markdown:Z . && rm -f /output/$OUT_IMAGE_TAG.tar && \
2834
buildah push $OUT_IMAGE_TAG docker-archive:/output/$OUT_IMAGE_TAG.tar

byok/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@ $ podman push $MY_BYOK_TOOL_IMAGE
2727

2828
```bash
2929
$ MY_BYOK_TOOL_IMAGE=quay.io/$USERNAME/byok_tool:0.0.1
30-
$ podman run -e OUT_IMAGE_TAG=my-byok-image -it --rm --device=/dev/fuse -v <dir_tree_with_markdown_files>:/markdown:Z -v <dir_for_image_tar>:/output:Z $MY_BYOK_TOOL_IMAGE
30+
$ podman run -e OUT_IMAGE_TAG=my-byok-image -it --rm --device=/dev/fuse \
31+
-v <dir_tree_with_markdown_files>:/markdown:Z \
32+
-v <dir_for_image_tar>:/output:Z \
33+
$MY_BYOK_TOOL_IMAGE
3134
```
3235

3336
The tool runs on CPUs, not GPUs.
@@ -71,4 +74,8 @@ total 1408
7174
$ podman run --rm localhost/byok-image-foobar:latest cat /rag/vector_db/metadata.json
7275
{"execution-time": 82.21773672103882, "llm": "None", "embedding-model-name": "sentence-transformers/all-mpnet-base-v2", "index-id": "vector_db_index", "vector-db": "faiss.IndexFlatIP", "embedding-dimension": 768, "chunk": 380, "overlap": 0, "total-embedded-files": 29}
7376
```
74-
The database is located at /rag/vector_db and the Faiss index id is "vector_db_index".
77+
The database is located at /rag/vector_db and by default the Faiss index id is "vector_db_index". The VECTOR_DB_INDEX environment variable can be used to override this default:
78+
79+
```bash
80+
$ podman run -e VECTOR_DB_INDEX=acme_openshift_sop ...
81+
```

byok/generate_embeddings_tool.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import Callable, Dict
88

99
import faiss
10+
import frontmatter
1011
import requests
1112
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
1213
from llama_index.core.llms.utils import resolve_llm
@@ -17,25 +18,27 @@
1718
from llama_index.readers.file.flat.base import FlatReader
1819
from llama_index.vector_stores.faiss import FaissVectorStore
1920

20-
def get_file_title(file_path: str) -> str:
21-
"""Extract title from the plaintext doc file."""
22-
title = ""
23-
try:
24-
with open(file_path, "r") as file:
25-
title = file.readline().rstrip("\n").lstrip("# ")
26-
except Exception: # noqa: S110
27-
pass
28-
return title
29-
30-
3121
def file_metadata_func(file_path: str) -> Dict:
3222
"""Populate the docs_url and title metadata elements with docs URL and the page's title.
3323
3424
Args:
3525
file_path: str: file path in str
3626
"""
27+
title = file_path
3728
docs_url = file_path
38-
title = get_file_title(file_path)
29+
try:
30+
with open(file_path, "r") as file:
31+
first_line = file.readline()
32+
if first_line.startswith("#"):
33+
title = first_line.rstrip("\n").lstrip("# ")
34+
docs_url = file_path
35+
elif first_line.startswith("---"):
36+
file.close()
37+
post = frontmatter.load(file_path)
38+
title = post['title']
39+
docs_url = post['url']
40+
except Exception: # noqa: S110
41+
pass
3942
msg = f"file_path: {file_path}, title: {title}, docs_url: {docs_url}"
4043
print(msg)
4144
return {"file_path": file_path, "title": title, "docs_url": docs_url}
@@ -92,7 +95,11 @@ def file_metadata_func(file_path: str) -> Dict:
9295

9396
# Load documents
9497
documents = SimpleDirectoryReader(
95-
args.input_dir, recursive=True, file_metadata=file_metadata_func
98+
args.input_dir,
99+
recursive=True,
100+
required_exts=[".md"],
101+
file_extractor={".md": FlatReader()},
102+
file_metadata=file_metadata_func
96103
).load_data()
97104

98105
# Create chunks/nodes

pdm.lock.cpu

Lines changed: 24 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pdm.lock.gpu

Lines changed: 24 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ dependencies = [
5353
"llama-index-embeddings-huggingface",
5454
"llama-index-readers-file",
5555
"faiss-cpu",
56+
"python-frontmatter",
5657
"aiohttp>=3.8.0",
5758
"beautifulsoup4>=4.10.0",
5859
]

requirements.cpu.txt

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,9 @@ httpcore==1.0.9 \
158158
httpx==0.28.1 \
159159
--hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \
160160
--hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad
161-
huggingface-hub[inference]==0.31.1 \
162-
--hash=sha256:43f73124819b48b42d140cbc0d7a2e6bd15b2853b1b9d728d4d55ad1750cac5b \
163-
--hash=sha256:492bb5f545337aa9e2f59b75ef4c5f535a371e8958a6ce90af056387e67f1180
161+
huggingface-hub[inference]==0.31.2 \
162+
--hash=sha256:7053561376ed7f6ffdaecf09cc54d70dc784ac6315fa4bb9b93e19662b029675 \
163+
--hash=sha256:8138cd52aa2326b4429bb00a4a1ba8538346b7b8a808cdce30acb6f1f1bdaeec
164164
idna==3.10 \
165165
--hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
166166
--hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
@@ -306,9 +306,9 @@ numpy==2.2.5 \
306306
--hash=sha256:b13f04968b46ad705f7c8a80122a42ae8f620536ea38cf4bdd374302926424dd \
307307
--hash=sha256:c42365005c7a6c42436a54d28c43fe0e01ca11eb2ac3cefe796c25a5f98e5e9b \
308308
--hash=sha256:f5045039100ed58fa817a6227a356240ea1b9a1bc141018864c306c1a16d4175
309-
openai==1.78.0 \
310-
--hash=sha256:1ade6a48cd323ad8a7715e7e1669bb97a17e1a5b8a916644261aaef4bf284778 \
311-
--hash=sha256:254aef4980688468e96cbddb1f348ed01d274d02c64c6c69b0334bf001fb62b3
309+
openai==1.78.1 \
310+
--hash=sha256:7368bf147ca499804cc408fe68cdb6866a060f38dec961bbc97b04f9d917907e \
311+
--hash=sha256:8b26b364531b100df1b961d03560042e5f5be11301d7d49a6cd1a2b9af824dca
312312
packaging==25.0 \
313313
--hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
314314
--hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
@@ -412,6 +412,9 @@ python-dateutil==2.9.0.post0 \
412412
python-dotenv==1.1.0 \
413413
--hash=sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5 \
414414
--hash=sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d
415+
python-frontmatter==1.1.0 \
416+
--hash=sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1 \
417+
--hash=sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d
415418
pytz==2025.2 \
416419
--hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
417420
--hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00

requirements.gpu.txt

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,9 @@ httpcore==1.0.9 \
158158
httpx==0.28.1 \
159159
--hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \
160160
--hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad
161-
huggingface-hub[inference]==0.31.1 \
162-
--hash=sha256:43f73124819b48b42d140cbc0d7a2e6bd15b2853b1b9d728d4d55ad1750cac5b \
163-
--hash=sha256:492bb5f545337aa9e2f59b75ef4c5f535a371e8958a6ce90af056387e67f1180
161+
huggingface-hub[inference]==0.31.2 \
162+
--hash=sha256:7053561376ed7f6ffdaecf09cc54d70dc784ac6315fa4bb9b93e19662b029675 \
163+
--hash=sha256:8138cd52aa2326b4429bb00a4a1ba8538346b7b8a808cdce30acb6f1f1bdaeec
164164
idna==3.10 \
165165
--hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
166166
--hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
@@ -355,9 +355,9 @@ nvidia-nvtx-cu12==12.4.127; platform_system == "Linux" and platform_machine == "
355355
--hash=sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485 \
356356
--hash=sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a \
357357
--hash=sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3
358-
openai==1.78.0 \
359-
--hash=sha256:1ade6a48cd323ad8a7715e7e1669bb97a17e1a5b8a916644261aaef4bf284778 \
360-
--hash=sha256:254aef4980688468e96cbddb1f348ed01d274d02c64c6c69b0334bf001fb62b3
358+
openai==1.78.1 \
359+
--hash=sha256:7368bf147ca499804cc408fe68cdb6866a060f38dec961bbc97b04f9d917907e \
360+
--hash=sha256:8b26b364531b100df1b961d03560042e5f5be11301d7d49a6cd1a2b9af824dca
361361
packaging==25.0 \
362362
--hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
363363
--hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
@@ -461,6 +461,9 @@ python-dateutil==2.9.0.post0 \
461461
python-dotenv==1.1.0 \
462462
--hash=sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5 \
463463
--hash=sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d
464+
python-frontmatter==1.1.0 \
465+
--hash=sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1 \
466+
--hash=sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d
464467
pytz==2025.2 \
465468
--hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
466469
--hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00

0 commit comments

Comments
 (0)