Merge pull request #334 from syedriko/syedriko-byok-upgrade-pip

openshift-merge-bot[bot] · web-flow · commit ca4413142013 · 2025-05-15T19:48:37.000Z
Fix for the "--require-hashes mode" pip issue during BYOK tool build
diff --git a/byok/Containerfile.output b/byok/Containerfile.output
@@ -2,12 +2,14 @@ ARG BYOK_TOOL_IMAGE=quay.io/$USERNAME/tool:latest
 ARG UBI_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:latest
 FROM ${BYOK_TOOL_IMAGE} as tool
 ARG UBI_BASE_IMAGE
+ARG VECTOR_DB_INDEX=vector_db_index
 
 USER 0
 WORKDIR /workdir
 
+ENV VECTOR_DB_INDEX=$VECTOR_DB_INDEX
 RUN python3.11 generate_embeddings_tool.py -i /markdown -emd embeddings_model \
-    -emn sentence-transformers/all-mpnet-base-v2 -o vector_db -id vector_db_index
+    -emn sentence-transformers/all-mpnet-base-v2 -o vector_db -id $VECTOR_DB_INDEX
 
 FROM ${UBI_BASE_IMAGE}
 COPY --from=tool /workdir/vector_db /rag/vector_db
diff --git a/byok/Containerfile.tool b/byok/Containerfile.tool
@@ -3,17 +3,21 @@ ARG UBI_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:latest
 FROM ${UBI_BASE_IMAGE}
 ARG LOG_LEVEL=info
 ARG OUT_IMAGE_TAG=byok-image
+ARG VECTOR_DB_INDEX=vector_db_index
 ARG BYOK_TOOL_IMAGE
 ARG UBI_BASE_IMAGE
-RUN dnf install -y buildah python3.11 python3.11-pip
+RUN dnf install -y buildah python3.11 python3.11-pip wget
 
 USER 0
 WORKDIR /workdir
 
 COPY requirements.cpu.txt .
-RUN pip3.11 install --no-cache-dir -r requirements.cpu.txt
+RUN pip3.11 install --upgrade pip && pip3.11 install --no-cache-dir -r requirements.cpu.txt
 
 COPY embeddings_model ./embeddings_model
+RUN cd embeddings_model && if [ ! -f embeddings_model/model.safetensors ]; then \
+        wget -q https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/9a3225965996d404b775526de6dbfe85d3368642/model.safetensors; \
+    fi
 COPY byok/generate_embeddings_tool.py byok/Containerfile.output .
 
 ENV _BUILDAH_STARTED_IN_USERNS=""
@@ -22,7 +26,9 @@ ENV OUT_IMAGE_TAG=$OUT_IMAGE_TAG
 ENV BYOK_TOOL_IMAGE=$BYOK_TOOL_IMAGE
 ENV UBI_BASE_IMAGE=$UBI_BASE_IMAGE
 ENV LOG_LEVEL=$LOG_LEVEL
+ENV VECTOR_DB_INDEX=$VECTOR_DB_INDEX
 CMD buildah --log-level $LOG_LEVEL build --build-arg BYOK_TOOL_IMAGE=$BYOK_TOOL_IMAGE \
-    --build-arg UBI_BASE_IMAGE=$UBI_BASE_IMAGE -t $OUT_IMAGE_TAG -f Containerfile.output \
+    --build-arg UBI_BASE_IMAGE=$UBI_BASE_IMAGE --env VECTOR_DB_INDEX=$VECTOR_DB_INDEX \
+    -t $OUT_IMAGE_TAG -f Containerfile.output \
     -v /markdown:/markdown:Z . && rm -f /output/$OUT_IMAGE_TAG.tar && \
     buildah push $OUT_IMAGE_TAG docker-archive:/output/$OUT_IMAGE_TAG.tar
diff --git a/byok/README.md b/byok/README.md
@@ -27,7 +27,10 @@ $ podman push $MY_BYOK_TOOL_IMAGE
 
 ```bash
 $ MY_BYOK_TOOL_IMAGE=quay.io/$USERNAME/byok_tool:0.0.1
-$ podman run -e OUT_IMAGE_TAG=my-byok-image -it --rm --device=/dev/fuse -v <dir_tree_with_markdown_files>:/markdown:Z -v <dir_for_image_tar>:/output:Z $MY_BYOK_TOOL_IMAGE
+$ podman run -e OUT_IMAGE_TAG=my-byok-image -it --rm --device=/dev/fuse \
+  -v <dir_tree_with_markdown_files>:/markdown:Z \
+  -v <dir_for_image_tar>:/output:Z \
+  $MY_BYOK_TOOL_IMAGE
 ```
 
 The tool runs on CPUs, not GPUs.
@@ -71,4 +74,8 @@ total 1408
 $ podman run --rm localhost/byok-image-foobar:latest cat /rag/vector_db/metadata.json
 {"execution-time": 82.21773672103882, "llm": "None", "embedding-model-name": "sentence-transformers/all-mpnet-base-v2", "index-id": "vector_db_index", "vector-db": "faiss.IndexFlatIP", "embedding-dimension": 768, "chunk": 380, "overlap": 0, "total-embedded-files": 29}
 ```
-The database is located at /rag/vector_db and the Faiss index id is "vector_db_index".
+The database is located at /rag/vector_db and by default the Faiss index id is "vector_db_index". The VECTOR_DB_INDEX environment variable can be used to override this default:
+
+```bash
+$ podman run -e VECTOR_DB_INDEX=acme_openshift_sop ...
+```
diff --git a/byok/generate_embeddings_tool.py b/byok/generate_embeddings_tool.py
@@ -7,6 +7,7 @@
 from typing import Callable, Dict
 
 import faiss
+import frontmatter
 import requests
 from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
 from llama_index.core.llms.utils import resolve_llm
@@ -17,25 +18,27 @@
 from llama_index.readers.file.flat.base import FlatReader
 from llama_index.vector_stores.faiss import FaissVectorStore
 
-def get_file_title(file_path: str) -> str:
-    """Extract title from the plaintext doc file."""
-    title = ""
-    try:
-        with open(file_path, "r") as file:
-            title = file.readline().rstrip("\n").lstrip("# ")
-    except Exception:  # noqa: S110
-        pass
-    return title
-
-
 def file_metadata_func(file_path: str) -> Dict:
     """Populate the docs_url and title metadata elements with docs URL and the page's title.
 
     Args:
         file_path: str: file path in str
     """
+    title = file_path
     docs_url = file_path
-    title = get_file_title(file_path)
+    try:
+        with open(file_path, "r") as file:
+            first_line = file.readline()
+            if first_line.startswith("#"):
+                title = first_line.rstrip("\n").lstrip("# ")
+                docs_url = file_path
+            elif first_line.startswith("---"):
+                file.close()
+                post = frontmatter.load(file_path)
+                title = post['title']
+                docs_url = post['url']
+    except Exception:  # noqa: S110
+        pass
     msg = f"file_path: {file_path}, title: {title}, docs_url: {docs_url}"
     print(msg)
     return {"file_path": file_path, "title": title, "docs_url": docs_url}
@@ -92,7 +95,11 @@ def file_metadata_func(file_path: str) -> Dict:
 
     # Load documents
     documents = SimpleDirectoryReader(
-        args.input_dir, recursive=True, file_metadata=file_metadata_func
+        args.input_dir,
+        recursive=True,
+        required_exts=[".md"],
+        file_extractor={".md": FlatReader()},
+        file_metadata=file_metadata_func
     ).load_data()
 
     # Create chunks/nodes
diff --git a/pdm.lock.cpu b/pdm.lock.cpu
diff --git a/pdm.lock.gpu b/pdm.lock.gpu
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,7 @@ dependencies = [
     "llama-index-embeddings-huggingface",
     "llama-index-readers-file",
     "faiss-cpu",
+    "python-frontmatter",
     "aiohttp>=3.8.0",
     "beautifulsoup4>=4.10.0",
 ]
diff --git a/requirements.cpu.txt b/requirements.cpu.txt
@@ -158,9 +158,9 @@ httpcore==1.0.9 \
 httpx==0.28.1 \
     --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \
     --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad
-huggingface-hub[inference]==0.31.1 \
-    --hash=sha256:43f73124819b48b42d140cbc0d7a2e6bd15b2853b1b9d728d4d55ad1750cac5b \
-    --hash=sha256:492bb5f545337aa9e2f59b75ef4c5f535a371e8958a6ce90af056387e67f1180
+huggingface-hub[inference]==0.31.2 \
+    --hash=sha256:7053561376ed7f6ffdaecf09cc54d70dc784ac6315fa4bb9b93e19662b029675 \
+    --hash=sha256:8138cd52aa2326b4429bb00a4a1ba8538346b7b8a808cdce30acb6f1f1bdaeec
 idna==3.10 \
     --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
     --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
@@ -306,9 +306,9 @@ numpy==2.2.5 \
     --hash=sha256:b13f04968b46ad705f7c8a80122a42ae8f620536ea38cf4bdd374302926424dd \
     --hash=sha256:c42365005c7a6c42436a54d28c43fe0e01ca11eb2ac3cefe796c25a5f98e5e9b \
     --hash=sha256:f5045039100ed58fa817a6227a356240ea1b9a1bc141018864c306c1a16d4175
-openai==1.78.0 \
-    --hash=sha256:1ade6a48cd323ad8a7715e7e1669bb97a17e1a5b8a916644261aaef4bf284778 \
-    --hash=sha256:254aef4980688468e96cbddb1f348ed01d274d02c64c6c69b0334bf001fb62b3
+openai==1.78.1 \
+    --hash=sha256:7368bf147ca499804cc408fe68cdb6866a060f38dec961bbc97b04f9d917907e \
+    --hash=sha256:8b26b364531b100df1b961d03560042e5f5be11301d7d49a6cd1a2b9af824dca
 packaging==25.0 \
     --hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
     --hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
@@ -412,6 +412,9 @@ python-dateutil==2.9.0.post0 \
 python-dotenv==1.1.0 \
     --hash=sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5 \
     --hash=sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d
+python-frontmatter==1.1.0 \
+    --hash=sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1 \
+    --hash=sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d
 pytz==2025.2 \
     --hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
     --hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00
diff --git a/requirements.gpu.txt b/requirements.gpu.txt
@@ -158,9 +158,9 @@ httpcore==1.0.9 \
 httpx==0.28.1 \
     --hash=sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc \
     --hash=sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad
-huggingface-hub[inference]==0.31.1 \
-    --hash=sha256:43f73124819b48b42d140cbc0d7a2e6bd15b2853b1b9d728d4d55ad1750cac5b \
-    --hash=sha256:492bb5f545337aa9e2f59b75ef4c5f535a371e8958a6ce90af056387e67f1180
+huggingface-hub[inference]==0.31.2 \
+    --hash=sha256:7053561376ed7f6ffdaecf09cc54d70dc784ac6315fa4bb9b93e19662b029675 \
+    --hash=sha256:8138cd52aa2326b4429bb00a4a1ba8538346b7b8a808cdce30acb6f1f1bdaeec
 idna==3.10 \
     --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
     --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
@@ -355,9 +355,9 @@ nvidia-nvtx-cu12==12.4.127; platform_system == "Linux" and platform_machine == "
     --hash=sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485 \
     --hash=sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a \
     --hash=sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3
-openai==1.78.0 \
-    --hash=sha256:1ade6a48cd323ad8a7715e7e1669bb97a17e1a5b8a916644261aaef4bf284778 \
-    --hash=sha256:254aef4980688468e96cbddb1f348ed01d274d02c64c6c69b0334bf001fb62b3
+openai==1.78.1 \
+    --hash=sha256:7368bf147ca499804cc408fe68cdb6866a060f38dec961bbc97b04f9d917907e \
+    --hash=sha256:8b26b364531b100df1b961d03560042e5f5be11301d7d49a6cd1a2b9af824dca
 packaging==25.0 \
     --hash=sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 \
     --hash=sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f
@@ -461,6 +461,9 @@ python-dateutil==2.9.0.post0 \
 python-dotenv==1.1.0 \
     --hash=sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5 \
     --hash=sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d
+python-frontmatter==1.1.0 \
+    --hash=sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1 \
+    --hash=sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d
 pytz==2025.2 \
     --hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
     --hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@ dependencies = [`
`53`	`53`	`"llama-index-embeddings-huggingface",`
`54`	`54`	`"llama-index-readers-file",`
`55`	`55`	`"faiss-cpu",`
	`56`	`+ "python-frontmatter",`
`56`	`57`	`"aiohttp>=3.8.0",`
`57`	`58`	`"beautifulsoup4>=4.10.0",`
`58`	`59`	`]`