opendatahub-io
diff --git a/‎.github/workflows/pre-commit.yaml
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/pre-commit.yaml
Lines changed: 45 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 55 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 55 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 19 additions & 0 deletions b/‎CONTRIBUTING.md
Lines changed: 19 additions & 0 deletions
diff --git a/‎DEPLOYMENT.md
Lines changed: 5 additions & 5 deletions b/‎DEPLOYMENT.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/embedding-models-with-beir/benchmark_beir_embedding_models.py
Lines changed: 28 additions & 4 deletions b/‎benchmarks/embedding-models-with-beir/benchmark_beir_embedding_models.py
Lines changed: 28 additions & 4 deletions
diff --git a/‎benchmarks/llama-stack-rag-with-beir/README.md
Lines changed: 2 additions & 2 deletions b/‎benchmarks/llama-stack-rag-with-beir/README.md
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,45 @@
+name: Pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.12'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        env:
+          SKIP: no-commit-to-branch
+          RUFF_OUTPUT_FORMAT: github
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
+
+      - name: Verify if there are any new files after pre-commit
+        run: |
+          unstaged_files=$(git ls-files --others --exclude-standard)
+          if [ -n "$unstaged_files" ]; then
+            echo "There are uncommitted new files, run pre-commit locally and commit again"
+            echo "$unstaged_files"
+            exit 1
+          fi
@@ -0,0 +1,55 @@
+exclude: 'build/'
+
+default_language_version:
+    python: python3.12
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0  # Latest stable version
+    hooks:
+    -   id: check-merge-conflict
+        args: ['--assume-in-merge']
+    -   id: trailing-whitespace
+        exclude: '\.py$'  # Exclude Python files as Ruff already handles them
+    -   id: check-added-large-files
+        args: ['--maxkb=1000']
+    -   id: end-of-file-fixer
+        exclude: '^(.*\.svg|.*\.md)$'
+    -   id: no-commit-to-branch
+    -   id: check-yaml
+        args: ["--unsafe"]
+    -   id: detect-private-key
+    -   id: requirements-txt-fixer
+    -   id: mixed-line-ending
+        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
+    -   id: check-executables-have-shebangs
+    -   id: check-json
+    -   id: check-shebang-scripts-are-executable
+
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.5.4
+    hooks:
+    -   id: insert-license
+        files: \.py$|\.sh$
+        args:
+          - --license-filepath
+          - docs/license_header.txt
+
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.4
+    hooks:
+    -   id: ruff
+        args: [ --fix ]
+        exclude: ^llama_stack/strong_typing/.*$
+    -   id: ruff-format
+
+-   repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.19.0
+    hooks:
+    -   id: blacken-docs
+        additional_dependencies:
+        - black==24.3.0
+
+ci:
+    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
+    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
@@ -92,3 +92,22 @@ When contributing to the `stack/` directory:
 - **Document any new overlays** or configurations in the [DEPLOYMENT.md](DEPLOYMENT.md) guide
 - **Test deployments** on both OpenShift or Kubernetes when possible
 - **Include resource requirements** in documentation
+
+### pre-commit
+
+This project is configured to use pre-commit for every new PR.
+You can find instructions for installing pre-commit [here](https://pre-commit.com/#installation)
+
+## Setup pre-commit for the RAG project
+
+Run the following command to allow pre-commit to run before each commit:
+
+``` bash
+pre-commit install
+```
+
+To run pre-commit without commiting run:
+
+``` bash
+pre-commit run --all-files
+```
@@ -47,7 +47,7 @@ oc patch secret hf-token-secret --type='merge' -p='{"data":{"HF_TOKEN":"'$(echo
 
 ```bash
 # Create secret llama-stack-inference-model-secret providing model info
-# Important: 
+# Important:
 #  - Make sure that the value for INFERENCE_MODEL is correct (it doesn't have points)
 #  - In VLLM_URL you can use internal or external endpoints for the model. Add /v1 at the end
 #  - NEVER set VLLM_TLS_VERIFY=false in production
@@ -60,8 +60,8 @@ oc create secret generic llama-stack-inference-model-secret \
   --from-literal INFERENCE_MODEL="$INFERENCE_MODEL"   \
   --from-literal VLLM_URL="$VLLM_URL"                 \
   --from-literal VLLM_TLS_VERIFY="$VLLM_TLS_VERIFY"   \
-  --from-literal VLLM_API_TOKEN="$VLLM_API_TOKEN"     
-  
+  --from-literal VLLM_API_TOKEN="$VLLM_API_TOKEN"
+
 # Deploy the LlamaStackDistribution
 oc apply -k stack/overlays/vllm-remote-inference-model
 ```
@@ -267,7 +267,7 @@ To completely remove the project and all its resources from OpenShift, follow th
    ```bash
    # Check for processes using port 8080
    lsof -i :8080
-   
+
    # Kill the process if found (replace PID with the actual process ID)
    kill <PID>
    ```
@@ -278,4 +278,4 @@ After completing these steps, all resources associated with the RAG stack will b
 
 - [OpenShift Documentation](https://docs.openshift.com/)
 - [KServe Documentation](https://kserve.github.io/website/)
-- [vLLM Documentation](https://vllm.readthedocs.io/) 
+- [vLLM Documentation](https://vllm.readthedocs.io/)
@@ -1,7 +1,7 @@
 # RAG
 
 ## Project Overview
-The RAG project serves as a repository of comprehensive demonstrations, benchmarking scripts, and deployment guides for the RAG Stack on Kubernetes/OpenShift. 
+The RAG project serves as a repository of comprehensive demonstrations, benchmarking scripts, and deployment guides for the RAG Stack on Kubernetes/OpenShift.
 
 ## Getting Started
 ### Deployment
 
@@ -1,3 +1,17 @@
+# Copyright 2025 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import os
 import uuid
@@ -92,7 +106,11 @@ def load_beir_dataset(dataset_name: str, custom_datasets_pairs: dict):
 
 
 def inject_documents(
-    llama_stack_client: LlamaStackAsLibraryClient, corpus: dict, batch_size: int, vector_db_provider_id: str, embedding_model: str
+    llama_stack_client: LlamaStackAsLibraryClient,
+    corpus: dict,
+    batch_size: int,
+    vector_db_provider_id: str,
+    embedding_model: str,
 ) -> str:
     vector_db_id = f"beir-rag-eval-{embedding_model}-{uuid.uuid4().hex}"
 
@@ -137,7 +155,9 @@ def inject_documents(
 
 # LlamaStack RAG Retriever
 class LlamaStackRAGRetriever:
-    def __init__(self, vector_db_id: str, query_config: RAGQueryConfig, top_k: int = 10):
+    def __init__(
+        self, vector_db_id: str, query_config: RAGQueryConfig, top_k: int = 10
+    ):
         self.llama_stack_client = llama_stack_client
         self.vector_db_id = vector_db_id
         self.query_config = query_config
@@ -163,7 +183,9 @@ def retrieve(self, queries, top_k=None):
 
 
 # Adapted from https://github.com/opendatahub-io/llama-stack-demos/blob/main/demos/rag_eval/Agentic_RAG_with_reference_eval.ipynb
-def permutation_test_for_paired_samples(scores_a: list, scores_b: list, iterations: int = 10_000):
+def permutation_test_for_paired_samples(
+    scores_a: list, scores_b: list, iterations: int = 10_000
+):
     """
     Performs a permutation test of a given statistic on provided data.
     """
@@ -184,7 +206,9 @@ def _statistic(x, y, axis):
 
 
 # Adapted from https://github.com/opendatahub-io/llama-stack-demos/blob/main/demos/rag_eval/Agentic_RAG_with_reference_eval.ipynb
-def print_stats_significance(scores_a: list, scores_b: list, overview_label: str, label_a: str, label_b: str):
+def print_stats_significance(
+    scores_a: list, scores_b: list, overview_label: str, label_a: str, label_b: str
+):
     mean_score_a = np.mean(scores_a)
     mean_score_b = np.mean(scores_b)
 
 
@@ -25,7 +25,7 @@ llama stack build --template ollama --image-type venv
 ```
 
 ### About the run.yaml file
-* The run.yaml file makes use of Milvus inline as its vector database. 
+* The run.yaml file makes use of Milvus inline as its vector database.
 * There are 3 default embedding models `ibm-granite/granite-embedding-125m-english`, `ibm-granite/granite-embedding-30m-english` and `all-MiniLM-L6-v2`.
 
 To add your own embedding models you can update the `models` section of the `run.yaml` file.
@@ -116,7 +116,7 @@ INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" uv run python benchmark_beir_
 
 ``` text
 dataset-name.zip/
-├── qrels/ 
+├── qrels/
 │   └── test.tsv     # Relevance judgments mapping query IDs to document IDs with relevance scores
 ├── corpus.jsonl     # Document collection with document IDs, titles, and text content
 └── queries.jsonl    # Test queries with query IDs and question text for retrieval evaluation