refactor: add flags for custom datasets and update requirements & run file

Bobbins228 · Bobbins228 · commit 720104a65a0c · 2025-07-08T11:56:13.000+01:00
diff --git a/benchmarks/llama-stack-rag-with-beir/benchmark_beir_ls_vs_no_ls.py b/benchmarks/llama-stack-rag-with-beir/benchmark_beir_ls_vs_no_ls.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
+import argparse
 import itertools
 import logging
 import os
@@ -57,6 +57,35 @@
 # makes this assessment faster.  Running on more datasets would make it more robust.  So it is a tricky trade-off.
 # See a full list of available datasets at https://github.com/beir-cellar/beir?tab=readme-ov-file#beers-available-datasets
 DATASETS = ["scifact"]
+DEFAULT_CUSTOM_DATASETS_URLS = []
+DEFAULT_BATCH_SIZE = 150
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Benchmark embedding models with BEIR datasets")
+    
+    parser.add_argument(
+        "--dataset-names",
+        nargs="+",
+        default=DATASETS,
+        help=f"List of BEIR datasets to evaluate (default: {DATASETS})"
+    )
+    
+    parser.add_argument(
+        "--custom-datasets-urls",
+        nargs="+",
+        default=DEFAULT_CUSTOM_DATASETS_URLS,
+        help=f"Custom URLs for datasets (default: {DEFAULT_CUSTOM_DATASETS_URLS})"
+    )
+
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        help=f"Batch size for injecting documents (default: {DEFAULT_BATCH_SIZE})"
+    )
+    
+    return parser.parse_args()
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -68,12 +97,15 @@
 
 
 # Load BEIR dataset
-def load_beir_dataset(dataset_name: str):
-    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
+def load_beir_dataset(dataset_name: str, custom_datasets_pairs: dict):
+    if custom_datasets_pairs == {}:
+        url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
+    else:
+        url = custom_datasets_pairs[dataset_name]
+    
     out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
-
     data_path = os.path.join(out_dir, dataset_name)
-    print(data_path)
+
     if not os.path.isdir(data_path):
         data_path = util.download_and_unzip(url, out_dir)
 
@@ -83,7 +115,7 @@ def load_beir_dataset(dataset_name: str):
 
 # Inject documents into LlamaStack vector database
 def inject_documents_llama_stack(
-    llama_stack_client, corpus, vector_db_provider_id, embedding_model_id, chunk_size_in_tokens
+    llama_stack_client, corpus, vector_db_provider_id, embedding_model_id, chunk_size_in_tokens, batch_size
 ):
     vector_db_id = f"beir-rag-eval-{uuid.uuid4().hex}"
 
@@ -100,20 +132,27 @@ def inject_documents_llama_stack(
         embedding_dimension=embedding_dimension,
     )
 
-    # Convert corpus into Documents
-    documents = [
-        Document(
-            document_id=doc_id,
-            content=data["title"] + " " + data["text"],
-            mime_type="text/plain",
-            metadata={},
-        )
-        for doc_id, data in corpus.items()
-    ]
+    # Convert corpus into Documents and process in batches
+    corpus_items = list(corpus.items())
+    total_docs = len(corpus_items)
+
+    for i in range(0, total_docs, batch_size):
+        batch_items = corpus_items[i:i + batch_size]
+        documents_batch = [
+            Document(
+                document_id=doc_id,
+                content=data["title"] + " " + data["text"],
+                mime_type="text/plain",
+                metadata={},
+            )
+            for doc_id, data in batch_items
+        ]
 
-    llama_stack_client.tool_runtime.rag_tool.insert(
-        documents=documents, vector_db_id=vector_db_id, chunk_size_in_tokens=chunk_size_in_tokens, timeout=36000
-    )
+        print(f"Inserting batch {i//batch_size + 1}/{(total_docs + batch_size - 1)//batch_size} ({len(documents_batch)} docs)")
+
+        llama_stack_client.tool_runtime.rag_tool.insert(
+            documents=documents_batch, vector_db_id=vector_db_id, chunk_size_in_tokens=chunk_size_in_tokens, timeout=36000
+        )
 
     return vector_db_id
 
@@ -182,7 +221,7 @@ def make_overlapped_chunks(
 
 
 # Inject documents directly into a Milvus lite vector database using the Milvus APIs
-def inject_documents_milvus(corpus, embedding_model_id, chunk_size_in_tokens):
+def inject_documents_milvus(corpus, embedding_model_id, chunk_size_in_tokens, batch_size):
     collection_name = f"beir_eval_{uuid.uuid4().hex}"
 
     embedding_model = model.dense.SentenceTransformerEmbeddingFunction(model_name=embedding_model_id, device="mps")
@@ -192,14 +231,23 @@ def inject_documents_milvus(corpus, embedding_model_id, chunk_size_in_tokens):
     milvus_client = MilvusClient(db_file)
     milvus_client.create_collection(collection_name=collection_name, dimension=int(embedding_dimension), auto_id=True)
 
-    documents = []
-    for doc_id, data in corpus.items():
-        full_text = data["title"] + " " + data["text"]
-        chunks = llama_stack_style_chunker(full_text, chunk_size_in_tokens)
-        for chunk in chunks:
-            documents.append({"doc_id": doc_id, "vector": embedding_model.encode_documents([chunk])[0], "text": chunk})
+    # Convert corpus into list and process in batches
+    corpus_items = list(corpus.items())
+    total_docs = len(corpus_items)
+
+    for i in range(0, total_docs, batch_size):
+        batch_items = corpus_items[i:i + batch_size]
+        documents_batch = []
+        
+        for doc_id, data in batch_items:
+            full_text = data["title"] + " " + data["text"]
+            chunks = llama_stack_style_chunker(full_text, chunk_size_in_tokens)
+            for chunk in chunks:
+                documents_batch.append({"doc_id": doc_id, "vector": embedding_model.encode_documents([chunk])[0], "text": chunk})
+
+        print(f"Inserting batch {i//batch_size + 1}/{(total_docs + batch_size - 1)//batch_size} ({len(documents_batch)} chunks)")
+        milvus_client.insert(collection_name=collection_name, data=documents_batch)
 
-    milvus_client.insert(collection_name=collection_name, data=documents)
     return milvus_client, collection_name, embedding_model
 
 
@@ -362,26 +410,33 @@ def print_scores(all_scores):
 def evaluate_retrieval_with_and_without_llama_stack(
     llama_stack_client,
     datasets,
+    custom_datasets_urls,
     vector_db_provider_id,
     embedding_model_id,
+    batch_size,
     chunk_size_in_tokens=512,
     number_of_search_results=10,
     save_files=False,
 ):
     all_scores = {}
     results_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "results")
+
+    custom_datasets_pairs = {}
+    if custom_datasets_urls:
+        custom_datasets_pairs = {dataset_name: custom_datasets_urls[i] for i, dataset_name in enumerate(datasets)}
+
     for dataset_name in datasets:
         all_scores[dataset_name] = {}
-        corpus, queries, qrels = load_beir_dataset(dataset_name)
-        
+        corpus, queries, qrels = load_beir_dataset(dataset_name, custom_datasets_pairs)
+
         # Uncomment this line to select only a few documents for debugging
         #corpus = pick_arbitrary_pairs(corpus)
 
         retrievers = {}
 
         logger.info(f"Ingesting {dataset_name}, LlamaStackRAGRetriever")
         vector_db_id = inject_documents_llama_stack(
-            llama_stack_client, corpus, vector_db_provider_id, embedding_model_id, chunk_size_in_tokens
+            llama_stack_client, corpus, vector_db_provider_id, embedding_model_id, chunk_size_in_tokens, batch_size
         )
 
         # We set max_tokens_in_context=chunk_size_in_tokens*number_of_search_results so that we won't get errors saying that we have too many tokens.
@@ -395,7 +450,7 @@ def evaluate_retrieval_with_and_without_llama_stack(
 
         print(f"Ingesting {dataset_name}, MilvusRetriever")
         milvus_client, collection_name, embedding_model = inject_documents_milvus(
-            corpus, embedding_model_id, chunk_size_in_tokens
+            corpus, embedding_model_id, chunk_size_in_tokens, batch_size
         )
         milvus_retriever = MilvusRetriever(
             milvus_client, collection_name, embedding_model, top_k=number_of_search_results
@@ -462,11 +517,21 @@ def pick_arbitrary_pairs(input_dict, num_pairs=5):
 
 
 if __name__ == "__main__":
+    args = parse_args()
+
+    # A check for when custom dataset urls are set they are compared with the number of dataset names
+    if args.custom_datasets_urls and len(args.custom_datasets_urls) != len(args.dataset_names):
+        raise ValueError(
+            f"Number of custom dataset URLs ({len(args.custom_datasets_urls)}) must match "
+            f"number of dataset names ({len(args.dataset_names)}). "
+            f"Got URLs: {args.custom_datasets_urls}, dataset names: {args.dataset_names}"
+        )
+
     llama_stack_client = LlamaStackAsLibraryClient("./run.yaml")
     llama_stack_client.initialize()
 
     all_scores = evaluate_retrieval_with_and_without_llama_stack(
-        llama_stack_client, DATASETS, "milvus", "ibm-granite/granite-embedding-125m-english"
+        llama_stack_client, args.dataset_names, args.custom_datasets_urls, "milvus", "ibm-granite/granite-embedding-125m-english", args.batch_size
     )
     has_significant_difference = print_scores(all_scores)
     if has_significant_difference:
diff --git a/benchmarks/llama-stack-rag-with-beir/requirements.txt b/benchmarks/llama-stack-rag-with-beir/requirements.txt
@@ -0,0 +1,5 @@
+llama-stack>=0.2.13
+pymilvus>=2.5.12
+pytrec-eval>=0.5
+beir>=2.2.0
+pymilvus-model>=0.3.2
diff --git a/benchmarks/llama-stack-rag-with-beir/run.yaml b/benchmarks/llama-stack-rag-with-beir/run.yaml
@@ -1,10 +1,12 @@
-version: '2'
+version: 2
 image_name: ollama
 apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
+- post_training
 - safety
 - scoring
 - telemetry
@@ -15,19 +17,20 @@ providers:
   - provider_id: ollama
     provider_type: remote::ollama
     config:
-      url: ${env.OLLAMA_URL:http://localhost:11434}
+      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      raise_on_connect_error: true
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
   vector_io:
   - provider_id: milvus
     provider_type: inline::milvus
     config:
-      db_path: ${env.MILVUS_STORE_DIR:~/.llama/distributions/ollama}/milvus.db
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec_store.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ollama/milvus.db}
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/${env.MILVUS_KVSTORE_DB_PATH:=~/.llama/distributions/ollama/milvus_registry.db}
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
@@ -40,37 +43,33 @@ providers:
       persistence_store:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
-      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/responses_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
     config:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
     config:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -81,17 +80,32 @@ providers:
   - provider_id: braintrust
     provider_type: inline::braintrust
     config:
-      openai_api_key: ${env.OPENAI_API_KEY:}
+      openai_api_key: ${env.OPENAI_API_KEY:+}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ollama/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/files_metadata.db
+  post_training:
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
     config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
       max_results: 3
   - provider_id: tavily-search
     provider_type: remote::tavily-search
     config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
       max_results: 3
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
@@ -102,10 +116,13 @@ providers:
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
     config:
-      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:+}
 metadata_store:
   type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -143,4 +160,3 @@ tool_groups:
   provider_id: wolfram-alpha
 server:
   port: 8321
-  disable_ipv6: false