First pass of embedding vLLM backend

charlesbluca · charlesbluca · commit 044a44580f3a · 2026-03-02T20:16:05.000Z
diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
@@ -122,3 +122,45 @@ To stop and remove both stacks:
 docker compose -p ingest-gpu0 down
 docker compose -p ingest-gpu1 down
 ```
+
+## Embedding backends
+
+Embeddings can be served by a **remote HTTP endpoint** (NIM, vLLM, or any OpenAI-compatible server) or by a **local HuggingFace model** when no endpoint is configured.
+
+- **Config**: Set `embedding_nim_endpoint` in `ingest-config.yaml` or stage config (e.g. `http://localhost:8000/v1`). Leave empty or null to use the local HF embedder.
+- **CLI**: Use `--embed-invoke-url` (inprocess/batch pipelines) or `--embedding-endpoint` / `--embedding-http-endpoint` (recall CLI) to point at a remote server.
+
+### Using vLLM for embeddings
+
+You can serve an embedding model with [vLLM](https://docs.vllm.ai/) and point the retriever at it. vLLM exposes an OpenAI-compatible `/v1/embeddings` API. Set the embedding endpoint to the vLLM base URL (e.g. `http://localhost:8000/v1`).
+
+**vLLM compatibility**: The default NIM-style client sends `input_type` and `truncate` in the request body; some vLLM versions or configs may not accept these. When using a **vLLM** server, enable the vLLM-compatible payload:
+
+- **Ingest**: `--embed-use-vllm-compat` (inprocess pipeline) or set `embed_use_vllm_compat: true` in `EmbedParams`.
+- **Recall**: `--embedding-use-vllm-compat` (recall CLI).
+
+This sends only `model`, `input`, and `encoding_format` (minimal OpenAI-compatible payload).
+
+### llama-nemotron-embed-1b-v2 with vLLM
+
+For **nvidia/llama-nemotron-embed-1b-v2**, follow the model’s official vLLM instructions:
+
+1. Use **vllm==0.11.0**.
+2. Clone the [model repo](https://huggingface.co/nvidia/llama-nemotron-embed-1b-v2) and **overwrite `config.json` with `config_vllm.json`** from that repo.
+3. Start the server (replace `<path_to_the_cloned_repository>` and `<num_gpus_to_use>`):
+
+   ```bash
+   vllm serve \
+       <path_to_the_cloned_repository> \
+       --trust-remote-code \
+       --runner pooling \
+       --model-impl vllm \
+       --override-pooler-config '{"pooling_type": "MEAN"}' \
+       --data-parallel-size <num_gpus_to_use> \
+       --dtype float32 \
+       --port 8000
+   ```
+
+4. Set the retriever embedding endpoint to `http://localhost:8000/v1` and use `--embed-use-vllm-compat` / `--embedding-use-vllm-compat` as above.
+
+See the [model README](https://huggingface.co/nvidia/llama-nemotron-embed-1b-v2) for the canonical vLLM setup and client example.
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -173,6 +173,12 @@ def main(
         min=0.0,
         help="Parse stage batch size (enables Parse-only path when > 0.0 with parse workers/GPU).",
     ),
+    embed_use_vllm_compat: bool = typer.Option(
+        False,
+        "--embed-use-vllm-compat/--no-embed-use-vllm-compat",
+        help="Use vLLM-compatible HTTP payload for embeddings (no input_type/truncate)."
+        "Set when --embed-invoke-url is a vLLM server.",
+    ),
     embed_modality: str = typer.Option(
         "text",
         "--embed-modality",
@@ -212,6 +218,7 @@ def main(
                 EmbedParams(
                     model_name=str(embed_model_name),
                     embed_invoke_url=embed_invoke_url,
+                    embed_use_vllm_compat=embed_use_vllm_compat,
                     embed_modality=embed_modality,
                     text_elements_modality=text_elements_modality,
                     structured_elements_modality=structured_elements_modality,
@@ -238,6 +245,7 @@ def main(
                 EmbedParams(
                     model_name=str(embed_model_name),
                     embed_invoke_url=embed_invoke_url,
+                    embed_use_vllm_compat=embed_use_vllm_compat,
                     embed_modality=embed_modality,
                     text_elements_modality=text_elements_modality,
                     structured_elements_modality=structured_elements_modality,
@@ -280,6 +288,7 @@ def main(
                 EmbedParams(
                     model_name=str(embed_model_name),
                     embed_invoke_url=embed_invoke_url,
+                    embed_use_vllm_compat=embed_use_vllm_compat,
                     embed_modality=embed_modality,
                     text_elements_modality=text_elements_modality,
                     structured_elements_modality=structured_elements_modality,
@@ -321,6 +330,7 @@ def main(
                 EmbedParams(
                     model_name=str(embed_model_name),
                     embed_invoke_url=embed_invoke_url,
+                    embed_use_vllm_compat=embed_use_vllm_compat,
                     embed_modality=embed_modality,
                     text_elements_modality=text_elements_modality,
                     structured_elements_modality=structured_elements_modality,
@@ -379,6 +389,7 @@ def main(
         embedding_http_endpoint=embed_invoke_url,
         top_k=10,
         ks=(1, 5, 10),
+        embedding_use_vllm_compat=bool(embed_use_vllm_compat),
     )
 
     _df_query, _gold, _raw_hits, _retrieved_keys, metrics = retrieve_and_score(query_csv=query_csv, cfg=cfg)
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
@@ -240,6 +240,7 @@ def _embed_group(
     inference_batch_size: int,
     output_column: str,
     resolved_model_name: str,
+    use_vllm_compat: bool = False,
 ) -> pd.DataFrame:
     """Embed a single modality group via ``create_text_embeddings_for_df``.
 
@@ -285,14 +286,17 @@ def _embed(texts: Sequence[str]) -> Sequence[Sequence[float]]:  # noqa: F811
         embed_modality=group_modality,
     )
 
+    task_config = {
+        "embedder": _embed,
+        "multimodal_embedder": _multimodal_embedder,
+        "endpoint_url": endpoint,
+        "local_batch_size": int(inference_batch_size),
+    }
+    if use_vllm_compat:
+        task_config["use_vllm_compat"] = True
     out_df, _ = create_text_embeddings_for_df(
         group_df,
-        task_config={
-            "embedder": _embed,
-            "multimodal_embedder": _multimodal_embedder,
-            "endpoint_url": endpoint,
-            "local_batch_size": int(inference_batch_size),
-        },
+        task_config=task_config,
         transform_config=cfg,
     )
     return out_df
@@ -307,6 +311,7 @@ def embed_text_main_text_embed(
     model_name: Optional[str] = None,
     embedding_endpoint: Optional[str] = None,
     embed_invoke_url: Optional[str] = None,
+    embed_use_vllm_compat: bool = False,
     text_column: str = "text",
     inference_batch_size: int = 16,
     output_column: str = "text_embeddings_1b_v2",
@@ -372,6 +377,7 @@ def embed_text_main_text_embed(
                 inference_batch_size=inference_batch_size,
                 output_column=output_column,
                 resolved_model_name=_resolved_model_name,
+                use_vllm_compat=bool(embed_use_vllm_compat),
             )
         else:
             # Multiple modalities: group, embed each, reassemble in original order.
@@ -390,6 +396,7 @@ def embed_text_main_text_embed(
                     inference_batch_size=inference_batch_size,
                     output_column=output_column,
                     resolved_model_name=_resolved_model_name,
+                    use_vllm_compat=bool(embed_use_vllm_compat),
                 )
                 parts.append(part)
             out_df = pd.concat(parts).sort_index()
diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py
@@ -181,6 +181,7 @@ class EmbedParams(_ParamsModel):
     model_name: Optional[str] = None
     embedding_endpoint: Optional[str] = None
     embed_invoke_url: Optional[str] = None
+    embed_use_vllm_compat: bool = False  # Use vLLM-compatible HTTP payload when using remote endpoint
     input_type: str = "passage"
     embed_modality: str = "text"  # "text", "image", or "text_image" — default for all element types
     text_elements_modality: Optional[str] = None  # per-type override for page-text rows
diff --git a/nemo_retriever/src/nemo_retriever/recall/core.py b/nemo_retriever/src/nemo_retriever/recall/core.py
@@ -46,6 +46,9 @@ class RecallConfig:
     local_hf_device: Optional[str] = None
     local_hf_cache_dir: Optional[str] = None
     local_hf_batch_size: int = 64
+    # When True and an HTTP embedding endpoint is set, use vLLM-compatible minimal
+    # payload (no input_type/truncate). Set this when the endpoint is a vLLM server.
+    embedding_use_vllm_compat: bool = False
 
 
 def _normalize_query_df(df: pd.DataFrame) -> pd.DataFrame:
@@ -106,6 +109,28 @@ def _resolve_embedding_endpoint(cfg: RecallConfig) -> Tuple[Optional[str], Optio
     return None, None
 
 
+def _embed_queries_vllm_http(
+    queries: List[str],
+    *,
+    endpoint: str,
+    model: str,
+    api_key: str,
+    batch_size: int = 256,
+) -> List[List[float]]:
+    """Embed queries via vLLM-compatible HTTP (minimal payload, no input_type/truncate)."""
+    from nemo_retriever.text_embed.vllm_http import embed_via_vllm_http
+
+    # llama-nemotron-embed-1b-v2 expects "query: " prefix for queries (see model README).
+    return embed_via_vllm_http(
+        queries,
+        endpoint_url=endpoint,
+        model_name=model,
+        api_key=(api_key or "").strip() or None,
+        batch_size=batch_size,
+        prefix="query: ",
+    )
+
+
 def _embed_queries_nim(
     queries: List[str],
     *,
@@ -297,13 +322,22 @@ def retrieve_and_score(
 
     endpoint, use_grpc = _resolve_embedding_endpoint(cfg)
     if endpoint is not None and use_grpc is not None:
-        vectors = _embed_queries_nim(
-            queries,
-            endpoint=endpoint,
-            model=cfg.embedding_model,
-            api_key=cfg.embedding_api_key,
-            grpc=bool(use_grpc),
-        )
+        if bool(cfg.embedding_use_vllm_compat) and not use_grpc:
+            vectors = _embed_queries_vllm_http(
+                queries,
+                endpoint=endpoint,
+                model=cfg.embedding_model,
+                api_key=cfg.embedding_api_key,
+                batch_size=256,
+            )
+        else:
+            vectors = _embed_queries_nim(
+                queries,
+                endpoint=endpoint,
+                model=cfg.embedding_model,
+                api_key=cfg.embedding_api_key,
+                grpc=bool(use_grpc),
+            )
     else:
         vectors = _embed_queries_local_hf(
             queries,
diff --git a/nemo_retriever/src/nemo_retriever/recall/vdb_recall.py b/nemo_retriever/src/nemo_retriever/recall/vdb_recall.py
@@ -131,6 +131,11 @@ def recall_with_main(
         min=1,
         help="Batch size for local HF embedding inference.",
     ),
+    embedding_use_vllm_compat: bool = typer.Option(
+        False,
+        "--embedding-use-vllm-compat/--no-embedding-use-vllm-compat",
+        help="Use vLLM-compatible HTTP payload (no input_type/truncate). Set when endpoint is a vLLM server.",
+    ),
 ) -> None:
     query_csv = _resolve_query_csv(Path(query_csv))
 
@@ -155,6 +160,7 @@ def recall_with_main(
         local_hf_device=_coerce_endpoint_str(local_hf_device),
         local_hf_cache_dir=(str(local_hf_cache_dir) if local_hf_cache_dir is not None else None),
         local_hf_batch_size=int(local_hf_batch_size),
+        embedding_use_vllm_compat=bool(embedding_use_vllm_compat),
     )
 
     print("Reading and normalizing query CSV...")
@@ -251,6 +257,11 @@ def run(
         min=1,
         help="Batch size for local HF embedding inference.",
     ),
+    embedding_use_vllm_compat: bool = typer.Option(
+        False,
+        "--embedding-use-vllm-compat/--no-embedding-use-vllm-compat",
+        help="Use vLLM-compatible HTTP payload (no input_type/truncate). Set when endpoint is a vLLM server.",
+    ),
     print_hits: bool = typer.Option(True, "--print-hits/--no-print-hits", help="Print top-k hits per query."),
 ) -> None:
     """
@@ -282,6 +293,7 @@ def run(
         local_hf_device=_coerce_endpoint_str(local_hf_device),
         local_hf_cache_dir=(str(local_hf_cache_dir) if local_hf_cache_dir is not None else None),
         local_hf_batch_size=int(local_hf_batch_size),
+        embedding_use_vllm_compat=bool(embedding_use_vllm_compat),
     )
 
     df_query, gold, raw_hits, retrieved_keys, metrics = retrieve_and_score(
diff --git a/nemo_retriever/src/nemo_retriever/text_embed/main_text_embed.py b/nemo_retriever/src/nemo_retriever/text_embed/main_text_embed.py
@@ -455,6 +455,42 @@ def _async_runner(
     return flat_results
 
 
+def _vllm_compat_runner(
+    prompts: List[List[str]],
+    api_key: Optional[str],
+    endpoint_url: str,
+    embedding_model: str,
+    encoding_format: str,
+    dimensions: Optional[int] = None,
+    batch_size: int = 256,
+) -> dict:
+    """
+    Request embeddings using vLLM-compatible minimal payload (no input_type/truncate).
+    Returns the same {"embeddings": [...], "info_msgs": [...]} shape as _async_runner.
+    """
+    from nemo_retriever.text_embed.vllm_http import embed_via_vllm_http
+
+    flat_prompts: List[str] = []
+    for batch in prompts:
+        flat_prompts.extend(batch)
+    if not flat_prompts:
+        return {"embeddings": [], "info_msgs": []}
+    # llama-nemotron-embed-1b-v2 expects "passage: " for documents (see model README).
+    vectors = embed_via_vllm_http(
+        flat_prompts,
+        endpoint_url=endpoint_url,
+        model_name=embedding_model,
+        api_key=api_key,
+        dimensions=dimensions,
+        encoding_format=encoding_format,
+        batch_size=batch_size,
+        prefix="passage: ",
+    )
+    # Normalize to list of list (or None for missing)
+    embeddings = [v if v else None for v in vectors]
+    return {"embeddings": embeddings, "info_msgs": [None] * len(embeddings)}
+
+
 def _callable_runner(
     prompts: List[List[str]],
     *,
@@ -656,7 +692,17 @@ def _text_image_content(r: pd.Series) -> Optional[str]:
                 filtered_content_list, batch_size=int(transform_config.batch_size)
             )
 
-            if endpoint_url:
+            if endpoint_url and task_config.get("use_vllm_compat"):
+                content_embeddings = _vllm_compat_runner(
+                    filtered_content_batches,
+                    api_key=api_key,
+                    endpoint_url=str(endpoint_url),
+                    embedding_model=str(model_name),
+                    encoding_format=str(transform_config.encoding_format),
+                    dimensions=dimensions,
+                    batch_size=int(transform_config.batch_size),
+                )
+            elif endpoint_url:
                 content_embeddings = _async_runner(
                     filtered_content_batches,
                     api_key,
diff --git a/nemo_retriever/src/nemo_retriever/text_embed/vllm_http.py b/nemo_retriever/src/nemo_retriever/text_embed/vllm_http.py