Skip to content

Commit df98fc3

Browse files
committed
feat: init hf node with minLM embedding model
1 parent dac8402 commit df98fc3

File tree

16 files changed

+928
-196
lines changed

16 files changed

+928
-196
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
.env
33
.env.local
44
.env.*.local
5-
5+
models/
6+
myenv/
67
# Python
78
__pycache__/
89
*.py[cod]

dfx/components/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""DFX components for Hugging Face models node."""
2+
3+
from dfx.components.minilm_embeddings import MiniLMEmbeddingsComponent
4+
5+
__all__ = ["MiniLMEmbeddingsComponent"]
6+
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env python3
2+
"""Utility runner for the locally stored all-MiniLM-L6-V2 embeddings model."""
3+
4+
from __future__ import annotations
5+
6+
import argparse
7+
import json
8+
from pathlib import Path
9+
from typing import Iterable, List, Tuple
10+
import subprocess
11+
12+
try:
13+
from sentence_transformers import SentenceTransformer # type: ignore[import-not-found]
14+
except ImportError as exc: # pragma: no cover - import guard
15+
raise SystemExit(
16+
"sentence-transformers must be installed to run this script. "
17+
"Install it with `uv pip install sentence-transformers`."
18+
) from exc
19+
20+
21+
MODEL_NAME = "all-MiniLM-L6-V2"
22+
ROOT_DIR = Path(__file__).resolve().parents[3]
23+
MODEL_PATH = ROOT_DIR / "models" / MODEL_NAME
24+
HF_MODEL_ID = "sentence-transformers/all-MiniLM-L6-V2"
25+
DOWNLOAD_SCRIPT = ROOT_DIR / "scripts" / "download.sh"
26+
27+
28+
def ensure_model(model_path: Path = MODEL_PATH) -> Path:
29+
"""Ensure the local MiniLM model assets exist."""
30+
if model_path.exists():
31+
return model_path
32+
33+
if not DOWNLOAD_SCRIPT.exists():
34+
raise FileNotFoundError(f"Download script not found at {DOWNLOAD_SCRIPT}")
35+
36+
model_path.parent.mkdir(parents=True, exist_ok=True)
37+
result = subprocess.run(
38+
[str(DOWNLOAD_SCRIPT), HF_MODEL_ID, str(model_path)],
39+
check=False,
40+
capture_output=True,
41+
text=True,
42+
)
43+
if result.returncode != 0:
44+
detail = result.stderr.strip() or result.stdout.strip() or f"exit code {result.returncode}"
45+
raise RuntimeError(f"Failed to download {HF_MODEL_ID}: {detail}")
46+
47+
if not model_path.exists():
48+
raise RuntimeError(f"Download reported success but assets missing at {model_path}")
49+
50+
return model_path
51+
52+
53+
def load_model(model_path: Path = MODEL_PATH) -> SentenceTransformer:
54+
"""Load the embedding model from the local models directory."""
55+
model_path = ensure_model(model_path)
56+
return SentenceTransformer(str(model_path))
57+
58+
59+
def run(texts: Iterable[str]) -> List[List[float]]:
60+
"""Generate embeddings for the provided texts."""
61+
texts_list = [text for text in texts if text.strip()]
62+
if not texts_list:
63+
raise ValueError("At least one non-empty text input is required.")
64+
65+
model = load_model()
66+
embeddings = model.encode(texts_list)
67+
return [embedding.tolist() for embedding in embeddings]
68+
69+
70+
def main() -> None:
71+
parser = argparse.ArgumentParser(
72+
description="Run the all-MiniLM-L6-V2 SentenceTransformer from the local cache."
73+
)
74+
parser.add_argument(
75+
"texts",
76+
nargs="*",
77+
default=[
78+
"LangFlow makes it easy to orchestrate AI workflows.",
79+
"Droq nodes can host specialized ML models.",
80+
],
81+
help="Texts to embed. Defaults to two demo sentences.",
82+
)
83+
args = parser.parse_args()
84+
85+
result = {
86+
"model": MODEL_NAME,
87+
"inputs": args.texts,
88+
"embeddings": run(args.texts),
89+
}
90+
print(json.dumps(result, indent=2))
91+
92+
93+
if __name__ == "__main__":
94+
main()
95+
96+
97+
def get_component_runner() -> Tuple[str, str, str, SentenceTransformer]:
98+
"""
99+
Return metadata + runner for this component.
100+
101+
Returns:
102+
(task, model_id, runner_kind, runner_instance)
103+
"""
104+
model = load_model()
105+
return ("embeddings", HF_MODEL_ID, "sentence_transformer", model)
106+
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""Langflow component for all-MiniLM-L6-V2 embeddings model."""
2+
3+
from __future__ import annotations
4+
5+
import subprocess
6+
from pathlib import Path
7+
from typing import List, Tuple
8+
9+
from lfx.base.embeddings.model import LCEmbeddingsModel
10+
from lfx.field_typing import Embeddings
11+
from lfx.io import IntInput
12+
13+
try:
14+
from sentence_transformers import SentenceTransformer # type: ignore[import-not-found]
15+
except ImportError:
16+
SentenceTransformer = None # type: ignore
17+
18+
ROOT_DIR = Path(__file__).resolve().parents[2]
19+
MODELS_DIR = ROOT_DIR / "models"
20+
MODEL_NAME = "all-MiniLM-L6-V2"
21+
HF_MODEL_ID = "sentence-transformers/all-MiniLM-L6-V2"
22+
DOWNLOAD_SCRIPT = ROOT_DIR / "scripts" / "download.sh"
23+
24+
25+
class MiniLMEmbeddingsComponent(Embeddings):
26+
"""Droq Embeddings wrapper for local execution of SentenceTransformer model."""
27+
28+
def __init__(self, model_path: Path) -> None:
29+
"""Initialize with path to local model."""
30+
if SentenceTransformer is None:
31+
msg = "sentence-transformers must be installed. Install it with: uv pip install sentence-transformers"
32+
raise ImportError(msg)
33+
self.model = SentenceTransformer(str(model_path))
34+
self.model_path = model_path
35+
36+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
37+
"""Generate embeddings for a list of documents."""
38+
if not texts:
39+
return []
40+
embeddings = self.model.encode(texts, convert_to_numpy=True)
41+
return [embedding.tolist() for embedding in embeddings]
42+
43+
def embed_query(self, text: str) -> List[float]:
44+
"""Generate embedding for a single query text."""
45+
if not text:
46+
return []
47+
embedding = self.model.encode(text, convert_to_numpy=True)
48+
return embedding.tolist()
49+
50+
51+
class MiniLMEmbeddingsComponent(LCEmbeddingsModel):
52+
"""DroqFlow component for all-MiniLM-L6-V2 embeddings using local model."""
53+
54+
display_name = "MiniLM Embeddings"
55+
description = "Generate embeddings using the locally stored all-MiniLM-L6-V2 SentenceTransformer model."
56+
documentation: str = "https://www.sbert.net/docs/pretrained_models.html"
57+
icon = "binary"
58+
name = "MiniLMEmbeddingsComponent"
59+
category = "models"
60+
61+
inputs = [
62+
IntInput(
63+
name="chunk_size",
64+
display_name="Chunk Size",
65+
info="Number of texts to embed in a single batch.",
66+
advanced=True,
67+
value=32,
68+
),
69+
]
70+
71+
def build_embeddings(self) -> Embeddings:
72+
"""Build and return the local MiniLM embeddings model."""
73+
model_path = ensure_model()
74+
return MiniLMEmbeddingsComponent(model_path)
75+
76+
77+
def ensure_model() -> Path:
78+
"""Ensure the MiniLM model assets exist locally, downloading them if necessary."""
79+
target_dir = MODELS_DIR / MODEL_NAME
80+
if target_dir.exists():
81+
return target_dir.resolve()
82+
83+
if not DOWNLOAD_SCRIPT.exists():
84+
raise FileNotFoundError(f"Download script not found at {DOWNLOAD_SCRIPT}")
85+
86+
target_dir.parent.mkdir(parents=True, exist_ok=True)
87+
result = subprocess.run(
88+
[str(DOWNLOAD_SCRIPT), HF_MODEL_ID, str(target_dir)],
89+
capture_output=True,
90+
text=True,
91+
)
92+
if result.returncode != 0:
93+
detail = result.stderr.strip() or result.stdout.strip() or f"exit code {result.returncode}"
94+
raise RuntimeError(f"Failed to download {HF_MODEL_ID}: {detail}")
95+
96+
if not target_dir.exists():
97+
raise RuntimeError(f"Download reported success but assets missing at {target_dir}")
98+
99+
return target_dir.resolve()
100+
101+
102+
def get_component_runner() -> Tuple[str, str, str, MiniLMEmbeddingsComponent]:
103+
"""
104+
Return task, model identifier, runner kind, and runner instance for this component.
105+
"""
106+
model_path = ensure_model()
107+
if SentenceTransformer is None:
108+
msg = "sentence-transformers must be installed. Install it with: uv pip install sentence-transformers"
109+
raise ImportError(msg)
110+
runner = SentenceTransformer(str(model_path))
111+
return ("embeddings", HF_MODEL_ID, "sentence_transformer", runner)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Example usage of the all-MiniLM-L6-V2 local embedding runner."""
2+
3+
from __future__ import annotations
4+
5+
import importlib.util
6+
import json
7+
from pathlib import Path
8+
9+
ROOT_DIR = Path(__file__).resolve().parents[2]
10+
RUNNER_PATH = ROOT_DIR / "dfx" / "components" / "all-MiniLM-L6-V2" / "run.py"
11+
12+
13+
def load_runner():
14+
spec = importlib.util.spec_from_file_location("all_minilm_runner", RUNNER_PATH)
15+
if spec is None or spec.loader is None:
16+
raise RuntimeError(f"Unable to load runner from {RUNNER_PATH}")
17+
module = importlib.util.module_from_spec(spec)
18+
spec.loader.exec_module(module)
19+
return module
20+
21+
22+
def main() -> None:
23+
runner = load_runner()
24+
demo_texts = [
25+
"AgentQL enables structured data extraction from web sources.",
26+
"Sentence transformers provide strong embeddings out-of-the-box.",
27+
]
28+
embeddings = runner.run(demo_texts)
29+
print(json.dumps({"inputs": demo_texts, "embedding_shapes": [len(vec) for vec in embeddings]}, indent=2))
30+
31+
32+
if __name__ == "__main__":
33+
main()
34+

node.json

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
{
22
"version": "0.1.0",
3-
"node_id": "dfx-base-node-template-py",
4-
"name": "Droq Base Node Template",
5-
"description": "A Python boilerplate template for creating new DFX nodes.",
3+
"node_id": "dfx-hf-models-node",
4+
"name": "DFX Hugging Face Models Node",
5+
"description": "A node providing local execution of Hugging Face models as DroqFlow components.",
66
"author": "Droq Team",
7-
"api_url": "http://localhost:8003",
7+
"api_url": "http://localhost:8006",
88
"created_at": "2025-11-17T18:44:00Z",
99
"ip_address": "127.0.0.1",
1010
"status": "active",
11-
"docker_image": "droqai/dfx-base-node-template:latest",
11+
"docker_image": "droq-ai/dfx-hf-models-node:latest",
1212
"deployment_location": "local",
13-
"source_code_location": "https://github.com/droq-ai/dfx-base-node-template-py",
13+
"source_code_location": "https://github.com/droq-ai/dfx-hf-models-node",
1414
"components": {
15-
"DFXExampleComponent": {
16-
"path": "dfx.components.example",
17-
"description": "This is a sample droq component",
15+
"MiniLMEmbeddingsComponent": {
16+
"path": "dfx.components.minilm_embeddings",
17+
"description": "Generate embeddings using the locally stored all-MiniLM-L6-V2 SentenceTransformer model.",
1818
"author": "Droq Team"
1919
}
2020
}

pyproject.toml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,16 @@ keywords = ["droq", "node", "template", "workflow"]
2020
# ]
2121

2222
dependencies = [
23+
"fastapi>=0.115.0,<1.0.0", # API surface
24+
"uvicorn[standard]>=0.34.0,<1.0.0", # ASGI server
25+
"pydantic>=2.7.0,<3.0.0",
26+
"lfx",
2327
"nats-py>=2.3.0", # For NATS messaging and JetStream
2428
"aiohttp>=3.8.0", # For HTTP/API calls
29+
"sentence-transformers>=3.0.1,<4.0.0", # For running local Hugging Face embedding models
30+
"transformers>=4.46.0,<5.0.0", # Ensure modern tokenizers wheels are available
31+
"torch>=2.3.0,<3.0.0", # Required backend for sentence-transformers
32+
"huggingface-hub>=0.24.0", # For downloading Hugging Face models
2533
]
2634

2735
[project.optional-dependencies]
@@ -40,8 +48,8 @@ node = "node.main:main"
4048
requires = ["hatchling"]
4149
build-backend = "hatchling.build"
4250

43-
[tool.hatchling.build.targets.wheel]
44-
packages = ["src/node"]
51+
[tool.hatch.build.targets.wheel]
52+
packages = ["src/node", "dfx"]
4553

4654
[tool.black]
4755
line-length = 100

scripts/download.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
if ! command -v uv >/dev/null 2>&1; then
6+
echo "uv is required. Install via https://github.com/astral-sh/uv" >&2
7+
exit 1
8+
fi
9+
10+
if [ $# -lt 1 ] || [ -z "${1:-}" ]; then
11+
echo "Usage: $0 <model-name> [destination]" >&2
12+
exit 1
13+
fi
14+
15+
MODEL_NAME="$1"
16+
17+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18+
ROOT_DIR="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
19+
MODELS_DIR="${ROOT_DIR}/models"
20+
mkdir -p "$MODELS_DIR"
21+
22+
DEST_DIR="${2:-${MODELS_DIR}/${MODEL_NAME##*/}}"
23+
24+
mkdir -p "$DEST_DIR"
25+
26+
echo "Downloading ${MODEL_NAME} into ${DEST_DIR}..."
27+
uv run huggingface-cli download "$MODEL_NAME" --local-dir "$DEST_DIR" --local-dir-use-symlinks False
28+
29+
echo "Model downloaded to ${DEST_DIR}"

src/node/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
"""Droq Node Template - Agnostic Python node template."""
1+
"""Droq Hugging Face Models Node."""
22

33
__version__ = "0.1.0"
44

5-
# Export main components
6-
from .main import main, run_node, shutdown_event
5+
# Export main entry point
6+
from .main import main
77

8-
__all__ = ["main", "run_node", "shutdown_event"]
8+
__all__ = ["main"]

0 commit comments

Comments
 (0)