feat: add default trucation in embedder

CaralHsi · CaralHsi · commit 785b5a4a65d2 · 2025-12-03T16:20:52.000+08:00
diff --git a/src/memos/configs/embedder.py b/src/memos/configs/embedder.py
@@ -12,6 +12,10 @@ class BaseEmbedderConfig(BaseConfig):
     embedding_dims: int | None = Field(
         default=None, description="Number of dimensions for the embedding"
     )
+    max_tokens: int | None = Field(
+        default=8192,
+        description="Maximum number of tokens per text. Texts exceeding this limit will be automatically truncated. Set to None to disable truncation.",
+    )
     headers_extra: dict[str, Any] | None = Field(
         default=None,
         description="Extra headers for the embedding model, only for universal_api backend",
diff --git a/src/memos/embedders/ark.py b/src/memos/embedders/ark.py
@@ -49,6 +49,9 @@ def embed(self, texts: list[str]) -> list[list[float]]:
             MultimodalEmbeddingContentPartTextParam,
         )
 
+        # Truncate texts if max_tokens is configured
+        texts = self._truncate_texts(texts)
+
         if self.config.multi_modal:
             texts_input = [
                 MultimodalEmbeddingContentPartTextParam(text=text, type="text") for text in texts
diff --git a/src/memos/embedders/base.py b/src/memos/embedders/base.py
@@ -1,14 +1,105 @@
+import re
+
 from abc import ABC, abstractmethod
 
 from memos.configs.embedder import BaseEmbedderConfig
 
 
+def _count_tokens_for_embedding(text: str) -> int:
+    """
+    Count tokens in text for embedding truncation.
+    Uses tiktoken if available, otherwise falls back to heuristic.
+
+    Args:
+        text: Text to count tokens for.
+
+    Returns:
+        Number of tokens.
+    """
+    try:
+        import tiktoken
+
+        try:
+            enc = tiktoken.encoding_for_model("gpt-4o-mini")
+        except Exception:
+            enc = tiktoken.get_encoding("cl100k_base")
+        return len(enc.encode(text or ""))
+    except Exception:
+        # Heuristic fallback: zh chars ~1 token, others ~1 token per ~4 chars
+        if not text:
+            return 0
+        zh_chars = re.findall(r"[\u4e00-\u9fff]", text)
+        zh = len(zh_chars)
+        rest = len(text) - zh
+        return zh + max(1, rest // 4)
+
+
+def _truncate_text_to_tokens(text: str, max_tokens: int) -> str:
+    """
+    Truncate text to fit within max_tokens limit.
+    Uses binary search to find the optimal truncation point.
+
+    Args:
+        text: Text to truncate.
+        max_tokens: Maximum number of tokens allowed.
+
+    Returns:
+        Truncated text.
+    """
+    if not text or max_tokens is None or max_tokens <= 0:
+        return text
+
+    current_tokens = _count_tokens_for_embedding(text)
+    if current_tokens <= max_tokens:
+        return text
+
+    # Binary search for the right truncation point
+    low, high = 0, len(text)
+    best_text = ""
+
+    while low < high:
+        mid = (low + high + 1) // 2  # Use +1 to avoid infinite loop
+        truncated = text[:mid]
+        tokens = _count_tokens_for_embedding(truncated)
+
+        if tokens <= max_tokens:
+            best_text = truncated
+            low = mid
+        else:
+            high = mid - 1
+
+    return best_text if best_text else text[:1]  # Fallback to at least one character
+
+
 class BaseEmbedder(ABC):
     """Base class for all Embedding models."""
 
     @abstractmethod
     def __init__(self, config: BaseEmbedderConfig):
         """Initialize the embedding model with the given configuration."""
+        self.config = config
+
+    def _truncate_texts(self, texts: list[str], approx_char_per_token=1.1) -> (list)[str]:
+        """
+        Truncate texts to fit within max_tokens limit if configured.
+
+        Args:
+            texts: List of texts to truncate.
+
+        Returns:
+            List of truncated texts.
+        """
+        if not hasattr(self, "config") or self.config.max_tokens is None:
+            return texts
+        max_tokens = self.config.max_tokens
+
+        truncated = []
+        for t in texts:
+            if len(t) < max_tokens * approx_char_per_token:
+                truncated.append(t)
+            else:
+                truncated.append(_truncate_text_to_tokens(t, max_tokens))
+        return truncated
 
     @abstractmethod
     def embed(self, texts: list[str]) -> list[list[float]]:
diff --git a/src/memos/embedders/ollama.py b/src/memos/embedders/ollama.py
@@ -67,6 +67,9 @@ def embed(self, texts: list[str]) -> list[list[float]]:
         Returns:
             List of embeddings, each represented as a list of floats.
         """
+        # Truncate texts if max_tokens is configured
+        texts = self._truncate_texts(texts)
+
         response = self.client.embed(
             model=self.config.model_name_or_path,
             input=texts,
diff --git a/src/memos/embedders/sentence_transformer.py b/src/memos/embedders/sentence_transformer.py
@@ -42,5 +42,8 @@ def embed(self, texts: list[str]) -> list[list[float]]:
         Returns:
             List of embeddings, each represented as a list of floats.
         """
+        # Truncate texts if max_tokens is configured
+        texts = self._truncate_texts(texts)
+
         embeddings = self.model.encode(texts, convert_to_numpy=True)
         return embeddings.tolist()
diff --git a/src/memos/embedders/universal_api.py b/src/memos/embedders/universal_api.py
@@ -36,6 +36,9 @@ def __init__(self, config: UniversalAPIEmbedderConfig):
         log_extra_args={"model_name_or_path": "text-embedding-3-large"},
     )
     def embed(self, texts: list[str]) -> list[list[float]]:
+        # Truncate texts if max_tokens is configured
+        texts = self._truncate_texts(texts)
+
         if self.provider == "openai" or self.provider == "azure":
             try:
                 response = self.client.embeddings.create(

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,9 @@ def embed(self, texts: list[str]) -> list[list[float]]:`
`49`	`49`	`MultimodalEmbeddingContentPartTextParam,`
`50`	`50`	`)`
`51`	`51`
	`52`	`+ # Truncate texts if max_tokens is configured`
	`53`	`+ texts = self._truncate_texts(texts)`
	`54`	`+`
`52`	`55`	`if self.config.multi_modal:`
`53`	`56`	`texts_input = [`
`54`	`57`	`MultimodalEmbeddingContentPartTextParam(text=text, type="text") for text in texts`