MemTensor
diff --git a/‎poetry.lock‎
Lines changed: 17 additions & 0 deletions b/‎poetry.lock‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/memos/api/config.py‎
Lines changed: 1 addition & 1 deletion b/‎src/memos/api/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/memos/api/product_api.py‎
Lines changed: 5 additions & 2 deletions b/‎src/memos/api/product_api.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/memos/api/product_models.py‎
Lines changed: 7 additions & 0 deletions b/‎src/memos/api/product_models.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/memos/api/routers/product_router.py‎
Lines changed: 28 additions & 4 deletions b/‎src/memos/api/routers/product_router.py‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎src/memos/llms/hf.py‎
Lines changed: 146 additions & 0 deletions b/‎src/memos/llms/hf.py‎
Lines changed: 146 additions & 0 deletions
@@ -26,6 +26,7 @@ fastapi = {extras = ["all"], version = "^0.115.12"}
 sentence-transformers = "^4.1.0"
 sqlalchemy = "^2.0.41"
 redis = "^6.2.0"
+pika = "^1.3.2"
 schedule = "^1.2.2"
 
 [tool.poetry.group.dev]
 
@@ -264,7 +264,7 @@ def create_user_config(user_name: str, user_id: str) -> tuple[MOSConfig, General
                                 "user": neo4j_config["user"],
                                 "password": neo4j_config["password"],
                                 "db_name": os.getenv(
-                                    "NEO4J_DB_NAME", f"db{user_id.replace('-', '')}"
+                                    "NEO4J_DB_NAME", f"memos{user_id.replace('-', '')}"
                                 ),  # , replace with
                                 "auto_create": neo4j_config["auto_create"],
                             },
 
@@ -26,5 +26,8 @@
 
 if __name__ == "__main__":
     import uvicorn
-
-    uvicorn.run(app, host="0.0.0.0", port=8001)
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8001)
+    args = parser.parse_args()
+    uvicorn.run(app, host="0.0.0.0", port=args.port)
@@ -150,3 +150,10 @@ class SearchRequest(BaseRequest):
     user_id: str = Field(..., description="User ID")
     query: str = Field(..., description="Search query")
     mem_cube_id: str | None = Field(None, description="Cube ID to search in")
+
+
+class SuggestionRequest(BaseRequest):
+    """Request model for getting suggestion queries."""
+
+    user_id: str = Field(..., description="User ID")
+    language: Literal["zh", "en"] = Field("zh", description="Language for suggestions")
@@ -15,6 +15,7 @@
     SearchRequest,
     SearchResponse,
     SimpleResponse,
+    SuggestionRequest,
     SuggestionResponse,
     UserRegisterRequest,
     UserRegisterResponse,
@@ -36,6 +37,7 @@ def get_mos_product_instance():
     global MOS_PRODUCT_INSTANCE
     if MOS_PRODUCT_INSTANCE is None:
         default_config = APIConfig.get_product_default_config()
+        print(default_config)
         from memos.configs.mem_os import MOSConfig
 
         mos_config = MOSConfig(**default_config)
@@ -85,7 +87,6 @@ async def register_user(user_req: UserRegisterRequest):
         logger.error(f"Failed to register user: {traceback.format_exc()}")
         raise HTTPException(status_code=500, detail=str(traceback.format_exc())) from err
 
-
 @router.get(
     "/suggestions/{user_id}", summary="Get suggestion queries", response_model=SuggestionResponse
 )
@@ -104,6 +105,25 @@ async def get_suggestion_queries(user_id: str):
         raise HTTPException(status_code=500, detail=str(traceback.format_exc())) from err
 
 
+@router.post("/suggestions", summary="Get suggestion queries with language", response_model=SuggestionResponse)
+async def get_suggestion_queries_post(suggestion_req: SuggestionRequest):
+    """Get suggestion queries for a specific user with language preference."""
+    try:
+        mos_product = get_mos_product_instance()
+        suggestions = mos_product.get_suggestion_query(
+            user_id=suggestion_req.user_id, 
+            language=suggestion_req.language
+        )
+        return SuggestionResponse(
+            message="Suggestions retrieved successfully", data={"query": suggestions}
+        )
+    except ValueError as err:
+        raise HTTPException(status_code=404, detail=str(traceback.format_exc())) from err
+    except Exception as err:
+        logger.error(f"Failed to get suggestions: {traceback.format_exc()}")
+        raise HTTPException(status_code=500, detail=str(traceback.format_exc())) from err
+
+
 @router.post("/get_all", summary="Get all memories for user", response_model=MemoryResponse)
 async def get_all_memories(memory_req: GetMemoryRequest):
     """Get all memories for a specific user."""
@@ -177,15 +197,19 @@ async def chat(chat_req: ChatRequest):
     try:
         mos_product = get_mos_product_instance()
 
-        def generate_chat_response():
+        async def generate_chat_response():
             """Generate chat response as SSE stream."""
             try:
-                yield from mos_product.chat_with_references(
+                import asyncio
+                
+                for chunk in mos_product.chat_with_references(
                     query=chat_req.query,
                     user_id=chat_req.user_id,
                     cube_id=chat_req.mem_cube_id,
                     history=chat_req.history,
-                )
+                ):
+                    yield chunk
+                    await asyncio.sleep(0.05)  # 50ms delay between chunks
             except Exception as e:
                 logger.error(f"Error in chat stream: {e}")
                 error_data = f"data: {json.dumps({'type': 'error', 'content': str(traceback.format_exc())})}\n\n"
 
@@ -1,4 +1,5 @@
 import torch
+from collections.abc import Generator
 
 from transformers import (
     AutoModelForCausalLM,
@@ -71,6 +72,24 @@ def generate(self, messages: MessageList, past_key_values: DynamicCache | None =
         else:
             return self._generate_with_cache(prompt, past_key_values)
 
+    def generate_stream(self, messages: MessageList, past_key_values: DynamicCache | None = None) -> Generator[str, None, None]:
+        """
+        Generate a streaming response from the model.
+        Args:
+            messages (MessageList): Chat messages for prompt construction.
+            past_key_values (DynamicCache | None): Optional KV cache for fast generation.
+        Yields:
+            str: Streaming model response chunks.
+        """
+        prompt = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=self.config.add_generation_prompt
+        )
+        logger.info(f"HFLLM streaming prompt: {prompt}")
+        if past_key_values is None:
+            yield from self._generate_full_stream(prompt)
+        else:
+            yield from self._generate_with_cache_stream(prompt, past_key_values)
+
     def _generate_full(self, prompt: str) -> str:
         """
         Generate output from scratch using the full prompt.
@@ -104,6 +123,71 @@ def _generate_full(self, prompt: str) -> str:
             else response
         )
 
+    def _generate_full_stream(self, prompt: str) -> Generator[str, None, None]:
+        """
+        Generate output from scratch using the full prompt with streaming.
+        Args:
+            prompt (str): The input prompt string.
+        Yields:
+            str: Streaming response chunks.
+        """
+        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.model.device)
+        
+        # Get generation parameters
+        max_new_tokens = getattr(self.config, "max_tokens", 128)
+        do_sample = getattr(self.config, "do_sample", True)
+        remove_think_prefix = getattr(self.config, "remove_think_prefix", False)
+        
+        # Manual streaming generation
+        input_length = inputs.input_ids.shape[1]
+        generated_ids = inputs.input_ids.clone()
+        accumulated_text = ""
+        
+        for _ in range(max_new_tokens):
+            # Forward pass
+            with torch.no_grad():
+                outputs = self.model(
+                    input_ids=generated_ids,
+                    use_cache=True,
+                    return_dict=True,
+                )
+            
+            # Get next token logits
+            next_token_logits = outputs.logits[:, -1, :]
+            
+            # Apply logits processors if sampling
+            if do_sample:
+                batch_size, _ = next_token_logits.size()
+                dummy_ids = torch.zeros((batch_size, 1), dtype=torch.long, device=next_token_logits.device)
+                filtered_logits = self.logits_processors(dummy_ids, next_token_logits)
+                probs = torch.softmax(filtered_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+            
+            # Check for EOS token
+            if self._should_stop(next_token):
+                break
+            
+            # Append new token
+            generated_ids = torch.cat([generated_ids, next_token], dim=-1)
+            
+            # Decode and yield the new token
+            new_token_text = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
+            if new_token_text:  # Only yield non-empty tokens
+                accumulated_text += new_token_text
+                
+                # Apply thinking tag removal if enabled
+                if remove_think_prefix:
+                    processed_text = remove_thinking_tags(accumulated_text)
+                    # Only yield the difference (new content)
+                    if len(processed_text) > len(accumulated_text) - len(new_token_text):
+                        yield processed_text[len(accumulated_text) - len(new_token_text):]
+                    else:
+                        yield new_token_text
+                else:
+                    yield new_token_text
+
     def _generate_with_cache(self, query: str, kv: DynamicCache) -> str:
         """
         Generate output incrementally using an existing KV cache.
@@ -137,6 +221,68 @@ def _generate_with_cache(self, query: str, kv: DynamicCache) -> str:
             else response
         )
 
+    def _generate_with_cache_stream(self, query: str, kv: DynamicCache) -> Generator[str, None, None]:
+        """
+        Generate output incrementally using an existing KV cache with streaming.
+        Args:
+            query (str): The new user query string.
+            kv (DynamicCache): The prefilled KV cache.
+        Yields:
+            str: Streaming response chunks.
+        """
+        query_ids = self.tokenizer(
+            query, return_tensors="pt", add_special_tokens=False
+        ).input_ids.to(self.model.device)
+        
+        max_new_tokens = getattr(self.config, "max_tokens", 128)
+        do_sample = getattr(self.config, "do_sample", True)
+        remove_think_prefix = getattr(self.config, "remove_think_prefix", False)
+        
+        # Initial forward pass
+        logits, kv = self._prefill(query_ids, kv)
+        next_token = self._select_next_token(logits)
+        
+        # Yield first token
+        first_token_text = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
+        accumulated_text = ""
+        if first_token_text:
+            accumulated_text += first_token_text
+            if remove_think_prefix:
+                processed_text = remove_thinking_tags(accumulated_text)
+                if len(processed_text) > len(accumulated_text) - len(first_token_text):
+                    yield processed_text[len(accumulated_text) - len(first_token_text):]
+                else:
+                    yield first_token_text
+            else:
+                yield first_token_text
+        
+        generated = [next_token]
+        
+        # Continue generation
+        for _ in range(max_new_tokens - 1):
+            if self._should_stop(next_token):
+                break
+            logits, kv = self._prefill(next_token, kv)
+            next_token = self._select_next_token(logits)
+            
+            # Decode and yield the new token
+            new_token_text = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
+            if new_token_text:
+                accumulated_text += new_token_text
+                
+                # Apply thinking tag removal if enabled
+                if remove_think_prefix:
+                    processed_text = remove_thinking_tags(accumulated_text)
+                    # Only yield the difference (new content)
+                    if len(processed_text) > len(accumulated_text) - len(new_token_text):
+                        yield processed_text[len(accumulated_text) - len(new_token_text):]
+                    else:
+                        yield new_token_text
+                else:
+                    yield new_token_text
+            
+            generated.append(next_token)
+
     @torch.no_grad()
     def _prefill(
         self, input_ids: torch.Tensor, kv: DynamicCache