NevaMind-AI
diff --git a/‎scripts/evals/locomo/PROMPT/eval‎
Lines changed: 0 additions & 23 deletions b/‎scripts/evals/locomo/PROMPT/eval‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎scripts/evals/locomo/result.json‎
Lines changed: 0 additions & 13917 deletions b/‎scripts/evals/locomo/result.json‎
Lines changed: 0 additions & 13917 deletions
diff --git a/‎src/memu/app/service.py‎
Lines changed: 347 additions & 14 deletions b/‎src/memu/app/service.py‎
Lines changed: 347 additions & 14 deletions
diff --git a/‎src/memu/llm/backends/base.py‎
Lines changed: 12 additions & 0 deletions b/‎src/memu/llm/backends/base.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/memu/llm/backends/openai.py‎
Lines changed: 35 additions & 0 deletions b/‎src/memu/llm/backends/openai.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/memu/llm/http_client.py‎
Lines changed: 107 additions & 0 deletions b/‎src/memu/llm/http_client.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎src/memu/llm/openai_sdk.py‎
Lines changed: 108 additions & 0 deletions b/‎src/memu/llm/openai_sdk.py‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎src/memu/prompts/preprocess/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎src/memu/prompts/preprocess/__init__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/memu/prompts/preprocess/audio.py‎
Lines changed: 21 additions & 0 deletions b/‎src/memu/prompts/preprocess/audio.py‎
Lines changed: 21 additions & 0 deletions
@@ -17,6 +17,18 @@ def build_summary_payload(
 
     def parse_summary_response(self, data: dict[str, Any]) -> str:
         raise NotImplementedError
+    
+    def build_vision_payload(
+        self,
+        *,
+        prompt: str,
+        base64_image: str,
+        mime_type: str,
+        system_prompt: str | None,
+        chat_model: str,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        raise NotImplementedError
 
     def build_embedding_payload(self, *, inputs: list[str], embed_model: str) -> dict[str, Any]:
         raise NotImplementedError
 
@@ -26,6 +26,41 @@ def build_summary_payload(
 
     def parse_summary_response(self, data: dict[str, Any]) -> str:
         return cast(str, data["choices"][0]["message"]["content"])
+    
+    def build_vision_payload(
+        self,
+        *,
+        prompt: str,
+        base64_image: str,
+        mime_type: str,
+        system_prompt: str | None,
+        chat_model: str,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        """Build payload for OpenAI Vision API."""
+        messages: list[dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        
+        messages.append({
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:{mime_type};base64,{base64_image}",
+                    },
+                },
+            ],
+        })
+        
+        return {
+            "model": chat_model,
+            "messages": messages,
+            "temperature": 0.2,
+            "max_tokens": max_tokens,
+        }
 
     def build_embedding_payload(self, *, inputs: list[str], embed_model: str) -> dict[str, Any]:
         return {"model": embed_model, "input": inputs}
 
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import base64
 import logging
 from collections.abc import Callable
+from pathlib import Path
 
 import httpx
 
@@ -53,6 +55,56 @@ async def summarize(self, text: str, max_tokens: int | None = None, system_promp
             data = resp.json()
         logger.debug("HTTP LLM summarize response: %s", data)
         return self.backend.parse_summary_response(data)
+    
+    async def vision(
+        self,
+        prompt: str,
+        image_path: str,
+        *,
+        max_tokens: int | None = None,
+        system_prompt: str | None = None,
+    ) -> str:
+        """
+        Call Vision API with an image.
+        
+        Args:
+            prompt: Text prompt to send with the image
+            image_path: Path to the image file
+            max_tokens: Maximum tokens in response
+            system_prompt: Optional system prompt
+            
+        Returns:
+            LLM response text
+        """
+        # Read and encode image as base64
+        image_data = Path(image_path).read_bytes()
+        base64_image = base64.b64encode(image_data).decode("utf-8")
+        
+        # Detect image format
+        suffix = Path(image_path).suffix.lower()
+        mime_type = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+        }.get(suffix, "image/jpeg")
+        
+        payload = self.backend.build_vision_payload(
+            prompt=prompt,
+            base64_image=base64_image,
+            mime_type=mime_type,
+            system_prompt=system_prompt,
+            chat_model=self.chat_model,
+            max_tokens=max_tokens,
+        )
+        
+        async with httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout) as client:
+            resp = await client.post(self.summary_endpoint, json=payload, headers=self._headers())
+            resp.raise_for_status()
+            data = resp.json()
+        logger.debug("HTTP LLM vision response: %s", data)
+        return self.backend.parse_summary_response(data)
 
     async def embed(self, inputs: list[str]) -> list[list[float]]:
         payload = self.backend.build_embedding_payload(inputs=inputs, embed_model=self.embed_model)
@@ -62,6 +114,61 @@ async def embed(self, inputs: list[str]) -> list[list[float]]:
             data = resp.json()
         logger.debug("HTTP LLM embedding response: %s", data)
         return self.backend.parse_embedding_response(data)
+    
+    async def transcribe(
+        self,
+        audio_path: str,
+        *,
+        prompt: str | None = None,
+        language: str | None = None,
+        response_format: str = "text",
+    ) -> str:
+        """
+        Transcribe audio file using OpenAI Audio API.
+        
+        Args:
+            audio_path: Path to the audio file
+            prompt: Optional prompt to guide the transcription
+            language: Optional language code (e.g., 'en', 'zh')
+            response_format: Response format ('text', 'json', 'verbose_json')
+            
+        Returns:
+            Transcribed text
+        """
+        try:
+            # Prepare multipart form data
+            with open(audio_path, "rb") as audio_file:
+                files = {"file": (Path(audio_path).name, audio_file, "application/octet-stream")}
+                data = {
+                    "model": "gpt-4o-mini-transcribe",
+                    "response_format": response_format,
+                }
+                if prompt:
+                    data["prompt"] = prompt
+                if language:
+                    data["language"] = language
+                
+                async with httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout * 3) as client:
+                    resp = await client.post(
+                        "/v1/audio/transcriptions",
+                        files=files,
+                        data=data,
+                        headers=self._headers(),
+                    )
+                    resp.raise_for_status()
+                    
+                    if response_format == "text":
+                        result = resp.text
+                    else:
+                        result_data = resp.json()
+                        result = result_data.get("text", "")
+            
+            logger.debug("HTTP audio transcribe response for %s: %s chars", audio_path, len(result))
+            return result or ""
+            
+        except Exception as e:
+            logger.error("Audio transcription failed for %s: %s", audio_path, e)
+            raise
 
     def _headers(self) -> dict[str, str]:
         return {"Authorization": f"Bearer {self.api_key}"}
 
@@ -1,4 +1,6 @@
+import base64
 import logging
+from pathlib import Path
 from typing import cast
 
 from openai import AsyncOpenAI
@@ -38,6 +40,112 @@ async def summarize(
         logger.debug("OpenAI summarize response: %s", response)
         return content or ""
 
+    async def vision(
+        self,
+        prompt: str,
+        image_path: str,
+        *,
+        max_tokens: int | None = None,
+        system_prompt: str | None = None,
+    ) -> str:
+        """
+        Call OpenAI Vision API with an image.
+        
+        Args:
+            prompt: Text prompt to send with the image
+            image_path: Path to the image file
+            max_tokens: Maximum tokens in response
+            system_prompt: Optional system prompt
+            
+        Returns:
+            LLM response text
+        """
+        # Read and encode image as base64
+        image_data = Path(image_path).read_bytes()
+        base64_image = base64.b64encode(image_data).decode("utf-8")
+        
+        # Detect image format
+        suffix = Path(image_path).suffix.lower()
+        mime_type = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+        }.get(suffix, "image/jpeg")
+        
+        # Build messages with image
+        messages: list[dict] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        
+        messages.append({
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:{mime_type};base64,{base64_image}",
+                    },
+                },
+            ],
+        })
+        
+        response = await self.client.chat.completions.create(
+            model=self.chat_model,
+            messages=messages,
+            temperature=1,
+            max_completion_tokens=max_tokens,
+        )
+        content = response.choices[0].message.content
+        logger.debug("OpenAI vision response: %s", response)
+        return content or ""
+
     async def embed(self, inputs: list[str]) -> list[list[float]]:
         response = await self.client.embeddings.create(model=self.embed_model, input=inputs)
         return [cast(list[float], d.embedding) for d in response.data]
+    
+    async def transcribe(
+        self,
+        audio_path: str,
+        *,
+        prompt: str | None = None,
+        language: str | None = None,
+        response_format: str = "text",
+    ) -> str:
+        """
+        Transcribe audio file using OpenAI Audio API.
+        
+        Args:
+            audio_path: Path to the audio file
+            prompt: Optional prompt to guide the transcription
+            language: Optional language code (e.g., 'en', 'zh')
+            response_format: Response format ('text', 'json', 'verbose_json')
+            
+        Returns:
+            Transcribed text
+        """
+        try:
+            with open(audio_path, "rb") as audio_file:
+                # Use gpt-4o-mini-transcribe for better performance and cost
+                transcription = await self.client.audio.transcriptions.create(
+                    model="gpt-4o-mini-transcribe",
+                    file=audio_file,
+                    response_format=response_format,
+                    prompt=prompt,
+                    language=language,
+                )
+            
+            # Handle different response formats
+            if response_format == "text":
+                result = transcription if isinstance(transcription, str) else transcription.text
+            else:
+                result = transcription.text if hasattr(transcription, "text") else str(transcription)
+            
+            logger.debug("OpenAI transcribe response for %s: %s chars", audio_path, len(result))
+            return result or ""
+            
+        except Exception as e:
+            logger.error("Audio transcription failed for %s: %s", audio_path, e)
+            raise
@@ -1,7 +1,11 @@
-from memu.prompts.preprocess import conversation
+from memu.prompts.preprocess import audio, conversation, document, image, video
 
 PROMPTS: dict[str, str] = {
     "conversation": conversation.PROMPT.strip(),
+    "video": video.PROMPT.strip(),
+    "image": image.PROMPT.strip(),
+    "document": document.PROMPT.strip(),
+    "audio": audio.PROMPT.strip(),
 }
 
 __all__ = ["PROMPTS"]
@@ -0,0 +1,21 @@
+PROMPT = """
+Analyze the following audio transcription and provide two outputs:
+
+## Transcription:
+<transcription>
+{transcription}
+</transcription>
+
+## Task:
+1. **Processed Content**: Provide a clean, well-formatted version of the transcription with proper punctuation and paragraph breaks if needed
+2. **Caption**: Provide a one-sentence summary describing what the audio is about
+
+## Output Format:
+<processed_content>
+[Provide the cleaned and formatted transcription here]
+</processed_content>
+
+<caption>
+[Provide a one-sentence summary of what the audio is about]
+</caption>
+"""