NevaMind-AI
diff --git a/‎scripts/evals/locomo/PROMPT/eval‎
Lines changed: 0 additions & 23 deletions b/‎scripts/evals/locomo/PROMPT/eval‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎scripts/evals/locomo/result.json‎
Lines changed: 0 additions & 13917 deletions b/‎scripts/evals/locomo/result.json‎
Lines changed: 0 additions & 13917 deletions
diff --git a/‎src/memu/app/service.py‎
Lines changed: 427 additions & 44 deletions b/‎src/memu/app/service.py‎
Lines changed: 427 additions & 44 deletions
diff --git a/‎src/memu/llm/backends/base.py‎
Lines changed: 12 additions & 0 deletions b/‎src/memu/llm/backends/base.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/memu/llm/backends/openai.py‎
Lines changed: 35 additions & 0 deletions b/‎src/memu/llm/backends/openai.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/memu/llm/http_client.py‎
Lines changed: 107 additions & 0 deletions b/‎src/memu/llm/http_client.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎src/memu/llm/openai_sdk.py‎
Lines changed: 130 additions & 6 deletions b/‎src/memu/llm/openai_sdk.py‎
Lines changed: 130 additions & 6 deletions
diff --git a/‎src/memu/prompts/preprocess/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎src/memu/prompts/preprocess/__init__.py‎
Lines changed: 5 additions & 1 deletion
@@ -18,6 +18,18 @@ def build_summary_payload(
     def parse_summary_response(self, data: dict[str, Any]) -> str:
         raise NotImplementedError
 
+    def build_vision_payload(
+        self,
+        *,
+        prompt: str,
+        base64_image: str,
+        mime_type: str,
+        system_prompt: str | None,
+        chat_model: str,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        raise NotImplementedError
+
     def build_embedding_payload(self, *, inputs: list[str], embed_model: str) -> dict[str, Any]:
         raise NotImplementedError
 
 
@@ -27,6 +27,41 @@ def build_summary_payload(
     def parse_summary_response(self, data: dict[str, Any]) -> str:
         return cast(str, data["choices"][0]["message"]["content"])
 
+    def build_vision_payload(
+        self,
+        *,
+        prompt: str,
+        base64_image: str,
+        mime_type: str,
+        system_prompt: str | None,
+        chat_model: str,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        """Build payload for OpenAI Vision API."""
+        messages: list[dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+
+        messages.append({
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:{mime_type};base64,{base64_image}",
+                    },
+                },
+            ],
+        })
+
+        return {
+            "model": chat_model,
+            "messages": messages,
+            "temperature": 0.2,
+            "max_tokens": max_tokens,
+        }
+
     def build_embedding_payload(self, *, inputs: list[str], embed_model: str) -> dict[str, Any]:
         return {"model": embed_model, "input": inputs}
 
 
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import base64
 import logging
 from collections.abc import Callable
+from pathlib import Path
 
 import httpx
 
@@ -54,6 +56,56 @@ async def summarize(self, text: str, max_tokens: int | None = None, system_promp
         logger.debug("HTTP LLM summarize response: %s", data)
         return self.backend.parse_summary_response(data)
 
+    async def vision(
+        self,
+        prompt: str,
+        image_path: str,
+        *,
+        max_tokens: int | None = None,
+        system_prompt: str | None = None,
+    ) -> str:
+        """
+        Call Vision API with an image.
+
+        Args:
+            prompt: Text prompt to send with the image
+            image_path: Path to the image file
+            max_tokens: Maximum tokens in response
+            system_prompt: Optional system prompt
+
+        Returns:
+            LLM response text
+        """
+        # Read and encode image as base64
+        image_data = Path(image_path).read_bytes()
+        base64_image = base64.b64encode(image_data).decode("utf-8")
+
+        # Detect image format
+        suffix = Path(image_path).suffix.lower()
+        mime_type = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+        }.get(suffix, "image/jpeg")
+
+        payload = self.backend.build_vision_payload(
+            prompt=prompt,
+            base64_image=base64_image,
+            mime_type=mime_type,
+            system_prompt=system_prompt,
+            chat_model=self.chat_model,
+            max_tokens=max_tokens,
+        )
+
+        async with httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout) as client:
+            resp = await client.post(self.summary_endpoint, json=payload, headers=self._headers())
+            resp.raise_for_status()
+            data = resp.json()
+        logger.debug("HTTP LLM vision response: %s", data)
+        return self.backend.parse_summary_response(data)
+
     async def embed(self, inputs: list[str]) -> list[list[float]]:
         payload = self.backend.build_embedding_payload(inputs=inputs, embed_model=self.embed_model)
         async with httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout) as client:
@@ -63,6 +115,61 @@ async def embed(self, inputs: list[str]) -> list[list[float]]:
         logger.debug("HTTP LLM embedding response: %s", data)
         return self.backend.parse_embedding_response(data)
 
+    async def transcribe(
+        self,
+        audio_path: str,
+        *,
+        prompt: str | None = None,
+        language: str | None = None,
+        response_format: str = "text",
+    ) -> str:
+        """
+        Transcribe audio file using OpenAI Audio API.
+
+        Args:
+            audio_path: Path to the audio file
+            prompt: Optional prompt to guide the transcription
+            language: Optional language code (e.g., 'en', 'zh')
+            response_format: Response format ('text', 'json', 'verbose_json')
+
+        Returns:
+            Transcribed text
+        """
+        try:
+            # Prepare multipart form data
+            with open(audio_path, "rb") as audio_file:
+                files = {"file": (Path(audio_path).name, audio_file, "application/octet-stream")}
+                data = {
+                    "model": "gpt-4o-mini-transcribe",
+                    "response_format": response_format,
+                }
+                if prompt:
+                    data["prompt"] = prompt
+                if language:
+                    data["language"] = language
+
+                async with httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout * 3) as client:
+                    resp = await client.post(
+                        "/v1/audio/transcriptions",
+                        files=files,
+                        data=data,
+                        headers=self._headers(),
+                    )
+                    resp.raise_for_status()
+
+                    if response_format == "text":
+                        result = resp.text
+                    else:
+                        result_data = resp.json()
+                        result = result_data.get("text", "")
+
+            logger.debug("HTTP audio transcribe response for %s: %s chars", audio_path, len(result))
+        except Exception:
+            logger.exception("Audio transcription failed for %s", audio_path)
+            raise
+        else:
+            return result or ""
+
     def _headers(self) -> dict[str, str]:
         return {"Authorization": f"Bearer {self.api_key}"}
 
 
@@ -1,7 +1,16 @@
+import base64
 import logging
-from typing import cast
+from pathlib import Path
+from typing import Any, Literal, cast
 
 from openai import AsyncOpenAI
+from openai.types.chat import (
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartTextParam,
+    ChatCompletionMessageParam,
+    ChatCompletionSystemMessageParam,
+    ChatCompletionUserMessageParam,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -25,19 +34,134 @@ async def summarize(
     ) -> str:
         prompt = system_prompt or "Summarize the text in one short paragraph."
 
+        system_message: ChatCompletionSystemMessageParam = {"role": "system", "content": prompt}
+        user_message: ChatCompletionUserMessageParam = {"role": "user", "content": text}
+        messages: list[ChatCompletionMessageParam] = [system_message, user_message]
+
         response = await self.client.chat.completions.create(
             model=self.chat_model,
-            messages=[
-                {"role": "system", "content": prompt},
-                {"role": "user", "content": text},
-            ],
+            messages=messages,
             temperature=1,
-            max_completion_tokens=max_tokens,
+            max_tokens=max_tokens,
         )
         content = response.choices[0].message.content
         logger.debug("OpenAI summarize response: %s", response)
         return content or ""
 
+    async def vision(
+        self,
+        prompt: str,
+        image_path: str,
+        *,
+        max_tokens: int | None = None,
+        system_prompt: str | None = None,
+    ) -> str:
+        """
+        Call OpenAI Vision API with an image.
+
+        Args:
+            prompt: Text prompt to send with the image
+            image_path: Path to the image file
+            max_tokens: Maximum tokens in response
+            system_prompt: Optional system prompt
+
+        Returns:
+            LLM response text
+        """
+        # Read and encode image as base64
+        image_data = Path(image_path).read_bytes()
+        base64_image = base64.b64encode(image_data).decode("utf-8")
+
+        # Detect image format
+        suffix = Path(image_path).suffix.lower()
+        mime_type = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+        }.get(suffix, "image/jpeg")
+
+        # Build messages with image
+        messages: list[ChatCompletionMessageParam] = []
+        if system_prompt:
+            system_message: ChatCompletionSystemMessageParam = {
+                "role": "system",
+                "content": system_prompt,
+            }
+            messages.append(system_message)
+
+        text_part: ChatCompletionContentPartTextParam = {"type": "text", "text": prompt}
+        image_part: ChatCompletionContentPartImageParam = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:{mime_type};base64,{base64_image}",
+            },
+        }
+        user_message: ChatCompletionUserMessageParam = {
+            "role": "user",
+            "content": [text_part, image_part],
+        }
+        messages.append(user_message)
+
+        response = await self.client.chat.completions.create(
+            model=self.chat_model,
+            messages=messages,
+            temperature=1,
+            max_tokens=max_tokens,
+        )
+        content = response.choices[0].message.content
+        logger.debug("OpenAI vision response: %s", response)
+        return content or ""
+
     async def embed(self, inputs: list[str]) -> list[list[float]]:
         response = await self.client.embeddings.create(model=self.embed_model, input=inputs)
         return [cast(list[float], d.embedding) for d in response.data]
+
+    async def transcribe(
+        self,
+        audio_path: str,
+        *,
+        prompt: str | None = None,
+        language: str | None = None,
+        response_format: Literal["text", "json", "verbose_json"] = "text",
+    ) -> str:
+        """
+        Transcribe audio file using OpenAI Audio API.
+
+        Args:
+            audio_path: Path to the audio file
+            prompt: Optional prompt to guide the transcription
+            language: Optional language code (e.g., 'en', 'zh')
+            response_format: Response format ('text', 'json', 'verbose_json')
+
+        Returns:
+            Transcribed text
+        """
+        try:
+            # Use gpt-4o-mini-transcribe for better performance and cost
+            kwargs: dict[str, Any] = {}
+            if prompt is not None:
+                kwargs["prompt"] = prompt
+            if language is not None:
+                kwargs["language"] = language
+            with open(audio_path, "rb") as audio_stream:
+                transcription = await self.client.audio.transcriptions.create(
+                    file=audio_stream,
+                    model="gpt-4o-mini-transcribe",
+                    response_format=response_format,
+                    **kwargs,
+                )
+
+            # Handle different response formats
+            if response_format == "text":
+                result = transcription if isinstance(transcription, str) else transcription.text
+            else:
+                result = transcription.text if hasattr(transcription, "text") else str(transcription)
+
+            logger.debug("OpenAI transcribe response for %s: %s chars", audio_path, len(result))
+        except Exception:
+            logger.exception("Audio transcription failed for %s", audio_path)
+            raise
+        else:
+            return result or ""
@@ -1,7 +1,11 @@
-from memu.prompts.preprocess import conversation
+from memu.prompts.preprocess import audio, conversation, document, image, video
 
 PROMPTS: dict[str, str] = {
     "conversation": conversation.PROMPT.strip(),
+    "video": video.PROMPT.strip(),
+    "image": image.PROMPT.strip(),
+    "document": document.PROMPT.strip(),
+    "audio": audio.PROMPT.strip(),
 }
 
 __all__ = ["PROMPTS"]