memory usage optimization

bitgeese · bitgeese · commit f0936a803132 · 2025-04-05T10:52:18.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -62,4 +62,7 @@ htmlcov/
 *.sqlite3
 
 # Vector database
-chroma_db/ 
+chroma_db/ 
+
+# Cursor
+.cursor/
diff --git a/app.py b/app.py
@@ -6,8 +6,10 @@
 """
 
 import logging
+import os
 
 import chainlit as cl
+import psutil  # For memory tracking
 from langchain_core.runnables.config import RunnableConfig
 from limits import parse
 from limits.storage import MemoryStorage
@@ -32,6 +34,38 @@
 # logging.getLogger("openai").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 
+# --- Memory Management Constants ---
+# Maximum number of message pairs (user+assistant) to keep in memory
+MAX_HISTORY_LENGTH = settings.MAX_HISTORY_LENGTH
+
+
+# Function to trim message history to prevent memory bloat
+def trim_message_history(session_id: str) -> None:
+    """
+    Trims the message history for a session when it gets too long.
+    This helps prevent memory buildup in long conversations.
+
+    Args:
+        session_id: The ID of the current session
+    """
+    try:
+        # Get current message history
+        history = cl.user_session.get("message_history", [])
+
+        # If history exceeds max length, trim it
+        if len(history) > MAX_HISTORY_LENGTH * 2:  # Each exchange has 2 messages
+            # Keep only the most recent messages
+            history = history[-MAX_HISTORY_LENGTH * 2 :]
+            cl.user_session.set("message_history", history)
+            logger.info(
+                f"Trimmed message history for session {session_id} "
+                f"to {len(history)} messages"
+            )
+    except Exception as e:
+        # Log but don't crash if history trimming fails
+        logger.warning(f"Failed to trim message history: {e}")
+
+
 # --- Global Initialization ---
 # Declare placeholders for global objects
 prompt_manager = None
@@ -88,6 +122,57 @@ def get_session_id():
 # Remove the slowapi limiter instance for messages
 # message_limiter = Limiter(key_func=get_session_id) # REMOVED
 
+
+# --- Helper Functions for Message Processing ---
+async def check_initialization() -> bool:
+    """Check if the application is properly initialized."""
+    if not INITIALIZATION_SUCCESSFUL:
+        await cl.ErrorMessage(content="Application not initialized.").send()
+        return False
+    return True
+
+
+async def get_translation_service():
+    """Get the translation service from the user session."""
+    service = cl.user_session.get("translation_service")
+    if not service:
+        logger.error("TranslationService not found in user session.")
+        await cl.ErrorMessage(
+            content="Error: Translation service unavailable. "
+            "Please restart the chat."
+        ).send()
+        return None
+    return service
+
+
+async def perform_translation(service, message_content, config):
+    """Perform the actual translation using the service."""
+    if settings.DEBUG:
+        # When debugging, let the callback handler manage steps
+        return await service.translate_text(message_content, config=config)
+    else:
+        # When not debugging, show a simple progress step
+        async with cl.Step(name="Translating..."):
+            # Config will have empty callbacks list here
+            return await service.translate_text(message_content, config=config)
+
+
+async def log_memory_usage(session_id):
+    """Log current memory usage for monitoring."""
+    try:
+        # Use psutil to get memory info
+        process = psutil.Process(os.getpid())
+        memory_info = process.memory_info()
+        memory_mb = memory_info.rss / 1024 / 1024  # Convert to MB
+        logger.info(f"Memory usage: {memory_mb:.2f} MB for session {session_id}")
+
+        # If memory usage is high, log a warning
+        if memory_mb > 400:  # 400MB is getting close to the 512MB limit
+            logger.warning(f"High memory usage detected: {memory_mb:.2f} MB")
+    except Exception as e:
+        logger.error(f"Failed to log memory usage: {e}")
+
+
 # --- Chainlit Event Handlers ---
 
 
@@ -119,77 +204,63 @@ async def start():
 # @message_limiter.limit("5/minute") # REMOVED Decorator
 async def on_message(message: cl.Message):
     """Handle incoming text messages and provide translations."""
-    # --- MANUAL Rate Limit Check (using 'limits' library directly) --- <<< CORRECTED
+    # --- Rate Limit Check ---
     session_id = get_session_id()
-    # Use the limits strategy's hit() method. It returns False if the limit is exceeded.
     if not message_limit_strategy.hit(message_rate_limit, session_id):
         # Limit exceeded
         logger.warning(f"Rate limit exceeded for session {session_id}")
         await cl.ErrorMessage(
             content="Rate limit exceeded (5 messages per minute). Please wait a moment."
         ).send()
-        return  # Stop processing this message
-    # --- End of Rate Limit Check ---
+        return
 
-    # Proceed with message handling only if the rate limit check passed
-    try:
-        # REMOVED await message_limiter.hit("5/minute", get_session_id())
+    # --- Memory Management ---
+    trim_message_history(session_id)
 
-        if not INITIALIZATION_SUCCESSFUL:
-            await cl.ErrorMessage(content="Application not initialized.").send()
-            return
+    # Track this message in history
+    history = cl.user_session.get("message_history", [])
+    history.append({"role": "user", "content": message.content})
+    cl.user_session.set("message_history", history)
 
-        service = cl.user_session.get("translation_service")
+    try:
+        # Basic validations
+        if not await check_initialization():
+            return
 
+        service = await get_translation_service()
         if not service:
-            logger.error("TranslationService not found in user session.")
-            await cl.ErrorMessage(
-                content="Error: Translation service unavailable. "
-                "Please restart the chat."
-            ).send()
             return
 
         if not message.content:
             logger.warning("Received empty message.")
-            return  # Ignore empty messages
+            return
 
-        # Conditionally add the callback handler for step visibility
+        # Setup for translation
         callbacks = []
         if settings.DEBUG:
             callbacks.append(cl.LangchainCallbackHandler())
-            logger.info(
-                "Debug enabled: Adding LangchainCallbackHandler for step visibility."
-            )
+            logger.info("Debug enabled: Adding LangchainCallbackHandler.")
 
         config = RunnableConfig(callbacks=callbacks)
 
-        # Use the service to translate, passing the config (with or without callbacks)
-        if settings.DEBUG:
-            # When debugging, let the callback handler manage steps
-            translation_result = await service.translate_text(
-                message.content, config=config
-            )
-        else:
-            # When not debugging, show a simple progress step
-            async with cl.Step(name="Translating..."):
-                # Config will have empty callbacks list here
-                translation_result = await service.translate_text(
-                    message.content, config=config
-                )
-                # Optionally set step output
-                # (might be redundant if result is sent immediately after)
-                # step.output = translation_result
-
-        # Send the final translation result
+        # Perform translation
+        translation_result = await perform_translation(service, message.content, config)
+
+        # Send result
         await cl.Message(content=f"Translation: {translation_result}").send()
 
+        # Update history
+        history = cl.user_session.get("message_history", [])
+        history.append(
+            {"role": "assistant", "content": f"Translation: {translation_result}"}
+        )
+        cl.user_session.set("message_history", history)
+
     except TranslationError as e:
         logger.error(
             f"Translation failed for '{message.content[:50]}...': {e}", exc_info=False
-        )  # exc_info=False to avoid redundant stack trace from service layer
-        await cl.ErrorMessage(
-            content=f"Sorry, translation failed: {e}"
-        ).send()  # Show specific error if safe
+        )
+        await cl.ErrorMessage(content=f"Sorry, translation failed: {e}").send()
     except AppError as e:
         logger.error(
             f"Service error during translation for '{message.content[:50]}...': {e}",
@@ -198,20 +269,30 @@ async def on_message(message: cl.Message):
         await cl.ErrorMessage(
             content="Sorry, an application error occurred during translation."
         ).send()
-    except Exception as e:  # Catch other potential exceptions from the core logic
-        # This generic catch might now be redundant if specific errors are handled
-        # but kept for safety, ensuring RateLimitExceeded is handled first.
+    except Exception as e:
         logger.error(
             f"Unexpected error during translation for '{message.content[:50]}...': {e}",
             exc_info=True,
         )
         await cl.ErrorMessage(
-            content=(
-                "Sorry, an unexpected error occurred during translation. "
-                "Please try again."
-            )
+            content="Sorry, an unexpected error occurred during translation."
         ).send()
+    finally:
+        # Log memory usage
+        await log_memory_usage(session_id)
+
+
+@cl.on_chat_end
+async def on_chat_end():
+    """Clean up resources when a chat session ends."""
+    try:
+        # Get the session ID for logging
+        session_id = cl.context.session.id
+        logger.info(f"Cleaning up resources for ending session {session_id}")
 
+        # Clear user session data to free memory
+        cl.user_session.clear()
 
-# Removed @cl.on_settings_update as it wasn't used after refactor
-# Removed @cl.on_chat_end/@cl.on_stop as they were empty
+        logger.info(f"Successfully cleaned up resources for session {session_id}")
+    except Exception as e:
+        logger.error(f"Error during session cleanup: {e}", exc_info=True)
diff --git a/config.py b/config.py
@@ -38,6 +38,14 @@ class Settings(BaseSettings):
     # 4. More user-friendly for development with automatic persistence
     CHROMA_PERSIST_DIRECTORY: str = "chroma_db"
 
+    # --- Memory Management Configuration ---
+    # Maximum number of documents to retrieve for context (smaller = less memory)
+    MAX_RETRIEVAL_DOCS: int = 3
+    # Number of documents to process in a batch during vector store creation
+    VECTORSTORE_BATCH_SIZE: int = 50
+    # Maximum history length (in message pairs) for chat sessions
+    MAX_HISTORY_LENGTH: int = 15
+
     # --- Prompt Configuration ---
     PROMPTS_DIR: str = "prompts"
     SYSTEM_PROMPT_FILE: str = "system.md"
diff --git a/core/data_loader.py b/core/data_loader.py
@@ -171,10 +171,16 @@ def _create_vector_store(documents: List[Document], api_key: str) -> VectorStore
             # Ensure directory exists
             os.makedirs(settings.CHROMA_PERSIST_DIRECTORY, exist_ok=True)
 
+            # Add batch_size parameter to control memory usage during indexing
             vector_store = Chroma.from_documents(
                 documents=documents,
                 embedding=embedding_model,
                 persist_directory=settings.CHROMA_PERSIST_DIRECTORY,
+                collection_metadata={
+                    "hnsw:space": "cosine"
+                },  # More efficient distance calculation
+                # Process documents in smaller batches to reduce peak memory usage
+                batch_size=settings.VECTORSTORE_BATCH_SIZE,
             )
             # Persist to disk
             vector_store.persist()
diff --git a/core/translator.py b/core/translator.py
@@ -67,7 +67,8 @@ def __init__(
                 streaming=True,  # Enable streaming by default if needed later
             )
 
-        self.retriever = self._create_retriever()
+        # Create retriever with configured number of documents to limit memory usage
+        self.retriever = self._create_retriever(k=settings.MAX_RETRIEVAL_DOCS)
         self.chain = self._build_rag_chain()
         logger.info("ArgentinianTranslator initialized successfully.")
 
@@ -123,7 +124,18 @@ def replace_match(match):
     def _create_retriever(self, k: int = 3):
         """Creates a retriever from the vector store."""
         logger.debug(f"Creating retriever with k={k}")
-        return self.vector_store.as_retriever(search_kwargs={"k": k})
+        # Add memory management for retrieval
+        return self.vector_store.as_retriever(
+            search_kwargs={
+                "k": k,
+                # Limit fetch size to reduce memory usage
+                "fetch_k": k * 3,
+                # Use more efficient MMR retrieval
+                # that removes duplicates to save memory
+                "search_type": "mmr",
+                "lambda_mult": 0.8,  # Controls diversity (higher = more diversity)
+            }
+        )
 
     def _format_retrieved_docs(self, docs: List[Document]) -> str:
         """Formats retrieved documents into a string for the prompt context."""
diff --git a/requirements.txt b/requirements.txt
@@ -1262,3 +1262,5 @@ yarl==1.18.3 \
 zipp==3.21.0 \
     --hash=sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4 \
     --hash=sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931
+memory-profiler==0.61.0 \
+    --hash=sha256:97c82e7e66a05ad5e1f2d0dfd23eae374cb1ab8aca87d9e0c27f03ab74fcef3d
diff --git a/services/translation_service.py b/services/translation_service.py
@@ -32,15 +32,15 @@
 # --- Language Detection Prompt Template (Keep it minimal) ---
 # Use a single triple-quoted string for clarity and correct parsing
 LANG_DETECT_PROMPT_TEMPLATE = ChatPromptTemplate.from_template(
-    f"""Identify the primary language of the following text. \
+    """Identify the primary language of the following text. \
 Respond with ONLY the two-letter ISO 639-1 language code (e.g., 'en', 'es', 'fr'). \
 If you are unsure, the text is nonsensical, gibberish, or not a real language, \
-respond with '{UNKNOWN_LANG_CODE}'. \
+respond with '{unknown_lang_code}'. \
 Text:
 \"\"\"
 {{user_input}}
 \"\"\"
-Language code:"""
+Language code:""".format(unknown_lang_code=UNKNOWN_LANG_CODE)
 )
 
 # Seed langdetect for consistent results
@@ -113,8 +113,17 @@ async def _detect_language_llm(self, text: str) -> str:
         """Detect language using the LLM chain."""
         logger.debug(f"Using LLM for language detection for: '{text[:50]}...'")
         try:
+            # Use a shortened version of the text for language detection to save tokens
+            # For very short texts, use the whole text
+            if len(text) > 100:
+                detection_text = text[:100]  # Only use the first 100 characters
+            else:
+                detection_text = text
+
             # Use ainvoke for the async context
-            result = await self._lang_detect_chain.ainvoke({"user_input": text})
+            result = await self._lang_detect_chain.ainvoke(
+                {"user_input": detection_text}
+            )
             # Clean up potential whitespace and normalize case
             detected_lang = result.strip().lower()
             logger.debug(f"LLM detection result: {detected_lang}")