add minimal ci and strengthen episode ingestion coverage

Luca Candela · Luca Candela · commit 161785ee55b5 · 2025-10-11T18:56:50.000-07:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,40 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        shell: bash
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Cache uv packages
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/uv
+          key: ${{ runner.os }}-uv-${{ hashFiles('uv.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-uv-
+
+      - name: Install dependencies
+        run: uv sync --extra dev
+
+      - name: Run targeted tests
+        run: |
+          uv run pytest tests/orchestration/test_bulk_serialization.py tests/search/test_search_utils_filters.py
+          uv run pytest tests/test_graphium_mock.py::test_add_episode_persists_nodes_and_edges
diff --git a/mcp_server/graphium_mcp/queues.py b/mcp_server/graphium_mcp/queues.py
@@ -1,14 +1,58 @@
-"""Episode queue orchestration."""
+"""Episode queue orchestration with retry tracking."""
 
 from __future__ import annotations
 
 import asyncio
 import logging
+from datetime import UTC, datetime
+from typing import Any
 
 from . import state
 
 logger = logging.getLogger(__name__)
 
+MAX_QUEUE_RETRIES = 3
+RETRY_BACKOFF_SECONDS = 0.5
+
+
+def _metadata(process_func: state.EpisodeProcessor) -> dict[str, Any]:
+    metadata = getattr(process_func, 'queue_metadata', {})
+    if not isinstance(metadata, dict):
+        metadata = {}
+    metadata.setdefault('attempts', 0)
+    return metadata
+
+
+def _record_failure(group_id: str, metadata: dict[str, Any], exc: Exception) -> None:
+    name = metadata.get('name', 'unknown')
+    attempts = metadata.get('attempts', 0)
+    failure = {
+        'name': str(name),
+        'error': f'{exc.__class__.__name__}: {exc}',
+        'attempts': str(attempts),
+        'timestamp': datetime.now(UTC).isoformat(),
+    }
+
+    failures = state.queue_failures.setdefault(group_id, [])
+    if name:
+        failures = [entry for entry in failures if entry.get('name') != name]
+    failures.append(failure)
+    state.queue_failures[group_id] = failures
+
+
+def _clear_failure(group_id: str, metadata: dict[str, Any]) -> None:
+    name = metadata.get('name')
+    if not name:
+        return
+    failures = state.queue_failures.get(group_id)
+    if not failures:
+        return
+    remaining = [entry for entry in failures if entry.get('name') != name]
+    if remaining:
+        state.queue_failures[group_id] = remaining
+    else:
+        state.queue_failures.pop(group_id, None)
+
 
 async def process_episode_queue(group_id: str) -> None:
     """Process episodes for a specific group_id sequentially."""
@@ -18,14 +62,33 @@ async def process_episode_queue(group_id: str) -> None:
     try:
         while True:
             process_func = await state.episode_queues[group_id].get()
+            metadata = _metadata(process_func)
+            name = metadata.get('name', 'queued-episode')
             try:
                 await process_func()
+                metadata['attempts'] = 0
+                _clear_failure(group_id, metadata)
             except Exception as exc:  # pragma: no cover - defensive logging
-                logger.error(
-                    'Error processing queued episode for group_id %s: %s',
+                metadata['attempts'] = metadata.get('attempts', 0) + 1
+                _record_failure(group_id, metadata, exc)
+                attempt = metadata['attempts']
+                logger.exception(
+                    "Error processing queued episode '%s' for group_id %s (attempt %s/%s)",
+                    name,
                     group_id,
-                    exc,
+                    attempt,
+                    MAX_QUEUE_RETRIES,
                 )
+                if attempt < MAX_QUEUE_RETRIES:
+                    setattr(process_func, 'queue_metadata', metadata)
+                    await asyncio.sleep(RETRY_BACKOFF_SECONDS)
+                    await state.episode_queues[group_id].put(process_func)
+                else:
+                    logger.error(
+                        "Episode '%s' for group_id %s exceeded max retries and will be discarded",
+                        name,
+                        group_id,
+                    )
             finally:
                 state.episode_queues[group_id].task_done()
     except asyncio.CancelledError:
@@ -46,6 +109,9 @@ async def enqueue_episode(group_id: str, process_func: state.EpisodeProcessor) -
     if group_id not in state.episode_queues:
         state.episode_queues[group_id] = asyncio.Queue()
 
+    metadata = _metadata(process_func)
+    setattr(process_func, 'queue_metadata', metadata)
+
     await state.episode_queues[group_id].put(process_func)
 
     if not state.queue_workers.get(group_id, False):
@@ -54,4 +120,4 @@ async def enqueue_episode(group_id: str, process_func: state.EpisodeProcessor) -
     return state.episode_queues[group_id].qsize()
 
 
-__all__ = ['enqueue_episode', 'process_episode_queue']
+__all__ = ['MAX_QUEUE_RETRIES', 'enqueue_episode', 'process_episode_queue']
diff --git a/mcp_server/graphium_mcp/state.py b/mcp_server/graphium_mcp/state.py
@@ -18,6 +18,7 @@
 episode_queues: dict[str, asyncio.Queue[EpisodeProcessor]] = {}
 queue_workers: dict[str, bool] = {}
 graphium_init_error: str | None = None
+queue_failures: dict[str, list[dict[str, str]]] = {}
 
 
 def set_config(new_config: GraphiumConfig) -> None:
@@ -42,6 +43,7 @@ def set_init_error(error: str | None) -> None:
     'graphium_config',
     'graphium_init_error',
     'queue_workers',
+    'queue_failures',
     'set_init_error',
     'set_client',
     'set_config',
diff --git a/mcp_server/graphium_mcp/status.py b/mcp_server/graphium_mcp/status.py
@@ -33,9 +33,17 @@ async def collect_status() -> StatusResponse:
         graphium = cast(Graphium, client)
         await graphium.driver.client.verify_connectivity()  # type: ignore
 
+        status_value = 'ok'
+        message = 'Graphium MCP server is running and connected to Neo4j'
+
+        if state.queue_failures:
+            failure_count = sum(len(entries) for entries in state.queue_failures.values())
+            status_value = 'warn'
+            message = f'{message} (pending queue failures: {failure_count})'
+
         return StatusResponse(
-            status='ok',
-            message='Graphium MCP server is running and connected to Neo4j',
+            status=status_value,
+            message=message,
         )
     except Exception as exc:
         logger.error('Error checking Neo4j connection: %s', exc)
diff --git a/mcp_server/graphium_mcp/tools.py b/mcp_server/graphium_mcp/tools.py
@@ -106,11 +106,25 @@ async def process_episode() -> None:
                     group_id_str,
                 )
 
+        process_episode.queue_metadata = {
+            'name': name,
+            'group_id': group_id_str,
+            'source': source_type.value,
+        }
+
         position = await enqueue_episode(group_id_str, process_episode)
 
-        return SuccessResponse(
-            message=f"Episode '{name}' queued for processing (position: {position})"
-        )
+        pending_failures = state.queue_failures.get(group_id_str, [])
+        message = f"Episode '{name}' queued for processing (position: {position})"
+        if pending_failures:
+            last_failure = pending_failures[-1]
+            message = (
+                f"{message}. Warning: {len(pending_failures)} prior failure(s); "
+                f"last error from '{last_failure.get('name', 'unknown')}': "
+                f"{last_failure.get('error', 'unknown error')}"
+            )
+
+        return SuccessResponse(message=message)
     except Exception as exc:
         logger.exception(
             "Error queuing episode '%s' for group_id %s",
diff --git a/tests/mcp/test_episode_queue.py b/tests/mcp/test_episode_queue.py
@@ -0,0 +1,57 @@
+import asyncio
+import logging
+from contextlib import suppress
+
+import pytest
+
+from mcp_server.graphium_mcp import queues, state
+
+
+@pytest.mark.asyncio
+async def test_enqueue_episode_retries_and_records_failures(monkeypatch, caplog):
+    group_id = 'queue-test-group'
+    attempts = 0
+    retry_complete = asyncio.Event()
+
+    async def failing_processor():
+        nonlocal attempts
+        attempts += 1
+        if attempts >= queues.MAX_QUEUE_RETRIES:
+            retry_complete.set()
+        raise RuntimeError('boom')
+
+    failing_processor.queue_metadata = {'name': 'test-episode'}
+
+    original_create_task = asyncio.create_task
+    created_tasks: list[asyncio.Task] = []
+
+    def capture_task(coro):
+        task = original_create_task(coro)
+        created_tasks.append(task)
+        return task
+
+    monkeypatch.setattr(asyncio, 'create_task', capture_task)
+    caplog.set_level(logging.WARNING, logger='mcp_server.graphium_mcp.queues')
+    state.queue_failures.pop(group_id, None)
+
+    try:
+        position = await queues.enqueue_episode(group_id, failing_processor)
+        assert position == 1
+
+        await asyncio.wait_for(retry_complete.wait(), timeout=3)
+        await asyncio.wait_for(state.episode_queues[group_id].join(), timeout=3)
+
+        assert attempts == queues.MAX_QUEUE_RETRIES
+        failures = state.queue_failures[group_id]
+        assert failures[-1]['name'] == 'test-episode'
+        assert failures[-1]['attempts'] == str(queues.MAX_QUEUE_RETRIES)
+        assert any('exceeded max retries' in record.message for record in caplog.records)
+    finally:
+        for task in created_tasks:
+            task.cancel()
+            with suppress(asyncio.CancelledError):
+                await task
+        state.episode_queues.pop(group_id, None)
+        state.queue_workers.pop(group_id, None)
+        state.queue_failures.pop(group_id, None)
+        monkeypatch.setattr(asyncio, 'create_task', original_create_task)
diff --git a/tests/orchestration/test_bulk_serialization.py b/tests/orchestration/test_bulk_serialization.py
@@ -0,0 +1,27 @@
+from datetime import UTC, datetime
+
+from graphium_core.nodes import EpisodicNode, EpisodeType
+from graphium_core.orchestration import bulk
+
+
+def test_serialize_episodes_preserves_entity_edges():
+    now = datetime.now(UTC)
+    episode = EpisodicNode(
+        name='episode-1',
+        group_id='group-123',
+        labels=['Conversation'],
+        created_at=now,
+        source=EpisodeType.message,
+        source_description='chat message',
+        content='user: hello world',
+        valid_at=now,
+        entity_edges=['edge-1', 'edge-2'],
+    )
+
+    payloads = bulk._serialize_episodes([episode])
+
+    assert len(payloads) == 1
+    payload = payloads[0]
+    assert payload.uuid == episode.uuid
+    assert payload.source == EpisodeType.message.value
+    assert payload.entity_edges == ['edge-1', 'edge-2']
diff --git a/tests/search/test_search_utils_filters.py b/tests/search/test_search_utils_filters.py
diff --git a/tests/test_graphium_mock.py b/tests/test_graphium_mock.py