agent-memory-server/agent_memory_server/api.py at eb2e92e2a6ec835b41ab99a785341ee0fb183c13 · redis/agent-memory-server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from typing import Any

import tiktoken
from fastapi import APIRouter, Depends, HTTPException, Query, Response
from mcp.server.fastmcp.prompts import base
from mcp.types import TextContent
from ulid import ULID

from agent_memory_server import long_term_memory, working_memory
from agent_memory_server.auth import UserInfo, get_current_user
from agent_memory_server.config import settings
from agent_memory_server.dependencies import HybridBackgroundTasks
from agent_memory_server.filters import SessionId, UserId
from agent_memory_server.llm import LLMClient
from agent_memory_server.logging import get_logger
from agent_memory_server.models import (
    AckResponse,
    CreateMemoryRecordRequest,
    CreateSummaryViewRequest,
    EditMemoryRecordRequest,
    GetSessionsQuery,
    MemoryMessage,
    MemoryPromptRequest,
    MemoryPromptResponse,
    MemoryRecord,
    MemoryRecordResultsResponse,
    ModelNameLiteral,
    RunSummaryViewPartitionRequest,
    RunSummaryViewRequest,
    SearchModeEnum,
    SearchRequest,
    SessionListResponse,
    SummaryView,
    SummaryViewPartitionResult,
    SystemMessage,
    Task,
    TaskStatusEnum,
    TaskTypeEnum,
    UpdateWorkingMemory,
    WorkingMemory,
    WorkingMemoryResponse,
)
from agent_memory_server.summarization import _incremental_summary
from agent_memory_server.summary_views import (
    get_summary_view as get_summary_view_config,
    list_partition_results,
    list_summary_views,
    save_partition_result,
    save_summary_view,
    summarize_partition_for_view,
)
from agent_memory_server.tasks import create_task, get_task
from agent_memory_server.utils.redis import get_redis_conn


logger = get_logger(__name__)

router = APIRouter()


@router.post("/v1/long-term-memory/forget")
async def forget_endpoint(
    policy: dict,
    namespace: str | None = None,
    user_id: str | None = None,
    session_id: str | None = None,
    limit: int = 1000,
    dry_run: bool = True,
    pinned_ids: list[str] | None = None,
    current_user: UserInfo = Depends(get_current_user),
):
    """Run a forgetting pass with the provided policy. Returns summary data.

    This is an admin-style endpoint; auth is enforced by the standard dependency.
    """
    return await long_term_memory.forget_long_term_memories(
        policy,
        namespace=namespace,
        user_id=user_id,
        session_id=session_id,
        limit=limit,
        dry_run=dry_run,
        pinned_ids=pinned_ids,
    )


def _get_effective_token_limit(
    model_name: ModelNameLiteral | None,
    context_window_max: int | None,
) -> int:
    """Calculate the effective token limit for working memory based on model context window."""
    # If context_window_max is explicitly provided, use that
    if context_window_max is not None:
        return context_window_max
    # If model_name is provided, get its max_tokens from our config
    if model_name is not None:
        model_config = LLMClient.get_model_config(model_name)
        return model_config.max_tokens
    # Otherwise use a conservative default (GPT-3.5 context window)
    return 16000  # Conservative default


def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
    """Calculate total token count for a list of messages."""
    encoding = tiktoken.get_encoding("cl100k_base")
    total_tokens = 0

    for msg in messages:
        msg_str = f"{msg.role}: {msg.content}"
        msg_tokens = len(encoding.encode(msg_str))
        total_tokens += msg_tokens

    return total_tokens


def _calculate_context_usage_percentages(
    messages: list[MemoryMessage],
    model_name: ModelNameLiteral | None,
    context_window_max: int | None,
) -> tuple[float | None, float | None]:
    """
    Calculate context usage percentages for total usage and until summarization triggers.

    Args:
        messages: List of messages to calculate token count for
        model_name: The client's LLM model name for context window determination
        context_window_max: Direct specification of context window max tokens

    Returns:
        Tuple of (total_percentage, until_summarization_percentage)
        - total_percentage: Percentage (0-100) of total context window used
        - until_summarization_percentage: Percentage (0-100) until summarization triggers
        Both values are None if no model info provided
    """
    # Return None only when no model information is provided
    if not model_name and not context_window_max:
        return None, None

    # If no messages but model info is provided, return 0% usage
    if not messages:
        return 0.0, 0.0

    # Calculate current token usage
    current_tokens = _calculate_messages_token_count(messages)

    # Get effective token limit for the client's model
    max_tokens = _get_effective_token_limit(model_name, context_window_max)

    # Calculate percentage of total context window used
    if max_tokens <= 0:
        return None, None

    total_percentage = (current_tokens / max_tokens) * 100.0

    # Calculate percentage until summarization threshold
    token_threshold = int(max_tokens * settings.summarization_threshold)
    if token_threshold <= 0:
        # If threshold is 0 or negative, we're already at 100% until summarization
        until_summarization_percentage = 100.0
    else:
        until_summarization_percentage = (current_tokens / token_threshold) * 100.0

    # Cap both at 100% for display purposes
    return min(total_percentage, 100.0), min(until_summarization_percentage, 100.0)


def _build_recency_params(payload: SearchRequest) -> dict[str, Any]:
    """Build recency parameters dict from payload."""
    return {
        "semantic_weight": (
            payload.recency_semantic_weight
            if payload.recency_semantic_weight is not None
            else 0.8
        ),
        "recency_weight": (
            payload.recency_recency_weight
            if payload.recency_recency_weight is not None
            else 0.2
        ),
        "freshness_weight": (
            payload.recency_freshness_weight
            if payload.recency_freshness_weight is not None
            else 0.6
        ),
        "novelty_weight": (
            payload.recency_novelty_weight
            if payload.recency_novelty_weight is not None
            else 0.4
        ),
        "half_life_last_access_days": (
            payload.recency_half_life_last_access_days
            if payload.recency_half_life_last_access_days is not None
            else 7.0
        ),
        "half_life_created_days": (
            payload.recency_half_life_created_days
            if payload.recency_half_life_created_days is not None
            else 30.0
        ),
    }


async def _summarize_working_memory(
    memory: WorkingMemory,
    model_name: ModelNameLiteral | None = None,
    context_window_max: int | None = None,
    model: str = settings.generation_model,
) -> WorkingMemory:
    """
    Summarize working memory when it exceeds token limits.

    Args:
        memory: The working memory to potentially summarize
        model_name: The client's LLM model name for context window determination
        context_window_max: Direct specification of context window max tokens
        model: The model to use for summarization

    Returns:
        Updated working memory with summary and trimmed messages
    """
    # Skip summarization entirely if disabled via config
    if not settings.enable_working_memory_summarization:
        return memory

    # Calculate current token usage
    current_tokens = _calculate_messages_token_count(memory.messages)

    # Get effective token limit for the client's model
    max_tokens = _get_effective_token_limit(model_name, context_window_max)

    # Reserve space for new messages, function calls, and response generation
    # Use configurable threshold to leave room for new content
    token_threshold = int(max_tokens * settings.summarization_threshold)

    if current_tokens <= token_threshold:
        return memory

    # Get model config for summarization
    model_config = LLMClient.get_model_config(model)
    summarization_max_tokens = model_config.max_tokens

    # Token allocation for summarization (same logic as original summarize_session)
    if summarization_max_tokens < 10000:
        summary_max_tokens = max(512, summarization_max_tokens // 8)  # 12.5%
    elif summarization_max_tokens < 50000:
        summary_max_tokens = max(1024, summarization_max_tokens // 10)  # 10%
    else:
        summary_max_tokens = max(2048, summarization_max_tokens // 20)  # 5%

    buffer_tokens = min(max(230, summarization_max_tokens // 100), 1000)
    max_message_tokens = summarization_max_tokens - summary_max_tokens - buffer_tokens

    encoding = tiktoken.get_encoding("cl100k_base")
    total_tokens = 0
    messages_to_summarize = []

    # We want to keep recent messages that fit in our target token budget
    target_remaining_tokens = int(
        max_tokens * 0.4
    )  # Keep 40% of context for recent messages

    # Work backwards from the end to find how many recent messages we can keep
    recent_messages_tokens = 0
    keep_count = 0

    for i in range(len(memory.messages) - 1, -1, -1):
        msg = memory.messages[i]
        msg_str = f"{msg.role}: {msg.content}"
        msg_tokens = len(encoding.encode(msg_str))

        if recent_messages_tokens + msg_tokens <= target_remaining_tokens:
            recent_messages_tokens += msg_tokens
            keep_count += 1
        else:
            break

    # Messages to summarize are the ones we're not keeping
    messages_to_check = (
        memory.messages[:-keep_count] if keep_count > 0 else memory.messages[:-1]
    )

    for msg in messages_to_check:
        msg_str = f"{msg.role}: {msg.content}"
        msg_tokens = len(encoding.encode(msg_str))

        # Handle oversized messages
        if msg_tokens > max_message_tokens:
            msg_str = msg_str[: max_message_tokens // 2]
            msg_tokens = len(encoding.encode(msg_str))

        if total_tokens + msg_tokens <= max_message_tokens:
            total_tokens += msg_tokens
            messages_to_summarize.append(msg_str)
        else:
            break

    if not messages_to_summarize:
        # No messages to summarize, just return original memory
        return memory

    # Generate summary
    summary, summary_tokens_used = await _incremental_summary(
        model,
        memory.context,  # Use existing context as base
        messages_to_summarize,
    )

    # Update working memory with new summary and trimmed messages
    # Keep only the most recent messages that fit in our token budget
    updated_memory = memory.model_copy(deep=True)
    updated_memory.context = summary
    updated_memory.messages = (
        memory.messages[-keep_count:] if keep_count > 0 else [memory.messages[-1]]
    )
    updated_memory.tokens = memory.tokens + summary_tokens_used

    return updated_memory


@router.get("/v1/working-memory/", response_model=SessionListResponse)
async def list_sessions(
    options: GetSessionsQuery = Depends(),
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Get a list of session IDs, with optional pagination.

    Args:
        options: Query parameters (limit, offset, namespace, user_id)

    Returns:
        List of session IDs
    """
    redis = await get_redis_conn()

    total, session_ids = await working_memory.list_sessions(
        redis=redis,
        limit=options.limit,
        offset=options.offset,
        namespace=options.namespace,
        user_id=options.user_id,
    )

    return SessionListResponse(
        sessions=session_ids,
        total=total,
    )


@router.get("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
async def get_working_memory(
    session_id: str,
    user_id: str | None = None,
    namespace: str | None = None,
    model_name: ModelNameLiteral | None = None,
    context_window_max: int | None = None,
    recent_messages_limit: int | None = None,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Get working memory for a session.

    This includes stored conversation messages, context, and structured memory records.
    If the messages exceed the token limit, older messages will be truncated.

    Args:
        session_id: The session ID
        user_id: The user ID to retrieve working memory for
        namespace: The namespace to use for the session
        model_name: The client's LLM model name (will determine context window size if provided)
        context_window_max: Direct specification of the context window max tokens (overrides model_name)
        recent_messages_limit: Maximum number of recent messages to return (most recent first)

    Returns:
        Working memory containing messages, context, and structured memory records
    """
    redis = await get_redis_conn()

    working_mem = await working_memory.get_working_memory(
        session_id=session_id,
        namespace=namespace,
        redis_client=redis,
        user_id=user_id,
        recent_messages_limit=recent_messages_limit,
    )

    if not working_mem:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    # Apply token-based truncation if we have messages and model info
    if working_mem.messages and (model_name or context_window_max):
        token_limit = _get_effective_token_limit(model_name, context_window_max)
        current_token_count = _calculate_messages_token_count(working_mem.messages)

        # If we exceed the token limit, truncate from the beginning (keep recent messages)
        if current_token_count > token_limit:
            # Keep removing oldest messages until we're under the limit
            truncated_messages = working_mem.messages[:]
            while len(truncated_messages) > 1:  # Always keep at least 1 message
                truncated_messages = truncated_messages[1:]  # Remove oldest
                if _calculate_messages_token_count(truncated_messages) <= token_limit:
                    break
            working_mem.messages = truncated_messages

    logger.debug(f"Working mem: {working_mem}")

    # Calculate context usage percentages
    total_percentage, until_summarization_percentage = (
        _calculate_context_usage_percentages(
            messages=working_mem.messages,
            model_name=model_name,
            context_window_max=context_window_max,
        )
    )

    working_mem_data = working_mem.model_dump()
    working_mem_data["context_percentage_total_used"] = total_percentage
    working_mem_data["context_percentage_until_summarization"] = (
        until_summarization_percentage
    )
    return WorkingMemoryResponse(**working_mem_data)


async def put_working_memory_core(
    session_id: str,
    memory: UpdateWorkingMemory,
    background_tasks: HybridBackgroundTasks,
    model_name: ModelNameLiteral | None = None,
    context_window_max: int | None = None,
) -> WorkingMemoryResponse:
    """
    Core implementation of put_working_memory.

    This function contains the business logic for setting working memory and can be
    called from both the REST API endpoint and MCP tools.

    Args:
        session_id: The session ID
        memory: Working memory data to save
        background_tasks: Background tasks handler
        model_name: The client's LLM model name for context window determination
        context_window_max: Direct specification of context window max tokens

    Returns:
        Updated working memory response
    """
    redis = await get_redis_conn()

    # PUT semantics: we simply replace whatever exists (or create if it doesn't exist)

    # Convert UpdateWorkingMemory to WorkingMemory with session_id from URL path
    working_memory_obj = memory.to_working_memory(session_id)

    # Validate that all long-term memories have id (if any)
    for long_term_mem in working_memory_obj.memories:
        if not long_term_mem.id:
            raise HTTPException(
                status_code=400,
                detail="All long-term memory records in working memory must have an ID",
            )

    # Validate that all messages have non-empty content
    for msg in working_memory_obj.messages:
        if not msg.content or not msg.content.strip():
            raise HTTPException(
                status_code=400,
                detail=f"Message content cannot be empty (message ID: {msg.id})",
            )

    # Handle summarization if needed (before storing) - now token-based
    updated_memory = working_memory_obj
    if working_memory_obj.messages:
        updated_memory = await _summarize_working_memory(
            working_memory_obj,
            model_name=model_name,
            context_window_max=context_window_max,
        )

    await working_memory.set_working_memory(
        working_memory=updated_memory,
        redis_client=redis,
    )

    # Background tasks for long-term memory promotion and indexing (if enabled)
    if settings.long_term_memory and (
        updated_memory.memories or updated_memory.messages
    ):
        # Promote structured memories from working memory to long-term storage
        # TODO: Evaluate if this is an optimal way to pass around user ID. We
        # need it to construct the key to get the working memory session from
        # this task, if the session was saved with a user ID to begin with.
        background_tasks.add_task(
            long_term_memory.promote_working_memory_to_long_term,
            session_id=session_id,
            user_id=updated_memory.user_id,
            namespace=updated_memory.namespace,
        )

    # Calculate context usage percentages based on the final state (after potential summarization)
    # This represents the current state of the session
    total_percentage, until_summarization_percentage = (
        _calculate_context_usage_percentages(
            messages=updated_memory.messages,
            model_name=model_name,
            context_window_max=context_window_max,
        )
    )

    # Return WorkingMemoryResponse with percentage values (no new_session for PUT)
    updated_memory_data = updated_memory.model_dump()
    updated_memory_data["context_percentage_total_used"] = total_percentage
    updated_memory_data["context_percentage_until_summarization"] = (
        until_summarization_percentage
    )
    return WorkingMemoryResponse(**updated_memory_data)


@router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
async def put_working_memory(
    session_id: str,
    memory: UpdateWorkingMemory,
    background_tasks: HybridBackgroundTasks,
    response: Response,
    model_name: ModelNameLiteral | None = None,
    context_window_max: int | None = None,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Set working memory for a session. Replaces existing working memory.

    The session_id comes from the URL path, not the request body.
    If the token count exceeds the context window threshold, messages will be summarized
    immediately and the updated memory state returned to the client.

    NOTE on context_percentage_* fields:
    The response includes `context_percentage_total_used` and `context_percentage_until_summarization`
    fields that show token usage. These fields will be `null` unless you provide either:
    - `model_name` query parameter (e.g., `?model_name=gpt-4o-mini`)
    - `context_window_max` query parameter (e.g., `?context_window_max=500`)

    Args:
        session_id: The session ID (from URL path)
        memory: Working memory data to save (session_id not required in body)
        model_name: The client's LLM model name for context window determination
        context_window_max: Direct specification of context window max tokens (overrides model_name)
        background_tasks: DocketBackgroundTasks instance (injected automatically)
        response: FastAPI Response object for setting headers

    Returns:
        Updated working memory (potentially with summary if tokens were condensed).
        Includes context_percentage_total_used and context_percentage_until_summarization
        if model information is provided.
    """
    # Check if any messages are missing created_at timestamps and add deprecation header
    messages_missing_timestamp = any(
        not getattr(msg, "_created_at_was_provided", True) for msg in memory.messages
    )
    if messages_missing_timestamp:
        response.headers["X-Deprecation-Warning"] = (
            "messages[].created_at will become required in the next major version. "
            "Please provide timestamps for all messages."
        )

    return await put_working_memory_core(
        session_id=session_id,
        memory=memory,
        background_tasks=background_tasks,
        model_name=model_name,
        context_window_max=context_window_max,
    )


@router.delete("/v1/working-memory/{session_id}", response_model=AckResponse)
async def delete_working_memory(
    session_id: str,
    user_id: str | None = None,
    namespace: str | None = None,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Delete working memory for a session.

    This deletes all stored memory (messages, context, structured memories) for a session.

    Args:
        session_id: The session ID
        user_id: Optional user ID for the session
        namespace: Optional namespace for the session

    Returns:
        Acknowledgement response
    """
    redis = await get_redis_conn()

    # Delete unified working memory
    await working_memory.delete_working_memory(
        session_id=session_id,
        user_id=user_id,
        namespace=namespace,
        redis_client=redis,
    )

    return AckResponse(status="ok")


@router.post("/v1/long-term-memory/", response_model=AckResponse)
async def create_long_term_memory(
    payload: CreateMemoryRecordRequest,
    background_tasks: HybridBackgroundTasks,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Create a long-term memory

    Args:
        payload: Long-term memory payload
        background_tasks: DocketBackgroundTasks instance (injected automatically)

    Returns:
        Acknowledgement response
    """
    if not settings.long_term_memory:
        raise HTTPException(status_code=400, detail="Long-term memory is disabled")

    # Validate and process memories
    for memory in payload.memories:
        # Enforce that ID is required on memory sent from clients
        if not memory.id:
            raise HTTPException(
                status_code=400, detail="id is required for all memory records"
            )

        # Ensure persisted_at is server-assigned and read-only for clients
        # Clear any client-provided persisted_at value
        memory.persisted_at = None

    background_tasks.add_task(
        long_term_memory.index_long_term_memories,
        memories=payload.memories,
        deduplicate=payload.deduplicate,
    )
    return AckResponse(status="ok")


@router.post("/v1/long-term-memory/search", response_model=MemoryRecordResultsResponse)
async def search_long_term_memory(
    payload: SearchRequest,
    background_tasks: HybridBackgroundTasks,
    optimize_query: bool = False,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Run a long-term memory search with semantic, keyword, or hybrid options.

    Args:
        payload: Search payload with filter objects for precise queries
        optimize_query: Whether to optimize the query for semantic (vector) search using a fast model; ignored for keyword and hybrid modes (default: False)

    Returns:
        List of search results
    """
    if not settings.long_term_memory:
        raise HTTPException(status_code=400, detail="Long-term memory is disabled")

    if (
        payload.distance_threshold is not None
        and payload.search_mode != SearchModeEnum.SEMANTIC
    ):
        raise HTTPException(
            status_code=400,
            detail="distance_threshold is only supported for semantic search mode",
        )

    # Extract filter objects from the payload
    filters = payload.get_filters()

    logger.debug(f"Long-term search filters: {filters}")

    kwargs = {
        "search_mode": payload.search_mode,
        "hybrid_alpha": payload.hybrid_alpha,
        "text_scorer": payload.text_scorer,
        "distance_threshold": payload.distance_threshold,
        "limit": payload.limit,
        "offset": payload.offset,
        "optimize_query": optimize_query,
        **filters,
    }

    kwargs["text"] = payload.text or ""

    logger.debug(f"Long-term search kwargs: {kwargs}")

    # Server-side recency rerank toggle
    server_side_recency = (
        payload.server_side_recency
        if payload.server_side_recency is not None
        else False
    )
    if server_side_recency:
        kwargs["server_side_recency"] = True
        kwargs["recency_params"] = _build_recency_params(payload)
        return await long_term_memory.search_long_term_memories(**kwargs)

    raw_results = await long_term_memory.search_long_term_memories(**kwargs)

    # Soft-filter fallback: if strict filters yield no results, relax filters and
    # inject hints into the query text to guide semantic search.
    try:
        had_any_strict_filters = any(
            key in kwargs and kwargs[key] is not None
            for key in ("topics", "entities", "namespace", "memory_type", "event_date")
        )
        if (
            raw_results.total == 0
            and had_any_strict_filters
            and kwargs.get("search_mode", SearchModeEnum.SEMANTIC)
            == SearchModeEnum.SEMANTIC
        ):
            fallback_kwargs = dict(kwargs)
            for key in ("topics", "entities", "namespace", "memory_type", "event_date"):
                fallback_kwargs.pop(key, None)

            def _vals(f):
                vals: list[str] = []
                if not f:
                    return vals
                for attr in ("eq", "any", "all"):
                    v = getattr(f, attr, None)
                    if isinstance(v, list):
                        vals.extend([str(x) for x in v])
                    elif v is not None:
                        vals.append(str(v))
                return vals

            topics_vals = _vals(filters.get("topics")) if filters else []
            entities_vals = _vals(filters.get("entities")) if filters else []
            namespace_vals = _vals(filters.get("namespace")) if filters else []
            memory_type_vals = _vals(filters.get("memory_type")) if filters else []

            hint_parts: list[str] = []
            if topics_vals:
                hint_parts.append(f"topics: {', '.join(sorted(set(topics_vals)))}")
            if entities_vals:
                hint_parts.append(f"entities: {', '.join(sorted(set(entities_vals)))}")
            if namespace_vals:
                hint_parts.append(
                    f"namespace: {', '.join(sorted(set(namespace_vals)))}"
                )
            if memory_type_vals:
                hint_parts.append(f"type: {', '.join(sorted(set(memory_type_vals)))}")

            base_text = payload.text or ""
            hint_suffix = f" ({'; '.join(hint_parts)})" if hint_parts else ""
            fallback_kwargs["text"] = (base_text + hint_suffix).strip()

            logger.debug(
                f"Soft-filter fallback engaged. Fallback kwargs: { {k: (str(v) if k == 'text' else v) for k, v in fallback_kwargs.items()} }"
            )
            raw_results = await long_term_memory.search_long_term_memories(
                **fallback_kwargs
            )
    except Exception as e:
        logger.warning(f"Soft-filter fallback failed: {e}")

    # Recency-aware re-ranking of results (configurable)
    # TODO: Why did we need to go this route instead of using recency boost at
    # the query level?
    try:
        from datetime import UTC, datetime as _dt

        # Decide whether to apply recency boost
        recency_boost = (
            payload.recency_boost if payload.recency_boost is not None else True
        )
        if not recency_boost or not raw_results.memories:
            return raw_results

        now = _dt.now(UTC)
        recency_params = _build_recency_params(payload)
        ranked = long_term_memory.rerank_with_recency(
            raw_results.memories, now=now, params=recency_params
        )
        # Update last_accessed in background with rate limiting
        ids = [m.id for m in ranked if m.id]
        if ids:
            background_tasks.add_task(long_term_memory.update_last_accessed, ids)

        raw_results.memories = ranked
        return raw_results
    except Exception:
        return raw_results


@router.delete("/v1/long-term-memory", response_model=AckResponse)
async def delete_long_term_memory(
    memory_ids: list[str] = Query(default=[], alias="memory_ids"),
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Delete long-term memories by ID

    Args:
        memory_ids: List of memory IDs to delete (passed as query parameters)
    """
    if not settings.long_term_memory:
        raise HTTPException(status_code=400, detail="Long-term memory is disabled")

    count = await long_term_memory.delete_long_term_memories(ids=memory_ids)
    return AckResponse(status=f"ok, deleted {count} memories")


@router.post("/v1/long-term-memory/compact", response_model=AckResponse)
async def compact_long_term_memories_endpoint(
    namespace: str | None = Query(default=None),
    user_id: str | None = Query(default=None),
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Compact long-term memories by merging hash-based and semantic duplicates.

    Args:
        namespace: Optional namespace filter
        user_id: Optional user ID filter
    """
    if not settings.long_term_memory:
        raise HTTPException(status_code=400, detail="Long-term memory is disabled")

    remaining = await long_term_memory.compact_long_term_memories(
        namespace=namespace,
        user_id=user_id,
    )
    return AckResponse(status=f"ok, {remaining} memories remaining after compaction")


@router.get("/v1/long-term-memory/{memory_id}", response_model=MemoryRecord)
async def get_long_term_memory(
    memory_id: str,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Get a long-term memory by its ID

    Args:
        memory_id: The ID of the memory to retrieve

    Returns:
        The memory record if found

    Raises:
        HTTPException: 404 if memory not found, 400 if long-term memory disabled
    """
    if not settings.long_term_memory:
        raise HTTPException(status_code=400, detail="Long-term memory is disabled")

    memory = await long_term_memory.get_long_term_memory_by_id(memory_id)
    if not memory:
        raise HTTPException(
            status_code=404, detail=f"Memory with ID {memory_id} not found"
        )

    return memory


@router.patch("/v1/long-term-memory/{memory_id}", response_model=MemoryRecord)
async def update_long_term_memory(
    memory_id: str,
    updates: EditMemoryRecordRequest,
    current_user: UserInfo = Depends(get_current_user),
):
    """
    Update a long-term memory by its ID

    Args:
        memory_id: The ID of the memory to update
        updates: The fields to update

    Returns:
        The updated memory record

    Raises:
        HTTPException: 404 if memory not found, 400 if invalid fields or long-term memory disabled
    """
    if not settings.long_term_memory:
        raise HTTPException(status_code=400, detail="Long-term memory is disabled")

    # Convert request model to dictionary, excluding None values
    update_dict = {k: v for k, v in updates.model_dump().items() if v is not None}

    if not update_dict:
        raise HTTPException(status_code=400, detail="No fields provided for update")

    try:
        updated_memory = await long_term_memory.update_long_term_memory(
            memory_id, update_dict
        )
        if not updated_memory:
            raise HTTPException(
                status_code=404, detail=f"Memory with ID {memory_id} not found"
            )

        return updated_memory
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e)) from e


@router.post("/v1/memory/prompt", response_model=MemoryPromptResponse)
async def memory_prompt(
    params: MemoryPromptRequest,
    background_tasks: HybridBackgroundTasks,
    optimize_query: bool = False,
    current_user: UserInfo = Depends(get_current_user),
) -> MemoryPromptResponse:
    """
    Hydrate a user query with memory context and return a prompt
    ready to send to an LLM.

    `query` is the query for vector search that the caller of this API wants to use to find
    relevant context. If `session_id` is provided and matches an existing
    session, the resulting prompt will include those messages as the immediate
    history of messages leading to a message containing `query`.

    If `long_term_search_payload` is provided, the resulting prompt will include
    relevant long-term memories found via semantic search with the options
    provided in the payload.

    Args:
        params: MemoryPromptRequest
        optimize_query: Whether to optimize the query for semantic (vector) search using a fast model; ignored for keyword and hybrid modes (default: False)

    Returns:
        List of messages to send to an LLM, hydrated with relevant memory context
    """
    if not params.session and not params.long_term_search:
        raise HTTPException(
            status_code=400,
            detail="Either session or long_term_search must be provided",
        )

    redis = await get_redis_conn()
    _messages = []

    if params.session:
        # Use token limit for memory prompt - model info is required now
        if params.session.model_name or params.session.context_window_max:
            token_limit = _get_effective_token_limit(
                model_name=params.session.model_name,
                context_window_max=params.session.context_window_max,
            )
            effective_token_limit = token_limit
        else:
            # No model info provided - use all messages without truncation
            effective_token_limit = None
        working_mem = await working_memory.get_working_memory(
            session_id=params.session.session_id,
            namespace=params.session.namespace,
            user_id=params.session.user_id,
            redis_client=redis,
        )

        logger.debug(f"Found working memory: {working_mem}")

        # Create empty working memory if session doesn't exist
        if not working_mem:
            working_mem = WorkingMemory(
                session_id=params.session.session_id,
                namespace=params.session.namespace,
                user_id=params.session.user_id,
                messages=[],
                memories=[],
            )
            logger.debug(
                f"Created empty working memory for session: {params.session.session_id}"
            )

        if working_mem:
            if working_mem.context:
                # TODO: Weird to use MCP types here?
                _messages.append(
                    SystemMessage(
                        content=TextContent(
                            type="text",
                            text=f"## A summary of the conversation so far:\n{working_mem.context}",
                        ),
                    )
                )
            # Apply token-based truncation if model info is provided
            if effective_token_limit is not None:
                # Token-based truncation
                if (
                    _calculate_messages_token_count(working_mem.messages)
                    > effective_token_limit
                ):
                    # Keep removing oldest messages until we're under the limit
                    recent_messages = working_mem.messages[:]
                    while len(recent_messages) > 1:  # Always keep at least 1 message
                        recent_messages = recent_messages[1:]  # Remove oldest
                        if (
                            _calculate_messages_token_count(recent_messages)
                            <= effective_token_limit