Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit a5a5b73

Browse files
Refine caching non-copilot FIM calls
Closes: #376 We were using the whole content of the prompt to hash the requests that doesn't come from copilot. This is inefficient since the prompt between requests can vary quite a lot. Instead use the filepath included in every request. After some investigation, `copilot` puts the filepath at the top of the prompt while the rest of the providers include the filepath at the bottom of the context section.
1 parent 6dca3a7 commit a5a5b73

File tree

2 files changed

+54
-11
lines changed

2 files changed

+54
-11
lines changed

src/codegate/db/connection.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import hashlib
33
import json
44
import re
5+
from datetime import timedelta
56
from pathlib import Path
67
from typing import List, Optional
78

@@ -200,20 +201,23 @@ def _extract_request_message(self, request: str) -> Optional[dict]:
200201

201202
def _create_hash_key(self, message: str, provider: str) -> str:
202203
"""Creates a hash key from the message and includes the provider"""
203-
# Try to extract the path from the message. Most of the times is at the top of the message.
204-
# The pattern was generated using ChatGPT. Should match common occurrences like:
204+
# Try to extract the path from the FIM message. The path is in FIM request in these formats:
205205
# folder/testing_file.py
206206
# Path: file3.py
207-
pattern = r"(?:[a-zA-Z]:\\|\/)?(?:[^\s\/]+\/)*[^\s\/]+\.[^\s\/]+"
208-
match = re.search(pattern, message)
209-
# Copilot it's the only provider that has an easy path to extract.
210-
# Other providers are harder to extact. This part needs to be revisited for the moment
211-
# hasing the entire request message.
212-
if match is None or provider != "copilot":
213-
logger.warning("No path found in message or not copilot. Creating hash from message.")
207+
pattern = r"^#.*?\b([a-zA-Z0-9_\-\/]+\.\w+)\b"
208+
matches = re.findall(pattern, message, re.MULTILINE)
209+
# If no path is found, hash the entire prompt message.
210+
if not matches:
211+
logger.warning("No path found in messages. Creating hash cache from message.")
214212
message_to_hash = f"{message}-{provider}"
215213
else:
216-
message_to_hash = f"{match.group(0)}-{provider}"
214+
# Copilot puts the path at the top of the file. Continue providers contain
215+
# several paths, the one in which the fim is triggered is the last one.
216+
if provider == "copilot":
217+
filepath = matches[0]
218+
else:
219+
filepath = matches[-1]
220+
message_to_hash = f"{filepath}-{provider}"
217221

218222
logger.debug(f"Message to hash: {message_to_hash}")
219223
hashed_content = hashlib.sha256(message_to_hash.encode("utf-8")).hexdigest()
@@ -247,7 +251,10 @@ def _should_record_context(self, context: Optional[PipelineContext]) -> bool:
247251

248252
elapsed_seconds = (context.input_request.timestamp - old_timestamp).total_seconds()
249253
if elapsed_seconds < Config.get_config().max_fim_hash_lifetime:
250-
logger.info(f"Skipping context recording. Elapsed time: {elapsed_seconds} seconds.")
254+
logger.info(
255+
f"Skipping DB context recording. "
256+
f"Elapsed time since last FIM cache: {timedelta(seconds=elapsed_seconds)}."
257+
)
251258
return False
252259

253260
async def record_context(self, context: Optional[PipelineContext]) -> None:

tests/db/test_connection.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import hashlib
2+
from unittest.mock import patch
3+
4+
import pytest
5+
6+
from codegate.db.connection import DbRecorder
7+
8+
9+
@patch("codegate.db.connection.DbRecorder.__init__", return_value=None)
10+
def mock_db_recorder(mocked_init) -> DbRecorder:
11+
db_recorder = DbRecorder()
12+
return db_recorder
13+
14+
15+
fim_message = """
16+
# Path: folder/testing_file.py
17+
# another_folder/another_file.py
18+
19+
This is a test message
20+
"""
21+
22+
23+
@pytest.mark.parametrize(
24+
"message, provider, expected_message_to_hash",
25+
[
26+
("This is a test message", "test_provider", "This is a test message-test_provider"),
27+
(fim_message, "copilot", "folder/testing_file.py-copilot"),
28+
(fim_message, "other", "another_folder/another_file.py-other"),
29+
],
30+
)
31+
def test_create_hash_key(message, provider, expected_message_to_hash):
32+
mocked_db_recorder = mock_db_recorder()
33+
expected_hash = hashlib.sha256(expected_message_to_hash.encode("utf-8")).hexdigest()
34+
35+
result_hash = mocked_db_recorder._create_hash_key(message, provider)
36+
assert result_hash == expected_hash

0 commit comments

Comments
 (0)