Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit 1790e7f

Browse files
authored
feat: group common messages in the same thread (#897)
It will look for partialquestions with a timestamp difference less than 5 seconds, and that share at least 1 common message in the list. Those will be grouped together Closes: #694
1 parent 0599103 commit 1790e7f

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

src/codegate/api/v1_processing.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,9 @@ def _clean_secrets_from_message(message: str) -> str:
235235
return pattern.sub("REDACTED_SECRET", message)
236236

237237

238-
def _group_partial_messages(
238+
def _group_partial_messages( # noqa: C901
239239
pq_list: List[PartialQuestions],
240-
) -> List[List[PartialQuestions]]: # noqa: C901
240+
) -> List[List[PartialQuestions]]:
241241
"""
242242
A PartialQuestion is an object that contains several user messages provided from a
243243
chat conversation. Example:
@@ -272,9 +272,7 @@ def _group_partial_messages(
272272
# (If sup's messages == sub's messages, that also counts, because sub ⊆ sup)
273273
possible_subsets: List[PartialQuestions] = []
274274
for sub in pq_list_sorted:
275-
if sub.message_id == sup.message_id:
276-
continue
277-
if sub.message_id in used:
275+
if sub.message_id == sup.message_id or sub.message_id in used:
278276
continue
279277
if (
280278
set(sub.messages).issubset(set(sup.messages))
@@ -283,10 +281,23 @@ def _group_partial_messages(
283281
):
284282
possible_subsets.append(sub)
285283

286-
# 3) If there are no subsets, this sup stands alone
284+
# 3) If there are no subsets, check for time-based grouping
287285
if not possible_subsets:
288-
groups.append([sup])
286+
new_group = [sup]
289287
used.add(sup.message_id)
288+
289+
for other in pq_list_sorted:
290+
if other.message_id in used or other.message_id == sup.message_id:
291+
continue
292+
if abs((other.timestamp - sup.timestamp).total_seconds()) <= 5 and set(
293+
other.messages
294+
) & set(
295+
sup.messages
296+
): # At least one message in common
297+
new_group.append(other)
298+
used.add(other.message_id)
299+
300+
groups.append(new_group)
290301
else:
291302
# 4) Group subsets by messages to discard duplicates e.g.: 2 subsets with single 'hello'
292303
subs_group_by_messages = defaultdict(list)

0 commit comments

Comments
 (0)