Skip to content

Commit 0c4f099

Browse files
authored
fix: resolve hash randomization in retrieval task ID generation (#3553)
This commit fixes non-deterministic query ID assignment in three retrieval tasks caused by Python hash randomization when using enumerate(set()). Affected tasks: - PublicHealthQARetrieval (8 languages including Korean) - BelebeleRetrieval (122 language variants including Korean) - GeorgianFAQRetrieval (Georgian)
1 parent 8b02789 commit 0c4f099

File tree

3 files changed

+25
-12
lines changed

3 files changed

+25
-12
lines changed

mteb/tasks/retrieval/kat/georgian_faq_retrieval.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,17 @@ def load_data(self) -> None:
4646
split=_EVAL_SPLIT,
4747
revision=self.metadata.dataset["revision"],
4848
)
49-
question_ids = {
50-
question: _id for _id, question in enumerate(set(data["question"]))
51-
}
52-
answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
49+
50+
question_ids = {}
51+
answer_ids = {}
52+
53+
for row in data:
54+
question = row["question"]
55+
answer = row["answer"]
56+
if question not in question_ids:
57+
question_ids[question] = len(question_ids)
58+
if answer not in answer_ids:
59+
answer_ids[answer] = len(answer_ids)
5360

5461
for row in data:
5562
question = row["question"]

mteb/tasks/retrieval/multilingual/belebele_retrieval.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,10 +230,11 @@ def load_data(self, **kwargs) -> None:
230230
ds_corpus = self.dataset[lang_corpus]
231231
ds_question = self.dataset[lang_question]
232232

233-
question_ids = {
234-
question: _id
235-
for _id, question in enumerate(set(ds_question["question"]))
236-
}
233+
question_ids = {}
234+
for row in ds_question:
235+
question = row["question"]
236+
if question not in question_ids:
237+
question_ids[question] = len(question_ids)
237238

238239
link_to_context_id = {}
239240
context_idx = 0

mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,15 @@ def _load_publichealthqa_data(
3232
split=split,
3333
revision=revision,
3434
)
35-
question_ids = {
36-
question: _id for _id, question in enumerate(set(data["question"]))
37-
}
38-
answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
35+
36+
question_ids = {}
37+
answer_ids = {}
38+
39+
for row in data:
40+
if row["question"] is not None and row["question"] not in question_ids:
41+
question_ids[row["question"]] = len(question_ids)
42+
if row["answer"] is not None and row["answer"] not in answer_ids:
43+
answer_ids[row["answer"]] = len(answer_ids)
3944

4045
for row in data:
4146
if row["question"] is None or row["answer"] is None:

0 commit comments

Comments
 (0)