Skip to content

Commit 5f6eef3

Browse files
committed
Updated e5 Retrieval with Prefix
1 parent ca75410 commit 5f6eef3

File tree

4 files changed

+39
-12
lines changed

4 files changed

+39
-12
lines changed

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ Like SimCSE, [ConGen: Unsupervised Control and Generalization Distillation For S
9797
| [SCT-IndoBERT Base](https://huggingface.co/LazarusNLP/sct-indobert-base) | 40.41 | 47.29 | 40.68 |
9898
| [distiluse-base-multilingual-cased-v2](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2) | 41.35 | 54.93 | 48.79 |
9999
| [paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) | 52.81 | 65.07 | 57.97 |
100-
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 68.33 | 78.85 | 73.84 |
101-
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 68.95 | 78.92 | 74.58 |
102-
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **69.89** | **80.09** | **75.64** |
100+
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 70.20 | 79.61 | 74.80 |
101+
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 70.00 | 79.50 | 75.16 |
102+
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **70.83** | **80.58** | **76.16** |
103103

104104
#### TyDiQA
105105

@@ -112,9 +112,9 @@ Like SimCSE, [ConGen: Unsupervised Control and Generalization Distillation For S
112112
| [SCT-IndoBERT Base](https://huggingface.co/LazarusNLP/sct-indobert-base) | 76.81 | 83.16 | 85.87 |
113113
| [distiluse-base-multilingual-cased-v2](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2) | 70.44 | 77.94 | 81.56 |
114114
| [paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) | 81.41 | 87.05 | 89.44 |
115-
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 90.97 | 94.14 | 95.25 |
116-
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 91.85 | 94.88 | 95.82 |
117-
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **94.15** | **96.36** | **97.14** |
115+
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 91.50 | 94.34 | 95.39 |
116+
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 93.45 | 95.88 | 96.69 |
117+
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **94.69** | **96.71** | **97.44** |
118118

119119
### Classification
120120

docs/index.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ Like SimCSE, [ConGen: Unsupervised Control and Generalization Distillation For S
9797
| [SCT-IndoBERT Base](https://huggingface.co/LazarusNLP/sct-indobert-base) | 40.41 | 47.29 | 40.68 |
9898
| [distiluse-base-multilingual-cased-v2](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2) | 41.35 | 54.93 | 48.79 |
9999
| [paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) | 52.81 | 65.07 | 57.97 |
100-
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 68.33 | 78.85 | 73.84 |
101-
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 68.95 | 78.92 | 74.58 |
102-
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **69.89** | **80.09** | **75.64** |
100+
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 70.20 | 79.61 | 74.80 |
101+
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 70.00 | 79.50 | 75.16 |
102+
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **70.83** | **80.58** | **76.16** |
103103

104104
#### TyDiQA
105105

@@ -112,9 +112,9 @@ Like SimCSE, [ConGen: Unsupervised Control and Generalization Distillation For S
112112
| [SCT-IndoBERT Base](https://huggingface.co/LazarusNLP/sct-indobert-base) | 76.81 | 83.16 | 85.87 |
113113
| [distiluse-base-multilingual-cased-v2](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2) | 70.44 | 77.94 | 81.56 |
114114
| [paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) | 81.41 | 87.05 | 89.44 |
115-
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 90.97 | 94.14 | 95.25 |
116-
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 91.85 | 94.88 | 95.82 |
117-
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **94.15** | **96.36** | **97.14** |
115+
| [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) | 91.50 | 94.34 | 95.39 |
116+
| [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | 93.45 | 95.88 | 96.69 |
117+
| [multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) | **94.69** | **96.71** | **97.44** |
118118

119119
### Classification
120120

evaluation/retrieval/eval_miracl.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ class Args:
1515
test_dataset_split: str = "dev"
1616
test_batch_size: int = 32
1717
output_folder: str = "results"
18+
query_prefix: str = None
19+
passage_prefix: str = None
1820

1921

2022
def main(args: Args):
@@ -25,6 +27,22 @@ def main(args: Args):
2527
# Load dataset
2628
test_ds = load_dataset(args.test_dataset_name, args.test_dataset_config, split=args.test_dataset_split)
2729

30+
# Add prefix for e5 models
31+
if args.query_prefix:
32+
test_ds = test_ds.map(lambda ex: {"query": args.query_prefix + ex["query"]})
33+
34+
if args.passage_prefix:
35+
test_ds = test_ds.map(
36+
lambda ex: {
37+
"positive_passages": [{"text": args.passage_prefix + d["text"]} for d in ex["positive_passages"]]
38+
}
39+
)
40+
test_ds = test_ds.map(
41+
lambda ex: {
42+
"negative_passages": [{"text": args.passage_prefix + d["text"]} for d in ex["negative_passages"]]
43+
}
44+
)
45+
2846
# Preprocess datasets
2947
queries, answers, documents = [], [], []
3048
for data in test_ds:

evaluation/retrieval/eval_tydiqa.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ class Args:
1515
test_dataset_split: str = "validation"
1616
test_batch_size: int = 32
1717
output_folder: str = "results"
18+
query_prefix: str = None
19+
passage_prefix: str = None
1820

1921

2022
def main(args: Args):
@@ -25,6 +27,13 @@ def main(args: Args):
2527
# Load dataset
2628
test_ds = load_dataset(args.test_dataset_name, args.test_dataset_config, split=args.test_dataset_split)
2729

30+
# Add prefix for e5 models
31+
if args.query_prefix:
32+
test_ds = test_ds.map(lambda ex: {"question_text": args.query_prefix + ex["question_text"]})
33+
34+
if args.passage_prefix:
35+
test_ds = test_ds.map(lambda ex: {"passage_text": args.passage_prefix + ex["passage_text"]})
36+
2837
# Get all queries and documents
2938
queries = test_ds["question_text"]
3039
documents = list(set(test_ds["passage_text"]))

0 commit comments

Comments
 (0)