diff --git a/FlagEmbedding/abc/finetune/embedder/AbsArguments.py b/FlagEmbedding/abc/finetune/embedder/AbsArguments.py index fde2b80a..923f1312 100644 --- a/FlagEmbedding/abc/finetune/embedder/AbsArguments.py +++ b/FlagEmbedding/abc/finetune/embedder/AbsArguments.py @@ -30,6 +30,10 @@ class AbsEmbedderModelArguments: default=False, metadata={"help": "Trust remote code"} ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use fast tokenizer or not."} + ) token: str = field( default_factory=lambda: os.getenv('HF_TOKEN', None), metadata={"help": "The token to use when accessing the model."} diff --git a/FlagEmbedding/abc/finetune/reranker/AbsArguments.py b/FlagEmbedding/abc/finetune/reranker/AbsArguments.py index 3c6a2e95..99bd3414 100644 --- a/FlagEmbedding/abc/finetune/reranker/AbsArguments.py +++ b/FlagEmbedding/abc/finetune/reranker/AbsArguments.py @@ -34,6 +34,10 @@ class AbsRerankerModelArguments: default='encoder', metadata={"help": "Type of finetune, ['encoder', 'decoder']"} ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use fast tokenizer or not."} + ) token: str = field( default_factory=lambda: os.getenv('HF_TOKEN', None), metadata={"help": "The token to use when accessing the model."} diff --git a/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py b/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py index 463c2378..16d73d37 100644 --- a/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py +++ b/FlagEmbedding/finetune/embedder/decoder_only/base/runner.py @@ -43,7 +43,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, token=self.model_args.token, cache_dir=self.model_args.cache_dir, - use_fast=False, + use_fast=self.model_args.use_fast_tokenizer, add_eos_token=True, trust_remote_code=self.model_args.trust_remote_code, ) diff --git a/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py b/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py index dc5f016a..f40aba62 100644 --- a/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py +++ b/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py @@ -44,7 +44,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, token=self.model_args.token, cache_dir=self.model_args.cache_dir, - use_fast=False, + use_fast=self.model_args.use_fast_tokenizer, add_eos_token=True, trust_remote_code=self.model_args.trust_remote_code, ) diff --git a/FlagEmbedding/finetune/embedder/encoder_only/base/runner.py b/FlagEmbedding/finetune/embedder/encoder_only/base/runner.py index 94558b2a..83a2fe2b 100644 --- a/FlagEmbedding/finetune/embedder/encoder_only/base/runner.py +++ b/FlagEmbedding/finetune/embedder/encoder_only/base/runner.py @@ -26,6 +26,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode self.model_args.model_name_or_path, cache_dir=self.model_args.cache_dir, token=self.model_args.token, + use_fast=self.model_args.use_fast_tokenizer, trust_remote_code=self.model_args.trust_remote_code ) base_model = AutoModel.from_pretrained( diff --git a/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py b/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py index 4f17ad20..bd3de30d 100644 --- a/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py +++ b/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py @@ -107,6 +107,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode self.model_args.model_name_or_path, cache_dir=self.model_args.cache_dir, token=self.model_args.token, + use_fast=self.model_args.use_fast_tokenizer, trust_remote_code=self.model_args.trust_remote_code ) diff --git a/FlagEmbedding/finetune/reranker/decoder_only/base/runner.py b/FlagEmbedding/finetune/reranker/decoder_only/base/runner.py index 7194c311..14fa3dd9 100644 --- a/FlagEmbedding/finetune/reranker/decoder_only/base/runner.py +++ b/FlagEmbedding/finetune/reranker/decoder_only/base/runner.py @@ -43,7 +43,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerMode self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, token=self.model_args.token, cache_dir=self.model_args.cache_dir, - use_fast=False, + use_fast=self.model_args.use_fast_tokenizer, add_eos_token=False, trust_remote_code=self.model_args.trust_remote_code, ) diff --git a/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.py b/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.py index 16086a80..b2502004 100644 --- a/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.py +++ b/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.py @@ -43,7 +43,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerMode self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, token=self.model_args.token, cache_dir=self.model_args.cache_dir, - # use_fast=False, + use_fast=self.model_args.use_fast_tokenizer, add_eos_token=False, trust_remote_code=self.model_args.trust_remote_code ) diff --git a/FlagEmbedding/finetune/reranker/encoder_only/base/runner.py b/FlagEmbedding/finetune/reranker/encoder_only/base/runner.py index 3b1a67f7..324c9e30 100644 --- a/FlagEmbedding/finetune/reranker/encoder_only/base/runner.py +++ b/FlagEmbedding/finetune/reranker/encoder_only/base/runner.py @@ -26,6 +26,7 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerMode self.model_args.model_name_or_path, cache_dir=self.model_args.cache_dir, token=self.model_args.token, + use_fast=self.model_args.use_fast_tokenizer, trust_remote_code=self.model_args.trust_remote_code ) diff --git a/research/BGE_Reasoner/README.md b/research/BGE_Reasoner/README.md index 77b3407d..54a7884d 100644 --- a/research/BGE_Reasoner/README.md +++ b/research/BGE_Reasoner/README.md @@ -13,13 +13,15 @@ We introduce **BGE-Reasoner**, an end-to-end reasoning-intensive information ret ## Open-source resources -| Resource Type | Name | Link | -| ------------------ | --------------------- | ----------- | -| Model | BGE-Reasoner-Rewriter | [πŸ€—]() (TBA) | -| Model | BGE-Reasoner-Embed | [πŸ€—]() (TBA) | -| Model | BGE-Reasoner-Reranker | [πŸ€—]() (TBA) | -| Training Data | BGE-Reasoner-Data | [πŸ€—]() (TBA) | -| Evaluation Scripts | - | (TBA) | +| Resource Type | Name | Link | Release Date | Comments | +| ------------------ | --------------------- | ----------- | ------------------ | ------------------ | +| Model | BGE-Reasoner-Rewriter | [πŸ€—]() (TBA) | - | | +| Model | BGE-Reasoner-Reranker | [πŸ€—]() (TBA) | - | | +| Model | BGE-Reasoner-Embed | [πŸ€—]() (TBA) | - | | +| Search Results | BGE-Reasoner-Embed-0821 Search Results | [πŸ€—](https://huggingface.co/datasets/hanhainebula/bright-search-results_bge-reasoner-embed-0821/tree/main) | Sep 4, 2025 | nDCG@10 = 32.5, submission to BRIGHT leaderboard on Aug 21, 2025 | +| Training Data | BGE-Reasoner-Data | [πŸ€—](https://huggingface.co/datasets/hanhainebula/bge-reasoner-data/tree/main/bge-reasoner-data-0904) | Sep 4, 2025 | part of our training data; full data to be released in the future | +| Evaluation Scripts | - | (TBA) | - | | + ## Performance @@ -71,7 +73,7 @@ Note: - Using the GPT-4 reasoning queries provided by BRIGHT, the score increases to **37.7**, which is **+5.6** higher than DIVER’s corresponding result (32.1). Combining our embedding-based retrieval with BM25 (hybrid fusion, weights: 0.75 / 0.25) yields **nDCG@10 = 40.2**. - Finally, when using rewritten queries produced by **BGE-Reasoner-Rewriter** and fusing with BM25 (weights: 0.75 / 0.25), we reach **nDCG@10 = 40.8**. -> On Sep 4, 2025, we released the first-stage search results of BGE-Reasoner-Embed-0821 using original queries and GPT-4 reasoning queries (Top-2000 candidates; excluded IDs removed) [here](https://huggingface.co/datasets/hanhainebula/bright-search-results_bge-reasoner-embed-0821/tree/main), and part of our training data [here](https://huggingface.co/datasets/hanhainebula/bge-reasoner-data/tree/main/bge-reasoner-data-0904). +> On Sep 4, 2025, we released the first-stage search results of BGE-Reasoner-Embed-0821 using original queries and GPT-4 reasoning queries (Top-2000 candidates; excluded IDs removed) [here](https://huggingface.co/datasets/hanhainebula/bright-search-results_bge-reasoner-embed-0821/tree/main). ![BGE-Reasoner-Embed & BGE-Reasoner-Rewriter Results](./imgs/embedder-rewriter_results.png)