Merge pull request #1531 from hanhainebula/master

hanhainebula · web-flow · commit 5e1f42f15cc0 · 2025-09-23T22:53:56.000+08:00
Update modeling_mapping for embedder and update README of BGE-Reasoner
diff --git a/FlagEmbedding/inference/embedder/decoder_only/base.py b/FlagEmbedding/inference/embedder/decoder_only/base.py
@@ -40,7 +40,7 @@ class BaseLLMEmbedder(AbsEmbedder):
             degradation. Defaults to :data:`True`.
         query_instruction_for_retrieval (Optional[str], optional): Query instruction for retrieval tasks, which will be used with
             with :attr:`query_instruction_format`. Defaults to :data:`None`.
-        query_instruction_format (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`.
+        query_instruction_format (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"Instruct: {}\nQuery: {}"`.
         devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`.
         trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`.
         cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`.
diff --git a/FlagEmbedding/inference/embedder/model_mapping.py b/FlagEmbedding/inference/embedder/model_mapping.py
@@ -38,6 +38,14 @@ class EmbedderConfig:
 
 # BGE models mapping
 BGE_MAPPING = OrderedDict([
+    (
+        "bge-reasoner-embed-qwen3-8b-0923",
+        EmbedderConfig(FlagLLMModel, PoolingMethod.LAST_TOKEN, query_instruction_format="Instruct: {}\nQuery: {}")
+    ),
+    (
+        "bge-code-v1",
+        EmbedderConfig(FlagLLMModel, PoolingMethod.LAST_TOKEN, trust_remote_code=True, query_instruction_format="<instruct>{}\n<query>{}")
+    ),
     (
         "bge-en-icl", 
         EmbedderConfig(FlagICLModel, PoolingMethod.LAST_TOKEN, query_instruction_format="<instruct>{}\n<query>{}")
@@ -100,6 +108,23 @@ class EmbedderConfig:
     ),
 ])
 
+# Qwen3-Embedding models mapping
+QWEN3_EMBEDDING_MAPPING = OrderedDict([
+    (
+        "Qwen3-Embedding-0.6B",
+        EmbedderConfig(FlagLLMModel, PoolingMethod.LAST_TOKEN, query_instruction_format="Instruct: {}\nQuery:{}")
+    ),
+    (
+        "Qwen3-Embedding-4B",
+        EmbedderConfig(FlagLLMModel, PoolingMethod.LAST_TOKEN, query_instruction_format="Instruct: {}\nQuery:{}")
+    ),
+    (
+        "Qwen3-Embedding-8B",
+        EmbedderConfig(FlagLLMModel, PoolingMethod.LAST_TOKEN, query_instruction_format="Instruct: {}\nQuery:{}")
+    ),
+])
+
+
 # E5 models mapping
 E5_MAPPING = OrderedDict([
     (
@@ -231,6 +256,7 @@ class EmbedderConfig:
 # Combine all mappings
 AUTO_EMBEDDER_MAPPING = OrderedDict()
 AUTO_EMBEDDER_MAPPING.update(BGE_MAPPING)
+AUTO_EMBEDDER_MAPPING.update(QWEN3_EMBEDDING_MAPPING)
 AUTO_EMBEDDER_MAPPING.update(E5_MAPPING)
 AUTO_EMBEDDER_MAPPING.update(GTE_MAPPING)
 AUTO_EMBEDDER_MAPPING.update(SFR_MAPPING)
diff --git a/research/BGE_Reasoner/README.md b/research/BGE_Reasoner/README.md
@@ -17,8 +17,8 @@ We introduce **BGE-Reasoner**, an end-to-end reasoning-intensive information ret
 | ------------------ | --------------------- | ----------- | ------------------ | ------------------ |
 | Model              | BGE-Reasoner-Rewriter | [🤗]() (TBA)     | -    |      |
 | Model              | BGE-Reasoner-Reranker | [🤗]() (TBA)     | -    |      |
-| Model              | BGE-Reasoner-Embed-Qwen3-8B-0923 | [🤗](https://huggingface.co/BAAI/bge-reasoner-embed-qwen3-8b-0923) | Sep 23, 2025 | nDCG@10 = 37.2 using original query, fine-tuned on [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) with our latest refined training data (data to be released) |
-| Search Results | BGE-Reasoner-Embed-Qwen3-8B-0923 Search Results | [🤗](https://huggingface.co/BAAI/bge-reasoner-embed-qwen3-8b-0923/tree/main/search_results) | Sep 23, 2025 | nDCG@10 = 37.2 using original query |
+| Model              | BGE-Reasoner-Embed-Qwen3-8B-0923 | [🤗](https://huggingface.co/BAAI/bge-reasoner-embed-qwen3-8b-0923) | Sep 23, 2025 | nDCG@10 = 37.1 using original query, fine-tuned on [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) with our latest refined training data (data to be released) |
+| Search Results | BGE-Reasoner-Embed-Qwen3-8B-0923 Search Results | [🤗](https://huggingface.co/BAAI/bge-reasoner-embed-qwen3-8b-0923/tree/main/search_results) | Sep 23, 2025 | nDCG@10 = 37.1 using original query |
 | Search Results | BGE-Reasoner-Embed-0821 Search Results | [🤗](https://huggingface.co/datasets/hanhainebula/bright-search-results_bge-reasoner-embed-0821/tree/main) | Sep 4, 2025 | nDCG@10 = 32.5 using original query, submission to BRIGHT leaderboard on Aug 21, 2025 |
 | Training Data      | BGE-Reasoner-Data | [🤗](https://huggingface.co/datasets/hanhainebula/bge-reasoner-data/tree/main/bge-reasoner-data-0904) | Sep 4, 2025 | part of our training data; full data to be released in the future |
 | Evaluation Scripts | -                     | (TBA)             | -            |              |
@@ -72,7 +72,7 @@ Note:
 
 **BGE-Reasoner-Embed-Qwen3-8B-0923**, fine-tuned on [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) with our latest refined training data (data to be released), achieves strong performance on the BRIGHT benchmark:
 
-- With original queries, it attains **nDCG@10 = 37.2**, an absolute improvement of **+8.3** over the previous best ([DIVER](https://arxiv.org/pdf/2508.07995): 28.9).
+- With original queries, it attains **nDCG@10 = 37.1**, an absolute improvement of **+8.2** over the previous best ([DIVER](https://arxiv.org/pdf/2508.07995): 28.9).
 - Using the GPT-4 reasoning queries provided by BRIGHT, the score increases to **39.7**, which is **+7.6** higher than DIVER’s corresponding result (32.1).
 
 > On Sep 23, 2025, we released the first-stage search results of BGE-Reasoner-Embed-Qwen3-8B-0923 using original queries and GPT-4 reasoning queries (Top-2000 candidates; excluded IDs removed) [here](https://huggingface.co/BAAI/bge-reasoner-embed-qwen3-8b-0923/tree/main/search_results). The model checkpoint is available [here](https://huggingface.co/BAAI/bge-reasoner-embed-qwen3-8b-0923).
@@ -139,6 +139,11 @@ Note:
 The technical details for each component of **BGE-Reasoner** will be released soon. Please stay tuned!
 
 
+## Contact Information
+
+Some resources are not yet publicly available. If you have urgent research needs for any of these resources (e.g., model checkpoints, search results, evaluation scripts) or have any questions, please contact Jianlyu Chen at jianlvchen@gmail.com.
+
+
 ## Citation
 
 TBA
diff --git a/research/BGE_Reasoner/imgs/embedder-0923_results.png b/research/BGE_Reasoner/imgs/embedder-0923_results.png