FlagOpen
diff --git a/‎.github/workflows/documentation.yml‎
Lines changed: 8 additions & 2 deletions b/‎.github/workflows/documentation.yml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎FlagEmbedding/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎FlagEmbedding/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎FlagEmbedding/abc/finetune/reranker/AbsDataset.py‎
Lines changed: 1 addition & 1 deletion b/‎FlagEmbedding/abc/finetune/reranker/AbsDataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎FlagEmbedding/abc/inference/AbsEmbedder.py‎
Lines changed: 2 additions & 0 deletions b/‎FlagEmbedding/abc/inference/AbsEmbedder.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎FlagEmbedding/evaluation/beir/data_loader.py‎
Lines changed: 3 additions & 3 deletions b/‎FlagEmbedding/evaluation/beir/data_loader.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎FlagEmbedding/evaluation/mteb/runner.py‎
Lines changed: 1 addition & 1 deletion b/‎FlagEmbedding/evaluation/mteb/runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎FlagEmbedding/finetune/embedder/decoder_only/base/arguments.py‎
Lines changed: 5 additions & 0 deletions b/‎FlagEmbedding/finetune/embedder/decoder_only/base/arguments.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py‎
Lines changed: 15 additions & 7 deletions b/‎FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎FlagEmbedding/finetune/embedder/decoder_only/base/runner.py‎
Lines changed: 10 additions & 5 deletions b/‎FlagEmbedding/finetune/embedder/decoder_only/base/runner.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.py‎
Lines changed: 5 additions & 0 deletions b/‎FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.py‎
Lines changed: 5 additions & 0 deletions
@@ -11,12 +11,18 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
-      - name: Install dependencies
+      - name: Install doc dependencies
         run: |
-          pip install . sphinx sphinx_rtd_theme myst_parser myst-nb furo
+          pip install . sphinx myst_parser myst-nb sphinx-design pydata-sphinx-theme sphinxcontrib-googleanalytics
+      - name: Install content dependencies
+        run: |
+          pip install faiss-cpu mteb air-benchmark beir
       - name: Sphinx build
         run: |
           sphinx-build docs/source docs/build
+      - name: Add CNAME
+        run: |
+          echo bge-model.com > docs/build/CNAME
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@v3
         if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
@@ -1 +1,2 @@
 from .inference import *
+from .evaluation import *
@@ -183,7 +183,7 @@ class AbsRerankerCollator(DataCollatorWithPadding):
     query_max_len: int = 32
     passage_max_len: int = 128
 
-    def __call__(self, features) -> list[BatchEncoding]:
+    def __call__(self, features) -> List[BatchEncoding]:
         teacher_scores = [f[1] for f in features]
         if teacher_scores[0] is None:
             teacher_scores = None
 
@@ -462,6 +462,8 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch
             Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
         """
         if isinstance(results_list[0], torch.Tensor):
+            # move all tensors to the same device
+            results_list = [res.to(self.target_devices[0]) for res in results_list]
             return torch.cat(results_list, dim=0)
         elif isinstance(results_list[0], np.ndarray):
             return np.concatenate(results_list, axis=0)
 
@@ -145,7 +145,7 @@ def _load_remote_qrels(
         if dataset_name != 'cqadupstack':
             qrels = datasets.load_dataset(
                 'BeIR/{d}-qrels'.format(d=dataset_name),
-                split=split, 
+                split=split if split != 'dev' else 'validation', 
                 trust_remote_code=True,
                 cache_dir=self.cache_dir,
                 download_mode=self.hf_download_mode
@@ -409,7 +409,7 @@ def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, s
         Returns:
             datasets.DatasetDict: A dict of relevance of query and document.
         """
-        checked_split = self.check_splits(split)
+        checked_split = self.check_splits(split, dataset_name=dataset_name)
         if len(checked_split) == 0:
             raise ValueError(f"Split {split} not found in the dataset.")
         split = checked_split[0]
@@ -450,7 +450,7 @@ def _load_local_queries(self, save_dir: str, dataset_name: Optional[str] = None,
         Returns:
             datasets.DatasetDict: A dict of queries with id as key, query text as value.
         """
-        checked_split = self.check_splits(split)
+        checked_split = self.check_splits(split, dataset_name=dataset_name)
         if len(checked_split) == 0:
             raise ValueError(f"Split {split} not found in the dataset.")
         split = checked_split[0]
 
@@ -10,7 +10,7 @@
 from .arguments import MTEBEvalArgs
 from .searcher import MTEBEvalDenseRetriever, MTEBEvalReranker
 from .prompts import get_task_def_by_task_name_and_type
-from  .examples import examples_dict
+
 
 logger = logging.getLogger(__name__)
 
 
@@ -69,3 +69,8 @@ class DecoderOnlyEmbedderModelArguments(AbsEmbedderModelArguments):
         default=False,
         metadata={"help": "If passed, will merge the lora modules and save the entire model."}
     )
+
+    only_merge_lora_model: bool = field(
+        default=False,
+        metadata={"help": "If passed, will only merge the lora modules and save the entire model."}
+    )
@@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             token=model_args.token,
-            cache_dir=model_args.cache_dir
+            cache_dir=model_args.cache_dir,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             token=model_args.token,
-            cache_dir=model_args.cache_dir
+            cache_dir=model_args.cache_dir,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re
             cache_dir=model_args.cache_dir,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
@@ -129,13 +132,15 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             token=model_args.token,
-            cache_dir=model_args.cache_dir
+            cache_dir=model_args.cache_dir,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             token=model_args.token,
-            cache_dir=model_args.cache_dir
+            cache_dir=model_args.cache_dir,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -152,6 +157,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
             cache_dir=model_args.cache_dir,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = model_args.from_config(config)
@@ -171,7 +177,9 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
         model = PeftModel.from_pretrained(model, find_largest_checkpoint(output_dir))
         model = model.merge_and_unload()
 
-    model.save_pretrained(os.path.join(output_dir, 'merged_model'))
-
-    tokenizer = AutoTokenizer.from_pretrained(output_dir)
+    tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code)
     tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model'))
+
+    # modify the vocab size in the model configuration
+    model.config.vocab_size = len(tokenizer)
+    model.save_pretrained(os.path.join(output_dir, 'merged_model'))
@@ -29,6 +29,9 @@ def __init__(
         training_args: AbsEmbedderTrainingArguments
     ):
         super().__init__(model_args, data_args, training_args)
+        self.model_args: DecoderOnlyEmbedderModelArguments
+        self.data_args: AbsEmbedderDataArguments
+        self.training_args: AbsEmbedderTrainingArguments
 
     def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]:
         """Load tokenizer and model.
@@ -41,7 +44,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
             token=self.model_args.token,
             cache_dir=self.model_args.cache_dir,
             use_fast=False,
-            add_eos_token=True
+            add_eos_token=True,
+            trust_remote_code=self.model_args.trust_remote_code,
         )
 
         if tokenizer.pad_token is None:
@@ -116,11 +120,12 @@ def run(self):
         """
         Run the finetune.
         """
-        Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True)
+        if not self.model_args.only_merge_lora_model:
+            Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True)
 
-        # Training
-        self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint)
-        self.trainer.save_model()
+            # Training
+            self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint)
+            self.trainer.save_model()
 
         # save merged model
         if self.model_args.save_merged_lora_model and self.training_args.process_index == 0:
 
@@ -73,6 +73,11 @@ class DecoderOnlyEmbedderICLModelArguments(AbsEmbedderModelArguments):
         metadata={"help": "If passed, will merge the lora modules and save the entire model."}
     )
 
+    only_merge_lora_model: bool = field(
+        default=False,
+        metadata={"help": "If passed, will only merge the lora modules and save the entire model."}
+    )
+
 
 @dataclass
 class DecoderOnlyEmbedderICLDataArguments(AbsEmbedderDataArguments):
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`from .inference import *`
	`2`	`+from .evaluation import *`
Original file line number	Diff line number	Diff line change
`@@ -69,3 +69,8 @@ class DecoderOnlyEmbedderModelArguments(AbsEmbedderModelArguments):`
`69`	`69`	`default=False,`
`70`	`70`	`metadata={"help": "If passed, will merge the lora modules and save the entire model."}`
`71`	`71`	`)`
	`72`	`+`
	`73`	`+ only_merge_lora_model: bool = field(`
	`74`	`+ default=False,`
	`75`	`+ metadata={"help": "If passed, will only merge the lora modules and save the entire model."}`
	`76`	`+ )`