🐛 Fix bugs

HamedBabaei · HamedBabaei · commit 068bb40f98d8 · 2025-02-04T21:52:23.000+01:00
diff --git a/ontoaligner/ontology_matchers/rag/rag.py b/ontoaligner/ontology_matchers/rag/rag.py
@@ -122,7 +122,6 @@ def generate_for_llm(self, tokenized_input_data: Any) -> Any:
                 **tokenized_input_data,
                 pad_token_id=self.tokenizer.eos_token_id,
                 max_new_tokens=self.kwargs["max_new_tokens"],
-                do_sample=False,
                 output_scores=True,
                 return_dict_in_generate=True
             )
diff --git a/ontoaligner/ontology_matchers/retrieval/retrieval.py b/ontoaligner/ontology_matchers/retrieval/retrieval.py
@@ -114,7 +114,10 @@ def get_top_k(self, query_embed: Any, candidate_embeds: Any) -> [List, List]:
         values = [(score, index) for index, score in enumerate(results)]
         dtype = [("score", float), ("index", int)]
         results = np.array(values, dtype=dtype)
-        top_k_items = np.sort(results, order="score")[-self.kwargs["top_k"] :][::-1]
+        try:
+            top_k_items = np.sort(results, order="score")[-self.kwargs["top_k"]:][::-1]
+        except IndexError:
+            top_k_items = np.sort(results, order="score")[::-1]
         top_k_indexes, top_k_scores = [], []
         for top_k in top_k_items:
             top_k_scores.append(top_k[0])
@@ -170,7 +173,7 @@ def load(self, path: str):
         Returns:
             None
         """
-        self.model = SentenceTransformer(path, device=self.kwargs["device"])
+        self.model = SentenceTransformer(path, device=self.kwargs["device"], trust_remote_code=True)
 
     def fit(self, inputs: Any) -> Any:
         """
diff --git a/ontoaligner/pipeline.py b/ontoaligner/pipeline.py
@@ -67,9 +67,9 @@ def _collect_dataset(self):
     def __call__(self, method: str, encoder_model: BaseEncoder = None, model_class: BaseOMModel = None, dataset_class: Dataset = None, postprocessor: Any = None,
                  llm_path: str = None, retriever_path: str = None, device: str = "cuda", batch_size: int = 2048, max_length: int = 300, max_new_tokens: int = 10,
                  top_k: int = 10, fuzzy_sm_threshold: float = 0.2, evaluate: bool = False, return_matching: bool = True, output_file_name: str = "matchings",
-                 save_matchings: bool = False, ir_threshold: float = 0.5, llm_threshold: float = 0.5, llm_mapper: LabelMapper = None, llm_mapper_interested_class: str = 'yes',
-                 answer_set: Dict = {"yes": ["yes", "true"], "no": ["no", "false"]}, huggingface_access_token: str = "", openai_key: str = "", device_map: str = "auto",
-                 positive_ratio: float = 0.7, n_shots: int = 5) -> [Any, Any]:
+                 save_matchings: bool = False, ir_threshold: float = 0.5, ir_rag_threshold: float = 0.7, llm_threshold: float = 0.5, llm_mapper: LabelMapper = None,
+                 llm_mapper_interested_class: str = 'yes', answer_set: Dict = {"yes": ["yes", "true"], "no": ["no", "false"]}, huggingface_access_token: str = "",
+                 openai_key: str = "", device_map: str = "auto", positive_ratio: float = 0.7, n_shots: int = 5) -> [Any, Any]:
         """
         Executes the ontology alignment process using the specified method.
 
@@ -92,6 +92,7 @@ def __call__(self, method: str, encoder_model: BaseEncoder = None, model_class:
             output_file_name (str, optional): Output file name without file type. Defaults to "matchings".
             save_matchings (bool, optional): Whether to save the matching results. Defaults to False.
             ir_threshold (float, optional): Retrieval postprocessor threshold.
+            ir_rag_threshold (float, optional): Retrieval postprocessor threshold in RAG module.
             llm_threshold (float, optional): LLM postprocessor threshold.
             llm_mapper (LabelMapper, optional): Mapper for LLM outputs.
             llm_mapper_interested_class (str, optional): Class to filter output pairs in LLM postprocessing.
@@ -127,7 +128,7 @@ def __call__(self, method: str, encoder_model: BaseEncoder = None, model_class:
             else:
                 encoder_model = encoder_model or ConceptRAGEncoder()
             matchings = self._run_rag(method, encoder_model, model_class, postprocessor or rag_hybrid_postprocessor,
-                                      llm_threshold,  ir_threshold, retriever_path, llm_path, rag_config)
+                                      llm_threshold,  ir_rag_threshold, retriever_path, llm_path, rag_config)
         else:
             raise ValueError(f"Unknown method: {method}")
         return self._process_results(matchings, method, evaluate, return_matching, output_file_name, save_matchings)

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,6 @@ def generate_for_llm(self, tokenized_input_data: Any) -> Any:`
`122`	`122`	`**tokenized_input_data,`
`123`	`123`	`pad_token_id=self.tokenizer.eos_token_id,`
`124`	`124`	`max_new_tokens=self.kwargs["max_new_tokens"],`
`125`		`- do_sample=False,`
`126`	`125`	`output_scores=True,`
`127`	`126`	`return_dict_in_generate=True`
`128`	`127`	`)`