From a63f6b245aadb6e20fed4ed353a4f3a62cfdffa6 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 22 Oct 2025 23:16:08 +0200 Subject: [PATCH 01/19] add rwth_dbis learner models --- ...llm_learner_rwthdbis_taxonomy_discovery.py | 57 ++ examples/llm_learner_rwthdbis_term_typing.py | 50 ++ ontolearner/__init__.py | 6 +- ontolearner/learner/__init__.py | 2 + .../learner/taxonomy_discovery/__init__.py | 15 + .../learner/taxonomy_discovery/rwthdbis.py | 792 ++++++++++++++++++ ontolearner/learner/term_typing/__init__.py | 15 + ontolearner/learner/term_typing/rwthdbis.py | 255 ++++++ requirements.txt | 3 + 9 files changed, 1194 insertions(+), 1 deletion(-) create mode 100644 examples/llm_learner_rwthdbis_taxonomy_discovery.py create mode 100644 examples/llm_learner_rwthdbis_term_typing.py create mode 100644 ontolearner/learner/taxonomy_discovery/__init__.py create mode 100644 ontolearner/learner/taxonomy_discovery/rwthdbis.py create mode 100644 ontolearner/learner/term_typing/__init__.py create mode 100644 ontolearner/learner/term_typing/rwthdbis.py diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py new file mode 100644 index 0000000..fea5539 --- /dev/null +++ b/examples/llm_learner_rwthdbis_taxonomy_discovery.py @@ -0,0 +1,57 @@ +# Import core modules from the OntoLearner library +from ontolearner import LearnerPipeline, train_test_split +from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner + +# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery +ontology = ChordOntology() +ontology.load() # Read entities, type system, and taxonomic edges into memory + +# Extract typed taxonomic edges and split into train/test while preserving the structured shape +train_data, test_data = train_test_split( + ontology.extract(), + test_size=0.2, + random_state=42 +) + +# Initialize a supervised taxonomy classifier (encoder-based fine-tuning) +# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views +# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions +learner = RWTHDBISTaxonomyLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + gradient_accumulation_steps=4, + learning_rate=2e-5, + max_length=256, + seed=42, + negative_ratio=5, + bidirectional_templates=True, + context_json_path=None, + ontology_name=ontology.ontology_full_name, +) + +# Build the pipeline +pipeline = LearnerPipeline( + llm=learner, + llm_id=learner.model_name, + ontologizer_data=False, +) + +# # Run the full learning pipeline on the taxonomy-discovery task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs['elapsed_time']) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py new file mode 100644 index 0000000..67d207f --- /dev/null +++ b/examples/llm_learner_rwthdbis_term_typing.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import LearnerPipeline, train_test_split, AgrO +from ontolearner import RWTHDBISTermTypingLearner + +#load the AgrO ontology. +# AgrO provides term-typing supervision where each term can be annotated with one or more types. +ontology = AgrO() +ontology.load() +data = ontology.extract() + +# Split the labeled term-typing data into train and test sets +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# Configure a supervised encoder-based classifier for term typing. +# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results. +learner = RWTHDBISTermTypingLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/deberta-v3", + num_train_epochs=30, + per_device_train_batch_size=16, + gradient_accumulation_steps=2, + learning_rate=2e-5, + max_length=64, + seed=42, +) + +# Build the pipeline and pass raw structured objects end-to-end. +pipeline = LearnerPipeline( + llm=learner, + llm_id=learner.model_name, + ontologizer_data=False, +) + +# Run the full learning pipeline on the term-typing task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs['elapsed_time']) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 237bee8..0b6fd26 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -29,7 +29,9 @@ AutoRetrieverLearner, AutoRAGLearner, StandardizedPrompting, - LabelMapper) + LabelMapper, + RWTHDBISTaxonomyLearner, + RWTHDBISTermTypingLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -47,6 +49,8 @@ "LabelMapper", "LearnerPipeline", "Processor", + "RWTHDBISTaxonomyLearner", + "RWTHDBISTermTypingLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 0baf580..ad38f0b 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -17,3 +17,5 @@ from .rag import AutoRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper +from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner +from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py new file mode 100644 index 0000000..ab5b4f8 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .rwthdbis import RWTHDBISSFTLearner diff --git a/ontolearner/learner/taxonomy_discovery/rwthdbis.py b/ontolearner/learner/taxonomy_discovery/rwthdbis.py new file mode 100644 index 0000000..47989c5 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/rwthdbis.py @@ -0,0 +1,792 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import random +import re +import time +import platform +import multiprocessing +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Callable +from functools import partial +from tqdm.auto import tqdm +import g4f +from g4f.client import Client as _G4FClient +import torch +from datasets import Dataset, DatasetDict +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + set_seed, +) + +from ...base import AutoLearner + +class RWTHDBISSFTLearner(AutoLearner): + """ + Supervised classifier for (parent, child) taxonomy edges. + + Model input format: + " ## " + + If no `context_json_path` is provided, the class precomputes a + context file ({ontology_name}_processed.json) directly from the ontology + object. + """ + + # Sentences containing any of these phrases are pruned from term_info. + _CONTEXT_REMOVALS = [ + "couldn't find any", + "does not require", + "assist you further", + "feel free to", + "I'm currently unable", + "the search results", + "I'm unable to", + "recommend referring directly", + "bear with me", + "searching for the most relevant information", + "I'm currently checking the most relevant", + "already in English", + "require further", + "any additional information", + "already an English", + "don't have information", + "I'm sorry,", + "For further exploration", + "For more detailed information", + ] + + def __init__( + self, + min_predictions: int = 1, + model_name: str = "distilroberta-base", + output_dir: str = "./results/{model_name}", + max_length: int = 256, + per_device_train_batch_size: int = 8, + gradient_accumulation_steps: int = 4, + num_train_epochs: int = 1, + learning_rate: float = 2e-5, + weight_decay: float = 0.01, + logging_steps: int = 25, + save_strategy: str = "epoch", + save_total_limit: int = 1, + fp16: bool = True, + bf16: bool = False, + seed: int = 42, + negative_ratio: int = 5, + bidirectional_templates: bool = True, + context_json_path: Optional[str] = None, + ontology_name: str = "Geonames" + ) -> None: + super().__init__() + + self.model_name = model_name + self.safe_model_name = model_name.replace("/", "__") + + resolved_output = output_dir.format(model_name=self.safe_model_name) + self.output_dir = str(Path(resolved_output)) + Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + self.min_predictions = int(min_predictions) + self.max_length = int(max_length) + self.per_device_train_batch_size = int(per_device_train_batch_size) + self.gradient_accumulation_steps = int(gradient_accumulation_steps) + self.num_train_epochs = float(num_train_epochs) + self.learning_rate = float(learning_rate) + self.weight_decay = float(weight_decay) + self.logging_steps = int(logging_steps) + self.save_strategy = str(save_strategy) + self.save_total_limit = int(save_total_limit) + self.fp16 = bool(fp16) + self.bf16 = bool(bf16) + self.seed = int(seed) + + self.negative_ratio = int(negative_ratio) + self.bidirectional_templates = bool(bidirectional_templates) + self.context_json_path = context_json_path + + self.ontology_name = ontology_name + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model: Optional[AutoModelForSequenceClassification] = None + self.tokenizer: Optional[AutoTokenizer] = None + + os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + os.environ.setdefault("WANDB_DISABLED", "true") + os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") + + self._context_exact: Dict[str, str] = {} # lower(term) -> info + self._context_rows: List[Dict[str, str]] = [] # [{'term': str, 'term_info': str}, ...] + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + return self._predict_pairs(data) if test else self._train_from_pairs(data) + + def _train_from_pairs(self, train_data: Any) -> None: + # Always (re)build context from ontology unless an explicit file is provided + if not self.context_json_path: + context_dir = Path(self.output_dir) / "context" + context_dir.mkdir(parents=True, exist_ok=True) + processed_context_file = context_dir / f"{self.ontology_name}_processed.json" + + # Remove stale file then regenerate + if processed_context_file.exists(): + try: + processed_context_file.unlink() + except Exception: + pass + + self.preprocess_context_from_ontology( + ontology=train_data, + processed_dir=context_dir, + dataset_name=self.ontology_name, + num_workers=max(1, min(os.cpu_count() or 2, 4)), + provider=partial(self._default_gpt_inference_with_dataset, dataset_name=self.ontology_name), + max_retries=5, + ) + + self.context_json_path = str(processed_context_file) + + # Reproducibility + set_seed(self.seed) + random.seed(self.seed) + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + + # Build labeled pairs from ontology; context comes from preprocessed map + positive_pairs = self._extract_positive_pairs(train_data) + if not positive_pairs: + raise ValueError("No positive (parent, child) pairs found in train_data.") + + entity_names = sorted({parent for parent, _ in positive_pairs} | {child for _, child in positive_pairs}) + negative_pairs = self._generate_negatives( + positives=positive_pairs, + entities=entity_names, + ratio=self.negative_ratio, + ) + + labels, texts = self._build_text_dataset(positive_pairs, negative_pairs) + + + datasets = DatasetDict({"train": Dataset.from_dict({"label": labels, "text": texts})}) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = ( + getattr(self.tokenizer, "eos_token", None) + or getattr(self.tokenizer, "sep_token", None) + or getattr(self.tokenizer, "cls_token", None) + ) + + def tokenize_batch(batch: Dict[str, List[str]]): + return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + + tokenized = datasets.map(tokenize_batch, batched=True, remove_columns=["text"]) + collator = DataCollatorWithPadding(self.tokenizer) + + self.model = AutoModelForSequenceClassification.from_pretrained( + self.model_name, + num_labels=2, + id2label={0: "incorrect", 1: "correct"}, + label2id={"incorrect": 0, "correct": 1}, + ) + if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + train_args = TrainingArguments( + output_dir=self.output_dir, + learning_rate=self.learning_rate, + per_device_train_batch_size=self.per_device_train_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + num_train_epochs=self.num_train_epochs, + weight_decay=self.weight_decay, + save_strategy=self.save_strategy, + save_total_limit=self.save_total_limit, + logging_steps=self.logging_steps, + dataloader_pin_memory = bool(torch.cuda.is_available()), + fp16=self.fp16, + bf16=self.bf16, + report_to="none", + save_safetensors=True, + ) + + trainer = Trainer( + model=self.model, + args=train_args, + train_dataset=tokenized["train"], + tokenizer=self.tokenizer, + data_collator=collator, + ) + trainer.train() + trainer.save_model(self.output_dir) + self.tokenizer.save_pretrained(self.output_dir) + + def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: + import torch.nn.functional as F + + self._ensure_loaded_for_inference() + + candidate_pairs = self._extract_pairs_for_eval(eval_data) + if not candidate_pairs: + return [] + + accepted: List[Dict[str, str]] = [] + scored_candidates: List[Tuple[float, str, str, int]] = [] + + self.model.eval() + with torch.no_grad(): + for parent_term, child_term in candidate_pairs: + input_text = self._format_input(parent_term, child_term) + inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, max_length=self.max_length) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + logits = self.model(**inputs).logits + probs = F.softmax(logits, dim=-1).squeeze(0) + p_positive = float(probs[1].item()) + predicted_label = int(torch.argmax(logits, dim=-1).item()) + scored_candidates.append((p_positive, parent_term, child_term, predicted_label)) + if predicted_label == 1: + accepted.append({"parent": parent_term, "child": child_term}) + + if accepted: + return accepted + + top_k = max(0, int(self.min_predictions)) + if top_k == 0: + return [] + scored_candidates.sort(key=lambda item: item[0], reverse=True) + return [{"parent": parent_term, "child": child_term} + for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k]] + + def _ensure_loaded_for_inference(self) -> None: + if self.model is not None and self.tokenizer is not None: + return + self.model = AutoModelForSequenceClassification.from_pretrained(self.output_dir).to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir) + if self.tokenizer.pad_token_id is None and getattr(self.model.config, "pad_token_id", None) is not None: + self.tokenizer.pad_token_id = self.model.config.pad_token_id + + def _load_context_map(self) -> None: + """Build exact and fuzzy maps from {ontology_name}_processed.json.""" + if not (self.context_json_path): + self._context_exact = {} + self._context_rows = [] + return + try: + rows = json.load(open(self.context_json_path, "r", encoding="utf-8")) + self._context_exact = { + str(row.get("term", "")).strip().lower(): str(row.get("term_info", "")).strip() + for row in rows + } + self._context_rows = [ + {"term": str(row.get("term", "")), "term_info": str(row.get("term_info", ""))} + for row in rows + ] + except Exception: + self._context_exact = {} + self._context_rows = [] + + def _lookup_context_info(self, raw_term: str) -> str: + """ + Loose context lookup: split by commas, strip whitespace, case-insensitive + substring match against any row['term']. Join hits with '.'. + """ + if not raw_term: + return "" + term_key = raw_term.strip().lower() + if term_key in self._context_exact: + return self._context_exact[term_key] + + subterms = [re.sub(r"\s+", "", piece) for piece in raw_term.split(",")] + matched_infos: List[str] = [] + for subterm in subterms: + if not subterm: + continue + lower_subterm = subterm.lower() + for row in self._context_rows: + if lower_subterm in row["term"].lower(): + info = row.get("term_info", "") + if info: + matched_infos.append(info) + break # one hit per subterm + return ".".join(matched_infos) + + def _extract_positive_pairs(self, ontology_obj: Any) -> List[Tuple[str, str]]: + """ + Read pairs from ontology_obj.type_taxonomies.taxonomies (or fallback to .taxonomies). + Each item must provide 'parent' and 'child' attributes/keys. + """ + type_taxonomies = getattr(ontology_obj, "type_taxonomies", None) + items = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology_obj, "taxonomies", None) + pairs: List[Tuple[str, str]] = [] + if items: + for item in items: + parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") + child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + if parent_term and child_term: + pairs.append((str(parent_term), str(child_term))) + return pairs + + def _extract_pairs_for_eval(self, ontology_obj: Any) -> List[Tuple[str, str]]: + candidate_pairs = getattr(ontology_obj, "pairs", None) + if candidate_pairs: + pairs: List[Tuple[str, str]] = [] + for item in candidate_pairs: + parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") + child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + if parent_term and child_term: + pairs.append((str(parent_term), str(child_term))) + return pairs + return self._extract_positive_pairs(ontology_obj) + + def _generate_negatives( + self, + positives: List[Tuple[str, str]], + entities: List[str], + ratio: int, + ) -> List[Tuple[str, str]]: + positive_set = set(positives) + all_possible = {(parent_term, child_term) for parent_term in entities for child_term in entities if parent_term != child_term} + negative_candidates = list(all_possible - positive_set) + + target_count = max(len(positive_set) * max(1, ratio), len(positive_set)) + sample_count = min(target_count, len(negative_candidates)) + return random.sample(negative_candidates, k=sample_count) if sample_count > 0 else [] + + def _build_text_dataset( + self, + positives: List[Tuple[str, str]], + negatives: List[Tuple[str, str]], + ) -> Tuple[List[int], List[str]]: + self._load_context_map() + + labels: List[int] = [] + input_texts: List[str] = [] + + def add_example(parent_term: str, child_term: str, label_value: int) -> None: + input_texts.append(self._format_input(parent_term, child_term)) + labels.append(label_value) + if self.bidirectional_templates: + input_texts.append(self._format_input(child_term, parent_term, reverse=True)) + labels.append(label_value) + + for parent_term, child_term in positives: + add_example(parent_term, child_term, 1) + for parent_term, child_term in negatives: + add_example(parent_term, child_term, 0) + + return labels, input_texts + + def _format_input(self, parent_term: str, child_term: str, reverse: bool = False) -> str: + relation_text = ( + f"{child_term} is a subclass / child / subtype / descendant class of {parent_term}" + if reverse + else f"{parent_term} is the superclass / parent / supertype / ancestor class of {child_term}" + ) + + parent_info = self._lookup_context_info(parent_term) + child_info = self._lookup_context_info(child_term) + if not parent_info and not child_info: + return relation_text + + context_text = f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + return f"{relation_text} {context_text}" + + @staticmethod + def _is_windows() -> bool: + return (os.name == "nt") or (platform.system().lower() == "windows") + + @staticmethod + def _default_gpt_inference_with_dataset(term: str, dataset_name: str) -> str: + """ + Generate a plain-text description for `term`, tailored by `dataset_name`. + Uses g4f if available; otherwise returns an empty string. + """ + prompt = ( + f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " + "Provide as detailed a definition of this term as possible in plain text.without any markdown format." + "No reference link in result. " + "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" + "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" + "Output: Plain text paragraphs only, neutral and factual." + f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." + ) + + try: + client = _G4FClient() + response = client.chat.completions.create( + model=g4f.models.default, + messages=[{"role": "user", "content": prompt}], + ) + raw_text = response.choices[0].message.content if response and response.choices else "" + except Exception: + raw_text = "" # or some deterministic fallback + + # Clean up + cleaned = re.sub(r"[\*\-\#]", "", raw_text) + cleaned = re.sub(r"\n\s*\n", " ", cleaned) + cleaned = cleaned.replace("\n", " ") + cleaned = re.sub(r"\s{2,}", " ", cleaned) + cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", cleaned) + sentences = [sentence for sentence in cleaned.split(".") if "?" not in sentence] + return ".".join(sentences).strip() + + @staticmethod + def _clean_term_info(raw_text: str) -> str: + """Normalize whitespace and remove link artifacts.""" + cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", str(raw_text)) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned + + @classmethod + def _merge_part_files(cls, dataset_name: str, merged_path: Path, part_paths: List[Path]) -> None: + merged_rows: List[dict] = [] + for part_path in part_paths: + try: + if not part_path.is_file(): + continue + part_content = json.load(open(part_path, "r", encoding="utf-8")) + if isinstance(part_content, list): + merged_rows.extend(part_content) + elif isinstance(part_content, dict): + merged_rows.append(part_content) + except Exception: + continue + + removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + for row in merged_rows: + term_info_raw = str(row.get("term_info", "")) + kept_sentences: List[str] = [] + for sentence in term_info_raw.split("."): + sentence_no_links = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence) + if any(marker in sentence_no_links for marker in removal_markers): + continue + kept_sentences.append(sentence_no_links) + row["term_info"] = cls._clean_term_info(".".join(kept_sentences)) + + merged_path.parent.mkdir(parents=True, exist_ok=True) + json.dump(merged_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + + # best-effort cleanup + for part_path in part_paths: + try: + os.remove(part_path) + except Exception: + pass + + @staticmethod + def _fill_bucket_threaded(bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str]) -> None: + start_index = 0 + try: + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + start_index = len(existing_rows) + except Exception: + pass + + for row_index in range(start_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) + except Exception: + bucket_rows[row_index]["term_info"] = "" + if row_index % 10 == 1: + json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + + json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + + @staticmethod + def _fill_bucket_process( + worker_id: int, + bucket_rows: List[dict], + output_path: Path, + provider: Callable[[str], str], + progress_map: "multiprocessing.managers.DictProxy", + ) -> None: + current_index = 0 + try: + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + current_index = len(existing_rows) + except Exception: + pass + + progress_map[worker_id] = current_index + + for row_index in range(current_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) + except Exception: + bucket_rows[row_index]["term_info"] = "" + progress_map[worker_id] = row_index + 1 + if row_index % 10 == 1: + json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + + json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + progress_map[worker_id] = len(bucket_rows) + + @classmethod + def _execute_for_terms( + cls, + terms: List[str], + merged_path: Path, + part_paths: List[Path], + provider: Callable[[str], str], + dataset_name: str, + num_workers: int = 2, + ) -> None: + """ + Generate context for `terms`, writing shards to `part_paths`, then merge. + Threads on Windows; processes on POSIX. + """ + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + all_rows = [{"id": row_index, "term": term, "term_info": ""} for row_index, term in enumerate(terms)] + + buckets: List[List[dict]] = [[] for _ in range(worker_count)] + for reversed_index, row in enumerate(reversed(all_rows)): + buckets[reversed_index % worker_count].append(row) + + if cls._is_windows(): + total_rows = len(terms) + progress_bar = tqdm(total=total_rows, desc=f"{dataset_name} generation (threads)") + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + cls._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [pool.submit(run_bucket, buckets[bucket_index], part_paths[bucket_index]) + for bucket_index in range(worker_count)] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + else: + manager = multiprocessing.Manager() + progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) + + processes: List[multiprocessing.Process] = [] + for worker_index, bucket_rows in enumerate(buckets): + process = multiprocessing.Process( + target=cls._fill_bucket_process, + args=(worker_index, bucket_rows, part_paths[worker_index], provider, progress_map), + ) + processes.append(process) + process.start() + + total_rows = len(terms) + with tqdm(total=total_rows, desc=f"{dataset_name} generation") as progress_bar: + previous_total = 0 + while any(process.is_alive() for process in processes): + current_total = int(sum(progress_map.values())) + progress_bar.update(current_total - previous_total) + previous_total = current_total + time.sleep(0.5) + current_total = int(sum(progress_map.values())) + if current_total > previous_total: + progress_bar.update(current_total - previous_total) + + for process in processes: + process.join() + + cls._merge_part_files(dataset_name, merged_path, part_paths) + + @classmethod + def _re_infer_short_entries( + cls, + merged_path: Path, + re_part_paths: List[Path], + re_merged_path: Path, + provider: Callable[[str], str], + dataset_name: str, + num_workers: int, + ) -> int: + """ + Re-query terms with too-short term_info (< 50 chars). Returns remaining count. + """ + merged_rows = json.load(open(merged_path, "r", encoding="utf-8")) + + removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + short_rows: List[dict] = [] + long_rows: List[dict] = [] + + for row in merged_rows: + term_info_raw = str(row.get("term_info", "")) + sentences = term_info_raw.split(".") + for marker in removal_markers: + sentences = [sentence if marker not in sentence else "" for sentence in sentences] + filtered_info = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", ".".join(sentences)) + row["term_info"] = filtered_info + (short_rows if len(filtered_info) < 50 else long_rows).append(row) + + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + buckets: List[List[dict]] = [[] for _ in range(worker_count)] + for row_index, row in enumerate(short_rows): + buckets[row_index % worker_count].append(row) + + # clean old re-inference shards + for path in re_part_paths: + try: + os.remove(path) + except Exception: + pass + + total_candidates = len(short_rows) + if cls._is_windows(): + progress_bar = tqdm(total=total_candidates, desc=f"{dataset_name} re-inference (threads)") + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + cls._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [pool.submit(run_bucket, buckets[bucket_index], re_part_paths[bucket_index]) + for bucket_index in range(worker_count)] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + else: + manager = multiprocessing.Manager() + progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) + + processes: List[multiprocessing.Process] = [] + for worker_index, bucket_rows in enumerate(buckets): + process = multiprocessing.Process( + target=cls._fill_bucket_process, + args=(worker_index, bucket_rows, re_part_paths[worker_index], provider, progress_map), + ) + processes.append(process) + process.start() + + with tqdm(total=total_candidates, desc=f"{dataset_name} re-inference") as progress_bar: + previous_total = 0 + while any(process.is_alive() for process in processes): + current_total = int(sum(progress_map.values())) + progress_bar.update(current_total - previous_total) + previous_total = current_total + time.sleep(1) + if progress_bar.n < total_candidates: + progress_bar.update(total_candidates - progress_bar.n) + + for process in processes: + process.join() + + # merge and write back + cls._merge_part_files(dataset_name, re_merged_path, re_part_paths) + new_rows = json.load(open(re_merged_path, "r", encoding="utf-8")) if re_merged_path.is_file() else [] + final_rows = long_rows + new_rows + json.dump(final_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + + remaining_short = sum(1 for row in final_rows if len(str(row.get("term_info", ""))) < 50) + return remaining_short + + @staticmethod + def _extract_terms_from_ontology(ontology: Any) -> List[str]: + """ + Collect unique term names from ontology.type_taxonomies.taxonomies. + """ + type_taxonomies = getattr(ontology, "type_taxonomies", None) + taxonomies = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology, "taxonomies", None) + unique_terms: set[str] = set() + if taxonomies: + for row in taxonomies: + parent_term = getattr(row, "parent", None) if not isinstance(row, dict) else row.get("parent") + child_term = getattr(row, "child", None) if not isinstance(row, dict) else row.get("child") + if parent_term: + unique_terms.add(str(parent_term)) + if child_term: + unique_terms.add(str(child_term)) + return sorted(unique_terms) + + def preprocess_context_from_ontology( + self, + ontology: Any, + processed_dir: str | Path, + dataset_name: str = "GeoNames", + num_workers: int = 2, + provider: Optional[Callable[[str], str]] = None, + max_retries: int = 5, + ) -> Path: + """ + Build {id, term, term_info} from an ontology object. + Always regenerates {dataset_name}_processed.json. + """ + provider = provider or provider or partial(self._default_gpt_inference_with_dataset, dataset_name=dataset_name) + + processed_dir = Path(processed_dir) + processed_dir.mkdir(parents=True, exist_ok=True) + + merged_path = processed_dir / f"{dataset_name}_processed.json" + if merged_path.exists(): + try: + merged_path.unlink() + except Exception: + pass + + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + shard_paths = [processed_dir / f"{dataset_name}_type_part{shard_index}.json" for shard_index in range(worker_count)] + reinf_paths = [processed_dir / f"{dataset_name}_re_inference{shard_index}.json" for shard_index in range(worker_count)] + reinf_merged_path = processed_dir / f"{dataset_name}_Types_re_inference.json" + + # remove any leftover shards + for path in shard_paths + reinf_paths + [reinf_merged_path]: + try: + if path.exists(): + path.unlink() + except Exception: + pass + + unique_terms = self._extract_terms_from_ontology(ontology) + print(f"[Preprocess] Unique terms from ontology: {len(unique_terms)}") + + self._execute_for_terms( + terms=unique_terms, + merged_path=merged_path, + part_paths=shard_paths, + provider=provider, + dataset_name=dataset_name, + num_workers=worker_count, + ) + + retry_round = 0 + while retry_round < max_retries: + remaining_count = self._re_infer_short_entries( + merged_path=merged_path, + re_part_paths=reinf_paths, + re_merged_path=reinf_merged_path, + provider=provider, + dataset_name=dataset_name, + num_workers=worker_count, + ) + print(f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}") + retry_round += 1 + if remaining_count == 0: + break + + print(f"[Preprocess] Done. Merged context at: {merged_path}") + self.context_json_path = str(merged_path) + return merged_path diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py new file mode 100644 index 0000000..ab5b4f8 --- /dev/null +++ b/ontolearner/learner/term_typing/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .rwthdbis import RWTHDBISSFTLearner diff --git a/ontolearner/learner/term_typing/rwthdbis.py b/ontolearner/learner/term_typing/rwthdbis.py new file mode 100644 index 0000000..f27fd56 --- /dev/null +++ b/ontolearner/learner/term_typing/rwthdbis.py @@ -0,0 +1,255 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +from typing import Any, Dict, List, Optional, Tuple + +import torch +from datasets import Dataset, DatasetDict +from tqdm.auto import tqdm +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + set_seed, +) +from transformers import DebertaV2Tokenizer + +from ...base import AutoLearner + +class RWTHDBISSFTLearner(AutoLearner): + """ + Supervised term-typing + + Training expands multi-label examples into multiple single-label rows. + Inference returns: [{"term": "", "types": [""]}, ...] + """ + + def __init__( + self, + model_name: str = "microsoft/deberta-v3-small", + trained_model_path: Optional[str] = None, + output_dir: Optional[str] = None, + max_length: int = 64, + per_device_train_batch_size: int = 16, + gradient_accumulation_steps: int = 2, + num_train_epochs: int = 3, + learning_rate: float = 2e-5, + weight_decay: float = 0.01, + logging_steps: int = 50, + save_strategy: str = "epoch", + save_total_limit: int = 1, + fp16: bool = False, + bf16: bool = False, + seed: int = 42 + ) -> None: + super().__init__() + self.model_name = model_name + self.trained_model_path = trained_model_path + self.output_dir = output_dir or "./term_typing" + os.makedirs(self.output_dir, exist_ok=True) + + self.max_length = max_length + self.per_device_train_batch_size = per_device_train_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.num_train_epochs = num_train_epochs + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.logging_steps = logging_steps + self.save_strategy = save_strategy + self.save_total_limit = save_total_limit + self.fp16 = fp16 + self.bf16 = bf16 + self.seed = seed + + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model: Optional[AutoModelForSequenceClassification] = None + self.tokenizer: Optional[AutoTokenizer] = None + self.id2label: Dict[int, str] = {} + self.label2id: Dict[str, int] = {} + + def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: + """ + train: expects ontology-like object with .term_typings + test: returns List[{"term": str, "types": [str]}] (for evaluator) + """ + if not test: + return self._train_from_term_typings(train_data=data) + + terms = self._collect_eval_terms(data) + return self._predict_structured_output(terms) + + def _load_robust_tokenizer(self, backbone: str) -> AutoTokenizer: + try: + return AutoTokenizer.from_pretrained(backbone, use_fast=True) + except Exception as fast_err: + print(f"[tokenizer] Fast tokenizer failed: {fast_err}. Trying DebertaV2Tokenizer (slow)...") + + try: + return DebertaV2Tokenizer.from_pretrained(backbone) + except Exception as slow_err: + print(f"[tokenizer] DebertaV2Tokenizer failed: {slow_err}. Trying AutoTokenizer(use_fast=False)...") + + try: + return AutoTokenizer.from_pretrained(backbone, use_fast=False) + except Exception as final_err: + raise RuntimeError( + "Failed to load a tokenizer for this DeBERTa model.\n" + "Try:\n" + " - pip install --upgrade sentencepiece\n" + " - ensure network access for model files\n" + " - clear your HF cache and retry\n" + " - pin versions: transformers==4.43.*, tokenizers<0.20\n" + f"Original error: {final_err}" + ) + + def _expand_multilabel_training_rows( + self, term_typings: List[Any] + ) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]: + """ + From multi-label instances -> (texts, label_ids), and label maps. + """ + label_strings: List[str] = [] + for instance in term_typings: + label_strings.extend([str(label) for label in instance.types]) + + unique_labels = sorted(set(label_strings)) + id2label = {i: label for i, label in enumerate(unique_labels)} + label2id = {label: i for i, label in enumerate(unique_labels)} + + texts: List[str] = [] + label_ids: List[int] = [] + for instance in term_typings: + term_text = str(instance.term) + for label in instance.types: + texts.append(term_text) + label_ids.append(label2id[str(label)]) + + return texts, label_ids, id2label, label2id + + def _collect_eval_terms(self, eval_data: Any) -> List[str]: + """ + Accepts List[str] OR object with .term_typings; returns list of term texts. + """ + if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): + terms = eval_data + else: + term_typings = getattr(eval_data, "term_typings", None) + if term_typings is None: + raise ValueError("Provide a List[str] OR an object with .term_typings for test=True.") + terms = [str(instance.term) for instance in term_typings] + return terms + + def _train_from_term_typings(self, train_data: Any) -> None: + set_seed(self.seed) + random.seed(self.seed) + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + + term_typings: List[Any] = getattr(train_data, "term_typings", None) + if term_typings is None: + raise ValueError("train_data must provide .term_typings for term-typing.") + + texts, label_ids, self.id2label, self.label2id = self._expand_multilabel_training_rows(term_typings) + + dataset = DatasetDict({"train": Dataset.from_dict({"labels": label_ids, "text": texts})}) + + backbone = self.trained_model_path or self.model_name + self.tokenizer = self._load_robust_tokenizer(backbone) + + def tokenize_batch(batch: Dict[str, List[str]]): + return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + + tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"]) + data_collator = DataCollatorWithPadding(self.tokenizer) + + self.model = AutoModelForSequenceClassification.from_pretrained( + backbone, + num_labels=len(self.id2label), + id2label=self.id2label, + label2id=self.label2id, + ) + + if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + training_args = TrainingArguments( + output_dir=self.output_dir, + learning_rate=self.learning_rate, + per_device_train_batch_size=self.per_device_train_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + num_train_epochs=self.num_train_epochs, + weight_decay=self.weight_decay, + save_strategy=self.save_strategy, + save_total_limit=self.save_total_limit, + logging_steps=self.logging_steps, + fp16=self.fp16, + bf16=self.bf16, + report_to=[], + ) + + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=tokenized["train"], + tokenizer=self.tokenizer, + data_collator=data_collator, + ) + + trainer.train() + trainer.save_model(self.output_dir) + self.tokenizer.save_pretrained(self.output_dir) + + def _ensure_loaded_for_inference(self) -> None: + if self.model is not None and self.tokenizer is not None: + return + model_path = self.trained_model_path or self.output_dir + self.model = AutoModelForSequenceClassification.from_pretrained(model_path) + self.tokenizer = self._load_robust_tokenizer(model_path) + + cfg = self.model.config + if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"): + self.id2label = dict(cfg.id2label) + self.label2id = dict(cfg.label2id) + + self.model.to(self.device).eval() + + def _predict_label_ids(self, terms: List[str]) -> List[int]: + self._ensure_loaded_for_inference() + predictions: List[int] = [] + for term_text in tqdm(terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"): + inputs = self.tokenizer(term_text, return_tensors="pt", truncation=True, max_length=self.max_length) + inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()} + with torch.no_grad(): + logits = self.model(**inputs).logits + predictions.append(int(torch.argmax(logits, dim=-1).item())) + return predictions + + def _predict_structured_output(self, terms: List[str]) -> List[Dict[str, List[str]]]: + """ + Convert predicted IDs into evaluator format: + [{"term": "", "types": [""]}, ...] + """ + label_ids = self._predict_label_ids(terms) + id2label_map = self.id2label or {} # fallback handled below + + results: List[Dict[str, List[str]]] = [] + for term_text, label_id in zip(terms, label_ids): + label_str = id2label_map.get(int(label_id), str(int(label_id))) + results.append({"term": term_text, "types": [label_str]}) + return results diff --git a/requirements.txt b/requirements.txt index 3ce19f7..6d71bd5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,6 @@ sentence-transformers~=5.1.0 scikit-learn~=1.6.1 bitsandbytes~=0.45.1 mistral-common[sentencepiece]~=1.8.5 +g4f +protobuf<5 +accelerate>=0.26.0 From 16457094ce35731b67f55b7f1bc27b5621242b20 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 29 Oct 2025 15:34:23 +0100 Subject: [PATCH 02/19] added skhnlp learner models --- ..._learner_skhnlp_sft_taxonomoy_discovery.py | 64 ++ ...m_learner_skhnlp_zs_taxonomoy_discovery.py | 50 ++ ontolearner/__init__.py | 6 +- ontolearner/learner/__init__.py | 1 + .../learner/taxonomy_discovery/__init__.py | 1 + .../learner/taxonomy_discovery/skhnlp.py | 761 ++++++++++++++++++ requirements.txt | 1 + 7 files changed, 883 insertions(+), 1 deletion(-) create mode 100644 examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py create mode 100644 examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py create mode 100644 ontolearner/learner/taxonomy_discovery/skhnlp.py diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py new file mode 100644 index 0000000..3661a5b --- /dev/null +++ b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py @@ -0,0 +1,64 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import SKHNLPSequentialFTLearner + +# Load ontology and split +# Load the GeoNames ontology for taxonomy discovery. +# GeoNames provides geographic parent-child relationships (is-a hierarchy). +ontology = GeoNames() +ontology.load() +data = ontology.extract() + +# Split the taxonomic relationships into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42 +) + +# Configure the learner with user-defined training args + device +# Configure the supervised BERT SFT Learner for taxonomy discovery. +# This fine-tunes BERT-Large using Sequential Prompts on (Parent, Child) pairs. +bert_learner = SKHNLPSequentialFTLearner( + model_name="bert-large-uncased", + n_prompts=2, + random_state=1403, + device="cpu", # Note: CPU training for BERT-Large is very slow. + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + warmup_steps=500, + weight_decay=0.01, + logging_dir="./logs/", + logging_steps=50, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) + +# Build pipeline and run +# Build the pipeline, passing the BERT Learner. +pipeline = LearnerPipeline( + llm=bert_learner, + llm_id="bert-large-uncased", + ontologizer_data=False, +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py new file mode 100644 index 0000000..90391f5 --- /dev/null +++ b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline, SKHNLPZSLearner + +#Load ontology and split data +# The GeoNames ontology provides geographic term types and relationships. +ontology = GeoNames() +ontology.load() +train_data, test_data = train_test_split( + ontology.extract(), + test_size=0.2, + random_state=42, +) + +# Configure the learner with user-defined generation and normalization settings +# Configure the Zero-Shot Qwen Learner for taxonomy discovery. +# This model uses a fixed prompt and string normalization (Levenshtein) to classify terms. +llm_learner = SKHNLPZSLearner( + model_name="Qwen/Qwen2.5-0.5B-Instruct", + device="cpu", # use "cuda" if you have a GPU + max_new_tokens=16, + save_path="./outputs/", # directory or full file path for CSV + verbose=True, + normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" +) + +# Build pipeline and run +pipe = LearnerPipeline( + llm=llm_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=False, + device="cpu", +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipe( + train_data=train_data, # zero-shot; ignored by the LLM learner + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 0b6fd26..d9ba608 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -31,7 +31,9 @@ StandardizedPrompting, LabelMapper, RWTHDBISTaxonomyLearner, - RWTHDBISTermTypingLearner) + RWTHDBISTermTypingLearner + SKHNLPZSLearner, + SKHNLPSequentialFTLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -51,6 +53,8 @@ "Processor", "RWTHDBISTaxonomyLearner", "RWTHDBISTermTypingLearner", + "SKHNLPZSLearner", + "SKHNLPSequentialFTLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index ad38f0b..3c56154 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -19,3 +19,4 @@ from .label_mapper import LabelMapper from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner +from .taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py index ab5b4f8..2c6b452 100644 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .rwthdbis import RWTHDBISSFTLearner +from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner diff --git a/ontolearner/learner/taxonomy_discovery/skhnlp.py b/ontolearner/learner/taxonomy_discovery/skhnlp.py new file mode 100644 index 0000000..fbe53b4 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/skhnlp.py @@ -0,0 +1,761 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import random + +import pandas as pd +import torch +import Levenshtein +from datasets import Dataset +from typing import Any, Optional, List, Tuple, Dict +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + BertTokenizer, + BertForSequenceClassification, + pipeline, + Trainer, + TrainingArguments, +) + +from ...base import AutoLearner, AutoPrompt +from ...utils import taxonomy_split, train_test_split as ontology_split +from ...data_structure import OntologyData, TaxonomicRelation + +class SKHNLPTaxonomyPrompts(AutoPrompt): + """Builds the 7 taxonomy prompts used during fine-tuning / inference.""" + def __init__(self) -> None: + super().__init__(prompt_template="{parent} is the superclass of {child}. This statement is [MASK].") + self.templates: List[str] = [ + "{parent} is the superclass of {child}. This statement is [MASK].", + "{child} is a subclass of {parent}. This statement is [MASK].", + "{parent} is the parent class of {child}. This statement is [MASK].", + "{child} is a child class of {parent}. This statement is [MASK].", + "{parent} is a supertype of {child}. This statement is [MASK].", + "{child} is a subtype of {parent}. This statement is [MASK].", + "{parent} is an ancestor class of {child}. This statement is [MASK].", + ] + + def make(self, parent: str, child: str, template_idx: int) -> str: + return self.templates[template_idx].format(parent=parent, child=child) + + +class SKHNLPSequentialFTLearner(AutoLearner): + """ + BERT-based classifier for taxonomy discovery. + + With OntologyData: + * TRAIN: ontology-aware split; create balanced train/eval with negatives. + * PREDICT/TEST: notebook-style parent selection -> list[{'parent', 'child'}]. + + With DataFrame/list: + * TRAIN: taxonomy_split + negatives; build prompts and fine-tune. + * PREDICT/TEST: pairwise binary classification (returns label + score). + """ + + # Fixed constants defining data split size and the proportional mix of + # negative sample types (reversed vs. manipulated) for balancing. + _EVAL_FRACTION: float = 0.16 + _NEG_RATIO_REVERSED: float = 1/3 + _NEG_RATIO_MANIPULATED: float = 2/3 + + def __init__( + self, + # core + model_name: str = "bert-large-uncased", + n_prompts: int = 7, + random_state: int = 1403, + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + + # ---- expose TrainingArguments as individual user-defined args ---- + output_dir: str = "./results/", + num_train_epochs: int = 1, + per_device_train_batch_size: int = 4, + per_device_eval_batch_size: int = 4, + warmup_steps: int = 500, + weight_decay: float = 0.01, + logging_dir: str = "./logs/", + logging_steps: int = 50, + eval_strategy: str = "epoch", + save_strategy: str = "epoch", + load_best_model_at_end: bool = True, + ) -> None: + super().__init__() + self.model_name = model_name + self.n_prompts = n_prompts + self.random_state = random_state + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + self.tokenizer: Optional[BertTokenizer] = None + self.model: Optional[BertForSequenceClassification] = None + self.prompter = SKHNLPTaxonomyPrompts() + + # Candidate parents (unique parent list) for multi-class parent selection. + self._candidate_parents: Optional[List[str]] = None + + # Keep last train/eval tables for inspection + self._last_train: Optional[pd.DataFrame] = None + self._last_eval: Optional[pd.DataFrame] = None + + random.seed(self.random_state) + + # Build TrainingArguments from the individual user-defined values + self.training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_train_epochs, + per_device_train_batch_size=per_device_train_batch_size, + per_device_eval_batch_size=per_device_eval_batch_size, + warmup_steps=warmup_steps, + weight_decay=weight_decay, + logging_dir=logging_dir, + logging_steps=logging_steps, + eval_strategy=eval_strategy, + save_strategy=save_strategy, + load_best_model_at_end=load_best_model_at_end, + ) + + def load(self, model_id: Optional[str] = None, **_: Any) -> None: + """Load tokenizer and model; move model to the requested device.""" + model_id = model_id or self.model_name + self.tokenizer = BertTokenizer.from_pretrained(model_id) + self.model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2) + self.model.config.problem_type = "single_label_classification" + + # place on device chosen by user (or auto) + target_device = self.device + if target_device not in {"cuda", "cpu"}: + target_device = "cuda" if torch.cuda.is_available() else "cpu" + self.model.to(target_device) + + def tasks_ground_truth_former(self, data: Any, task: str) -> Any: + if task != "taxonomy-discovery": + return super().tasks_ground_truth_former(data, task) + + if isinstance(data, pd.DataFrame): + if "label" in data.columns: + return [ + {"parent": p, "child": c, "label": bool(lbl)} + for p, c, lbl in zip(data["parent"], data["child"], data["label"]) + ] + return [{"parent": p, "child": c} for p, c in zip(data["parent"], data["child"])] + + if isinstance(data, list): + return data + + return super().tasks_ground_truth_former(data, task) + + def _make_negatives(self, positives_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Return (reversed_df, manipulated_df).""" + unique_parents = positives_df["parent"].unique().tolist() + + def as_reversed(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + out[["parent", "child"]] = out[["child", "parent"]].values + out["label"] = False + return out + + def with_random_parent(df: pd.DataFrame) -> pd.DataFrame: + def pick_other_parent(p: str) -> str: + pool = [x for x in unique_parents if x != p] + return random.choice(pool) if pool else p + out = df.copy() + out["parent"] = out["parent"].apply(pick_other_parent) + out["label"] = False + return out + + return as_reversed(positives_df), with_random_parent(positives_df) + + def _balance_with_negatives( + self, + positives_df: pd.DataFrame, + reversed_df: pd.DataFrame, + manipulated_df: pd.DataFrame, + ) -> pd.DataFrame: + """Combine positives and negatives with the same ratios as before.""" + n_pos = len(positives_df) + n_rev = int(n_pos * self._NEG_RATIO_REVERSED) + n_man = int(n_pos * self._NEG_RATIO_MANIPULATED) + + combined = pd.concat( + [ + positives_df.sample(n_pos, random_state=self.random_state), + reversed_df.sample(n_rev, random_state=self.random_state), + manipulated_df.sample(n_man, random_state=self.random_state), + ], + ignore_index=True, + ) + combined = combined.drop_duplicates(subset=["parent", "child", "label"]).reset_index(drop=True) + return combined + + def _add_prompt_columns(self, df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + for i in range(self.n_prompts): + out[f"prompt_{i+1}"] = out.apply( + lambda r, k=i: self.prompter.make(r["parent"], r["child"], k), axis=1 + ) + return out + + def _df_from_relations(relations: List[TaxonomicRelation], label: bool = True) -> pd.DataFrame: + if not relations: + return pd.DataFrame(columns=["parent", "child", "label"]) + return pd.DataFrame([{"parent": r.parent, "child": r.child, "label": label} for r in relations]) + + def _relations_from_df(df: pd.DataFrame) -> List[TaxonomicRelation]: + return [TaxonomicRelation(parent=p, child=c) for p, c in zip(df["parent"], df["child"])] + + def _build_masked_prompt(self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]") -> str: + prompts_1based = [ + f"{parent} is the superclass of {child}. This statement is {mask_token}.", + f"{child} is a subclass of {parent}. This statement is {mask_token}.", + f"{parent} is the parent class of {child}. This statement is {mask_token}.", + f"{child} is a child class of {parent}. This statement is {mask_token}.", + f"{parent} is a supertype of {child}. This statement is {mask_token}.", + f"{child} is a subtype of {parent}. This statement is {mask_token}.", + f"{parent} is an ancestor class of {child}. This statement is {mask_token}.", + f"{child} is a descendant classs of {child}. This statement is {mask_token}.", + f"\"{parent}\" is the superclass of \"{child}\". This statement is {mask_token}.", + ] + return prompts_1based[index_1_based - 1] + + @torch.no_grad() + def _predict_prompt_true_false(self, sentence: str) -> bool: + enc = self.tokenizer(sentence, return_tensors="pt").to(self.model.device) + logits = self.model(**enc).logits + predicted_label = torch.argmax(logits, dim=1).item() + return predicted_label == 1 + + def _select_parent_via_prompts(self, child: str) -> str: + assert self._candidate_parents, "Candidate parents not initialized." + scores: dict[str, int] = {p: 0 for p in self._candidate_parents} + + def prompt_indices_for_level(level: int) -> List[int]: + if level == 0: + return [1] + return [2 * level, 2 * level + 1] + + def recurse(active_parents: List[str], level: int) -> str: + idxs = [i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts] + if idxs: + for parent in active_parents: + votes = sum( + 1 + for idx in idxs + if self._predict_prompt_true_false( + self._build_masked_prompt(parent=parent, child=child, index_1_based=idx) + ) + ) + scores[parent] += votes + + max_score = max(scores[p] for p in active_parents) + tied = [p for p in active_parents if scores[p] == max_score] + if len(tied) == 1: + return tied[0] + if level < 4: + return recurse(tied, level + 1) + return random.choice(tied) + + return recurse(list(scores.keys()), level=0) + + def _taxonomy_discovery(self, data: Any, test: bool = False): + """ + TRAIN: + - OntologyData -> ontology-aware split; negatives per split; balanced sets. + - DataFrame/list -> taxonomy_split for positives; negatives proportional. + TEST: + - OntologyData -> parent selection: [{'parent': predicted, 'child': child}] + - DataFrame/list -> binary pair classification with 'label' + 'score' + """ + is_ontology_object = isinstance(data, OntologyData) + + # Normalize input + if isinstance(data, pd.DataFrame): + pairs_df = data.copy() + elif isinstance(data, list): + pairs_df = pd.DataFrame(data) + else: + gt_pairs = super().tasks_ground_truth_former(data, "taxonomy-discovery") + pairs_df = pd.DataFrame(gt_pairs) + if "label" not in pairs_df.columns: + pairs_df["label"] = True + + # Maintain candidate parents across calls + if "parent" in pairs_df.columns: + parents_in_call = sorted(pd.unique(pairs_df["parent"]).tolist()) + if test: + if self._candidate_parents is None: + self._candidate_parents = parents_in_call + else: + self._candidate_parents = sorted(set(self._candidate_parents).union(parents_in_call)) + else: + if self._candidate_parents is None: + self._candidate_parents = parents_in_call + + if test: + if is_ontology_object and self._candidate_parents: + predictions: List[dict[str, str]] = [] + for _, row in pairs_df.iterrows(): + child_term = row["child"] + chosen_parent = self._select_parent_via_prompts(child_term) + predictions.append({"parent": chosen_parent, "child": child_term}) + return predictions + + # pairwise binary classification + prompts_df = self._add_prompt_columns(pairs_df.copy()) + true_probs_by_prompt: List[torch.Tensor] = [] + + for i in range(self.n_prompts): + col = f"prompt_{i+1}" + enc = self.tokenizer( + prompts_df[col].tolist(), + return_tensors="pt", + padding=True, + truncation=True, + ).to(self.model.device) + with torch.no_grad(): + logits = self.model(**enc).logits + true_probs_by_prompt.append(torch.softmax(logits, dim=1)[:, 1]) + + avg_true_prob = torch.stack(true_probs_by_prompt, dim=0).mean(0) + predicted_bool = (avg_true_prob >= 0.5).cpu().tolist() + + results: List[dict[str, Any]] = [] + for p, c, s, yhat in zip( + pairs_df["parent"], pairs_df["child"], avg_true_prob.tolist(), predicted_bool + ): + results.append({"parent": p, "child": c, "label": int(bool(yhat)), "score": float(s)}) + return results + + if isinstance(data, OntologyData): + train_onto, eval_onto = ontology_split( + data, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + ) + + train_pos_rel: List[TaxonomicRelation] = getattr(train_onto.type_taxonomies, "taxonomies", []) or [] + eval_pos_rel: List[TaxonomicRelation] = getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + + train_pos_df = self._df_from_relations(train_pos_rel, label=True) + eval_pos_df = self._df_from_relations(eval_pos_rel, label=True) + + tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) + ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) + + train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) + eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + else: + if "label" not in pairs_df.columns or pairs_df["label"].nunique() == 1: + positives_df = pairs_df[pairs_df.get("label", True)][["parent", "child"]].copy() + pos_rel = self._relations_from_df(positives_df) + + tr_rel, ev_rel = taxonomy_split( + pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + ) + train_pos_df = self._df_from_relations(tr_rel, label=True) + eval_pos_df = self._df_from_relations(ev_rel, label=True) + + tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) + ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) + + train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) + eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + else: + positives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() + pos_rel = self._relations_from_df(positives_df) + + tr_rel, ev_rel = taxonomy_split( + pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + ) + train_pos_df = self._df_from_relations(tr_rel, label=True) + eval_pos_df = self._df_from_relations(ev_rel, label=True) + + negatives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() + negatives_df = negatives_df.sample(frac=1.0, random_state=self.random_state).reset_index(drop=True) + + n_eval_neg = max(1, int(len(negatives_df) * self._EVAL_FRACTION)) if len(negatives_df) > 0 else 0 + eval_neg_df = negatives_df.iloc[:n_eval_neg].copy() if n_eval_neg > 0 else negatives_df.iloc[:0].copy() + train_neg_df = negatives_df.iloc[n_eval_neg:].copy() + + train_neg_df["label"] = False + eval_neg_df["label"] = False + + train_df = pd.concat([train_pos_df, train_neg_df], ignore_index=True) + eval_df = pd.concat([eval_pos_df, eval_neg_df], ignore_index=True) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + # Ensure labels are int64 + train_df["label"] = train_df["label"].astype("int64") + eval_df["label"] = eval_df["label"].astype("int64") + + # Sequential fine-tuning across prompts + for i in range(self.n_prompts): + prompt_col = f"prompt_{i+1}" + train_ds = Dataset.from_pandas(train_df[[prompt_col, "label"]].reset_index(drop=True)) + eval_ds = Dataset.from_pandas(eval_df[[prompt_col, "label"]].reset_index(drop=True)) + + train_ds = train_ds.rename_column("label", "labels") + eval_ds = eval_ds.rename_column("label", "labels") + + def tokenize_batch(batch): + return self.tokenizer(batch[prompt_col], padding="max_length", truncation=True) + + train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) + eval_ds = eval_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) + + train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + + trainer = Trainer( + model=self.model, + args=self.training_args, + train_dataset=train_ds, + eval_dataset=eval_ds, + ) + trainer.train() + + self._last_train = train_df + self._last_eval = eval_df + return None + + +class SKHNLPZSLearner(AutoLearner): + """ + Zero-shot taxonomy learner using an instruction-tuned causal LLM. + + Behavior + -------- + - Builds a fixed classification prompt listing 9 GeoNames parent classes. + - For each input row (child term), generates a short completion and parses + the predicted class from a strict '#[ ... ]#' format. + - Optionally normalizes the raw prediction to one of the valid 9 labels via: + * "none" : keep the parsed text as-is + * "substring" : snap to a label if either is a substring of the other + * "levenshtein" : snap to the closest label by edit distance + * "auto" : substring, then Levenshtein if needed + - Saves raw and normalized predictions to CSV if `save_path` is provided. + + Inputs the learner accepts (via `_to_dataframe`): + - pandas.DataFrame with columns: ['child', 'parent'] or ['child', 'parent', 'label'] + - list[dict] with keys: 'child', 'parent' (and optionally 'label') + - list of tuples/lists: (child, parent) or (child, parent, label) + - OntoLearner-style object exposing .type_taxonomies.taxonomies iterable with (child, parent) + """ + + # Fixed class inventory (GeoNames parents) + CLASS_LIST = [ + "city, village", + "country, state, region", + "forest, heath", + "mountain, hill, rock", + "parks, area", + "road, railroad", + "spot, building, farm", + "stream, lake", + "undersea", + ] + + # Strict format: #[ ... ]# + _PREDICTION_PATTERN = re.compile(r"#\[\s*([^\]]+?)\s*\]#") + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + max_new_tokens: int = 16, + save_path: Optional[str] = None, # directory or full path + verbose: bool = True, + normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" + random_state: int = 1403, + ) -> None: + super().__init__() + self.model_name = model_name + self.verbose = verbose + self.max_new_tokens = max_new_tokens + self.save_path = save_path + self.normalize_mode = (normalize_mode or "none").lower().strip() + self.random_state = random_state + + random.seed(self.random_state) + + # Device: auto-detect CUDA if not specified + if device is None: + self._has_cuda = torch.cuda.is_available() + else: + self._has_cuda = (device == "cuda") + self._pipe_device = 0 if self._has_cuda else -1 + self._model_device_map = {"": "cuda"} if self._has_cuda else None + + self._tokenizer = None + self._model = None + self._pipeline = None + + # Prompt template used for every example + self._classification_prompt = ( + "My task is classification. My classes are as follows: " + "(city, village), (country, state, region), (forest, heath), " + "(mountain, hill, rock), (parks, area), (road, railroad), " + "(spot, building, farm), (stream, lake), (undersea). " + 'I will provide you with a phrase like "wadi mouth". ' + "The name of each class is placed within a pair of parentheses. " + "I want you to choose the most appropriate class from those mentioned above " + "based on the given phrase and present it in a format like #[parks, area]#. " + "So, the general format for each response will be #[class name]#. " + "Pay attention to the format of the response. Start with a '#' character, " + "include the class name inside it, and end with another '#' character. " + "Additionally, make sure to include a '#' character at the end to indicate " + "that the answer is complete. I don't need any additional explanations." + ) + + def load(self, model_id: str = "") -> None: + """ + Load tokenizer, model, and text-generation pipeline. + """ + model_id = model_id or self.model_name + if self.verbose: + print(f"[ZeroShotTaxonomyLearner] Loading {model_id}") + + self._tokenizer = AutoTokenizer.from_pretrained(model_id) + + # Ensure a pad token is set for generation + if self._tokenizer.pad_token_id is None and self._tokenizer.eos_token_id is not None: + self._tokenizer.pad_token = self._tokenizer.eos_token + + self._model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=self._model_device_map, + torch_dtype="auto", + ) + + self._pipeline = pipeline( + task="text-generation", + model=self._model, + tokenizer=self._tokenizer, + device=self._pipe_device, # 0 for GPU, -1 for CPU + ) + + if self.verbose: + print("Device set to use", "cuda" if self._has_cuda else "cpu") + print("[ZeroShotTaxonomyLearner] Model loaded.") + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[List[Dict[str, str]]]: + """ + Zero-shot prediction over all incoming rows (no filtering/augmentation). + Returns a list of dictionaries: [{'parent': predicted_label, 'child': child}, ...] + """ + if not test: + if self.verbose: + print("[ZeroShot] Training skipped (zero-shot).") + return None + + df = self._to_dataframe(data) + + if self.verbose: + print(f"[ZeroShot] Incoming rows: {len(df)}; columns: {list(df.columns)}") + + eval_df = pd.DataFrame(df).reset_index(drop=True) + if eval_df.empty: + return [] + + # Prepare columns for inspection and saving + eval_df["prediction_raw"] = "" + eval_df["prediction_sub"] = "" + eval_df["prediction_lvn"] = "" + eval_df["prediction_auto"] = "" + eval_df["prediction"] = "" # final (per normalize_mode) + + # Generate predictions row by row + for idx, row in eval_df.iterrows(): + child_term = str(row["child"]) + raw_text, parsed_raw = self._generate_and_parse(child_term) + + # Choose a string to normalize (parsed token if matched, otherwise whole output) + basis = parsed_raw if parsed_raw != "unknown" else raw_text + + # Compute all normalization variants + sub_norm = self._normalize_substring_only(basis) + lvn_norm = self._normalize_levenshtein_only(basis) + auto_norm = self._normalize_auto(basis) + + # Final selection by mode + if self.normalize_mode == "none": + final_label = parsed_raw + elif self.normalize_mode == "substring": + final_label = sub_norm + elif self.normalize_mode == "levenshtein": + final_label = lvn_norm + elif self.normalize_mode == "auto": + final_label = auto_norm + else: + final_label = parsed_raw # fallback + + # Persist to DataFrame for inspection/export + eval_df.at[idx, "prediction_raw"] = parsed_raw + eval_df.at[idx, "prediction_sub"] = sub_norm + eval_df.at[idx, "prediction_lvn"] = lvn_norm + eval_df.at[idx, "prediction_auto"] = auto_norm + eval_df.at[idx, "prediction"] = final_label + + # Return in the format expected by the pipeline + return [{"parent": p, "child": c} for p, c in zip(eval_df["prediction"], eval_df["child"])] + + def _generate_and_parse(self, child_term: str) -> (str, str): + """ + Generate a completion for the given child term and extract the raw predicted class + using the strict '#[ ... ]#' pattern. + + Returns + ------- + (raw_generation_text, parsed_prediction_or_unknown) + """ + messages = [ + {"role": "system", "content": "You are a helpful classifier."}, + {"role": "user", "content": f"{self._classification_prompt} {child_term}"}, + ] + + prompt = self._tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + generation = self._pipeline( + prompt, + max_new_tokens=self.max_new_tokens, + do_sample=False, + temperature=0.0, + top_p=1.0, + eos_token_id=self._tokenizer.eos_token_id, + pad_token_id=self._tokenizer.eos_token_id, + return_full_text=False, + )[0]["generated_text"] + + match = self._PREDICTION_PATTERN.search(generation) + parsed = match.group(1).strip() if match else "unknown" + return generation, parsed + + # ------------------------------------------------------------------------- + # Normalization helpers + # ------------------------------------------------------------------------- + + def _normalize_substring_only(self, text: str) -> str: + """ + Snap to a label if the string is equal to / contained in / contains a valid label (case-insensitive). + """ + if not isinstance(text, str): + return "unknown" + lowered = text.strip().lower() + if not lowered: + return "unknown" + + for label in self.CLASS_LIST: + label_lower = label.lower() + if lowered == label_lower or lowered in label_lower or label_lower in lowered: + return label + return "unknown" + + def _normalize_levenshtein_only(self, text: str) -> str: + """ + Snap to the nearest label by Levenshtein (edit) distance. + """ + if not isinstance(text, str): + return "unknown" + lowered = text.strip().lower() + if not lowered: + return "unknown" + + best_label = None + best_distance = 10**9 + for label in self.CLASS_LIST: + label_lower = label.lower() + distance = Levenshtein.distance(lowered, label_lower) + if distance < best_distance: + best_distance = distance + best_label = label + return best_label or "unknown" + + def _normalize_auto(self, text: str) -> str: + """ + Cascade: try substring-first; if no match, fall back to Levenshtein snapping. + """ + snapped = self._normalize_substring_only(text) + return snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + + def _to_dataframe(data: Any) -> pd.DataFrame: + """ + Normalize various input formats into a DataFrame with columns: + ['child', 'parent'] or ['child', 'parent', 'label']. + """ + # Already a DataFrame + if isinstance(data, pd.DataFrame): + df = data.copy() + df.columns = [str(c).lower() for c in df.columns] + return df.reset_index(drop=True) + + # List[dict] + if isinstance(data, list) and data and isinstance(data[0], dict): + rows = [{str(k).lower(): v for k, v in d.items()} for d in data] + return pd.DataFrame(rows).reset_index(drop=True) + + # Iterable of tuples/lists: (child, parent[, label]) + if isinstance(data, (list, tuple)) and data: + first = data[0] + if isinstance(first, (list, tuple)) and not isinstance(first, dict): + n = len(first) + if n >= 3: + return pd.DataFrame(data, columns=["child", "parent", "label"]).reset_index(drop=True) + if n == 2: + return pd.DataFrame(data, columns=["child", "parent"]).reset_index(drop=True) + + # OntoLearner-style object (with .type_taxonomies.taxonomies) + try: + type_taxonomies = getattr(data, "type_taxonomies", None) + if type_taxonomies is not None: + taxonomies = getattr(type_taxonomies, "taxonomies", None) + if taxonomies is not None: + rows = [] + for rel in taxonomies: + parent = getattr(rel, "parent", None) + child = getattr(rel, "child", None) + label = getattr(rel, "label", None) if hasattr(rel, "label") else None + if parent is not None and child is not None: + rows.append({"child": child, "parent": parent, "label": label}) + if rows: + return pd.DataFrame(rows).reset_index(drop=True) + except Exception: + pass + + raise ValueError( + "Unsupported data format. Provide a DataFrame, a list of dicts, " + "a list of (child, parent[, label]) tuples/lists, or an object with " + ".type_taxonomies.taxonomies." + ) + + def _resolve_save_path(save_path: str, default_filename: str) -> str: + """ + If `save_path` is a directory, join it with `default_filename`. + If it's a file path, return as-is. + """ + base = os.path.basename(save_path) + has_ext = os.path.splitext(base)[1] != "" + return save_path if has_ext else os.path.join(save_path, default_filename) diff --git a/requirements.txt b/requirements.txt index 6d71bd5..28a92bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ mistral-common[sentencepiece]~=1.8.5 g4f protobuf<5 accelerate>=0.26.0 +Levenshtein From 844de4f0a0b6a2aa1240941fe6283fca1f0c52ed Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 29 Oct 2025 16:07:24 +0100 Subject: [PATCH 03/19] adding sbunlp learner models --- ...lm_learner_sbunlp_fs_taxonomy_discovery.py | 66 +++ examples/llm_learner_sbunlp_text2onto.py | 81 +++ examples/llm_learner_sbunlp_zs_term_typing.py | 55 ++ ontolearner/__init__.py | 10 +- ontolearner/learner/__init__.py | 3 + .../learner/taxonomy_discovery/__init__.py | 1 + .../learner/taxonomy_discovery/sbunlp.py | 317 +++++++++++ ontolearner/learner/term_typing/__init__.py | 1 + ontolearner/learner/term_typing/sbunlp.py | 400 +++++++++++++ ontolearner/learner/text2onto/__init__.py | 15 + ontolearner/learner/text2onto/sbunlp.py | 525 ++++++++++++++++++ 11 files changed, 1472 insertions(+), 2 deletions(-) create mode 100644 examples/llm_learner_sbunlp_fs_taxonomy_discovery.py create mode 100644 examples/llm_learner_sbunlp_text2onto.py create mode 100644 examples/llm_learner_sbunlp_zs_term_typing.py create mode 100644 ontolearner/learner/taxonomy_discovery/sbunlp.py create mode 100644 ontolearner/learner/term_typing/sbunlp.py create mode 100644 ontolearner/learner/text2onto/__init__.py create mode 100644 ontolearner/learner/text2onto/sbunlp.py diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py new file mode 100644 index 0000000..19797a9 --- /dev/null +++ b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py @@ -0,0 +1,66 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +# Import the specific Few-Shot Learner implementation +from ontolearner import SBUNLPFewShotLearner + +# Load ontology and split +# Load the GeoNames ontology for taxonomy discovery. +# GeoNames provides geographic parent-child relationships (is-a hierarchy). +ontology = GeoNames() +ontology.load() +data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object + +# Split the taxonomic relationships into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing (terms to find relations for) + random_state=42, +) + +# Configure the learner with user-defined inference args + device +# Configure the SBUNLP Few-Shot Learner using the Qwen model. +# This performs in-context learning via N x M batch prompting. +llm_learner = SBUNLPFewShotLearner( + # Model / decoding + model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency + max_new_tokens=140, # limit the length of the model's response (for JSON output) + max_input_tokens=1500, # limit the total prompt length (context window) + temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) + top_p=1.0, # top-p sampling disabled with temperature=0.0 + + # Grid settings (N x M prompts) + n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks + m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) + + # Run controls + limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run + output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging +) + +# Build pipeline and run +# Build the pipeline, passing the Few-Shot Learner. +pipe = LearnerPipeline( + llm=llm_learner, + llm_id=llm_learner.model_name, + ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers + device="auto", # automatically select CUDA or CPU +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=True, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py new file mode 100644 index 0000000..564f641 --- /dev/null +++ b/examples/llm_learner_sbunlp_text2onto.py @@ -0,0 +1,81 @@ +import os +import torch +#Import all the required classes +from ontolearner import SBUNLPText2OntoLearner +from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM + +# Local folder where the dataset is stored +# This path is relative to the directory where the script is executed +# (e.g., E:\OntoLearner\examples) +LOCAL_DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" + +# Ensure the base directories exist +# Creates the train and test subdirectories if they don't already exist. +os.makedirs(os.path.join(LOCAL_DATA_DIR, 'train'), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, 'test'), exist_ok=True) + +# Define local file paths: POINTING TO ALREADY SAVED FILES +# These files are used as input for the Fit and Predict phases. +DOCS_ALL_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/documents.jsonl" +TERMS2DOC_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/terms2docs.json" +DOCS_TEST_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/text2onto_ecology_test_documents.jsonl" + +# Output files for predictions (saved directly under LOCAL_DATA_DIR/test) +# These files will be created by the predict_terms/types methods. +TERMS_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" +TYPES_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" + +#Initialize and Load Learner --- +MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +# Determine the device for inference (GPU or CPU) +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Instantiate the underlying LLM helper +# (LocalAutoLLM handles model loading and generation) +llm_model_helper = LocalAutoLLM(device=DEVICE) + +# Instantiate the main learner class, passing the LLM helper to its constructor +learner = SBUNLPText2OntoLearner(model=llm_model_helper, device=DEVICE) + +# Load the model (This calls llm_model_helper.load) +LOAD_IN_4BIT = torch.cuda.is_available() +learner.model.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) + +# Build Few-Shot Exemplars (Fit Phase) +# The fit method uses the local data paths to build the in-context learning prompts. +learner.fit( + train_docs_jsonl=DOCS_ALL_PATH, + terms2doc_json=TERMS2DOC_PATH, + sample_size=28, + seed=123 # Seed for stratified random sampling stability +) + +MAX_NEW_TOKENS = 100 + +terms_written = learner.predict_terms( + docs_test_jsonl=DOCS_TEST_PATH, + out_jsonl=TERMS_PRED_OUT, + max_new_tokens=MAX_NEW_TOKENS +) +print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.") + +# Type Extraction subtask +types_written = learner.predict_types( + docs_test_jsonl=DOCS_TEST_PATH, + out_jsonl=TYPES_PRED_OUT, + max_new_tokens=MAX_NEW_TOKENS +) +print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.") + +try: + # Evaluate Term Extraction using the custom F1 function and gold data + f1_term = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TERMS_PRED_OUT, key="term") + print(f"Final Term Extraction F1: {f1_term:.4f}") + + # Evaluate Type Extraction + f1_type = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TYPES_PRED_OUT, key="type") + print(f"Final Type Extraction F1: {f1_type:.4f}") + +except Exception as e: + # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) + print(f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created.") diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py new file mode 100644 index 0000000..75d01da --- /dev/null +++ b/examples/llm_learner_sbunlp_zs_term_typing.py @@ -0,0 +1,55 @@ +# Import core modules from the OntoLearner library +from ontolearner import AgrO, train_test_split, LearnerPipeline +# Import the specific Zero-Shot Learner implementation for Term Typing +from ontolearner import SBUNLPZSLearner + +# Load ontology and split +# Load the AgrO ontology for type inventory and test data. +ontology = AgrO() +ontology.load() +data = ontology.extract() # Extract the full set of relationships/terms + +# Split the data into train (to learn type inventory) and test (terms to predict) +train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing + random_state=42, +) + +# Configure the Qwen Zero-Shot learner (inference-only) +# This learner's 'fit' phase learns the vocabulary of allowed type labels. +llm_learner = SBUNLPZSLearner( + # Model / decoding + model_id="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + # device= is auto-detected + max_new_tokens=64, # Sufficient length for JSON list of types + temperature=0.0, # Ensures deterministic (greedy) output + # token= None, # Assuming public model access +) + +# Build pipeline and run +# Build the pipeline, passing the Zero-Shot Learner. +pipe = LearnerPipeline( + llm=llm_learner, + llm_id=llm_learner.model_id, + ontologizer_data=False, + device="cpu", # select CUDA or CPU +) + +# Run the full learning pipeline on the Term-Typing task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for learning (type inventory) + prediction + evaluation +print("Elapsed time:", outputs.get("elapsed_time")) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index d9ba608..49b94c4 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -31,9 +31,12 @@ StandardizedPrompting, LabelMapper, RWTHDBISTaxonomyLearner, - RWTHDBISTermTypingLearner + RWTHDBISTermTypingLearner, SKHNLPZSLearner, - SKHNLPSequentialFTLearner) + SKHNLPSequentialFTLearner, + SBUNLPFewShotLearner, + SBUNLPZSLearner, + SBUNLPText2OntoLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -55,6 +58,9 @@ "RWTHDBISTermTypingLearner", "SKHNLPZSLearner", "SKHNLPSequentialFTLearner", + "SBUNLPFewShotLearner", + "SBUNLPZSLearner", + "SBUNLPText2OntoLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 3c56154..4f41586 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -20,3 +20,6 @@ from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner from .taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner +from .taxonomy_discovery.sbunlp import SBUNLPFewShotLearner +from .term_typing.sbunlp import SBUNLPZSLearner +from .text2onto import SBUNLPFewShotLearner as SBUNLPText2OntoLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py index 2c6b452..d52513b 100644 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -14,3 +14,4 @@ from .rwthdbis import RWTHDBISSFTLearner from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner +from .sbunlp import SBUNLPFewShotLearner diff --git a/ontolearner/learner/taxonomy_discovery/sbunlp.py b/ontolearner/learner/taxonomy_discovery/sbunlp.py new file mode 100644 index 0000000..9fc520d --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/sbunlp.py @@ -0,0 +1,317 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import json +import importlib.util +from typing import Any, Dict, List, Optional, Tuple + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +from ...base import AutoLearner + +class SBUNLPFewShotLearner(AutoLearner): + """ + Taxonomy-discovery via N×M batch prompting with a small Qwen model. + + Lifecycle + --------- + fit(): + Cache + clean training parent–child pairs. + predict(): + Chunk (train pairs × test terms), prompt per chunk pair, parse, merge, + and deduplicate predicted relations. + """ + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", + try_4bit: bool = True, + num_train_chunks: int = 7, + num_test_chunks: int = 7, + max_new_tokens: int = 140, + max_input_tokens: int = 1500, + temperature: float = 0.0, + top_p: float = 1.0, + limit_num_prompts: Optional[int] = None, + output_dir: Optional[str] = None, + **kwargs: Any, + ) -> None: + super().__init__(**kwargs) + self.model_name = model_name + self.try_4bit = try_4bit + + self.num_train_chunks = num_train_chunks + self.num_test_chunks = num_test_chunks + + self.max_new_tokens = max_new_tokens + self.max_input_tokens = max_input_tokens + self.temperature = temperature + self.top_p = top_p + self.limit_num_prompts = limit_num_prompts + + self.output_dir = output_dir + + self.tokenizer: Optional[AutoTokenizer] = None + self.model: Optional[AutoModelForCausalLM] = None + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + self.train_pairs_clean: List[Dict[str, str]] = [] + + # ----------------------- small helpers ---------------------- + def _clean_pairs(pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: + """ + Normalize, drop empty or self-relations, and deduplicate by (parent, child). + """ + cleaned_pairs: List[Dict[str, str]] = [] + seen_parent_child: set[Tuple[str, str]] = set() + + for pair_record in pair_rows or []: + if not isinstance(pair_record, dict): + continue + + parent_label = str(pair_record.get("parent", "")).strip() + child_label = str(pair_record.get("child", "")).strip() + if not parent_label or not child_label: + continue + + normalized_key = (parent_label.lower(), child_label.lower()) + if normalized_key[0] == normalized_key[1]: # parent==child + continue + if normalized_key in seen_parent_child: + continue + + seen_parent_child.add(normalized_key) + cleaned_pairs.append({"parent": parent_label, "child": child_label}) + + return cleaned_pairs + + def _chunk_list(items: List[Any], num_chunks: int) -> List[List[Any]]: + """ + Split `items` into `num_chunks` near-equal parts. Some chunks may be empty. + """ + if num_chunks <= 0: + return [items] + total_items = len(items) + base_size, remainder = divmod(total_items, num_chunks) + + chunks: List[List[Any]] = [] + start_index = 0 + for chunk_index in range(num_chunks): + current_size = base_size + (1 if chunk_index < remainder else 0) + end_index = start_index + current_size + chunks.append(items[start_index:end_index]) + start_index = end_index + return chunks + + def _ensure_dir(self, maybe_path: Optional[str]) -> None: + if maybe_path: + os.makedirs(maybe_path, exist_ok=True) + + # ---------------------- model load/gen ---------------------- + def load(self, **_: Any) -> None: + """ + Load tokenizer/model; use 4-bit nf4 on CUDA if available + requested. + """ + bnb_available = importlib.util.find_spec("bitsandbytes") is not None + use_4bit_quant = bool(self.try_4bit and bnb_available and self.device == "cuda") + + quant_config = None + if use_4bit_quant: + from transformers import BitsAndBytesConfig + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=("auto" if self.device == "cuda" else None), + torch_dtype=(torch.float16 if self.device == "cuda" else torch.float32), + quantization_config=quant_config, + ) + + def _format_chat(self, user_text: str) -> str: + """ + Wrap user text with the model's chat template (if present). + """ + if hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None): + return self.tokenizer.apply_chat_template( + [{"role": "user", "content": user_text}], + tokenize=False, + add_generation_prompt=True, + ) + return user_text + + @torch.no_grad() + def _generate(self, prompt_text: str) -> str: + """ + Single prompt → model text. Clips *input* tokens to avoid overflow. + """ + formatted_prompt = self._format_chat(prompt_text) + prompt_token_ids = self.tokenizer(formatted_prompt, add_special_tokens=False, return_tensors=None)["input_ids"] + if len(prompt_token_ids) > self.max_input_tokens: + prompt_token_ids = prompt_token_ids[-self.max_input_tokens:] + + prompt_tensor = torch.tensor([prompt_token_ids]).to(self.model.device) + + generation = self.model.generate( + input_ids=prompt_tensor, + max_new_tokens=self.max_new_tokens, + do_sample=(self.temperature > 0.0), + temperature=self.temperature, + top_p=self.top_p, + pad_token_id=self.tokenizer.pad_token_id, + eos_token_id=getattr(self.tokenizer, "eos_token_id", None), + use_cache=True, + ) + + decoded_full = self.tokenizer.decode(generation[0], skip_special_tokens=True) + decoded_prompt = self.tokenizer.decode(prompt_tensor[0], skip_special_tokens=True) + return decoded_full[len(decoded_prompt):].strip() if decoded_full.startswith(decoded_prompt) else decoded_full.strip() + + # ------------------ prompt build & parsing ------------------ + def _build_prompt(train_pairs_chunk: List[Dict[str, str]], + test_terms_chunk: List[str]) -> str: + """ + Few-shot with JSON examples + a block of test terms. + The model must return ONLY a JSON array of {parent, child}. + """ + examples_json = json.dumps(train_pairs_chunk, ensure_ascii=False, indent=2) + test_types_block = "\n".join(test_terms_chunk) + return ( + "From this file, extract all parent–child relations like in the examples.\n" + "Return ONLY a JSON array of objects with keys 'parent' and 'child'.\n" + "Output format:\n" + "[\n" + ' {"parent": "parent1", "child": "child1"},\n' + ' {"parent": "parent2", "child": "child2"}\n' + "]\n\n" + "EXAMPLES (JSON):\n" + f"{examples_json}\n\n" + "TEST TYPES (between [PAIR] tags):\n" + "[PAIR]\n" + f"{test_types_block}\n" + "[PAIR]\n" + "Return only JSON." + ) + + def _parse_pairs(model_text: str) -> List[Dict[str, str]]: + """ + Parse a model response into a list of {'parent','child'} dicts. + """ + def deduplicate_and_normalize(dict_list: List[Dict[str, str]]) -> List[Dict[str, str]]: + return SBUNLPFewShotLearner._clean_pairs(dict_list) + + response_text = model_text.strip() + + # 1) Direct JSON list + try: + maybe_json = json.loads(response_text) + if isinstance(maybe_json, list): + return deduplicate_and_normalize(maybe_json) + except Exception: + pass + + # 2) Find outermost [ ... ] and parse that + outer_list_match = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", response_text) + if outer_list_match: + try: + array_json = json.loads(outer_list_match.group(0)) + if isinstance(array_json, list): + return deduplicate_and_normalize(array_json) + except Exception: + pass + + # 3) Nothing parsable + return [] + + # --------------------- AutoLearner hooks -------------------- + def fit(self, train_data: Any, task: str, ontologizer: bool = True): + """ + Build the training example bank (parent–child pairs). + """ + if task != "taxonomy-discovery": + return super().fit(train_data, task, ontologizer) + + if ontologizer: + # Convert ontology object → list of {"parent","child"} gold pairs + gold_pairs_from_ontology = self.tasks_ground_truth_former( + train_data, task="taxonomy-discovery" + ) + self.train_pairs_clean = self._clean_pairs(gold_pairs_from_ontology) + else: + # Already a Python list of dicts + self.train_pairs_clean = self._clean_pairs(train_data) + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """ + Main prediction path. Returns a deduplicated list of relations. + """ + if not test: + return None + + if self.model is None or self.tokenizer is None: + self.load() + + # Build test vocabulary of types/terms + if isinstance(data, list) and (len(data) == 0 or isinstance(data[0], str)): + test_type_list: List[str] = data + else: + test_type_list = super().tasks_data_former( + data=data, task="taxonomy-discovery", test=True + ) + + # Create N×M grid + train_chunks = self._chunk_list(self.train_pairs_clean, self.num_train_chunks) + test_chunks = self._chunk_list(test_type_list, self.num_test_chunks) + + self._ensure_dir(self.output_dir) + + merged_predicted_pairs: List[Dict[str, str]] = [] + issued_prompt_count = 0 + + for train_chunk_index, train_pairs_chunk in enumerate(train_chunks, start=1): + for test_chunk_index, test_terms_chunk in enumerate(test_chunks, start=1): + issued_prompt_count += 1 + if self.limit_num_prompts and issued_prompt_count > self.limit_num_prompts: + break + + prompt_text = self._build_prompt(train_pairs_chunk, test_terms_chunk) + model_response = self._generate(prompt_text) + parsed_relation_pairs = self._parse_pairs(model_response) + + # Optional per-batch dump for debugging + if self.output_dir: + batch_json_path = os.path.join( + self.output_dir, f"pairs_T{train_chunk_index}_S{test_chunk_index}.json" + ) + with open(batch_json_path, "w", encoding="utf-8") as fp: + json.dump(parsed_relation_pairs, fp, ensure_ascii=False, indent=2) + + merged_predicted_pairs.extend(parsed_relation_pairs) + + if self.limit_num_prompts and issued_prompt_count >= (self.limit_num_prompts or 0): + break + + # Deduplicate final list + return self._clean_pairs(merged_predicted_pairs) diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py index ab5b4f8..ebd8cd9 100644 --- a/ontolearner/learner/term_typing/__init__.py +++ b/ontolearner/learner/term_typing/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .rwthdbis import RWTHDBISSFTLearner +from .sbunlp import SBUNLPZSLearner diff --git a/ontolearner/learner/term_typing/sbunlp.py b/ontolearner/learner/term_typing/sbunlp.py new file mode 100644 index 0000000..f838bd0 --- /dev/null +++ b/ontolearner/learner/term_typing/sbunlp.py @@ -0,0 +1,400 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional +import re + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from ...base import AutoLearner + +class SBUNLPZSLearner(AutoLearner): + """ + Qwen-based blind term typing learner (Task B), implemented as an AutoLearner. + + This class reproduces the notebook logic: + - Fit phase learns the *allowed type inventory* from training data. + - Predict phase performs blind prompting per term using the learned type list. + - Outputs are restricted to the allowed types and returned as [{"id", "types"}]. + + Expected I/O (recommended): + - fit(train_data, task="term-typing", ontologizer=True): + The framework's AutoLearner.tasks_data_former() provides a unique list of + type labels; we store it to `self.allowed_types`. + - predict(eval_data, task="term-typing", ontologizer=False): + Pass a list of dicts with keys {"id": str, "term": str} so IDs are preserved. + Returns a list of dicts [{"id": ..., "types": [...] }]. + """ + + def __init__( + self, + model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", + device: Optional[str] = None, + max_new_tokens: int = 64, + temperature: float = 0.0, + token: Optional[str] = None, + ) -> None: + """ + Args: + model_id: HF model id for Qwen. + device: "cuda", "mps", or "cpu". Auto-detected if None. + max_new_tokens: Generation cap per prompt. + temperature: Not used for greedy decoding (kept for future). + token: HF token if the model is gated. + """ + super().__init__() + + # Basic configuration + self.model_id = model_id + # default device detection: prefer CUDA if available + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.max_new_tokens = max_new_tokens + self.temperature = temperature + self.token = token + + # Model/tokenizer placeholders (populated by load()) + self.tokenizer: Optional[AutoTokenizer] = None + self.model: Optional[AutoModelForCausalLM] = None + + # Learned inventory of allowed type labels (populated by fit()) + self.allowed_types: List[str] = [] + + # Regex used to extract quoted strings from model output (e.g. "type") + self._quoted_re = re.compile(r'"([^"]+)"') + + def load(self, **kwargs: Any): + """ + Load Qwen model and tokenizer. + + NOTE: + - The HF arguments used here mirror your original code (`token=...`). + You may see a deprecation warning for `torch_dtype` (older transformers); + switching to `dtype=` is recommended but I did not change behavior here. + """ + # Respect overrides from kwargs if provided + model_id = kwargs.get("model_id", self.model_id) + token = kwargs.get("token", self.token) + + # Load tokenizer. If the model is gated, pass token (original code uses `token`). + # If your environment requires `use_auth_token=` replace here. + self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) + + # Ensure tokenizer has a pad token (some models omit it) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Device mapping for from_pretrained -> keep same behavior as original code + device_map = "auto" if self.device != "cpu" else "cpu" + # original code used torch_dtype; left as-is to avoid behavioral change + torch_dtype = torch.float16 if self.device != "cpu" else torch.float32 + + # Load the model weights. This can be heavy; keep same params as original. + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch_dtype, + token=token, + ) + return self + + # ------------------------------------------------------------------------- + # Fit / Predict interface + # ------------------------------------------------------------------------- + def fit(self, train_data: Any, task: str, ontologizer: bool = True): + """ + Learn the allowed type inventory from the training data. + + Expected behavior: + - If `tasks_data_former(..., test=False)` returns a list of strings, + set allowed_types to that list (deduped & sorted). + - If it returns a list of dicts (relationships), extract unique 'parent' + fields and use those as the allowed type inventory. + + This method contains a tolerant branch for the framework's custom container: + If the returned `train_fmt` is not a list but has a `.term_typings` attribute + (e.g., OntologyData object used by the framework), iterate that attribute + and collect any `types` values found. + """ + train_fmt = self.tasks_data_former(data=train_data, task=task, test=False) if ontologizer else train_data + if task != "term-typing": + raise ValueError("SBUNLPZSLearner only implements 'term-typing'.") + + # If framework passed a container with `.term_typings`, extract types from there + if not isinstance(train_fmt, list): + # handle OntologyData-like object with attribute 'term_typings' + if hasattr(train_fmt, "term_typings"): + try: + # term_typings is expected to be an iterable of objects with attribute `types` + collected = set() + for tt in getattr(train_fmt, "term_typings") or []: + # tt.types could be list[str] or a single str + if hasattr(tt, "types"): + tvals = tt.types + elif isinstance(tt, dict) and "types" in tt: + tvals = tt["types"] + else: + tvals = None + + # Normalize both list and single-string cases + if isinstance(tvals, (list, tuple, set)): + for x in tvals: + if isinstance(x, str): + collected.add(x) + elif isinstance(tvals, str): + collected.add(tvals) + + # If we successfully collected types, set allowed_types and return + if collected: + self.allowed_types = sorted(collected) + return self + # else fall through to error below (no types found) + except Exception: + # If anything unexpected occurs while iterating term_typings, + # gracefully fall through and raise the original TypeError below. + pass + + # not a supported non-list type -> keep original behavior (raise) + raise TypeError("For term-typing, expected a list of type labels at fit().") + + # At this point train_fmt is a list (original logic preserved) + if train_fmt and isinstance(train_fmt[0], dict) and "parent" in train_fmt[0]: + # Case A: Received raw relationships/pairs (e.g., from train_test_split). + # Extract unique parent types from the relationship records. + unique_types = set(r.get("parent") for r in train_fmt if r.get("parent")) + self.allowed_types = sorted(unique_types) + elif all(isinstance(x, str) for x in train_fmt): + # Case B: Received a clean list of type labels (List[str]). + self.allowed_types = sorted(set(train_fmt)) + else: + # The input is a list but not in either expected format -> raise + raise TypeError("For term-typing, input data format for fit() is invalid. Expected list of strings (types) or list of relationships (dicts).") + + return self + + def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: + """ + Predict types for each term. + + Expected inputs: + - With ontologizer=True: a list[str] of term strings (IDs are autogenerated). + - With ontologizer=False: a list[dict] where each dict has keys {'id','term'}. + + This method tolerantly converts common framework containers (e.g., an + OntologyData object exposing `.term_typings`) into the expected list[dict] + shape so that the internal _term_typing() can run unchanged. + """ + if task != "term-typing": + # Delegate to base for other tasks (not implemented here) + return super().predict(eval_data, task, ontologizer=ontologizer) + + def _extract_list_of_dicts_from_term_typings(obj) -> Optional[List[Dict[str, str]]]: + """ + Helper: try to produce a list of {"id","term"} dicts from objects + exposing a `term_typings` iterable. Supports either object-like + TermTyping (attributes) or dict-style entries. + """ + tts = getattr(obj, "term_typings", None) + if tts is None: + return None + out = [] + for tt in tts: + # support object-style TermTyping (attributes) and dict-style + if isinstance(tt, dict): + # try several common key names for ID + tid = tt.get("ID") or tt.get("id") or tt.get("Id") or tt.get("ID_") + tterm = tt.get("term") or tt.get("label") or tt.get("name") + else: + # object-style access + tid = getattr(tt, "ID", None) or getattr(tt, "id", None) or getattr(tt, "Id", None) + tterm = getattr(tt, "term", None) or getattr(tt, "label", None) or getattr(tt, "name", None) + if tid is None or tterm is None: + # skip malformed entry - this is defensive so downstream code has valid inputs + continue + out.append({"id": str(tid), "term": str(tterm)}) + return out if out else None + + # Case A: ontologizer=True -> framework often provides list[str] + if ontologizer: + if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): + # Simple case: convert list of terms to list of dicts with generated IDs + eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data)] + else: + # Try to extract from a framework container (e.g., OntologyData) + maybe = _extract_list_of_dicts_from_term_typings(eval_data) + if maybe is not None: + eval_pack = maybe + else: + # Last resort: if eval_data is some iterable of strings, convert it + try: + if hasattr(eval_data, "__iter__") and not isinstance(eval_data, (str, bytes)): + lst = list(eval_data) + if all(isinstance(x, str) for x in lst): + eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(lst)] + else: + raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") + else: + raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") + except TypeError: + # re-raise to preserve original error semantics + raise + # Delegate to internal inference routine + return self._term_typing(eval_pack, test=True) + + # Case B: ontologizer=False -> we expect list[dict], but tolerate common containers + else: + if isinstance(eval_data, list) and all(isinstance(x, dict) for x in eval_data): + eval_pack = eval_data + else: + # Try to extract from framework container (term_typings) + maybe = _extract_list_of_dicts_from_term_typings(eval_data) + if maybe is not None: + eval_pack = maybe + else: + # As a final attempt, allow eval_data to be a dict with a list under some known keys + if isinstance(eval_data, dict): + for key in ("term_typings", "terms", "items"): + if key in eval_data and isinstance(eval_data[key], (list, tuple)): + converted = [] + for x in eval_data[key]: + # Accept dict-style entries that include id and term/name + if isinstance(x, dict) and ("id" in x or "ID" in x) and ("term" in x or "name" in x): + tid = x.get("ID") or x.get("id") + tterm = x.get("term") or x.get("name") + converted.append({"id": str(tid), "term": str(tterm)}) + if converted: + eval_pack = converted + break + else: + # Could not convert; raise same TypeError as before + raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") + else: + # Not a supported container -> raise + raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") + # Delegate to internal inference routine + return self._term_typing(eval_pack, test=True) + + + # ------------------------------------------------------------------------- + # Internal task implementations (AutoLearner hooks) + # ------------------------------------------------------------------------- + def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: + """ + Core implementation: + - training mode (test=False): `data` is a list of allowed type labels -> store them. + - inference mode (test=True): `data` is a list of {"id","term"} -> produce [{"id","types"}]. + """ + if not test: + # training: expect a list of strings (type labels) + if not isinstance(data, list): + raise TypeError("Expected a list of type labels at training time.") + self.allowed_types = sorted(set(data)) + return None + + # Inference path + if not isinstance(data, list) or not all(isinstance(x, dict) for x in data): + raise TypeError("At prediction time, expected a list of {'id','term'} dicts.") + + # Ensure model and tokenizer are loaded + if self.model is None or self.tokenizer is None: + raise RuntimeError("Model/tokenizer not loaded. Call .load() before predict().") + + results = [] + for item in data: + # preserve incoming IDs and terms + term_id = item["id"] + term_text = item["term"] + + # build the blind JSON-prompt that instructs the model to output types + prompt = self._build_blind_prompt(term_id, term_text, self.allowed_types) + + # generate and parse model output into allowed types + types = self._generate_and_parse_types(prompt) + + # append result for this term (keep original id) + # include the original term so downstream evaluation (and any consumers) can match by term + results.append({"id": term_id, "term": term_text, "types": types}) + + return results + + # ------------------------------------------------------------------------- + # Prompting + parsing + # ------------------------------------------------------------------------- + + def _format_types_inline(allowed: List[str]) -> str: + """ + Format allowed types as comma-separated quoted strings for insertion into the prompt. + Example: '"type1", "type2", "type3"' + """ + return ", ".join(f'"{t}"' for t in allowed) + + def _build_blind_prompt(self, term_id: str, term: str, allowed_types: List[str]) -> str: + """ + Construct the prompt given a single term. The prompt: + - Instructs the model to produce a JSON array of {id, types} objects. + - Provides the allowed types list (so the model should only use those). + - Includes the single input item for which the model must decide types. + + Note: This is the same blind-prompting approach used in the original notebook. + """ + allowed_str = self._format_types_inline(allowed_types) + return ( + "Identify the type(s) of the term in a second JSON file.\n" + "A term can have more than one type.\n" + "Output file must be in this format:\n" + "[\n" + '{ "id": "TT_465e8904", "types": [ "type1" ] },\n' + '{ "id": "TT_01c7707e", "types": [ "type2", "type3" ] },\n' + '{ "id": "TT_b20cb478", "types": [ "type4" ] }\n' + "]\n" + "The id must be taken from the input JSON file.\n" + "You must find the type(s) for each term in the JSON file.\n" + "Types must be selected only from the types list.\n\n" + f"Types list: {allowed_str}\n\n" + f'{{ "id": "{term_id}", "term": "{term}" }}' + ) + + def _generate_and_parse_types(self, prompt: str) -> List[str]: + """ + Greedy generate, then extract quoted strings and filter by allowed types. + + Important details: + - We assert model/tokenizer presence before calling. + - Tokenized inputs are moved to the model device (original code uses .to(self.model.device)). + - The decoded text is scanned for quoted substrings using self._quoted_re. + - Only quoted strings that are present in self.allowed_types are kept. + - Returned list is unique & sorted for deterministic ordering. + """ + assert self.model is not None and self.tokenizer is not None + + # Tokenize prompt and move tensors to model device to avoid device mismatch + inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) + + with torch.no_grad(): + outputs = self.model.generate( + **inputs, + max_new_tokens=self.max_new_tokens, + do_sample=False, # deterministic (greedy) decoding + pad_token_id=self.tokenizer.eos_token_id, + ) + + # Decode full generated sequence (prompt + generation). Then extract quoted strings. + text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + candidates = self._quoted_re.findall(text) + + # Filter candidates to the allowed inventory + filtered = [c for c in candidates if c in self.allowed_types] + + # Return unique & sorted for stability across runs + return sorted(set(filtered)) diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py new file mode 100644 index 0000000..30e8372 --- /dev/null +++ b/ontolearner/learner/text2onto/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .sbunlp import SBUNLPFewShotLearner diff --git a/ontolearner/learner/text2onto/sbunlp.py b/ontolearner/learner/text2onto/sbunlp.py new file mode 100644 index 0000000..8ab617d --- /dev/null +++ b/ontolearner/learner/text2onto/sbunlp.py @@ -0,0 +1,525 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#      https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import random +import re +import ast +import gc +from typing import Any, Dict, List, Optional, Set, Tuple +from collections import defaultdict + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig + +from ...base import AutoLearner, AutoLLM + +# ----------------------------------------------------------------------------- +# Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface +# ----------------------------------------------------------------------------- +class LocalAutoLLM(AutoLLM): + """ + Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama). + Uses 4-bit quantization for efficiency and greedy decoding by default. + """ + + def __init__(self, label_mapper: Any = None, device: str = "cpu", token: str = "") -> None: + super().__init__(label_mapper=label_mapper, device=device, token=token) + self.model = None + self.tokenizer = None + + def load(self, model_id: str, load_in_4bit: bool = False, dtype: str = "auto", trust_remote_code: bool = True): + """Load tokenizer + model, applying 4-bit quantization if specified and possible.""" + + # Determine the target data type (default to float32 for CPU, float16 for GPU) + torch_dtype_val = (torch.float16 if torch.cuda.is_available() else torch.float32) + + # Load the tokenizer + self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + quant_config = None + if load_in_4bit: + # Configure BitsAndBytes for 4-bit loading + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + if torch_dtype_val is None: + torch_dtype_val = torch.float16 + + # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise) + device_map = "auto" if (self.device != "cpu") else {"": "cpu"} + + # Load the Causal Language Model + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch_dtype_val, + quantization_config=quant_config, + trust_remote_code=trust_remote_code, + ) + + # Ensure model is on the correct device (redundant if device_map="auto" but safe) + if self.device == "cpu": + self.model.to("cpu") + + def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: float = 0.0, top_p: float = 1.0) -> List[str]: + """Generate continuations for a list of prompts, returning only the generated part.""" + if self.model is None or self.tokenizer is None: + raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") + + # --- Generation Setup --- + # Tokenize batch (padding is essential for batch inference) + enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True) + input_ids = enc["input_ids"] + attention_mask = enc["attention_mask"] + + # Move tensors to the model's device (e.g., cuda:0) + model_device = next(self.model.parameters()).device + input_ids = input_ids.to(model_device) + attention_mask = attention_mask.to(model_device) + + # --- Generate --- + with torch.no_grad(): + outputs = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_new_tokens, + do_sample=(temperature > 0.0), # Use greedy decoding if temperature is 0.0 + temperature=temperature, + top_p=top_p, + pad_token_id=self.tokenizer.eos_token_id, + ) + + # --- Post-processing: Extract only the generated tail --- + decoded_outputs: List[str] = [] + for i, output_ids in enumerate(outputs): + full_decoded_text = self.tokenizer.decode(output_ids, skip_special_tokens=True) + prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True) + + # Safely strip the prompt text from the full output + if full_decoded_text.startswith(prompt_text): + generated_tail = full_decoded_text[len(prompt_text):].strip() + else: + # Fallback extraction (less robust if padding affects token indices) + prompt_len = input_ids.shape[1] + generated_tail = self.tokenizer.decode(output_ids[prompt_len:], skip_special_tokens=True).strip() + decoded_outputs.append(generated_tail) + + return decoded_outputs + +# ----------------------------------------------------------------------------- +# Main Learner: SBUNLPFewShotLearner (Task A Text2Onto) +# ----------------------------------------------------------------------------- +class SBUNLPFewShotLearner(AutoLearner): + """ + Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction). + It uses Few-Shot prompts generated from training data for inference. + """ + + def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"): + super().__init__() + # self.model is an instance of LocalAutoLLM + self.model = model or LocalAutoLLM(device=device) + self.device = device + # Cached in-memory prompt blocks built during the fit phase + self.fewshot_terms_block: str = "" + self.fewshot_types_block: str = "" + + # --- Few-shot construction (terms) --- + def build_stratified_fewshot_prompt( + self, + documents_path: str, + terms_path: str, + sample_size: int = 28, + seed: int = 123, + max_chars_per_text: int = 1200, + ) -> str: + """ + Builds the few-shot exemplar block for Term Extraction using stratified sampling. + """ + random.seed(seed) + + # Read documents (JSONL) into a list + corpus_documents: List[Dict[str, Any]] = [] + with open(documents_path, "r", encoding="utf-8") as file_handle: + for line in file_handle: + if line.strip(): + corpus_documents.append(json.loads(line)) + + num_total_docs = len(corpus_documents) + num_sample_docs = min(sample_size, num_total_docs) + + # Load the map of term -> [list of document IDs] + with open(terms_path, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + # Invert map: document ID -> [list of terms] + doc_id_to_terms_map = defaultdict(list) + for term, doc_ids in term_to_doc_map.items(): + for doc_id in doc_ids: + doc_id_to_terms_map[doc_id].append(term) + + # Define strata (groups of documents associated with specific terms) + strata_map = defaultdict(list) + for doc in corpus_documents: + doc_id = doc.get("id", "") + associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"]) + for term in associated_terms: + strata_map[term].append(doc) + + # Perform proportional sampling across strata + sampled_documents: List[Dict[str, Any]] = [] + for term_str, stratum_docs in strata_map.items(): + num_stratum_docs = len(stratum_docs) + if num_stratum_docs == 0: + continue + + # Calculate proportional sample size + proportion = num_stratum_docs / num_total_docs + num_to_sample_from_stratum = int(num_sample_docs * proportion) + + if num_to_sample_from_stratum > 0: + sampled_documents.extend(random.sample(stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs))) + + # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size' + unique_docs_by_id = {} + for doc in sampled_documents: + unique_docs_by_id[doc.get("id", "")] = doc + + final_sample_docs = list(unique_docs_by_id.values()) + + if len(final_sample_docs) > num_sample_docs: + final_sample_docs = random.sample(final_sample_docs, num_sample_docs) + elif len(final_sample_docs) < num_sample_docs: + remaining_docs = [d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id] + needed_count = min(num_sample_docs - len(final_sample_docs), len(remaining_docs)) + final_sample_docs.extend(random.sample(remaining_docs, needed_count)) + + # Format the few-shot exemplar text block + prompt_lines: List[str] = [] + for doc in final_sample_docs: + doc_id = doc.get("id", "") + title = doc.get("title", "") + text = doc.get("text", "") + + # Truncate text if it exceeds the maximum character limit + if max_chars_per_text and len(text) > max_chars_per_text: + text = text[:max_chars_per_text] + "…" + + associated_terms = doc_id_to_terms_map.get(doc_id, []) + prompt_lines.append( + f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------" + ) + + prompt_block = "\n".join(prompt_lines) + self.fewshot_terms_block = prompt_block + return prompt_block + + # --- Few-shot construction (types) --- + def build_types_fewshot_block( + self, + docs_jsonl: str, + terms2doc_json: str, + sample_per_term: int = 1, + full_word: bool = True, + case_sensitive: bool = True, + max_chars_per_text: int = 800, + ) -> str: + """ + Builds the few-shot block for Type Extraction. + This method samples documents based on finding an associated term/type within the text. + """ + # Load documents into dict by ID + docs_by_id = {} + with open(docs_jsonl, "r", encoding="utf-8") as file_handle: + for line in file_handle: + line_stripped = line.strip() + if line_stripped: + try: + doc = json.loads(line_stripped) + doc_id = doc.get("id", "") + if doc_id: + docs_by_id[doc_id] = doc + except json.JSONDecodeError: + continue + + # Load term -> [doc_id,...] map + with open(terms2doc_json, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + flags = 0 if case_sensitive else re.IGNORECASE + prompt_lines: List[str] = [] + + # Iterate over terms (which act as types in this context) + for term, doc_ids in term_to_doc_map.items(): + escaped_term = re.escape(term) + # Create regex pattern for matching the term in the text + pattern = rf"\b{escaped_term}\b" if full_word else escaped_term + term_regex = re.compile(pattern, flags=flags) + + picked_count = 0 + for doc_id in doc_ids: + doc = docs_by_id.get(doc_id) + if not doc: + continue + + title = doc.get("title", "") + text = doc.get("text", "") + + # Check if the term/type is actually present in the document text/title + if term_regex.search(f"{title} {text}"): + text_content = text + + # Truncate text if necessary + if max_chars_per_text and len(text_content) > max_chars_per_text: + text_content = text_content[:max_chars_per_text] + "…" + + # Escape single quotes in the term for Python list formatting in the prompt + term_for_prompt = term.replace("'", "\\'") + + prompt_lines.append( + f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------" + ) + picked_count += 1 + + if picked_count >= sample_per_term: + break # Move to the next term + + prompt_block = "\n".join(prompt_lines) + self.fewshot_types_block = prompt_block + return prompt_block + + def fit(self, train_docs_jsonl: str, terms2doc_json: str, sample_size: int = 28, seed: int = 123) -> None: + """ + Fit phase: Builds and caches the few-shot prompt blocks from the training files. + No model training occurs (Few-Shot/In-Context Learning). + """ + # Build prompt block for Term extraction + _ = self.build_stratified_fewshot_prompt(train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed) + # Build prompt block for Type extraction + _ = self.build_types_fewshot_block(train_docs_jsonl, terms2doc_json, sample_per_term=1) + + # ------------------------- + # Inference helpers (prompt construction and output parsing) + # ------------------------- + def _build_term_prompt(self, example_block: str, title: str, text: str) -> str: + """Constructs the full prompt for Term Extraction.""" + return f"""{example_block} + [var] + Title: {title} + Text: {text} + [var] + Extract all relevant terms that could form the basis of an ontology from the above document. + Return ONLY a Python list like ['term1', 'term2', ...] and nothing else. + If no terms are found, return []. + """ + + def _build_type_prompt(self, example_block: str, title: str, text: str) -> str: + """Constructs the full prompt for Type Extraction.""" + return f"""{example_block} + [var] + Title: {title} + Text: {text} + [var] + Extract all relevant TYPES mentioned in the above document that could serve as ontology classes. + Only consider content inside the [var] ... [var] block. + Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return []. + """ + + def _parse_list_like(self, raw_string: str) -> List[str]: + """Try to extract a Python list of strings from model output robustly.""" + processed_string = raw_string.strip() + if processed_string in ("[]", ""): + return [] + + # 1. Try direct evaluation + try: + parsed_value = ast.literal_eval(processed_string) + if isinstance(parsed_value, list): + # Filter to ensure only strings are returned + return [item for item in parsed_value if isinstance(item, str)] + except Exception: + pass + + # 2. Try finding and evaluating text within outermost brackets [ ... ] + bracket_match = re.search(r"\[[\s\S]*?\]", processed_string) + if bracket_match: + try: + parsed_value = ast.literal_eval(bracket_match.group(0)) + if isinstance(parsed_value, list): + return [item for item in parsed_value if isinstance(item, str)] + except Exception: + pass + + # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors) + # Finds content inside either single quotes ('...') or double quotes ("...") + quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string) + flattened_list = [a_match or b_match for a_match, b_match in quoted_matches] + return flattened_list + + def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str: + """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output.""" + # self.model is an instance of LocalAutoLLM + model_output = self.model.generate([prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0) + return model_output[0] if model_output else "" + + def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + """ + Runs Term Extraction on the test documents and saves results to a JSONL file. + Returns: The count of individual terms written. + """ + if not self.fewshot_terms_block: + raise RuntimeError("Few-shot block for terms is empty. Call fit() first.") + + num_written_terms = 0 + with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + for line_index, line in enumerate(file_in, start=1): + if 0 < max_lines < line_index: + break + + try: + document = json.loads(line.strip()) + except Exception: + continue # Skip malformed JSON lines + + doc_id = document.get("id", "unknown") + title = document.get("title", "") + text = document.get("text", "") + + # Construct and call model + prompt = self._build_term_prompt(self.fewshot_terms_block, title, text) + raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) + predicted_terms = self._parse_list_like(raw_output) + + # Write extracted terms + for term_or_type in predicted_terms: + if isinstance(term_or_type, str) and term_or_type.strip(): + file_out.write(json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + "\n") + num_written_terms += 1 + + # Lightweight memory management for long runs + if line_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return num_written_terms + + def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + """ + Runs Type Extraction on the test documents and saves results to a JSONL file. + Returns: The count of individual types written. + """ + if not self.fewshot_types_block: + raise RuntimeError("Few-shot block for types is empty. Call fit() first.") + + num_written_types = 0 + with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + for line_index, line in enumerate(file_in, start=1): + if 0 < max_lines < line_index: + break + + try: + document = json.loads(line.strip()) + except Exception: + continue # Skip malformed JSON lines + + doc_id = document.get("id", "unknown") + title = document.get("title", "") + text = document.get("text", "") + + # Construct and call model using the dedicated type prompt block + prompt = self._build_type_prompt(self.fewshot_types_block, title, text) + raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) + predicted_types = self._parse_list_like(raw_output) + + # Write extracted types + for term_or_type in predicted_types: + if isinstance(term_or_type, str) and term_or_type.strip(): + file_out.write(json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + "\n") + num_written_types += 1 + + if line_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return num_written_types + + # --- Evaluation utilities (unchanged from prior definition, added docstrings) --- + def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]: + """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased.""" + gold_pairs = set() + with open(terms2doc_path, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + for term, doc_ids in term_to_doc_map.items(): + clean_term = term.strip().lower() + for doc_id in doc_ids: + gold_pairs.add((doc_id, clean_term)) + return gold_pairs + + def load_predicted_pairs(self, predicted_jsonl_path: str, key: str = "term") -> Set[Tuple[str, str]]: + """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased.""" + predicted_pairs = set() + with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle: + for line in file_handle: + try: + entry = json.loads(line.strip()) + except Exception: + continue + doc_id = entry.get("doc_id") + value = entry.get(key) + if doc_id and value: + predicted_pairs.add((doc_id, value.strip().lower())) + return predicted_pairs + + def evaluate_extraction_f1(self, terms2doc_path: str, predicted_jsonl: str, key: str = "term") -> float: + """ + Computes set-based binary Precision, Recall, and F1 score against the gold pairs. + """ + # Load the ground truth and predictions + gold_set = self.load_gold_pairs(terms2doc_path) + predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key) + + # Build combined universe of all pairs for score calculation + all_pairs = sorted(gold_set | predicted_set) + + # Create binary labels (1=present, 0=absent) + y_true = [1 if pair in gold_set else 0 for pair in all_pairs] + y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs] + + # Use scikit-learn for metric calculation + from sklearn.metrics import precision_recall_fscore_support + precision, recall, f1, _ = precision_recall_fscore_support( + y_true, y_pred, average="binary", zero_division=0 + ) + + # Display results + num_true_positives = len(gold_set & predicted_set) + + print("\n📊 Evaluation Results:") + print(f" ✅ Precision: {precision:.4f}") + print(f" ✅ Recall: {recall:.4f}") + print(f" ✅ F1 Score: {f1:.4f}") + print(f" 📌 Gold pairs: {len(gold_set)}") + print(f" 📌 Predicted pairs:{len(predicted_set)}") + print(f" 🎯 True Positives: {num_true_positives}") + + return float(f1) From be80e735b2de7cbc48c2c5bfcb0c34b065c537a0 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Mon, 3 Nov 2025 23:09:36 +0100 Subject: [PATCH 04/19] alexbek learner models --- .../llm_learner_alexbek_rag_term_typing.py | 50 + .../llm_learner_alexbek_rf_term_typing.py | 54 + ...er_alexbek_self_attn_taxonomy_discovery.py | 41 + examples/llm_learner_alexbek_text2onto.py | 74 ++ ontolearner/__init__.py | 10 +- ontolearner/learner/__init__.py | 3 + .../learner/taxonomy_discovery/__init__.py | 1 + .../learner/taxonomy_discovery/alexbek.py | 305 +++++ ontolearner/learner/term_typing/__init__.py | 1 + ontolearner/learner/term_typing/alexbek.py | 809 ++++++++++++ ontolearner/learner/text2onto/__init__.py | 1 + ontolearner/learner/text2onto/alexbek.py | 1084 +++++++++++++++++ 12 files changed, 2432 insertions(+), 1 deletion(-) create mode 100644 examples/llm_learner_alexbek_rag_term_typing.py create mode 100644 examples/llm_learner_alexbek_rf_term_typing.py create mode 100644 examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py create mode 100644 examples/llm_learner_alexbek_text2onto.py create mode 100644 ontolearner/learner/taxonomy_discovery/alexbek.py create mode 100644 ontolearner/learner/term_typing/alexbek.py create mode 100644 ontolearner/learner/text2onto/alexbek.py diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py new file mode 100644 index 0000000..5723e36 --- /dev/null +++ b/examples/llm_learner_alexbek_rag_term_typing.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import AlexbekRAGLearner + +# Load the GeoNames ontology. +ontology = GeoNames() +ontology.load() + +# Extract labeled items and split into train/test sets for evaluation +train_data, test_data = train_test_split(ontology.extract(), test_size=0.2, random_state=42) + +# Configure a Retrieval-Augmented Generation (RAG) term-typing classifier. +# - llm_model_id: generator used to predict types from the prompt + retrieved examples +# - retriever_model_id: encoder used to embed items and fetch top-k similar (RAG) examples +# - device: "cuda" for GPU or "cpu" +# - top_k: number of nearest examples to retrieve per query term +# - max_new_tokens: decoding budget of the LLM during prediction +# - output_dir: where intermediate artifacts / logs can be stored +rag_learner = AlexbekRAGLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id="sentence-transformers/all-MiniLM-L6-v2", + device="cuda", + top_k=3, + max_new_tokens=256, + output_dir="./results/", +) + +# Build the pipeline and pass raw structured objects end-to-end. +# We place the RAG learner in the llm slot and set llm_id accordingly. +pipe = LearnerPipeline( + llm=rag_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=True, +) + +# Run the full learning pipeline on the term-typing task +# - task="term-typing" (Task B) +# - evaluate=True computes precision/recall/F1 on the held-out test split +# - ontologizer_data=True must match the pipeline flag above +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=True, +) + +# Display the evaluation results and runtime +print("Metrics:", outputs.get("metrics")) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} +print("Elapsed time (s):", outputs.get("elapsed_time")) diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py new file mode 100644 index 0000000..c5c7454 --- /dev/null +++ b/examples/llm_learner_alexbek_rf_term_typing.py @@ -0,0 +1,54 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import AlexbekRFLearner # A random-forest term-typing learner over text+graph features + +# Load the GeoNames ontology and extract labeled term-typing data + +ontology = GeoNames() +ontology.load() + +data = ontology.extract() + +# Split the labeled term-typing data into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42 +) + +# Configure the RF-based learner (embeddings + optional graph features) +# - device: "cpu" or "cuda" +# - threshold: decision threshold for multi-label assignment +# - use_graph_features: include ontology-graph-derived features if available +rf_learner = AlexbekRFLearner( + device="cpu", # switch to "cuda" if you have a GPU + batch_size=16, + max_length=512, # max tokenizer length for embedding model inputs + threshold=0.30, # probability cutoff for assigning each type + use_graph_features=True # set False for pure RF on text embeddings only +) + +# Build the pipeline and pass raw structured objects end-to-end. +pipe = LearnerPipeline( + retriever=rf_learner, + retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory + ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} + device="cpu", + batch_size=16 +) + +# Run the full learning pipeline on the term-typing task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=True, +) + +# Display evaluation summary and runtime +print("Metrics:", outputs.get("metrics")) + +print("Elapsed time:", outputs["elapsed_time"]) + +print(ontology) diff --git a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py new file mode 100644 index 0000000..b78976f --- /dev/null +++ b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py @@ -0,0 +1,41 @@ +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import AlexbekCrossAttnLearner +# 1) Load & split +ontology = GeoNames() +ontology.load() +data = ontology.extract() +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# 2) Configure the cross-attention learner +cross_learner = AlexbekCrossAttnLearner( + embedding_model="sentence-transformers/all-MiniLM-L6-v2", # or "Qwen/Qwen2.5-1.5B-... (if wrapped as ST)" + device="cpu", + num_heads=8, + lr=5e-5, + weight_decay=0.01, + num_epochs=1, + batch_size=256, + neg_ratio=1.0, + output_dir="./results/crossattn/", + seed=42, +) + +# 3) Build pipeline +pipeline = LearnerPipeline( + llm=cross_learner, # <- our learner + llm_id="cross-attn", # label for bookkeeping + ontologizer_data=False # pass raw ontology objects as in your example +) + +# 4) Train + predict + evaluate +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +print("Metrics:", outputs.get("metrics")) +print("Elapsed time:", outputs["elapsed_time"]) +print(outputs) diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py new file mode 100644 index 0000000..caf4c5b --- /dev/null +++ b/examples/llm_learner_alexbek_text2onto.py @@ -0,0 +1,74 @@ +import os +import json +import torch + +# LocalAutoLLM handles model loading/generation; AlexbekFewShotLearner provides fit/predict APIs +from ontolearner.learner.text2onto.alexbek import LocalAutoLLM, AlexbekFewShotLearner + +# Local folder where the dataset is stored (relative to this script) +DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" + +# Input paths (already saved) +TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") +TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") +TEST_DOCS_FULL_PATH = os.path.join(DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl") + +# Output paths +DOC_TERMS_OUT_PATH = os.path.join(DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl") +TERMS2TYPES_OUT_PATH = os.path.join(DATA_DIR, "test", "terms2types_pred_ecology.fast.json") +TYPES2DOCS_OUT_PATH = os.path.join(DATA_DIR, "test", "types2docs_pred_ecology.fast.json") + +# Device selection +DEVICE = ( + "cuda" + if torch.cuda.is_available() + else ("mps" if torch.backends.mps.is_available() else "cpu") +) + +# Model config +MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" +LOAD_IN_4BIT = (DEVICE == "cuda") # 4-bit helps on GPU + +# 1) Load LLM +llm = LocalAutoLLM(device=DEVICE) +llm.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) + +# 2) Build few-shot exemplars from training split +learner = AlexbekFewShotLearner(model=llm, device=DEVICE) +learner.fit( + train_docs_jsonl=TRAIN_DOCS_PATH, + terms2doc_json=TRAIN_TERMS2DOCS_PATH, + # use defaults for sample size/seed +) + +# 3) Predict terms per test document +os.makedirs(os.path.dirname(DOC_TERMS_OUT_PATH), exist_ok=True) +num_written_doc_terms = learner.predict_terms( + docs_test_jsonl=TEST_DOCS_FULL_PATH, + out_jsonl=DOC_TERMS_OUT_PATH, + # use defaults for max_new_tokens and few_shot_k +) +print(f"[terms] wrote {num_written_doc_terms} lines → {DOC_TERMS_OUT_PATH}") + +# 4) Predict types for extracted terms, using the JSONL we just wrote +typing_summary = learner.predict_types_from_terms( + doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly + doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) + model_id=MODEL_ID, # reuse the same small model + out_terms2types=TERMS2TYPES_OUT_PATH, + out_types2docs=TYPES2DOCS_OUT_PATH, + # use defaults for everything else +) + +print(f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types") +print(f"[saved] {TERMS2TYPES_OUT_PATH}") +print(f"[saved] {TYPES2DOCS_OUT_PATH}") + +# 5) Small preview of term→types +try: + with open(TERMS2TYPES_OUT_PATH, "r", encoding="utf-8") as fin: + preview = json.load(fin)[:3] + print("[preview] first 3:") + print(json.dumps(preview, ensure_ascii=False, indent=2)) +except Exception as e: + print(f"[preview] skipped: {e}") diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 49b94c4..5ebd3f6 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -36,7 +36,11 @@ SKHNLPSequentialFTLearner, SBUNLPFewShotLearner, SBUNLPZSLearner, - SBUNLPText2OntoLearner) + SBUNLPText2OntoLearner, + AlexbekCrossAttnLearner, + AlexbekRFLearner, + AlexbekRAGLearner, + AlexbekFewShotLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -61,6 +65,10 @@ "SBUNLPFewShotLearner", "SBUNLPZSLearner", "SBUNLPText2OntoLearner", + "AlexbekCrossAttnLearner", + "AlexbekRFLearner", + "AlexbekRAGLearner", + "AlexbekFewShotLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 4f41586..71020e8 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -23,3 +23,6 @@ from .taxonomy_discovery.sbunlp import SBUNLPFewShotLearner from .term_typing.sbunlp import SBUNLPZSLearner from .text2onto import SBUNLPFewShotLearner as SBUNLPText2OntoLearner +from .taxonomy_discovery.alexbek import AlexbekCrossAttnLearner +from .term_typing.alexbek import AlexbekRFLearner, AlexbekRAGLearner +from .text2onto.alexbek import AlexbekFewShotLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py index d52513b..57a845b 100644 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -15,3 +15,4 @@ from .rwthdbis import RWTHDBISSFTLearner from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner from .sbunlp import SBUNLPFewShotLearner +from .alexbek import AlexbekCrossAttnLearner diff --git a/ontolearner/learner/taxonomy_discovery/alexbek.py b/ontolearner/learner/taxonomy_discovery/alexbek.py new file mode 100644 index 0000000..616d50f --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/alexbek.py @@ -0,0 +1,305 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple + +import math +import os +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +from sentence_transformers import SentenceTransformer + +from ...base import AutoLearner + +class RMSNorm(nn.Module): + """Root Mean Square normalization with learnable scale. + + Computes: y = weight * x / sqrt(mean(x^2) + eps) + """ + + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + rms_inv = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + return self.weight * (x * rms_inv) + +class CrossAttentionHead(nn.Module): + """Minimal multi-head *pair* scorer using cross-attention-style projections. + + Given child vector c and parent vector p: + q = Wq * c, k = Wk * p + per-head score = (q_h · k_h) / sqrt(d_head) + aggregate by mean across heads, then sigmoid to get probability. + """ + + def __init__(self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6): + super().__init__() + assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads" + self.hidden_size = hidden_size + self.num_heads = num_heads + self.dim_per_head = hidden_size // num_heads + + # Linear projections for queries (child) and keys (parent) + self.query_projection = nn.Linear(hidden_size, hidden_size, bias=False) + self.key_projection = nn.Linear(hidden_size, hidden_size, bias=False) + + # Pre-projection normalization for stability + self.query_norm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.key_norm = RMSNorm(hidden_size, eps=rms_norm_eps) + + # Xavier init helps stabilize training + nn.init.xavier_uniform_(self.query_projection.weight) + nn.init.xavier_uniform_(self.key_projection.weight) + + def forward(self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor) -> torch.Tensor: + """Score (child, parent) pairs. + + Args: + child_embeddings: Tensor of shape (batch, hidden_size) + parent_embeddings: Tensor of shape (batch, hidden_size) + Returns: + Tensor of probabilities with shape (batch,) + """ + batch_size, _ = child_embeddings.shape + + # Project and normalize + queries = self.query_norm(self.query_projection(child_embeddings)) + keys = self.key_norm(self.key_projection(parent_embeddings)) + + # Reshape into heads: (batch, heads, dim_per_head) + queries = queries.view(batch_size, self.num_heads, self.dim_per_head) + keys = keys.view(batch_size, self.num_heads, self.dim_per_head) + + # Scaled dot-product similarity per head -> (batch, heads) + per_head_scores = (queries * keys).sum(-1) / math.sqrt(self.dim_per_head) + + # Aggregate across heads -> (batch,) + mean_score = per_head_scores.mean(-1) + + # Map to probability + return torch.sigmoid(mean_score) + +class AlexbekCrossAttnLearner(AutoLearner): + """Cross-Attention Taxonomy Learner (inherits AutoLearner). + + - Encodes type strings with a SentenceTransformer. + - Trains a small cross-attention head to score (parent, child) edges. + - Predicts probabilities for provided pairs. + + Helper functions live in this same module (below), *not* as class methods. + """ + + def __init__( + self, + *, + embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "cpu", + num_heads: int = 8, + lr: float = 5e-5, + weight_decay: float = 0.01, + num_epochs: int = 1, + batch_size: int = 256, + neg_ratio: float = 1.0, # negatives per positive + output_dir: str = "./results/", + seed: int = 42, + **kwargs: Any, + ): + """Configure the learner. + + All configuration is kept directly on the learner (no separate Config class). + """ + super().__init__(**kwargs) + + # ----- hyperparameters / settings ----- + self.embedding_model_id = embedding_model + self.requested_device = device + self.num_heads = num_heads + self.learning_rate = lr + self.weight_decay = weight_decay + self.num_epochs = num_epochs + self.batch_size = batch_size + self.negative_ratio = neg_ratio + self.output_dir = output_dir + self.seed = seed + + # Prefer requested device but gracefully fall back to CPU + if torch.cuda.is_available() or self.requested_device == "cpu": + self.device = torch.device(self.requested_device) + else: + self.device = torch.device("cpu") + + # Will be set in load() + self.embedder: Optional[SentenceTransformer] = None + self.cross_attn_head: Optional[CrossAttentionHead] = None + self.embedding_dim: Optional[int] = None + + # Cache of term -> embedding tensor (on device) + self.term_to_vector: Dict[str, torch.Tensor] = {} + + os.makedirs(self.output_dir, exist_ok=True) + random.seed(self.seed) + torch.manual_seed(self.seed) + + def load(self, **kwargs: Any): + """Load the sentence embedding model and initialize the cross-attention head.""" + model_id = kwargs.get("embedding_model", self.embedding_model_id) + self.embedder = SentenceTransformer(model_id, trust_remote_code=True, device=str(self.device)) + + # Probe output dimensionality using a dummy encode + probe_embedding = self.embedder.encode(["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False) + self.embedding_dim = int(probe_embedding.shape[-1]) + + # Initialize the cross-attention head + self.cross_attn_head = CrossAttentionHead(hidden_size=self.embedding_dim, num_heads=self.num_heads).to( + self.device + ) + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + if self.embedder is None or self.cross_attn_head is None: + self.load() + + if not test: + positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + self._ensure_term_embeddings(unique_terms) + negative_pairs = self._sample_negative_pairs( + positive_pairs, unique_terms, ratio=self.negative_ratio, seed=self.seed + ) + self._train_cross_attn_head(positive_pairs, negative_pairs) + return None + else: + candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + self._ensure_term_embeddings(unique_terms, append_only=True) + probabilities = self._score_parent_child_pairs(candidate_pairs) + + predictions = [ + {"parent": parent, "child": child, "score": float(prob), "label": int(prob >= 0.5)} + for (parent, child), prob in zip(candidate_pairs, probabilities) + ] + return predictions + + def _ensure_term_embeddings(self, terms: List[str], append_only: bool = False) -> None: + """Encode terms with the sentence embedder and store in cache. + + Args: + terms: list of unique strings to embed + append_only: if True, only embed terms missing from cache + """ + if self.embedder is None: + raise RuntimeError("Call load() before building term embeddings") + + terms_to_encode = [t for t in terms if t not in self.term_to_vector] if append_only else terms + if not terms_to_encode: + return + + embeddings = self.embedder.encode( + terms_to_encode, + convert_to_tensor=True, + normalize_embeddings=False, + batch_size=256, + show_progress_bar=False, + ) + for term, embedding in zip(terms_to_encode, embeddings): + self.term_to_vector[term] = embedding.detach().to(self.device) + + def _pairs_as_tensors(self, pairs: List[Tuple[str, str]]) -> Tuple[torch.Tensor, torch.Tensor]: + """Turn list of (parent, child) strings into two aligned tensors on device.""" + # child embeddings tensor of shape (batch, dim) + child_tensor = torch.stack([self.term_to_vector[child] for (_, child) in pairs], dim=0).to(self.device) + # parent embeddings tensor of shape (batch, dim) + parent_tensor = torch.stack([self.term_to_vector[parent] for (parent, _) in pairs], dim=0).to(self.device) + return child_tensor, parent_tensor + + def _train_cross_attn_head(self, positive_pairs: List[Tuple[str, str]], negative_pairs: List[Tuple[str, str]]) -> None: + """Train the cross-attention head with BCE loss on labeled pairs.""" + if self.cross_attn_head is None: + raise RuntimeError("Head not initialized. Call load().") + + self.cross_attn_head.train() + optimizer = torch.optim.AdamW( + self.cross_attn_head.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay + ) + + # Build a simple supervised dataset: 1 for positive, 0 for negative + labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [(1, pc) for pc in positive_pairs] + [ + (0, nc) for nc in negative_pairs + ] + random.shuffle(labeled_pairs) + + def iterate_minibatches(items: List[Tuple[int, Tuple[str, str]]], batch_size: int): + for start in range(0, len(items), batch_size): + yield items[start : start + batch_size] + + for epoch in range(self.num_epochs): + epoch_loss_sum = 0.0 + for minibatch in iterate_minibatches(labeled_pairs, self.batch_size): + labels = torch.tensor([y for y, _ in minibatch], dtype=torch.float32, device=self.device) + string_pairs = [pc for _, pc in minibatch] + child_tensor, parent_tensor = self._pairs_as_tensors(string_pairs) + + probs = self.cross_attn_head(child_tensor, parent_tensor) + loss = F.binary_cross_entropy(probs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss_sum += float(loss.item()) * len(minibatch) + + + def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float]: + """Compute probability scores for (parent, child) pairs.""" + if self.cross_attn_head is None: + raise RuntimeError("Head not initialized. Call load().") + + self.cross_attn_head.eval() + scores: List[float] = [] + with torch.no_grad(): + for start in range(0, len(pairs), self.batch_size): + chunk = pairs[start : start + self.batch_size] + child_tensor, parent_tensor = self._pairs_as_tensors(chunk) + prob = self.cross_attn_head(child_tensor, parent_tensor) + scores.extend(prob.detach().cpu().tolist()) + return scores + + def _extract_parent_child_pairs_and_terms(self, data): + parent_child_pairs = [] + unique_terms = set() + for edge in getattr(data, "type_taxonomies").taxonomies: + parent, child = str(edge.parent), str(edge.child) + parent_child_pairs.append((parent, child)) + unique_terms.add(parent) + unique_terms.add(child) + return parent_child_pairs, sorted(unique_terms) + + def _sample_negative_pairs(self, positive_pairs, terms, ratio: float = 1.0, seed: int = 42): + random.seed(seed) + term_list = list(terms) + positive_set = set(positive_pairs) + negatives = [] + target_negative_count = int(len(positive_pairs) * ratio) + while len(negatives) < target_negative_count: + parent = random.choice(term_list) + child = random.choice(term_list) + if parent == child: + continue + candidate = (parent, child) + if candidate in positive_set: + continue + negatives.append(candidate) + return negatives diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py index ebd8cd9..a42d716 100644 --- a/ontolearner/learner/term_typing/__init__.py +++ b/ontolearner/learner/term_typing/__init__.py @@ -14,3 +14,4 @@ from .rwthdbis import RWTHDBISSFTLearner from .sbunlp import SBUNLPZSLearner +from .alexbek import AlexbekRFLearner, AlexbekRAGLearner diff --git a/ontolearner/learner/term_typing/alexbek.py b/ontolearner/learner/term_typing/alexbek.py new file mode 100644 index 0000000..7aa6033 --- /dev/null +++ b/ontolearner/learner/term_typing/alexbek.py @@ -0,0 +1,809 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import json +import re +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +import networkx as nx +from tqdm import tqdm +from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.ensemble import RandomForestClassifier +from sklearn.multiclass import OneVsRestClassifier + +from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM +from sentence_transformers import SentenceTransformer + +from ...base import AutoLearner, AutoRetriever + +class AlexbekRFLearner(AutoRetriever): + """ + Embedding-based multi-label classifier for *term typing*. + + Pipeline overview: + 1) Load a Hugging Face encoder (tokenizer + model). + 2) Encode input terms into sentence embeddings. + 3) Optionally augment with simple graph (co-occurrence) features. + 4) Train a One-vs-Rest RandomForest on the concatenated features. + 5) Predict multi-label types with a probability threshold (fallback to top-1). + + API expected by LearnerPipeline: + - load(model_id) + - fit(data, task, ontologizer=True) + - predict(data, task, ontologizer=True) + - tasks_ground_truth_former(data, task) + """ + + def __init__( + self, + device: str = "cpu", + batch_size: int = 16, + max_length: int = 256, + threshold: float = 0.30, + use_graph_features: bool = True, + rf_kwargs: Optional[Dict[str, Any]] = None, + ): + # Runtime / inference settings + self.device = torch.device(device) + self.batch_size = batch_size + self.max_length = max_length + self.threshold = threshold # probability cutoff for selecting labels + self.use_graph_features = use_graph_features + + # RandomForest hyperparameters (with sensible defaults) + self.rf_kwargs = rf_kwargs or dict( + n_estimators=200, max_depth=20, class_weight="balanced", random_state=42 + ) + + # Filled during load/fit + self.model_name: Optional[str] = None + self.tokenizer: Optional[AutoTokenizer] = None + self.embedding_model: Optional[AutoModel] = None + + # Label processing / classifier / optional graph + self.label_binarizer = MultiLabelBinarizer() + self.ovr_random_forest: Optional[OneVsRestClassifier] = None + self.term_graph: Optional[nx.Graph] = None + + def load(self, model_id: str, **_: Any) -> None: + """Load a Hugging Face encoder by model id (tokenizer + base model).""" + self.model_name = model_id + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + self.embedding_model = AutoModel.from_pretrained(model_id) + self.embedding_model.eval().to(self.device) + + def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: + """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features).""" + if task != "term-typing": + raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + + # Normalize incoming training data into a list of dicts: {term, types, RAG} + training_rows = self._as_term_types_dicts(data) + if not training_rows: + raise ValueError("No valid training examples found (need 'term' and 'types').") + + # Split out terms and raw labels + training_terms: List[str] = [row["term"] for row in training_rows] + raw_label_lists: List[List[str]] = [row["types"] for row in training_rows] + + # Fit label binarizer to learn label space/order + self.label_binarizer.fit(raw_label_lists) + + # Encode terms to sentence embeddings + term_embeddings_train = self._encode(training_terms) + + # Optionally build a light-weight co-occurrence graph and extract features + if self.use_graph_features: + self.term_graph = self._create_term_graph(training_rows) + graph_features_train = self._extract_graph_features(self.term_graph, training_terms) + X_train = np.hstack([term_embeddings_train, graph_features_train]) + else: + self.term_graph = None + X_train = term_embeddings_train + + # Multi-label targets (multi-hot) + Y_train = self.label_binarizer.transform(raw_label_lists) + + # One-vs-Rest RandomForest (one binary RF per label) + self.ovr_random_forest = OneVsRestClassifier(RandomForestClassifier(**self.rf_kwargs)) + self.ovr_random_forest.fit(X_train, Y_train) + + + def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> List[Dict[str, Any]]: + """Predict multi-label types for input terms. + + Returns a list of dicts with keys: {id, term, types}. + """ + if task != "term-typing": + raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + if self.ovr_random_forest is None or self.tokenizer is None or self.embedding_model is None: + raise RuntimeError("Call load() and fit() before predict().") + + # Normalize prediction input into parallel lists of terms and example ids + test_terms, example_ids = self._as_predict_terms_ids(data) + + # Encode terms + term_embeddings_test = self._encode(test_terms) + + # Match feature layout used during training + if self.use_graph_features and self.term_graph is not None: + graph_features_test = self._extract_graph_features(self.term_graph, test_terms) + X_test = np.hstack([term_embeddings_test, graph_features_test]) + else: + X_test = term_embeddings_test + + # Probabilities per label (shape: [n_samples, n_labels]) + probability_matrix = self.ovr_random_forest.predict_proba(X_test) + + predictions: List[Dict[str, Any]] = [] + label_names = self.label_binarizer.classes_ + threshold = float(self.threshold) + + # Select labels above threshold; fallback to argmax if none exceed it + for row_index, label_probabilities in enumerate(probability_matrix): + selected_label_indices = np.where(label_probabilities > threshold)[0] + if len(selected_label_indices) == 0: + selected_label_indices = [int(np.argmax(label_probabilities))] + + predicted_types = [label_names[label_idx] for label_idx in selected_label_indices] + + predictions.append( + { + "id": example_ids[row_index], + "term": test_terms[row_index], + "types": predicted_types, + } + ) + return predictions + + def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, Any]]: + """Normalize ground-truth into a list of {id, term, types} dicts for evaluation.""" + if task != "term-typing": + raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + return self._as_gold_id_term_types(data) + + def _encode(self, texts: List[str]) -> np.ndarray: + """Encode a list of strings into L2-normalized sentence embeddings (NumPy array). + + If no texts are provided, returns an empty array with width equal to the model hidden size. + """ + assert self.tokenizer is not None and self.embedding_model is not None, "Call load(model_id) first." + + if not texts: + hidden_size = getattr(getattr(self.embedding_model, "config", None), "hidden_size", 768) + return np.zeros((0, hidden_size), dtype=np.float32) + + batch_embeddings: List[torch.Tensor] = [] + + for start_idx in tqdm(range(0, len(texts), self.batch_size), desc="Embedding"): + end_idx = start_idx + self.batch_size + batch_texts = texts[start_idx:end_idx] + + # Tokenize and move to device + tokenized_batch = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="pt", + ).to(self.device) + + # Forward pass without gradients + with torch.no_grad(): + model_output = self.embedding_model(**tokenized_batch) + + # Prefer dedicated pooler if provided; otherwise pool by last valid token + if hasattr(model_output, "pooler_output") and model_output.pooler_output is not None: + sentence_embeddings = model_output.pooler_output + else: + sentence_embeddings = self._last_token_pool( + model_output.last_hidden_state, tokenized_batch["attention_mask"] + ) + + # L2-normalize embeddings for stability + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + # Detach, move to CPU, collect + batch_embeddings.append(sentence_embeddings.detach().cpu()) + + # Best-effort memory cleanup (especially useful on CUDA) + del tokenized_batch, model_output, sentence_embeddings + if self.device.type == "cuda": + torch.cuda.empty_cache() + gc.collect() + + # Concatenate all batches and convert to NumPy + return torch.cat(batch_embeddings, dim=0).numpy() + + def _last_token_pool(self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Select the last *non-padding* token embedding for each sequence in the batch.""" + last_valid_token_idx = attention_mask.sum(dim=1) - 1 # (batch,) + batch_row_idx = torch.arange(last_hidden_states.size(0), device=last_hidden_states.device) + return last_hidden_states[batch_row_idx, last_valid_token_idx] + + def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: + """Create a simple undirected co-occurrence graph from training rows. + + Nodes: terms (with node attribute 'types'). + Edges: between a term and each neighbor from its optional RAG list. + Edge weight = number of shared types (or 0.1 if none shared). + """ + graph = nx.Graph() + + for row in training_rows: + term = row["term"] + term_types = row.get("types", []) + graph.add_node(term, types=term_types) + + # RAG may be a list of neighbor dicts like {"term": ..., "types": [...]} + for neighbor in (row.get("RAG", []) or []): + neighbor_term = neighbor.get("term") + neighbor_types = neighbor.get("types", []) + + # Shared-type-based edge weight (weak edge if no overlap) + shared_types = set(term_types).intersection(set(neighbor_types)) + edge_weight = float(len(shared_types)) if shared_types else 0.1 + + graph.add_edge(term, neighbor_term, weight=edge_weight) + + return graph + + def _extract_graph_features(self, term_graph: nx.Graph, terms: List[str]) -> np.ndarray: + """Compute simple per-term graph features. + + For each term we compute a 4-dim vector: + [degree, clustering_coefficient, degree_centrality, pagerank_score] + Returns an array of shape [len(terms), 4]. + """ + if len(term_graph): + degree_centrality = nx.degree_centrality(term_graph) + pagerank_scores = nx.pagerank(term_graph) + else: + degree_centrality, pagerank_scores = {}, {} + + feature_rows: List[List[float]] = [] + for term in terms: + if term in term_graph: + feature_rows.append( + [ + float(term_graph.degree(term)), + float(nx.clustering(term_graph, term)), + float(degree_centrality.get(term, 0.0)), + float(pagerank_scores.get(term, 0.0)), + ] + ) + else: + feature_rows.append([0.0, 0.0, 0.0, 0.0]) + + return np.asarray(feature_rows, dtype=np.float32) + + def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: + """Normalize diverse training data formats to a list of dicts: {term, types, RAG}.""" + normalized_rows: List[Dict[str, Any]] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for item in term_typings_attr: + term_text = getattr(item, "term", None) + type_list = getattr(item, "types", None) + rag_neighbors = getattr(item, "RAG", None) + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + ) + return normalized_rows + + # Otherwise: must be a list/tuple-like container + if not isinstance(data, (list, tuple)): + raise ValueError("Training data must be a list/tuple or expose .term_typings") + + if not data: + return normalized_rows + + # Case 2: list of dicts + if isinstance(data[0], dict): + for row in data: + term_text = row.get("term") + type_list = row.get("types") + rag_neighbors = row.get("RAG") + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + ) + return normalized_rows + + # Case 3: list of tuples/lists: (term, types[, RAG]) + for item in data: + if not isinstance(item, (list, tuple)) or len(item) < 2: + continue + term_text, type_list = item[0], item[1] + rag_neighbors = item[2] if len(item) > 2 else None + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + ) + + return normalized_rows + + def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: + """Normalize prediction input into parallel lists: (terms, ids).""" + terms: List[str] = [] + example_ids: List[Any] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for idx, item in enumerate(term_typings_attr): + terms.append(str(getattr(item, "term", ""))) + example_ids.append(getattr(item, "id", getattr(item, "ID", idx))) + return terms, example_ids + + # Case 2: list/tuple container + if isinstance(data, (list, tuple)) and data: + first_element = data[0] + + # 2a) list of dicts + if isinstance(first_element, dict): + for i, row in enumerate(data): + terms.append(str(row.get("term", ""))) + example_ids.append(row.get("id", row.get("ID", i))) + return terms, example_ids + + # 2b) list of tuples/lists: (term, id[, ...]) + if isinstance(first_element, (list, tuple)): + for i, tuple_row in enumerate(data): + if not tuple_row: + continue + terms.append(str(tuple_row[0])) + example_ids.append(tuple_row[1] if len(tuple_row) > 1 else i) + return terms, example_ids + + # 2c) list of strings (terms only) + if isinstance(first_element, str): + terms = [str(x) for x in data] # type: ignore[arg-type] + example_ids = list(range(len(terms))) + return terms, example_ids + + raise ValueError("Unsupported predict() input format.") + + def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: + """Normalize gold labels into a list of dicts: {id, term, types}.""" + gold_rows: List[Dict[str, Any]] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for idx, item in enumerate(term_typings_attr): + gold_id = getattr(item, "id", getattr(item, "ID", idx)) + term_text = str(getattr(item, "term", "")) + type_list = getattr(item, "types", []) + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + return gold_rows + + # Case 2: list/tuple container + if isinstance(data, (list, tuple)) and data: + first_element = data[0] + + # 2a) list of dicts + if isinstance(first_element, dict): + for i, row in enumerate(data): + gold_id = row.get("id", row.get("ID", i)) + term_text = str(row.get("term", "")) + type_list = row.get("types", []) + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + return gold_rows + + # 2b) list of tuples/lists: (term, types[, id]) + if isinstance(first_element, (list, tuple)): + for i, tuple_row in enumerate(data): + if not tuple_row or len(tuple_row) < 2: + continue + term_text = str(tuple_row[0]) + type_list = tuple_row[1] + gold_id = tuple_row[2] if len(tuple_row) > 2 else i + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + return gold_rows + + raise ValueError("Unsupported ground-truth input format for tasks_ground_truth_former().") + +class AlexbekRAGLearner(AutoLearner): + """Retrieval-Augmented Term Typing learner (single task: term-typing). + + Flow: + 1) fit: collect (term -> [types]) examples, build an in-memory index + using a sentence-embedding model. + 2) predict: for each new term, retrieve top-k similar examples, compose a + structured prompt, query an instruction-tuned causal LLM, and parse types. + + Returns a list of dicts: {"term": str, "types": List[str], "id": Optional[str]}. + """ + + def __init__( + self, + llm_model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "auto", # "auto" | "cuda" | "cpu" + token: str = "", # HF token if needed + top_k: int = 3, + max_new_tokens: int = 256, + gen_batch_size: int = 4, # generation batch size + enc_batch_size: int = 64, # embedding batch size + **kwargs: Any, # absorb extra pipeline-style args + ) -> None: + super().__init__() + + # Consolidated configuration for simple serialization + self.cfg: Dict[str, Any] = { + "llm_model_id": llm_model_id, + "retriever_model_id": retriever_model_id, + "device": device, + "token": token, + "top_k": int(top_k), + "max_new_tokens": int(max_new_tokens), + "gen_batch_size": int(gen_batch_size), + "enc_batch_size": int(enc_batch_size), + } + self.extra_cfg: Dict[str, Any] = dict(kwargs) + + # LLM components + self.tokenizer: Optional[AutoTokenizer] = None + self.generation_model: Optional[AutoModelForCausalLM] = None + + # Retriever components + self.embedder: Optional[SentenceTransformer] = None + self.indexed_corpus: List[str] = [] # items: " || [...]" + self.corpus_embeddings: Optional[torch.Tensor] = None + + # Training cache of (term, [types]) tuples + self.train_term_types: List[Tuple[str, List[str]]] = [] + + # Prompt templates + self._system_prompt: str = ( + "You are an expert in ontologies and semantic term classification.\n" + "Task: determine semantic types for the TERM using the EXAMPLES provided.\n" + "Rules:\n" + "1) Types must be generalizing categories from the domain ontology.\n" + "2) Be concise. Respond ONLY in JSON using double quotes.\n" + 'Format: {"term":"...", "reasoning":"<<=100 words>>", "types":["...", "..."]}\n' + ) + self._user_prompt_template: str = ( + """{examples} + + TERM: {term} + + TASK: Determine semantic types for the given term based on the domain ontology. + Remember: types are generalizing categories, not the term itself. Respond in JSON. + """ + ) + + def load( + self, + model_id: Optional[str] = None, + retriever_id: Optional[str] = None, + device: Optional[str] = None, + token: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Load the LLM and the embedding retriever. Overrides constructor values if provided.""" + if model_id is not None: + self.cfg["llm_model_id"] = model_id + if retriever_id is not None: + self.cfg["retriever_model_id"] = retriever_id + if device is not None: + self.cfg["device"] = device + if token is not None: + self.cfg["token"] = token + self.extra_cfg.update(kwargs) + + # Choose device & dtype for the LLM + cuda_available: bool = torch.cuda.is_available() + use_cuda: bool = cuda_available and (self.cfg["device"] != "cpu") + device_map: str = "auto" if use_cuda else "cpu" + torch_dtype = torch.bfloat16 if use_cuda else torch.float32 + + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + self.cfg["llm_model_id"], padding_side="left", token=self.cfg["token"] + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # LLM + self.generation_model = AutoModelForCausalLM.from_pretrained( + self.cfg["llm_model_id"], + device_map=device_map, + torch_dtype=torch_dtype, + token=self.cfg["token"], + ) + + # Deterministic decoding defaults + generation_cfg = self.generation_model.generation_config + generation_cfg.do_sample = False + generation_cfg.temperature = None + generation_cfg.top_p = None + generation_cfg.top_k = None + generation_cfg.num_beams = 1 + + # Retriever + self.embedder = SentenceTransformer(self.cfg["retriever_model_id"], trust_remote_code=True) + + def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: + """Prepare the retrieval index from training examples.""" + if task != "term-typing": + return super().fit(train_data, task, ontologizer) + + # Normalize incoming training data -> list[(term, [types])] + self.train_term_types = self._unpack_train(train_data) + + # Build the textual corpus to index + self.indexed_corpus = [ + f"{term} || {json.dumps(types, ensure_ascii=False)}" for term, types in self.train_term_types + ] + + # Embed the corpus if available; else fall back to zero-shot prompting + if self.indexed_corpus and self.embedder is not None: + self.corpus_embeddings = self._encode_texts(self.indexed_corpus) + else: + self.corpus_embeddings = None + + def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: + """Predict types for evaluation items; returns a list of {term, types, id?}.""" + if task != "term-typing": + return super().predict(eval_data, task, ontologizer) + + eval_terms, eval_ids = self._unpack_eval(eval_data) + if not eval_terms: + return [] + + # Use RAG if we have an indexed corpus & embeddings; otherwise zero-shot + rag_available = ( + self.corpus_embeddings is not None and self.embedder is not None and len(self.indexed_corpus) > 0 + ) + + if rag_available: + neighbor_docs_per_query = self._retrieve_batch(eval_terms, top_k=int(self.cfg["top_k"])) + else: + neighbor_docs_per_query = [[] for _ in eval_terms] + + # Compose prompts + prompts: List[str] = [] + for term, neighbor_docs in zip(eval_terms, neighbor_docs_per_query): + example_pairs = self._decode_examples(neighbor_docs) + examples_block = self._format_examples(example_pairs) + prompt_text = self._compose_prompt(examples_block, term) + prompts.append(prompt_text) + + predicted_types_lists = self._generate_and_parse(prompts) + + # Build standardized results + results: List[Dict[str, Any]] = [] + for term, example_id, predicted_types in zip(eval_terms, eval_ids, predicted_types_lists): + result_row: Dict[str, Any] = { + "term": term, + "types": sorted({t for t in predicted_types}), # unique + sorted + } + if example_id is not None: + result_row["id"] = example_id + results.append(result_row) + + assert all(("term" in row and "types" in row) for row in results), "predict() must return term + types" + return results + + def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: + """Extract (term, [types]) tuples from supported training payloads.""" + term_typings = getattr(data, "term_typings", None) + if term_typings is not None: + parsed_pairs: List[Tuple[str, List[str]]] = [] + for item in term_typings: + term = getattr(item, "term", None) + types = list(getattr(item, "types", []) or []) + if term and types: + parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + return parsed_pairs + + if isinstance(data, list) and data and isinstance(data[0], dict): + parsed_pairs = [] + for row in data: + term = row.get("term") + types = row.get("types") or [] + if term and isinstance(types, list) and types: + parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + return parsed_pairs + + # If only a list of strings is provided, there's nothing to index for RAG + if isinstance(data, (list, set, tuple)) and all(isinstance(x, str) for x in data): + return [] + + return [] + + def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: + """Extract (terms, ids) from supported evaluation payloads.""" + term_typings = getattr(data, "term_typings", None) + if term_typings is not None: + terms: List[str] = [] + ids: List[Optional[str]] = [] + for item in term_typings: + terms.append(getattr(item, "term", "")) + ids.append(getattr(item, "id", None)) + return terms, ids + + if isinstance(data, list) and data and isinstance(data[0], str): + return list(data), [None] * len(data) + + if isinstance(data, list) and data and isinstance(data[0], dict): + terms: List[str] = [] + ids: List[Optional[str]] = [] + for row in data: + terms.append(row.get("term", "")) + ids.append(row.get("id")) + return terms, ids + + return [], [] + + def _encode_texts(self, texts: List[str]) -> torch.Tensor: + """Encode a batch of texts with the sentence-embedding model.""" + batch_size = int(self.cfg["enc_batch_size"]) + batch_embeddings: List[torch.Tensor] = [] + + for batch_start in range(0, len(texts), batch_size): + batch_texts = texts[batch_start : batch_start + batch_size] + embeddings = self.embedder.encode(batch_texts, convert_to_tensor=True, show_progress_bar=False) + batch_embeddings.append(embeddings) + + return torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + + def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: + """Return for each query the top-k most similar corpus entries (as raw text rows).""" + if self.corpus_embeddings is None or not self.indexed_corpus: + return [[] for _ in queries] + + query_embeddings = self._encode_texts(queries) # [Q, D] + doc_embeddings = self.corpus_embeddings # [N, D] + if query_embeddings.shape[-1] != doc_embeddings.shape[-1]: + raise ValueError( + f"Embedding dim mismatch: {query_embeddings.shape[-1]} vs {doc_embeddings.shape[-1]}" + ) + + # Cosine similarity via L2-normalized dot product + q_norm = F.normalize(query_embeddings, p=2, dim=1) + d_norm = F.normalize(doc_embeddings, p=2, dim=1) + cos_sim = torch.matmul(q_norm, d_norm.T) # [Q, N] + + k = min(max(1, top_k), len(self.indexed_corpus)) + _, top_indices = torch.topk(cos_sim, k=k, dim=1) + return [[self.indexed_corpus[j] for j in row.tolist()] for row in top_indices] + + def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: + """Parse raw corpus rows ('term || [types]') into (term, [types]) pairs.""" + example_pairs: List[Tuple[str, List[str]]] = [] + for raw_row in docs: + try: + term_raw, types_json = raw_row.split("||", 1) + term = term_raw.strip() + types_list = json.loads(types_json.strip()) + if isinstance(types_list, list): + example_pairs.append((term, [t for t in types_list if isinstance(t, str)])) + except Exception: + continue + return example_pairs + + def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: + """Format retrieved example pairs into a compact block for the prompt.""" + if not pairs: + return "EXAMPLES: (none provided)" + lines: List[str] = ["CLASSIFICATION EXAMPLES:"] + for idx, (term, types) in enumerate(pairs, 1): + preview_types = types[:3] # keep context small + lines.append(f"{idx}. Term: '{term}' → Types: {list(preview_types)}") + lines.append("END OF EXAMPLES.") + return "\n".join(lines) + + def _compose_prompt(self, examples_block: str, term: str) -> str: + """Compose the final prompt from system + user blocks.""" + user_block = self._user_prompt_template.format(examples=examples_block, term=term) + return f"{self._system_prompt}\n\n{user_block}\n" + + def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: + """Run generation for a batch of prompts and parse the JSON 'types' from outputs.""" + batch_size = int(self.cfg["gen_batch_size"]) + all_predicted_types: List[List[str]] = [] + + for batch_start in range(0, len(prompts), batch_size): + prompt_batch = prompts[batch_start : batch_start + batch_size] + + # Tokenize and move to the LLM's device + model_device = getattr(self.generation_model, "device", None) + encodings = self.tokenizer(prompt_batch, return_tensors="pt", padding=True).to(model_device) + input_token_length = encodings["input_ids"].shape[1] + + # Deterministic decoding (greedy) + with torch.no_grad(): + generated_tokens = self.generation_model.generate( + **encodings, + do_sample=False, + num_beams=1, + temperature=None, + top_p=None, + top_k=None, + max_new_tokens=int(self.cfg["max_new_tokens"]), + pad_token_id=self.tokenizer.eos_token_id, + ) + + # Slice off the prompt tokens and decode only newly generated tokens + new_token_span = generated_tokens[:, input_token_length:] + decoded_texts = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in new_token_span] + + parsed_types_per_prompt = [self._parse_types(text) for text in decoded_texts] + all_predicted_types.extend(parsed_types_per_prompt) + + return all_predicted_types + + def _parse_types(self, text: str) -> List[str]: + """Extract a list of type strings from LLM output. + + Attempts (in order): + 1) Strict JSON object with "types". + 2) Regex-extract JSON object containing "types". + 3) Regex-extract first bracketed list. + 4) Comma-split fallback. + """ + try: + obj = json.loads(text) + if isinstance(obj, dict) and isinstance(obj.get("types"), list): + return [t for t in obj["types"] if isinstance(t, str)] + except Exception: + pass + + try: + obj_match = re.search(r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S) + if obj_match: + obj = json.loads(obj_match.group(0)) + types = obj.get("types", []) + return [t for t in types if isinstance(t, str)] + except Exception: + pass + + try: + list_match = re.search(r'\[([^\]]+)\]', text) + if list_match: + items = [x.strip().strip('"').strip("'") for x in list_match.group(1).split(",")] + return [t for t in items if t] + except Exception: + pass + + if "," in text: + items = [x.strip().strip('"').strip("'") for x in text.split(",")] + return [t for t in items if t] + + return [] diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py index 30e8372..6408881 100644 --- a/ontolearner/learner/text2onto/__init__.py +++ b/ontolearner/learner/text2onto/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .sbunlp import SBUNLPFewShotLearner +from .alexbek import AlexbekFewShotLearner diff --git a/ontolearner/learner/text2onto/alexbek.py b/ontolearner/learner/text2onto/alexbek.py new file mode 100644 index 0000000..5760dca --- /dev/null +++ b/ontolearner/learner/text2onto/alexbek.py @@ -0,0 +1,1084 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple, Iterable +import json +from json.decoder import JSONDecodeError +import os +import random +import re + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +from ...base import AutoLearner, AutoLLM + +try: + from outlines.models import Transformers as OutlinesTFModel + from outlines.generate import json as outlines_generate_json + from pydantic import BaseModel + + class _PredictedTypesSchema(BaseModel): + """Schema used when generating structured JSON { "types": [...] }.""" + types: List[str] + + OUTLINES_AVAILABLE: bool = True +except Exception: + # If outlines is unavailable, we will fall back to greedy decoding + regex parsing. + OUTLINES_AVAILABLE = False + _PredictedTypesSchema = None + OutlinesTFModel = None + outlines_generate_json = None + +class LocalAutoLLM(AutoLLM): + """ + Minimal local LLM helper. + + - Inherits AutoLLM but overrides load/generate to avoid label_mapper. + - Optional 4-bit loading with `load_in_4bit=True` in .load(). + - Greedy decoding by default (deterministic). + """ + + def __init__(self, device: str = "cpu", token: str = "") -> None: + """ + Initialize the local LLM holder. + + Parameters + ---------- + device : str + Execution device: "cpu" or "cuda". + token : str + Optional auth token for private model hubs. + """ + super().__init__(label_mapper=None, device=device, token=token) + self.model: Optional[AutoModelForCausalLM] = None + self.tokenizer: Optional[AutoTokenizer] = None + + def load(self, model_id: str, *, load_in_4bit: bool = False) -> None: + """ + Load a Hugging Face causal model + tokenizer and set deterministic + generation defaults. + + Parameters + ---------- + model_id : str + Model identifier resolvable by HF `from_pretrained`. + load_in_4bit : bool + If True and bitsandbytes is available, load using 4-bit quantization. + """ + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, padding_side="left", token=self.token + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Model (optionally quantized) + if load_in_4bit: + from transformers import BitsAndBytesConfig + + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + quantization_config=quantization_config, + token=self.token, + ) + else: + device_map = "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + token=self.token, + ) + + # Deterministic generation defaults + generation_cfg = self.model.generation_config + generation_cfg.do_sample = False + generation_cfg.temperature = None + generation_cfg.top_k = None + generation_cfg.top_p = None + generation_cfg.num_beams = 1 + + def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: + """ + Greedy-generate continuations for a list of prompts. + + Parameters + ---------- + prompts : List[str] + Prompts to generate for (batched). + max_new_tokens : int + Maximum number of new tokens per continuation. + + Returns + ------- + List[str] + Decoded new-token texts (no special tokens, stripped). + """ + if self.model is None or self.tokenizer is None: + raise RuntimeError("Call .load(model_id) on LocalAutoLLM before generate().") + + tokenized_batch = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True) + input_seq_len = tokenized_batch["input_ids"].shape[1] + tokenized_batch = {k: v.to(self.model.device) for k, v in tokenized_batch.items()} + + with torch.no_grad(): + outputs = self.model.generate( + **tokenized_batch, + max_new_tokens=max_new_tokens, + pad_token_id=self.tokenizer.eos_token_id, + do_sample=False, + num_beams=1, + ) + + # Only return the newly generated part for each row in the batch + continuation_token_ids = outputs[:, input_seq_len:] + return [self.tokenizer.decode(row, skip_special_tokens=True).strip() for row in continuation_token_ids] + +class AlexbekFewShotLearner(AutoLearner): + """ + Text2Onto learner for LLMS4OL Task A (term & type extraction). + + Public API (A1 + convenience): + - fit(train_docs_jsonl, terms2doc_json, sample_size=24, seed=42) + - predict_terms(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int + - predict_types(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int + - evaluate_extraction_f1(gold_item2docs_json, preds_jsonl, key="term"|"type") -> float + + Option A (A2, term→types) bridge: + - predict_types_from_terms_option_a(...) + Reads your A1 results (docs→terms), predicts types for each term, and + writes two files: terms2types_pred.json + types2docs_pred.json + """ + def __init__(self, model: LocalAutoLLM, device: str = "cpu", **_: Any) -> None: + """ + Initialize learner state and canned prompts. + + Parameters + ---------- + model : LocalAutoLLM + Loaded local LLM helper instance. + device : str + Device name ("cpu" or "cuda"). + """ + super().__init__(**_) + self.model = model + self.device = device + + # Few-shot exemplars for A1 (Docs→Terms) and for Docs→Types: + # Each exemplar is a tuple: (title, text, gold_list) + self._fewshot_terms_docs: List[Tuple[str, str, List[str]]] = [] + self._fewshot_types_docs: List[Tuple[str, str, List[str]]] = [] + + # System prompts + self._system_prompt_terms = ( + "You are an expert in ontology term extraction.\n" + "Extract only terms that explicitly appear in the document.\n" + 'Answer strictly as JSON: {"terms": ["..."]}\n' + ) + self._system_prompt_types = ( + "You are an expert in ontology type classification.\n" + "List ontology *types* that characterize the document’s terminology.\n" + 'Answer strictly as JSON: {"types": ["..."]}\n' + ) + + # Compiled regex for robust JSON extraction from LLM outputs + self._json_object_regex = re.compile(r"\{[^{}]*\}", re.S) + self._json_array_regex = re.compile(r"\[[^\]]*\]", re.S) + + # Term→Types (Option A) specific prompt + self._system_prompt_term_to_types = ( + "You are an expert in ontology and semantic type classification.\n" + "Given a term, predict its semantic types from the domain-specific ontology.\n" + 'Answer strictly as JSON:\n{"types": ["type1", "type2", "..."]}' + ) + + def fit( + self, + *, + train_docs_jsonl: str, + terms2doc_json: str, + sample_size: int = 24, + seed: int = 42, + ) -> None: + """ + Build internal few-shot exemplars from a labeled training split. + + Parameters + ---------- + train_docs_jsonl : str + Path to JSONL (or tolerant JSON/JSONL) with train documents. + terms2doc_json : str + JSON mapping item -> [doc_id,...]; "item" can be a term or type. + sample_size : int + Number of exemplar documents to keep for few-shot prompting. + seed : int + RNG seed for reproducible sampling. + """ + rng = random.Random(seed) + + # Load documents and map doc_id -> row + document_map = self._load_documents_jsonl(train_docs_jsonl) + if not document_map: + raise FileNotFoundError(f"No documents found in: {train_docs_jsonl}") + + # Load item -> [doc_ids] + item_to_docs_map = self._load_json(terms2doc_json) + if not isinstance(item_to_docs_map, dict): + raise ValueError(f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]") + + # Reverse mapping: doc_id -> [items] + doc_id_to_items_map: Dict[str, List[str]] = {} + for item_label, doc_id_list in item_to_docs_map.items(): + for doc_id in doc_id_list: + doc_id_to_items_map.setdefault(doc_id, []).append(item_label) + + # Build candidate exemplars (title, text, gold_list) + exemplar_candidates: List[Tuple[str, str, List[str]]] = [] + for doc_id, labeled_items in doc_id_to_items_map.items(): + doc_row = document_map.get(doc_id) + if not doc_row: + continue + doc_title = str(doc_row.get("title", "")) # be defensive (may be None) + doc_text = self._to_text(doc_row.get("text", "")) # string-ify list if needed + if not doc_text: + continue + gold_items = self._unique_preserve([s for s in labeled_items if isinstance(s, str)]) + if gold_items: + exemplar_candidates.append((doc_title, doc_text, gold_items)) + + if not exemplar_candidates: + raise RuntimeError("No candidate docs with items found to build few-shot exemplars.") + + chosen_exemplars = rng.sample(exemplar_candidates, k=min(sample_size, len(exemplar_candidates))) + # Reuse exemplars for both docs→terms and docs→types prompting + self._fewshot_terms_docs = chosen_exemplars + self._fewshot_types_docs = chosen_exemplars + + def predict_terms( + self, + *, + docs_test_jsonl: str, + out_jsonl: str, + max_new_tokens: int = 128, + few_shot_k: int = 6, + ) -> int: + """ + Extract terms that explicitly appear in each document. + + Writes one JSON object per line: + {"id": "", "terms": ["...", "...", ...]} + + Parameters + ---------- + docs_test_jsonl : str + Path to test/dev documents in JSONL or tolerant JSON/JSONL. + out_jsonl : str + Output JSONL path where predictions are written (one line per doc). + max_new_tokens : int + Max generation length. + few_shot_k : int + Number of few-shot exemplars to prepend per prompt. + + Returns + ------- + int + Number of lines written (i.e., number of processed documents). + """ + if self.model is None or self.model.model is None: + raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") + + test_documents = self._load_documents_jsonl(docs_test_jsonl) + prompts: List[str] = [] + document_order: List[str] = [] + + for document_id, document_row in test_documents.items(): + title = str(document_row.get("title", "")) + text = self._to_text(document_row.get("text", "")) + + fewshot_block = self._format_fewshot_block( + self._system_prompt_terms, self._fewshot_terms_docs, key="terms", k=few_shot_k + ) + user_block = self._format_user_block(title, text) + + prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") + document_order.append(document_id) + + generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) + parsed_term_lists = [self._parse_json_list(generated, key="terms") for generated in generations] + + os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) + lines_written = 0 + with open(out_jsonl, "w", encoding="utf-8") as fp_out: + for document_id, term_list in zip(document_order, parsed_term_lists): + payload = {"id": document_id, "terms": self._unique_preserve(term_list)} + fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") + lines_written += 1 + return lines_written + + + def predict_types( + self, + *, + docs_test_jsonl: str, + out_jsonl: str, + max_new_tokens: int = 128, + few_shot_k: int = 6, + ) -> int: + """ + Predict ontology types that characterize each document’s terminology. + + Writes one JSON object per line: + {"id": "", "types": ["...", "...", ...]} + + Parameters + ---------- + docs_test_jsonl : str + Path to test/dev documents in JSONL or tolerant JSON/JSONL. + out_jsonl : str + Output JSONL path where predictions are written (one line per doc). + max_new_tokens : int + Max generation length. + few_shot_k : int + Number of few-shot exemplars to prepend per prompt. + + Returns + ------- + int + Number of lines written (i.e., number of processed documents). + """ + if self.model is None or self.model.model is None: + raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") + + test_documents = self._load_documents_jsonl(docs_test_jsonl) + prompts: List[str] = [] + document_order: List[str] = [] + + for document_id, document_row in test_documents.items(): + title = str(document_row.get("title", "")) + text = self._to_text(document_row.get("text", "")) + + fewshot_block = self._format_fewshot_block( + self._system_prompt_types, self._fewshot_types_docs, key="types", k=few_shot_k + ) + user_block = self._format_user_block(title, text) + + prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") + document_order.append(document_id) + + generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) + parsed_type_lists = [self._parse_json_list(generated, key="types") for generated in generations] + + os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) + lines_written = 0 + with open(out_jsonl, "w", encoding="utf-8") as fp_out: + for document_id, type_list in zip(document_order, parsed_type_lists): + payload = {"id": document_id, "types": self._unique_preserve(type_list)} + fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") + lines_written += 1 + return lines_written + + def evaluate_extraction_f1( + self, + gold_item2docs_json: str, + preds_jsonl: str, + *, + key: str = "term", + ) -> float: + """ + Compute micro-F1 over (doc_id, item) pairs. + + Parameters + ---------- + gold_item2docs_json : str + JSON mapping item -> [doc_ids]. + preds_jsonl : str + JSONL lines like {"id": "...", "terms":[...]} or {"id":"...","types":[...]}. + key : str + "term" or "type" depending on what you are evaluating. + + Returns + ------- + float + Micro-averaged F1 score. + """ + item_to_doc_ids: Dict[str, List[str]] = self._load_json(gold_item2docs_json) + + # Build gold: doc_id -> set(items) + gold_doc_to_items: Dict[str, set] = {} + for item_label, doc_id_list in item_to_doc_ids.items(): + for document_id in doc_id_list: + gold_doc_to_items.setdefault(document_id, set()).add(self._norm(item_label)) + + # Build predictions: doc_id -> set(items) + pred_doc_to_items: Dict[str, set] = {} + with open(preds_jsonl, "r", encoding="utf-8") as fp_in: + for line in fp_in: + row = json.loads(line.strip()) + document_id = str(row.get("id", "")) + items_list = row.get("terms" if key == "term" else "types", []) + pred_doc_to_items[document_id] = {self._norm(x) for x in items_list if isinstance(x, str)} + + # Micro counts + true_positive = false_positive = false_negative = 0 + all_document_ids = set(gold_doc_to_items.keys()) | set(pred_doc_to_items.keys()) + for document_id in all_document_ids: + gold_set = gold_doc_to_items.get(document_id, set()) + pred_set = pred_doc_to_items.get(document_id, set()) + true_positive += len(gold_set & pred_set) + false_positive += len(pred_set - gold_set) + false_negative += len(gold_set - pred_set) + + precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0 + recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0 + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0 + return f1 + + def predict_types_from_terms( + self, + *, + doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl + doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list + few_shot_jsonl: Optional[str] = None, # JSONL lines: {"term":"...", "types":[...]} + rag_terms_json: Optional[str] = None, # JSON list; items may contain "term" and "RAG":[...] + random_few_shot: Optional[int] = 3, + model_id: str = "Qwen/Qwen2.5-1.5B-Instruct", + use_structured_output: bool = True, + seed: int = 42, + out_terms2types: str = "terms2types_pred.json", + out_types2docs: str = "types2docs_pred.json", + ) -> Dict[str, Any]: + """ + Predict types for each unique term extracted per document and derive a types→docs map. + + Parameters + ---------- + doc_terms_jsonl : Optional[str] + Path to JSONL with lines like {"id": "...", "terms": [...]} or a JSON with {"results":[...]}. + doc_terms_list : Optional[List[Dict]] + In-memory results like [{"id":"...","extracted_terms":[...]}] or {"id":"...","terms":[...]}. + few_shot_jsonl : Optional[str] + Global few-shot exemplars: one JSON object per line with {"term": "...", "types":[...]}. + rag_terms_json : Optional[str] + Optional per-term RAG exemplars: a JSON list of {"term": "...", "RAG":[{"term": "...", "types":[...]}]}. + random_few_shot : Optional[int] + If provided, randomly select up to this many few-shot examples for each prediction. + model_id : str + HF model id used specifically for term→types predictions. + use_structured_output : bool + If True and outlines is available, enforce structured {"types":[...]} output. + seed : int + Random seed for reproducibility. + out_terms2types : str + Output JSON path for list of {"term": "...", "predicted_types":[...]}. + out_types2docs : str + Output JSON path for dict {"TYPE":[doc_ids,...], ...}. + + Returns + ------- + Dict[str, Any] + Summary with predictions and counts. + """ + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + # Load normalized document→terms results + doc_term_extractions = self._load_doc_term_extractions( + results_json_path=doc_terms_jsonl, + in_memory_results=doc_terms_list, + ) + if not doc_term_extractions: + raise ValueError("No document→terms results provided (doc_terms_jsonl/doc_terms_list).") + + # Prepare unique term list and term→doc occurrences + unique_terms = self._collect_unique_terms_from_extractions(doc_term_extractions) + term_to_doc_ids_map = self._build_term_to_doc_ids(doc_term_extractions) + + # Load optional global few-shot examples + global_few_shot_examples: List[Dict] = [] + if few_shot_jsonl and os.path.exists(few_shot_jsonl): + with open(few_shot_jsonl, "r", encoding="utf-8") as few_shot_file: + for raw_line in few_shot_file: + raw_line = raw_line.strip() + if not raw_line: + continue + try: + json_obj = json.loads(raw_line) + except Exception: + continue + if isinstance(json_obj, dict) and "term" in json_obj and "types" in json_obj: + global_few_shot_examples.append(json_obj) + + # Optional per-term RAG examples: {normalized_term -> [examples]} + rag_examples_lookup: Dict[str, List[Dict]] = {} + if rag_terms_json and os.path.exists(rag_terms_json): + try: + rag_payload = self._load_json(rag_terms_json) + if isinstance(rag_payload, list): + for rag_item in rag_payload: + if isinstance(rag_item, dict): + normalized_term = self._normalize_term(rag_item.get("term", "")) + rag_examples_lookup[normalized_term] = rag_item.get("RAG", []) + except Exception: + pass + + # Load a small chat LLM dedicated to Term→Types + typing_model, typing_tokenizer = self._load_llm_for_types(model_id) + + # Predict types per term + term_to_predicted_types_list: List[Dict] = [] + for term_text in unique_terms: + normalized_term = self._normalize_term(term_text) + + # Prefer per-term RAG for this term, else use global few-shot + few_shot_examples_for_term = rag_examples_lookup.get(normalized_term, None) or global_few_shot_examples + + # Build conversation and prompt + conversation_messages = self._build_conv_for_type_infer( + term=term_text, + few_shot_examples=few_shot_examples_for_term, + random_k=random_few_shot, + ) + typing_prompt_string = self._apply_chat_template_safe_types(typing_tokenizer, conversation_messages) + + predicted_types: List[str] = [] + raw_generation_text: str = "" + + # Structured JSON path (if requested and available) + if use_structured_output and OUTLINES_AVAILABLE and _PredictedTypesSchema is not None: + try: + outlines_model = OutlinesTFModel(typing_model, typing_tokenizer) # type: ignore + generator = outlines_generate_json(outlines_model, _PredictedTypesSchema) # type: ignore + structured = generator(typing_prompt_string, max_tokens=512) + predicted_types = [label for label in structured.types if isinstance(label, str)] + raw_generation_text = json.dumps({"types": predicted_types}, ensure_ascii=False) + except Exception: + # Fall back to greedy decoding + use_structured_output = False + + # Greedy decode fallback + if not use_structured_output or not OUTLINES_AVAILABLE or _PredictedTypesSchema is None: + tokenized_prompt = typing_tokenizer(typing_prompt_string, return_tensors="pt", truncation=True, max_length=2048) + if torch.cuda.is_available(): + tokenized_prompt = {name: tensor.cuda() for name, tensor in tokenized_prompt.items()} + with torch.no_grad(): + output_ids = typing_model.generate( + **tokenized_prompt, + max_new_tokens=256, + do_sample=False, + num_beams=1, + pad_token_id=typing_tokenizer.eos_token_id, + ) + new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1]:] + raw_generation_text = typing_tokenizer.decode(new_token_span, skip_special_tokens=True) + predicted_types = self._extract_types_from_text(raw_generation_text) + + term_to_predicted_types_list.append({ + "term": term_text, + "predicted_types": sorted(set(predicted_types)), + }) + + # 7) Build types→docs from (term→types) and (term→docs) + types_to_doc_id_set: Dict[str, set] = {} + for term_prediction in term_to_predicted_types_list: + normalized_term = self._normalize_term(term_prediction["term"]) + doc_ids_for_term = term_to_doc_ids_map.get(normalized_term, []) + for type_label in term_prediction.get("predicted_types", []): + types_to_doc_id_set.setdefault(type_label, set()).update(doc_ids_for_term) + + types_to_doc_ids: Dict[str, List[str]] = { + type_label: sorted(doc_id_set) for type_label, doc_id_set in types_to_doc_id_set.items() + } + + # 8) Save outputs + os.makedirs(os.path.dirname(out_terms2types) or ".", exist_ok=True) + with open(out_terms2types, "w", encoding="utf-8") as fp_terms2types: + json.dump(term_to_predicted_types_list, fp_terms2types, ensure_ascii=False, indent=2) + + os.makedirs(os.path.dirname(out_types2docs) or ".", exist_ok=True) + with open(out_types2docs, "w", encoding="utf-8") as fp_types2docs: + json.dump(types_to_doc_ids, fp_types2docs, ensure_ascii=False, indent=2) + + # Cleanup VRAM if any + del typing_model, typing_tokenizer + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return { + "terms2types_pred": term_to_predicted_types_list, + "types2docs_pred": types_to_doc_ids, + "unique_terms": len(unique_terms), + "types_count": len(types_to_doc_ids), + } + + def _load_json(self, path: str) -> Dict[str, Any]: + """Load a JSON file from disk and return its parsed object.""" + with open(path, "r", encoding="utf-8") as file_obj: + return json.load(file_obj) + + + def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: + """ + Iterate over *all* JSON objects found inside a string. + + Supports cases where multiple JSON objects are concatenated back-to-back + in a single line. It skips stray commas/whitespace between objects. + + Parameters + ---------- + blob : str + A string that may contain one or more JSON objects. + + Yields + ------ + Dict[str, Any] + Each parsed JSON object. + """ + json_decoder = json.JSONDecoder() + cursor_index, text_length = 0, len(blob) + while cursor_index < text_length: + # Skip whitespace/commas between objects + while cursor_index < text_length and blob[cursor_index] in " \t\r\n,": + cursor_index += 1 + if cursor_index >= text_length: + break + try: + json_obj, end_index = json_decoder.raw_decode(blob, idx=cursor_index) + except JSONDecodeError: + # Can't decode from this position; stop scanning this chunk + break + yield json_obj + cursor_index = end_index + + + def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: + """ + Robust reader that supports: + • True JSONL (one object per line) + • Lines with multiple concatenated JSON objects + • Whole file as a JSON array + + Returns + ------- + Dict[str, Dict[str, Any]] + Mapping doc_id -> full document row. + """ + documents_by_id: Dict[str, Dict[str, Any]] = {} + + with open(path, "r", encoding="utf-8") as file_obj: + content = file_obj.read().strip() + + # Case A: whole-file JSON array + if content.startswith("["): + try: + json_array = json.loads(content) + if isinstance(json_array, list): + for record in json_array: + if not isinstance(record, dict): + continue + document_id = str( + record.get("id") + or record.get("doc_id") + or (record.get("doc") or {}).get("id") + or "" + ) + if document_id: + documents_by_id[document_id] = record + return documents_by_id + except Exception: + # Fall back to line-wise handling if array parsing fails + pass + + # Case B: treat as JSONL-ish; parse *all* objects per line + for raw_line in content.splitlines(): + line = raw_line.strip() + if not line: + continue + for record in self._iter_json_objects(line): + if not isinstance(record, dict): + continue + document_id = str( + record.get("id") + or record.get("doc_id") + or (record.get("doc") or {}).get("id") + or "" + ) + if document_id: + documents_by_id[document_id] = record + + return documents_by_id + + + def _to_text(self, text_field: Any) -> str: + """ + Convert a 'text' field into a single string (handles list-of-strings). + + Parameters + ---------- + text_field : Any + The value found under "text" in the dataset row. + + Returns + ------- + str + A single-string representation of the text. + """ + if isinstance(text_field, str): + return text_field + if isinstance(text_field, list): + return " ".join(str(part) for part in text_field) + return str(text_field) if text_field is not None else "" + + + def _unique_preserve(self, values: List[str]) -> List[str]: + """ + Deduplicate values while preserving the original order. + + Parameters + ---------- + values : List[str] + Sequence possibly containing duplicates. + + Returns + ------- + List[str] + Sequence without duplicates, order preserved. + """ + seen_values: set = set() + ordered_values: List[str] = [] + for candidate in values: + if candidate not in seen_values: + seen_values.add(candidate) + ordered_values.append(candidate) + return ordered_values + + + def _norm(self, text: str) -> str: + """ + Lowercased, single-spaced normalization (for comparisons). + + Parameters + ---------- + text : str + Input string. + + Returns + ------- + str + Normalized string. + """ + return " ".join(text.lower().split()) + + + def _normalize_term(self, term: str) -> str: + """ + Normalization tailored for term keys / lookups. + + Parameters + ---------- + term : str + Term to normalize. + + Returns + ------- + str + Lowercased, trimmed and single-spaced term. + """ + return " ".join(str(term).strip().split()).lower() + + + def _format_fewshot_block( + self, + system_prompt: str, + fewshot_examples: List[Tuple[str, str, List[str]]], + *, + key: str, + k: int = 6, + ) -> str: + """ + Render a few-shot block like: + + + + ### Example + User: + Title: ... + + Assistant: + {"terms": [...]} or {"types": [...]} + + Parameters + ---------- + system_prompt : str + Instructional system text to prepend. + fewshot_examples : List[Tuple[str, str, List[str]]] + Examples as (title, text, labels_list). + key : str + Either "terms" or "types" depending on the task. + k : int + Number of examples to include. + + Returns + ------- + str + Formatted few-shot block text. + """ + lines: List[str] = [system_prompt.strip(), ""] + for example_title, example_text, gold_list in fewshot_examples[:k]: + lines.append("### Example") + lines.append(f"User:\nTitle: {example_title}\n{example_text}") + lines.append(f'Assistant:\n{{"{key}": ' + json.dumps(gold_list, ensure_ascii=False) + "}") + return "\n".join(lines) + + + def _format_user_block(self, title: str, text: str) -> str: + """ + Format the 'Task' block for the current document. + + Parameters + ---------- + title : str + Document title. + text : str + Document text (single string). + + Returns + ------- + str + Formatted user block. + """ + return f"### Task\nUser:\nTitle: {title}\n{text}" + + + def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: + """ + Extract a list from model output, trying: + 1) JSON object with the key ({"terms":[...]} or {"types":[...]}). + 2) Any top-level JSON array. + 3) Fallback: comma-split. + + Parameters + ---------- + generated_text : str + Raw generation text to parse. + key : str + "terms" or "types". + + Returns + ------- + List[str] + Parsed strings (best-effort). + """ + # 1) Try a JSON object and read key + try: + object_match = self._json_object_regex.search(generated_text) + if object_match: + json_obj = json.loads(object_match.group(0)) + json_array = json_obj.get(key) + if isinstance(json_array, list): + return [value for value in json_array if isinstance(value, str)] + except Exception: + pass + + # 2) Any JSON array + try: + array_match = self._json_array_regex.search(generated_text) + if array_match: + json_array = json.loads(array_match.group(0)) + if isinstance(json_array, list): + return [value for value in json_array if isinstance(value, str)] + except Exception: + pass + + # 3) Fallback: comma-split (last resort) + if "," in generated_text: + return [part.strip().strip('"').strip("'") for part in generated_text.split(",") if part.strip()] + return [] + + + def _apply_chat_template_safe_types(self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]]) -> str: + """ + Safely build a prompt string for chat models. Uses the model's chat template + when available; otherwise falls back to a simple concatenation. + """ + try: + return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + except Exception: + system_text = next((m["content"] for m in messages if m.get("role") == "system"), "") + last_user_text = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + return f"{system_text}\n\nUser:\n{last_user_text}\n\nAssistant:" + + + def _build_conv_for_type_infer( + self, + term: str, + few_shot_examples: Optional[List[Dict]] = None, + random_k: Optional[int] = None, + ) -> List[Dict[str, str]]: + """ + Create a chat-style conversation for a single term→types query, + optionally prepending few-shot examples. + """ + messages: List[Dict[str, str]] = [{"role": "system", "content": self._system_prompt_term_to_types}] + examples = list(few_shot_examples or []) + if random_k and len(examples) > random_k: + import random as _rnd + examples = _rnd.sample(examples, random_k) + for exemplar in examples: + example_term = exemplar.get("term", "") + example_types = exemplar.get("types", []) + messages.append({"role": "user", "content": f"Term: {example_term}"}) + messages.append({"role": "assistant", "content": json.dumps({"types": example_types}, ensure_ascii=False)}) + messages.append({"role": "user", "content": f"Term: {term}"}) + return messages + + + def _extract_types_from_text(self, generated_text: str) -> List[str]: + """ + Parse {"types":[...]} from a free-form generation. + """ + try: + object_match = re.search(r'\{[^}]*"types"[^}]*\}', generated_text) + if object_match: + json_obj = json.loads(object_match.group(0)) + types_array = json_obj.get("types", []) + return [type_label for type_label in types_array if isinstance(type_label, str)] + except Exception: + pass + return [] + + + def _load_llm_for_types(self, model_id: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + """ + Load a *separate* small chat model for Term→Types (keeps LocalAutoLLM untouched). + """ + tokenizer = AutoTokenizer.from_pretrained(model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + device_map="auto" if torch.cuda.is_available() else None, + ) + return model, tokenizer + + + def _load_doc_term_extractions( + self, + *, + results_json_path: Optional[str] = None, + in_memory_results: Optional[List[Dict]] = None, + ) -> List[Dict]: + """ + Normalize document→terms outputs to a list of: + {"id": "", "extracted_terms": ["...", ...]} + + Accepts either: + - in_memory_results (list of dicts) + - results_json_path pointing to: + • a JSONL file with lines: {"id": "...", "terms": [...]} + • OR a JSON file with {"results":[{"id":..., "extracted_terms": [...]}, ...]} + • OR a JSON list of dicts + """ + normalized_records: List[Dict] = [] + + def _coerce_to_record(source_row: Dict) -> Optional[Dict]: + document_id = str(source_row.get("id", "")) or str(source_row.get("doc_id", "")) + if not document_id: + return None + terms = source_row.get("extracted_terms") + if terms is None: + terms = source_row.get("terms") + if terms is None and "payload" in source_row and isinstance(source_row["payload"], dict): + terms = source_row["payload"].get("terms") + if not isinstance(terms, list): + terms = [] + return {"id": document_id, "extracted_terms": [t for t in terms if isinstance(t, str)]} + + if in_memory_results is not None: + for source_row in in_memory_results: + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + return normalized_records + + if not results_json_path: + raise ValueError("Provide results_json_path or in_memory_results") + + # Detect JSON vs JSONL by extension (best-effort) + if results_json_path.endswith(".jsonl"): + with open(results_json_path, "r", encoding="utf-8") as file_in: + for raw_line in file_in: + raw_line = raw_line.strip() + if not raw_line: + continue + # Multiple concatenated objects per line? Iterate them all. + for json_obj in self._iter_json_objects(raw_line): + if isinstance(json_obj, dict): + coerced_record = _coerce_to_record(json_obj) + if coerced_record: + normalized_records.append(coerced_record) + else: + payload_obj = self._load_json(results_json_path) + if isinstance(payload_obj, dict) and "results" in payload_obj: + for source_row in payload_obj["results"]: + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + elif isinstance(payload_obj, list): + for source_row in payload_obj: + if isinstance(source_row, dict): + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + + return normalized_records + + + def _collect_unique_terms_from_extractions(self, doc_term_extractions: List[Dict]) -> List[str]: + """ + Collect unique terms (original casing) from normalized document→terms results. + """ + seen_normalized_terms: set = set() + ordered_unique_terms: List[str] = [] + for record in doc_term_extractions: + for term_text in record.get("extracted_terms", []): + normalized = self._normalize_term(term_text) + if normalized and normalized not in seen_normalized_terms: + seen_normalized_terms.add(normalized) + ordered_unique_terms.append(term_text.strip()) + return ordered_unique_terms + + + def _build_term_to_doc_ids(self, doc_term_extractions: List[Dict]) -> Dict[str, List[str]]: + """ + Build lookup: normalized_term -> sorted unique list of doc_ids. + """ + term_to_doc_set: Dict[str, set] = {} + for record in doc_term_extractions: + document_id = str(record.get("id", "")) + for term_text in record.get("extracted_terms", []): + normalized = self._normalize_term(term_text) + if not normalized or not document_id: + continue + term_to_doc_set.setdefault(normalized, set()).add(document_id) + return {normalized_term: sorted(doc_ids) for normalized_term, doc_ids in term_to_doc_set.items()} From 1abbbc91e7c65321da0f25f1f41b190c3776986d Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Mon, 10 Nov 2025 23:52:46 +0100 Subject: [PATCH 05/19] added changes for taxonomy discovery and term typing --- .../llm_learner_alexbek_rag_term_typing.py | 10 +- .../llm_learner_alexbek_rf_term_typing.py | 24 +- ...er_alexbek_self_attn_taxonomy_discovery.py | 9 +- examples/llm_learner_alexbek_text2onto.py | 32 +- ...llm_learner_rwthdbis_taxonomy_discovery.py | 17 +- examples/llm_learner_rwthdbis_term_typing.py | 13 +- ...lm_learner_sbunlp_fs_taxonomy_discovery.py | 33 +- examples/llm_learner_sbunlp_text2onto.py | 29 +- examples/llm_learner_sbunlp_zs_term_typing.py | 20 +- ..._learner_skhnlp_sft_taxonomoy_discovery.py | 10 +- ...m_learner_skhnlp_zs_taxonomoy_discovery.py | 13 +- .../learner/taxonomy_discovery/__init__.py | 18 - .../learner/taxonomy_discovery/alexbek.py | 291 +++++- .../learner/taxonomy_discovery/rwthdbis.py | 922 ++++++++++++------ .../learner/taxonomy_discovery/sbunlp.py | 393 +++++--- .../learner/taxonomy_discovery/skhnlp.py | 561 +++++++++-- ontolearner/learner/term_typing/__init__.py | 17 - ontolearner/learner/term_typing/alexbek.py | 665 +++++++++++-- ontolearner/learner/term_typing/rwthdbis.py | 214 +++- ontolearner/learner/term_typing/sbunlp.py | 404 ++++---- ontolearner/learner/text2onto/__init__.py | 16 - ontolearner/learner/text2onto/alexbek.py | 293 ++++-- ontolearner/learner/text2onto/sbunlp.py | 127 ++- 23 files changed, 2956 insertions(+), 1175 deletions(-) delete mode 100644 ontolearner/learner/taxonomy_discovery/__init__.py delete mode 100644 ontolearner/learner/term_typing/__init__.py delete mode 100644 ontolearner/learner/text2onto/__init__.py diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py index 5723e36..3a3233f 100644 --- a/examples/llm_learner_alexbek_rag_term_typing.py +++ b/examples/llm_learner_alexbek_rag_term_typing.py @@ -1,13 +1,15 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import AlexbekRAGLearner +from ontolearner.learner.term_typing.alexbek import AlexbekRAGLearner # Load the GeoNames ontology. ontology = GeoNames() ontology.load() # Extract labeled items and split into train/test sets for evaluation -train_data, test_data = train_test_split(ontology.extract(), test_size=0.2, random_state=42) +train_data, test_data = train_test_split( + ontology.extract(), test_size=0.2, random_state=42 +) # Configure a Retrieval-Augmented Generation (RAG) term-typing classifier. # - llm_model_id: generator used to predict types from the prompt + retrieved examples @@ -46,5 +48,7 @@ ) # Display the evaluation results and runtime -print("Metrics:", outputs.get("metrics")) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} +print( + "Metrics:", outputs.get("metrics") +) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} print("Elapsed time (s):", outputs.get("elapsed_time")) diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py index c5c7454..28ca94c 100644 --- a/examples/llm_learner_alexbek_rf_term_typing.py +++ b/examples/llm_learner_alexbek_rf_term_typing.py @@ -1,6 +1,8 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import AlexbekRFLearner # A random-forest term-typing learner over text+graph features +from ontolearner.learner.term_typing.alexbek import ( + AlexbekRFLearner, +) # A random-forest term-typing learner over text+graph features # Load the GeoNames ontology and extract labeled term-typing data @@ -10,31 +12,27 @@ data = ontology.extract() # Split the labeled term-typing data into train and test sets -train_data, test_data = train_test_split( - data, - test_size=0.2, - random_state=42 -) +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) # Configure the RF-based learner (embeddings + optional graph features) # - device: "cpu" or "cuda" # - threshold: decision threshold for multi-label assignment # - use_graph_features: include ontology-graph-derived features if available rf_learner = AlexbekRFLearner( - device="cpu", # switch to "cuda" if you have a GPU + device="cpu", # switch to "cuda" if you have a GPU batch_size=16, - max_length=512, # max tokenizer length for embedding model inputs - threshold=0.30, # probability cutoff for assigning each type - use_graph_features=True # set False for pure RF on text embeddings only + max_length=512, # max tokenizer length for embedding model inputs + threshold=0.30, # probability cutoff for assigning each type + use_graph_features=True, # set False for pure RF on text embeddings only ) # Build the pipeline and pass raw structured objects end-to-end. pipe = LearnerPipeline( retriever=rf_learner, - retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory - ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} + retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory + ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} device="cpu", - batch_size=16 + batch_size=16, ) # Run the full learning pipeline on the term-typing task diff --git a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py index b78976f..6a42160 100644 --- a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py +++ b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py @@ -1,5 +1,6 @@ from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import AlexbekCrossAttnLearner +from ontolearner.learner.taxonomy_discovery.alexbek import AlexbekCrossAttnLearner + # 1) Load & split ontology = GeoNames() ontology.load() @@ -22,9 +23,9 @@ # 3) Build pipeline pipeline = LearnerPipeline( - llm=cross_learner, # <- our learner - llm_id="cross-attn", # label for bookkeeping - ontologizer_data=False # pass raw ontology objects as in your example + llm=cross_learner, # <- our learner + llm_id="cross-attn", # label for bookkeeping + ontologizer_data=False, # pass raw ontology objects as in your example ) # 4) Train + predict + evaluate diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py index caf4c5b..69282a9 100644 --- a/examples/llm_learner_alexbek_text2onto.py +++ b/examples/llm_learner_alexbek_text2onto.py @@ -9,14 +9,22 @@ DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" # Input paths (already saved) -TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") -TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") -TEST_DOCS_FULL_PATH = os.path.join(DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl") +TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") +TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") +TEST_DOCS_FULL_PATH = os.path.join( + DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl" +) # Output paths -DOC_TERMS_OUT_PATH = os.path.join(DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl") -TERMS2TYPES_OUT_PATH = os.path.join(DATA_DIR, "test", "terms2types_pred_ecology.fast.json") -TYPES2DOCS_OUT_PATH = os.path.join(DATA_DIR, "test", "types2docs_pred_ecology.fast.json") +DOC_TERMS_OUT_PATH = os.path.join( + DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl" +) +TERMS2TYPES_OUT_PATH = os.path.join( + DATA_DIR, "test", "terms2types_pred_ecology.fast.json" +) +TYPES2DOCS_OUT_PATH = os.path.join( + DATA_DIR, "test", "types2docs_pred_ecology.fast.json" +) # Device selection DEVICE = ( @@ -27,7 +35,7 @@ # Model config MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" -LOAD_IN_4BIT = (DEVICE == "cuda") # 4-bit helps on GPU +LOAD_IN_4BIT = DEVICE == "cuda" # 4-bit helps on GPU # 1) Load LLM llm = LocalAutoLLM(device=DEVICE) @@ -52,15 +60,17 @@ # 4) Predict types for extracted terms, using the JSONL we just wrote typing_summary = learner.predict_types_from_terms( - doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly - doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) - model_id=MODEL_ID, # reuse the same small model + doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly + doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) + model_id=MODEL_ID, # reuse the same small model out_terms2types=TERMS2TYPES_OUT_PATH, out_types2docs=TYPES2DOCS_OUT_PATH, # use defaults for everything else ) -print(f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types") +print( + f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types" +) print(f"[saved] {TERMS2TYPES_OUT_PATH}") print(f"[saved] {TYPES2DOCS_OUT_PATH}") diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py index fea5539..4412c5f 100644 --- a/examples/llm_learner_rwthdbis_taxonomy_discovery.py +++ b/examples/llm_learner_rwthdbis_taxonomy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library -from ontolearner import LearnerPipeline, train_test_split -from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner +from ontolearner import LearnerPipeline, train_test_split, ChordOntology +from ontolearner.learner.taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner # Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery ontology = ChordOntology() @@ -8,17 +8,16 @@ # Extract typed taxonomic edges and split into train/test while preserving the structured shape train_data, test_data = train_test_split( - ontology.extract(), - test_size=0.2, - random_state=42 + ontology.extract(), test_size=0.2, random_state=42 ) # Initialize a supervised taxonomy classifier (encoder-based fine-tuning) # Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views # Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions -learner = RWTHDBISTaxonomyLearner( +learner = RWTHDBISSFTLearner( model_name="microsoft/deberta-v3-small", output_dir="./results/", + device="cpu", num_train_epochs=1, per_device_train_batch_size=8, gradient_accumulation_steps=4, @@ -48,10 +47,12 @@ ) # Display the evaluation results -print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} +print( + "Metrics:", outputs["metrics"] +) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} # Display total elapsed time for training + prediction + evaluation -print("Elapsed time:", outputs['elapsed_time']) +print("Elapsed time:", outputs["elapsed_time"]) # Print all returned outputs (include predictions) print(outputs) diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py index 67d207f..d9bdc4b 100644 --- a/examples/llm_learner_rwthdbis_term_typing.py +++ b/examples/llm_learner_rwthdbis_term_typing.py @@ -1,8 +1,8 @@ # Import core modules from the OntoLearner library from ontolearner import LearnerPipeline, train_test_split, AgrO -from ontolearner import RWTHDBISTermTypingLearner +from ontolearner.learner.term_typing.rwthdbis import RWTHDBISSFTLearner -#load the AgrO ontology. +# load the AgrO ontology. # AgrO provides term-typing supervision where each term can be annotated with one or more types. ontology = AgrO() ontology.load() @@ -13,9 +13,10 @@ # Configure a supervised encoder-based classifier for term typing. # This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results. -learner = RWTHDBISTermTypingLearner( +learner = RWTHDBISSFTLearner( model_name="microsoft/deberta-v3-small", output_dir="./results/deberta-v3", + device="cpu", num_train_epochs=30, per_device_train_batch_size=16, gradient_accumulation_steps=2, @@ -41,10 +42,12 @@ ) # Display the evaluation results -print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} +print( + "Metrics:", outputs["metrics"] +) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} # Display total elapsed time for training + prediction + evaluation -print("Elapsed time:", outputs['elapsed_time']) +print("Elapsed time:", outputs["elapsed_time"]) # Print all returned outputs (include predictions) print(outputs) diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py index 19797a9..2200892 100644 --- a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py +++ b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py @@ -1,19 +1,22 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline + # Import the specific Few-Shot Learner implementation -from ontolearner import SBUNLPFewShotLearner +from ontolearner.learner.taxonomy_discovery.sbunlp import SBUNLPFewShotLearner # Load ontology and split # Load the GeoNames ontology for taxonomy discovery. # GeoNames provides geographic parent-child relationships (is-a hierarchy). ontology = GeoNames() ontology.load() -data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object +data = ( + ontology.extract() +) # Extract the list of taxonomic relationships from the ontology object # Split the taxonomic relationships into train and test sets train_data, test_data = train_test_split( data, - test_size=0.6, # 60% of data used for testing (terms to find relations for) + test_size=0.6, # 60% of data used for testing (terms to find relations for) random_state=42, ) @@ -22,19 +25,17 @@ # This performs in-context learning via N x M batch prompting. llm_learner = SBUNLPFewShotLearner( # Model / decoding - model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load - try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency - max_new_tokens=140, # limit the length of the model's response (for JSON output) - max_input_tokens=1500, # limit the total prompt length (context window) - temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) - top_p=1.0, # top-p sampling disabled with temperature=0.0 - + model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency + max_new_tokens=140, # limit the length of the model's response (for JSON output) + max_input_tokens=1500, # limit the total prompt length (context window) + temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) + top_p=1.0, # top-p sampling disabled with temperature=0.0 # Grid settings (N x M prompts) - n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks - m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) - + n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks + m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) # Run controls - limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run + limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging ) @@ -43,8 +44,8 @@ pipe = LearnerPipeline( llm=llm_learner, llm_id=llm_learner.model_name, - ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers - device="auto", # automatically select CUDA or CPU + ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers + device="auto", # automatically select CUDA or CPU ) # Run the full learning pipeline on the taxonomy-discovery task diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py index 564f641..cff543c 100644 --- a/examples/llm_learner_sbunlp_text2onto.py +++ b/examples/llm_learner_sbunlp_text2onto.py @@ -1,6 +1,7 @@ import os import torch -#Import all the required classes + +# Import all the required classes from ontolearner import SBUNLPText2OntoLearner from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM @@ -11,8 +12,8 @@ # Ensure the base directories exist # Creates the train and test subdirectories if they don't already exist. -os.makedirs(os.path.join(LOCAL_DATA_DIR, 'train'), exist_ok=True) -os.makedirs(os.path.join(LOCAL_DATA_DIR, 'test'), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, "train"), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, "test"), exist_ok=True) # Define local file paths: POINTING TO ALREADY SAVED FILES # These files are used as input for the Fit and Predict phases. @@ -22,10 +23,14 @@ # Output files for predictions (saved directly under LOCAL_DATA_DIR/test) # These files will be created by the predict_terms/types methods. -TERMS_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" -TYPES_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" +TERMS_PRED_OUT = ( + "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" +) +TYPES_PRED_OUT = ( + "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" +) -#Initialize and Load Learner --- +# Initialize and Load Learner --- MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Determine the device for inference (GPU or CPU) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" @@ -47,7 +52,7 @@ train_docs_jsonl=DOCS_ALL_PATH, terms2doc_json=TERMS2DOC_PATH, sample_size=28, - seed=123 # Seed for stratified random sampling stability + seed=123, # Seed for stratified random sampling stability ) MAX_NEW_TOKENS = 100 @@ -55,7 +60,7 @@ terms_written = learner.predict_terms( docs_test_jsonl=DOCS_TEST_PATH, out_jsonl=TERMS_PRED_OUT, - max_new_tokens=MAX_NEW_TOKENS + max_new_tokens=MAX_NEW_TOKENS, ) print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.") @@ -63,7 +68,7 @@ types_written = learner.predict_types( docs_test_jsonl=DOCS_TEST_PATH, out_jsonl=TYPES_PRED_OUT, - max_new_tokens=MAX_NEW_TOKENS + max_new_tokens=MAX_NEW_TOKENS, ) print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.") @@ -77,5 +82,7 @@ print(f"Final Type Extraction F1: {f1_type:.4f}") except Exception as e: - # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) - print(f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created.") + # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) + print( + f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created." + ) diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py index 75d01da..54c070c 100644 --- a/examples/llm_learner_sbunlp_zs_term_typing.py +++ b/examples/llm_learner_sbunlp_zs_term_typing.py @@ -1,30 +1,30 @@ # Import core modules from the OntoLearner library from ontolearner import AgrO, train_test_split, LearnerPipeline + # Import the specific Zero-Shot Learner implementation for Term Typing -from ontolearner import SBUNLPZSLearner +from ontolearner.learner.term_typing.sbunlp import SBUNLPZSLearner # Load ontology and split # Load the AgrO ontology for type inventory and test data. ontology = AgrO() ontology.load() -data = ontology.extract() # Extract the full set of relationships/terms +data = ontology.extract() # Extract the full set of relationships/terms # Split the data into train (to learn type inventory) and test (terms to predict) train_data, test_data = train_test_split( data, - test_size=0.6, # 60% of data used for testing + test_size=0.6, # 60% of data used for testing random_state=42, ) # Configure the Qwen Zero-Shot learner (inference-only) # This learner's 'fit' phase learns the vocabulary of allowed type labels. llm_learner = SBUNLPZSLearner( - # Model / decoding - model_id="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load - # device= is auto-detected - max_new_tokens=64, # Sufficient length for JSON list of types - temperature=0.0, # Ensures deterministic (greedy) output - # token= None, # Assuming public model access + device="cpu", + max_new_tokens=64, + temperature=0.0, + model_id="Qwen/Qwen2.5-0.5B-Instruct", + token=None, ) # Build pipeline and run @@ -33,7 +33,7 @@ llm=llm_learner, llm_id=llm_learner.model_id, ontologizer_data=False, - device="cpu", # select CUDA or CPU + device="cpu", # select CUDA or CPU ) # Run the full learning pipeline on the Term-Typing task diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py index 3661a5b..5c87925 100644 --- a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py +++ b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import SKHNLPSequentialFTLearner +from ontolearner.learner.taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner # Load ontology and split # Load the GeoNames ontology for taxonomy discovery. @@ -10,11 +10,7 @@ data = ontology.extract() # Split the taxonomic relationships into train and test sets -train_data, test_data = train_test_split( - data, - test_size=0.2, - random_state=42 -) +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) # Configure the learner with user-defined training args + device # Configure the supervised BERT SFT Learner for taxonomy discovery. @@ -23,7 +19,7 @@ model_name="bert-large-uncased", n_prompts=2, random_state=1403, - device="cpu", # Note: CPU training for BERT-Large is very slow. + device="cpu", # Note: CPU training for BERT-Large is very slow. output_dir="./results/", num_train_epochs=1, per_device_train_batch_size=8, diff --git a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py index 90391f5..fec0ddd 100644 --- a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py +++ b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py @@ -1,7 +1,8 @@ # Import core modules from the OntoLearner library -from ontolearner import GeoNames, train_test_split, LearnerPipeline, SKHNLPZSLearner +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.taxonomy_discovery.skhnlp import SKHNLPZSLearner -#Load ontology and split data +# Load ontology and split data # The GeoNames ontology provides geographic term types and relationships. ontology = GeoNames() ontology.load() @@ -16,11 +17,11 @@ # This model uses a fixed prompt and string normalization (Levenshtein) to classify terms. llm_learner = SKHNLPZSLearner( model_name="Qwen/Qwen2.5-0.5B-Instruct", - device="cpu", # use "cuda" if you have a GPU + device="cpu", # use "cuda" if you have a GPU max_new_tokens=16, - save_path="./outputs/", # directory or full file path for CSV + save_path="./outputs/", # directory or full file path for CSV verbose=True, - normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" + normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" ) # Build pipeline and run @@ -33,7 +34,7 @@ # Run the full learning pipeline on the taxonomy-discovery task outputs = pipe( - train_data=train_data, # zero-shot; ignored by the LLM learner + train_data=train_data, # zero-shot; ignored by the LLM learner test_data=test_data, task="taxonomy-discovery", evaluate=True, diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py deleted file mode 100644 index 57a845b..0000000 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .rwthdbis import RWTHDBISSFTLearner -from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner -from .sbunlp import SBUNLPFewShotLearner -from .alexbek import AlexbekCrossAttnLearner diff --git a/ontolearner/learner/taxonomy_discovery/alexbek.py b/ontolearner/learner/taxonomy_discovery/alexbek.py index 616d50f..3623f16 100644 --- a/ontolearner/learner/taxonomy_discovery/alexbek.py +++ b/ontolearner/learner/taxonomy_discovery/alexbek.py @@ -24,33 +24,70 @@ from ...base import AutoLearner + class RMSNorm(nn.Module): """Root Mean Square normalization with learnable scale. - Computes: y = weight * x / sqrt(mean(x^2) + eps) + Computes per-position normalization: + y = weight * x / sqrt(mean(x^2) + eps) + + This variant normalizes over the last dimension and keeps scale as a + learnable parameter, similar to RMSNorm used in modern transformer stacks. """ def __init__(self, dim: int, eps: float = 1e-6): + """Initialize the RMSNorm layer. + + Args: + dim: Size of the last (feature) dimension to normalize over. + eps: Small constant added inside the square root for numerical + stability. + """ super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) def forward(self, x: torch.Tensor) -> torch.Tensor: + """Apply RMS normalization. + + Args: + x: Input tensor of shape (..., dim). + + Returns: + Tensor of the same shape as `x`, RMS-normalized over the last axis. + """ rms_inv = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) return self.weight * (x * rms_inv) + class CrossAttentionHead(nn.Module): """Minimal multi-head *pair* scorer using cross-attention-style projections. - Given child vector c and parent vector p: - q = Wq * c, k = Wk * p - per-head score = (q_h · k_h) / sqrt(d_head) - aggregate by mean across heads, then sigmoid to get probability. + Given child vector `c` and parent vector `p`: + q = W_q * c, k = W_k * p + score_head = (q_h · k_h) / sqrt(d_head) + + We average the per-head scores and apply a sigmoid to produce a probability. + This is not a full attention block—just a learnable similarity function. """ - def __init__(self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6): + def __init__( + self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6 + ): + """Initialize projections and per-stream normalizers. + + Args: + hidden_size: Dimensionality of input embeddings (child/parent). + num_heads: Number of subspaces to split the projection into. + rms_norm_eps: Epsilon for RMSNorm stability. + + Raises: + AssertionError: If `hidden_size` is not divisible by `num_heads`. + """ super().__init__() - assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads" + assert hidden_size % num_heads == 0, ( + "hidden_size must be divisible by num_heads" + ) self.hidden_size = hidden_size self.num_heads = num_heads self.dim_per_head = hidden_size // num_heads @@ -67,14 +104,17 @@ def __init__(self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1 nn.init.xavier_uniform_(self.query_projection.weight) nn.init.xavier_uniform_(self.key_projection.weight) - def forward(self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor) -> torch.Tensor: + def forward( + self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor + ) -> torch.Tensor: """Score (child, parent) pairs. Args: - child_embeddings: Tensor of shape (batch, hidden_size) - parent_embeddings: Tensor of shape (batch, hidden_size) + child_embeddings: Tensor of shape (batch, hidden_size). + parent_embeddings: Tensor of shape (batch, hidden_size). + Returns: - Tensor of probabilities with shape (batch,) + Tensor of probabilities with shape (batch,), each in [0, 1]. """ batch_size, _ = child_embeddings.shape @@ -95,14 +135,17 @@ def forward(self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tenso # Map to probability return torch.sigmoid(mean_score) + class AlexbekCrossAttnLearner(AutoLearner): """Cross-Attention Taxonomy Learner (inherits AutoLearner). - - Encodes type strings with a SentenceTransformer. - - Trains a small cross-attention head to score (parent, child) edges. - - Predicts probabilities for provided pairs. + Workflow + - Encode terms with a SentenceTransformer. + - Train a compact cross-attention head on (parent, child) pairs + (positives + sampled negatives) using BCE loss. + - Inference returns probabilities per pair; edges with prob >= 0.5 are + labeled as positive. - Helper functions live in this same module (below), *not* as class methods. """ def __init__( @@ -122,11 +165,26 @@ def __init__( ): """Configure the learner. - All configuration is kept directly on the learner (no separate Config class). + Args: + embedding_model: SentenceTransformer model id/path for term encoding. + device: 'cuda' or 'cpu'. If 'cuda' is requested but unavailable, CPU + is used. + num_heads: Number of heads in the cross-attention scorer. + lr: Learning rate for AdamW. + weight_decay: Weight decay for AdamW. + num_epochs: Number of epochs to train the head. + batch_size: Minibatch size for training and scoring loops. + neg_ratio: Number of sampled negatives per positive during training. + output_dir: Directory to store artifacts (reserved for future use). + seed: Random seed for reproducibility. + **kwargs: Passed through to `AutoLearner` base init. + + Side Effects: + Creates `output_dir` if missing and seeds Python/Torch RNGs. """ super().__init__(**kwargs) - # ----- hyperparameters / settings ----- + # hyperparameters / settings self.embedding_model_id = embedding_model self.requested_device = device self.num_heads = num_heads @@ -157,25 +215,62 @@ def __init__( torch.manual_seed(self.seed) def load(self, **kwargs: Any): - """Load the sentence embedding model and initialize the cross-attention head.""" + """Load the sentence embedding model and initialize the cross-attention head. + + Args: + **kwargs: Optional override, supports `embedding_model`. + + Side Effects: + - Initializes `self.embedder` on the configured device. + - Probes and stores `self.embedding_dim`. + - Constructs `self.cross_attn_head` with the probed dimensionality. + """ model_id = kwargs.get("embedding_model", self.embedding_model_id) - self.embedder = SentenceTransformer(model_id, trust_remote_code=True, device=str(self.device)) + self.embedder = SentenceTransformer( + model_id, trust_remote_code=True, device=str(self.device) + ) # Probe output dimensionality using a dummy encode - probe_embedding = self.embedder.encode(["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False) + probe_embedding = self.embedder.encode( + ["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False + ) self.embedding_dim = int(probe_embedding.shape[-1]) # Initialize the cross-attention head - self.cross_attn_head = CrossAttentionHead(hidden_size=self.embedding_dim, num_heads=self.num_heads).to( - self.device - ) + self.cross_attn_head = CrossAttentionHead( + hidden_size=self.embedding_dim, num_heads=self.num_heads + ).to(self.device) def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """Train or infer taxonomy edges according to the AutoLearner contract. + + Training (`test=False`) + - Extract positives (parent, child) and the unique term set from `data`. + - Build/extend the term embedding cache. + - Sample negatives at ratio `self.negative_ratio`. + - Train the cross-attention head with BCE loss. + + Inference (`test=True`) + - Ensure embeddings exist for all terms. + - Score candidate pairs and return per-pair probabilities and labels. + + Args: + data: Ontology-like object exposing `type_taxonomies.taxonomies`, + where each item has `.parent` and `.child` string-like fields. + test: If True, perform inference instead of training. + + Returns: + - `None` on training. + - On inference: List of dicts + `{"parent": str, "child": str, "score": float, "label": int}`. + """ if self.embedder is None or self.cross_attn_head is None: self.load() if not test: - positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms( + data + ) self._ensure_term_embeddings(unique_terms) negative_pairs = self._sample_negative_pairs( positive_pairs, unique_terms, ratio=self.negative_ratio, seed=self.seed @@ -183,27 +278,42 @@ def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: self._train_cross_attn_head(positive_pairs, negative_pairs) return None else: - candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms( + data + ) self._ensure_term_embeddings(unique_terms, append_only=True) probabilities = self._score_parent_child_pairs(candidate_pairs) predictions = [ - {"parent": parent, "child": child, "score": float(prob), "label": int(prob >= 0.5)} + { + "parent": parent, + "child": child, + "score": float(prob), + "label": int(prob >= 0.5), + } for (parent, child), prob in zip(candidate_pairs, probabilities) ] return predictions - def _ensure_term_embeddings(self, terms: List[str], append_only: bool = False) -> None: + def _ensure_term_embeddings( + self, terms: List[str], append_only: bool = False + ) -> None: """Encode terms with the sentence embedder and store in cache. Args: - terms: list of unique strings to embed - append_only: if True, only embed terms missing from cache + terms: List of unique term strings to embed. + append_only: If True, only embed terms missing from the cache; + otherwise (re)encode all provided terms. + + Raises: + RuntimeError: If called before `load()`. """ if self.embedder is None: raise RuntimeError("Call load() before building term embeddings") - terms_to_encode = [t for t in terms if t not in self.term_to_vector] if append_only else terms + terms_to_encode = ( + [t for t in terms if t not in self.term_to_vector] if append_only else terms + ) if not terms_to_encode: return @@ -217,38 +327,78 @@ def _ensure_term_embeddings(self, terms: List[str], append_only: bool = False) - for term, embedding in zip(terms_to_encode, embeddings): self.term_to_vector[term] = embedding.detach().to(self.device) - def _pairs_as_tensors(self, pairs: List[Tuple[str, str]]) -> Tuple[torch.Tensor, torch.Tensor]: - """Turn list of (parent, child) strings into two aligned tensors on device.""" + def _pairs_as_tensors( + self, pairs: List[Tuple[str, str]] + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Convert string pairs into aligned embedding tensors on the correct device. + + Args: + pairs: List of (parent, child) term strings. + + Returns: + Tuple `(child_tensor, parent_tensor)` where each tensor has shape + `(batch, embedding_dim)` and is located on `self.device`. + + Notes: + This function assumes that all terms in `pairs` are present in + `self.term_to_vector`. Use `_ensure_term_embeddings` beforehand. + """ # child embeddings tensor of shape (batch, dim) - child_tensor = torch.stack([self.term_to_vector[child] for (_, child) in pairs], dim=0).to(self.device) + child_tensor = torch.stack( + [self.term_to_vector[child] for (_, child) in pairs], dim=0 + ).to(self.device) # parent embeddings tensor of shape (batch, dim) - parent_tensor = torch.stack([self.term_to_vector[parent] for (parent, _) in pairs], dim=0).to(self.device) + parent_tensor = torch.stack( + [self.term_to_vector[parent] for (parent, _) in pairs], dim=0 + ).to(self.device) return child_tensor, parent_tensor - def _train_cross_attn_head(self, positive_pairs: List[Tuple[str, str]], negative_pairs: List[Tuple[str, str]]) -> None: - """Train the cross-attention head with BCE loss on labeled pairs.""" + def _train_cross_attn_head( + self, + positive_pairs: List[Tuple[str, str]], + negative_pairs: List[Tuple[str, str]], + ) -> None: + """Train the cross-attention head with BCE loss on labeled pairs. + + The dataset is a concatenation of positives (label 1) and sampled + negatives (label 0). The head is optimized with AdamW. + + Args: + positive_pairs: List of ground-truth (parent, child) edges. + negative_pairs: List of sampled non-edges. + + Raises: + RuntimeError: If the head has not been initialized (call `load()`). + """ if self.cross_attn_head is None: raise RuntimeError("Head not initialized. Call load().") self.cross_attn_head.train() optimizer = torch.optim.AdamW( - self.cross_attn_head.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay + self.cross_attn_head.parameters(), + lr=self.learning_rate, + weight_decay=self.weight_decay, ) # Build a simple supervised dataset: 1 for positive, 0 for negative - labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [(1, pc) for pc in positive_pairs] + [ - (0, nc) for nc in negative_pairs - ] + labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [ + (1, pc) for pc in positive_pairs + ] + [(0, nc) for nc in negative_pairs] random.shuffle(labeled_pairs) - def iterate_minibatches(items: List[Tuple[int, Tuple[str, str]]], batch_size: int): + def iterate_minibatches( + items: List[Tuple[int, Tuple[str, str]]], batch_size: int + ): + """Yield contiguous minibatches of size `batch_size` from `items`.""" for start in range(0, len(items), batch_size): yield items[start : start + batch_size] for epoch in range(self.num_epochs): epoch_loss_sum = 0.0 for minibatch in iterate_minibatches(labeled_pairs, self.batch_size): - labels = torch.tensor([y for y, _ in minibatch], dtype=torch.float32, device=self.device) + labels = torch.tensor( + [y for y, _ in minibatch], dtype=torch.float32, device=self.device + ) string_pairs = [pc for _, pc in minibatch] child_tensor, parent_tensor = self._pairs_as_tensors(string_pairs) @@ -261,9 +411,18 @@ def iterate_minibatches(items: List[Tuple[int, Tuple[str, str]]], batch_size: in epoch_loss_sum += float(loss.item()) * len(minibatch) - def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float]: - """Compute probability scores for (parent, child) pairs.""" + """Compute probability scores for (parent, child) pairs. + + Args: + pairs: List of candidate (parent, child) edges to score. + + Returns: + List of floats in [0, 1] corresponding to the input order. + + Raises: + RuntimeError: If the head has not been initialized (call `load()`). + """ if self.cross_attn_head is None: raise RuntimeError("Head not initialized. Call load().") @@ -277,8 +436,23 @@ def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float] scores.extend(prob.detach().cpu().tolist()) return scores - def _extract_parent_child_pairs_and_terms(self, data): - parent_child_pairs = [] + def _extract_parent_child_pairs_and_terms( + self, data: Any + ) -> Tuple[List[Tuple[str, str]], List[str]]: + """Extract (parent, child) edges and the set of unique terms from an ontology-like object. + + The function expects `data.type_taxonomies.taxonomies` to be an iterable + of objects with `.parent` and `.child` string-like attributes. + + Args: + data: Ontology-like container. + + Returns: + A tuple `(pairs, terms)` where: + - `pairs` is a list of (parent, child) strings, + - `terms` is a sorted list of unique term strings (parents ∪ children). + """ + parent_child_pairs: List[Tuple[str, str]] = [] unique_terms = set() for edge in getattr(data, "type_taxonomies").taxonomies: parent, child = str(edge.parent), str(edge.child) @@ -287,11 +461,32 @@ def _extract_parent_child_pairs_and_terms(self, data): unique_terms.add(child) return parent_child_pairs, sorted(unique_terms) - def _sample_negative_pairs(self, positive_pairs, terms, ratio: float = 1.0, seed: int = 42): + def _sample_negative_pairs( + self, + positive_pairs: List[Tuple[str, str]], + terms: List[str], + ratio: float = 1.0, + seed: int = 42, + ) -> List[Tuple[str, str]]: + """Sample random negative (parent, child) pairs not present in positives. + + Sampling is uniform over the Cartesian product of `terms` excluding + (x, x) self-pairs and any pair found in `positive_pairs`. + + Args: + positive_pairs: Known positive edges to exclude. + terms: Candidate vocabulary (parents ∪ children). + ratio: Number of negatives per positive to draw. + seed: RNG seed used for reproducible sampling. + + Returns: + A list of sampled negative pairs of approximate length + `int(len(positive_pairs) * ratio)`. + """ random.seed(seed) term_list = list(terms) positive_set = set(positive_pairs) - negatives = [] + negatives: List[Tuple[str, str]] = [] target_negative_count = int(len(positive_pairs) * ratio) while len(negatives) < target_negative_count: parent = random.choice(term_list) diff --git a/ontolearner/learner/taxonomy_discovery/rwthdbis.py b/ontolearner/learner/taxonomy_discovery/rwthdbis.py index 47989c5..c535016 100644 --- a/ontolearner/learner/taxonomy_discovery/rwthdbis.py +++ b/ontolearner/learner/taxonomy_discovery/rwthdbis.py @@ -16,9 +16,7 @@ import os import random import re -import time import platform -import multiprocessing from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Callable @@ -39,16 +37,45 @@ from ...base import AutoLearner + class RWTHDBISSFTLearner(AutoLearner): """ Supervised classifier for (parent, child) taxonomy edges. Model input format: - " ## " - - If no `context_json_path` is provided, the class precomputes a - context file ({ontology_name}_processed.json) directly from the ontology - object. + " ## " + + Context building: + If no `context_json_path` is provided, the learner precomputes a fixed-name + context file `rwthdbis_onto_processed.json` under `output_dir/context/` + from the ontology terms and stores the path in `self.context_json_path`. + + Attributes: + model_name: Hugging Face model identifier. + output_dir: Directory where checkpoints and tokenizer are saved/loaded. + min_predictions: If no candidate is predicted positive, return the top-k + by positive probability (k = min_predictions). + max_length: Maximum tokenized length for inputs. + per_device_train_batch_size: Micro-batch size per device. + gradient_accumulation_steps: Gradient accumulation steps. + num_train_epochs: Number of training epochs. + learning_rate: Optimizer LR. + weight_decay: Weight decay for AdamW. + logging_steps: Logging interval for Trainer. + save_strategy: HF saving strategy (e.g., 'epoch'). + save_total_limit: Max checkpoints to keep. + fp16: Enable FP16 mixed precision. + bf16: Enable BF16 mixed precision (on supported hardware). + seed: Random seed for reproducibility. + negative_ratio: Number of negatives per positive during training. + bidirectional_templates: If True, also add reversed template examples. + context_json_path: Path to the preprocessed term-context JSON. If None, + the file is generated with the fixed prefix `rwthdbis_onto_*`. + ontology_name: Logical dataset/domain label used in prompts and filtering + (filenames still use the fixed `rwthdbis_onto_*` prefix). + device: user-defined argument as 'cuda' or 'cpu'. + model: Loaded/initialized `AutoModelForSequenceClassification`. + tokenizer: Loaded/initialized `AutoTokenizer`. """ # Sentences containing any of these phrases are pruned from term_info. @@ -78,7 +105,8 @@ def __init__( self, min_predictions: int = 1, model_name: str = "distilroberta-base", - output_dir: str = "./results/{model_name}", + output_dir: str = "./results/taxonomy-discovery", + device: str = "cpu", max_length: int = 256, per_device_train_batch_size: int = 8, gradient_accumulation_steps: int = 4, @@ -94,56 +122,176 @@ def __init__( negative_ratio: int = 5, bidirectional_templates: bool = True, context_json_path: Optional[str] = None, - ontology_name: str = "Geonames" + ontology_name: str = "Geonames", ) -> None: + """ + Initialize the taxonomy-edge learner and set training/inference knobs. + + Notes: + - Output artifacts are written under `output_dir`, including + the model weights and tokenizer (for later `from_pretrained` loads). + - If `context_json_path` is not provided, a new context file named + `rwthdbis_onto_processed.json` is generated under `output_dir/context/`. + """ super().__init__() self.model_name = model_name - self.safe_model_name = model_name.replace("/", "__") + safe_model_name = model_name.replace("/", "__") - resolved_output = output_dir.format(model_name=self.safe_model_name) + resolved_output = output_dir.format(model_name=safe_model_name) self.output_dir = str(Path(resolved_output)) Path(self.output_dir).mkdir(parents=True, exist_ok=True) - self.min_predictions = int(min_predictions) - self.max_length = int(max_length) - self.per_device_train_batch_size = int(per_device_train_batch_size) - self.gradient_accumulation_steps = int(gradient_accumulation_steps) - self.num_train_epochs = float(num_train_epochs) - self.learning_rate = float(learning_rate) - self.weight_decay = float(weight_decay) - self.logging_steps = int(logging_steps) - self.save_strategy = str(save_strategy) - self.save_total_limit = int(save_total_limit) - self.fp16 = bool(fp16) - self.bf16 = bool(bf16) - self.seed = int(seed) - - self.negative_ratio = int(negative_ratio) - self.bidirectional_templates = bool(bidirectional_templates) + # Store provided argument values as-is (types are enforced by callers). + self.min_predictions = min_predictions + self.max_length = max_length + self.per_device_train_batch_size = per_device_train_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.num_train_epochs = num_train_epochs + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.logging_steps = logging_steps + self.save_strategy = save_strategy + self.save_total_limit = save_total_limit + self.fp16 = fp16 + self.bf16 = bf16 + self.seed = seed + + self.negative_ratio = negative_ratio + self.bidirectional_templates = bidirectional_templates self.context_json_path = context_json_path self.ontology_name = ontology_name - self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device self.model: Optional[AutoModelForSequenceClassification] = None self.tokenizer: Optional[AutoTokenizer] = None - os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") - os.environ.setdefault("WANDB_DISABLED", "true") - os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") + # Context caches built from the context JSON. + self._context_exact: Dict[str, str] = {} # lower(term) -> info + self._context_rows: List[ + Dict[str, str] + ] = [] # [{'term': str, 'term_info': str}, ...] + + def _is_windows(self) -> bool: + """Return True if the current OS is Windows (NT).""" + return (os.name == "nt") or (platform.system().lower() == "windows") + + def _normalize_text(self, raw_text: str, *, drop_questions: bool = False) -> str: + """ + Normalize plain text consistently across the pipeline. + + Operations: + - Remove markdown-like link patterns (e.g., '[[1]](http://...)'). + - Replace newlines with spaces; collapse repeated spaces. + - Optionally drop sentences containing '?' (useful for model generations). + + Args: + raw_text: Input text to normalize. + drop_questions: If True, filter out sentences with '?'. + + Returns: + str: Cleaned single-line string. + """ + if raw_text is None: + return "" + text = str(raw_text) + + # Remove simple markdown link artifacts like [[1]](http://...) + text = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", text) + + # Replace newlines with spaces and collapse multiple spaces + text = text.replace("\n", " ") + text = re.sub(r"\s{2,}", " ", text) + + if drop_questions: + sentences = [s.strip() for s in text.split(".")] + sentences = [s for s in sentences if s and "?" not in s] + text = ". ".join(sentences) + + return text.strip() + + def _default_gpt_inference_with_dataset(self, term: str, dataset_name: str) -> str: + """ + Generate a plain-text description for `term`, conditioned on `dataset_name`, + via g4f (best-effort). Falls back to an empty string on failure. + + The raw output is then normalized with `_normalize_text(drop_questions=True)`. + + Args: + term: Term to describe. + dataset_name: Ontology/domain name used in the prompt. + + Returns: + str: Cleaned paragraph describing the term, or "" on failure. + """ + prompt = ( + f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " + "Provide as detailed a definition of this term as possible in plain text.without any markdown format." + "No reference link in result. " + "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" + "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" + "Output: Plain text paragraphs only, neutral and factual." + f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." + ) + + try: + client = _G4FClient() + response = client.chat.completions.create( + model=g4f.models.default, + messages=[{"role": "user", "content": prompt}], + ) + raw_text = ( + response.choices[0].message.content + if response and response.choices + else "" + ) + except Exception: + raw_text = "" # best-effort fallback - self._context_exact: Dict[str, str] = {} # lower(term) -> info - self._context_rows: List[Dict[str, str]] = [] # [{'term': str, 'term_info': str}, ...] + return self._normalize_text(raw_text, drop_questions=True) def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """ + AutoLearner hook: route to training or prediction. + + Args: + data: Ontology-like object (has `.taxonomies` or `.type_taxonomies.taxonomies`). + test: If True, run inference; otherwise, train a model. + + Returns: + If test=True, a list of accepted edges as dicts with keys `parent` and `child`; + otherwise None. + """ return self._predict_pairs(data) if test else self._train_from_pairs(data) def _train_from_pairs(self, train_data: Any) -> None: + """ + Train a binary classifier from ontology pairs. + + Steps: + 1) (Re)build the term-context JSON unless `context_json_path` is set. + 2) Extract positive (parent, child) edges from `train_data`. + 3) Sample negatives at `negative_ratio`. + 4) Tokenize, instantiate HF Trainer, train, and save. + + Args: + train_data: Ontology-like object with `.type_taxonomies.taxonomies` + (preferred) or `.taxonomies`, each item providing `parent` and `child`. + + Raises: + ValueError: If no positive pairs are found. + + Side Effects: + - Writes a trained model to `self.output_dir` (via `trainer.save_model`). + - Writes the tokenizer to `self.output_dir` (via `save_pretrained`). + - Sets `self.context_json_path` if it was previously unset. + The generated context file is named `rwthdbis_onto_processed.json`. + """ # Always (re)build context from ontology unless an explicit file is provided if not self.context_json_path: context_dir = Path(self.output_dir) / "context" context_dir.mkdir(parents=True, exist_ok=True) - processed_context_file = context_dir / f"{self.ontology_name}_processed.json" + processed_context_file = context_dir / "rwthdbis_onto_processed.json" # Remove stale file then regenerate if processed_context_file.exists(): @@ -157,10 +305,12 @@ def _train_from_pairs(self, train_data: Any) -> None: processed_dir=context_dir, dataset_name=self.ontology_name, num_workers=max(1, min(os.cpu_count() or 2, 4)), - provider=partial(self._default_gpt_inference_with_dataset, dataset_name=self.ontology_name), + provider=partial( + self._default_gpt_inference_with_dataset, + dataset_name=self.ontology_name, + ), max_retries=5, ) - self.context_json_path = str(processed_context_file) # Reproducibility @@ -175,19 +325,23 @@ def _train_from_pairs(self, train_data: Any) -> None: if not positive_pairs: raise ValueError("No positive (parent, child) pairs found in train_data.") - entity_names = sorted({parent for parent, _ in positive_pairs} | {child for _, child in positive_pairs}) + entity_names = sorted( + {parent for parent, _ in positive_pairs} + | {child for _, child in positive_pairs} + ) negative_pairs = self._generate_negatives( positives=positive_pairs, entities=entity_names, ratio=self.negative_ratio, ) - labels, texts = self._build_text_dataset(positive_pairs, negative_pairs) - - - datasets = DatasetDict({"train": Dataset.from_dict({"label": labels, "text": texts})}) + labels, input_texts = self._build_text_dataset(positive_pairs, negative_pairs) + dataset_dict = DatasetDict( + {"train": Dataset.from_dict({"label": labels, "text": input_texts})} + ) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + # Ensure a pad token exists for robust padding across models. if self.tokenizer.pad_token is None: self.tokenizer.pad_token = ( getattr(self.tokenizer, "eos_token", None) @@ -196,10 +350,15 @@ def _train_from_pairs(self, train_data: Any) -> None: ) def tokenize_batch(batch: Dict[str, List[str]]): - return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + """Tokenize a batch of input texts for HF Datasets mapping.""" + return self.tokenizer( + batch["text"], truncation=True, max_length=self.max_length + ) - tokenized = datasets.map(tokenize_batch, batched=True, remove_columns=["text"]) - collator = DataCollatorWithPadding(self.tokenizer) + tokenized_dataset = dataset_dict.map( + tokenize_batch, batched=True, remove_columns=["text"] + ) + data_collator = DataCollatorWithPadding(self.tokenizer) self.model = AutoModelForSequenceClassification.from_pretrained( self.model_name, @@ -207,10 +366,14 @@ def tokenize_batch(batch: Dict[str, List[str]]): id2label={0: "incorrect", 1: "correct"}, label2id={"incorrect": 0, "correct": 1}, ) - if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + # Ensure model has a pad_token_id if tokenizer provides one. + if ( + getattr(self.model.config, "pad_token_id", None) is None + and self.tokenizer.pad_token_id is not None + ): self.model.config.pad_token_id = self.tokenizer.pad_token_id - train_args = TrainingArguments( + training_args = TrainingArguments( output_dir=self.output_dir, learning_rate=self.learning_rate, per_device_train_batch_size=self.per_device_train_batch_size, @@ -220,7 +383,7 @@ def tokenize_batch(batch: Dict[str, List[str]]): save_strategy=self.save_strategy, save_total_limit=self.save_total_limit, logging_steps=self.logging_steps, - dataloader_pin_memory = bool(torch.cuda.is_available()), + dataloader_pin_memory=bool(torch.cuda.is_available()), fp16=self.fp16, bf16=self.bf16, report_to="none", @@ -229,16 +392,30 @@ def tokenize_batch(batch: Dict[str, List[str]]): trainer = Trainer( model=self.model, - args=train_args, - train_dataset=tokenized["train"], + args=training_args, + train_dataset=tokenized_dataset["train"], tokenizer=self.tokenizer, - data_collator=collator, + data_collator=data_collator, ) trainer.train() - trainer.save_model(self.output_dir) + trainer.save_model() + # Persist tokenizer alongside the model for from_pretrained() loads. self.tokenizer.save_pretrained(self.output_dir) def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: + """ + Score candidate pairs and return those predicted as positive. + + If no pair is predicted positive but `min_predictions` > 0, the top-k + pairs by positive probability are returned. + + Args: + eval_data: Ontology-like object with either `.pairs` (preferred) or + `.type_taxonomies.taxonomies` / `.taxonomies`. + + Returns: + list[dict]: Each dict has keys `parent` and `child`. + """ import torch.nn.functional as F self._ensure_loaded_for_inference() @@ -247,55 +424,90 @@ def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: if not candidate_pairs: return [] - accepted: List[Dict[str, str]] = [] + accepted_pairs: List[Dict[str, str]] = [] scored_candidates: List[Tuple[float, str, str, int]] = [] self.model.eval() with torch.no_grad(): for parent_term, child_term in candidate_pairs: input_text = self._format_input(parent_term, child_term) - inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, max_length=self.max_length) - inputs = {k: v.to(self.device) for k, v in inputs.items()} + inputs = self.tokenizer( + input_text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) + inputs = {key: tensor.to(self.device) for key, tensor in inputs.items()} logits = self.model(**inputs).logits - probs = F.softmax(logits, dim=-1).squeeze(0) - p_positive = float(probs[1].item()) + probabilities = F.softmax(logits, dim=-1).squeeze(0) + p_positive = float(probabilities[1].item()) predicted_label = int(torch.argmax(logits, dim=-1).item()) - scored_candidates.append((p_positive, parent_term, child_term, predicted_label)) + scored_candidates.append( + (p_positive, parent_term, child_term, predicted_label) + ) if predicted_label == 1: - accepted.append({"parent": parent_term, "child": child_term}) + accepted_pairs.append({"parent": parent_term, "child": child_term}) - if accepted: - return accepted + if accepted_pairs: + return accepted_pairs top_k = max(0, int(self.min_predictions)) if top_k == 0: return [] scored_candidates.sort(key=lambda item: item[0], reverse=True) - return [{"parent": parent_term, "child": child_term} - for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k]] + return [ + {"parent": parent_term, "child": child_term} + for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k] + ] def _ensure_loaded_for_inference(self) -> None: + """ + Load model and tokenizer from `self.output_dir` if not already loaded. + + Side Effects: + - Sets `self.model` and `self.tokenizer`. + - Moves the model to `self.device`. + - Ensures `tokenizer.pad_token_id` is set if model config provides one. + """ if self.model is not None and self.tokenizer is not None: return - self.model = AutoModelForSequenceClassification.from_pretrained(self.output_dir).to(self.device) + self.model = AutoModelForSequenceClassification.from_pretrained( + self.output_dir + ).to(self.device) self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir) - if self.tokenizer.pad_token_id is None and getattr(self.model.config, "pad_token_id", None) is not None: + if ( + self.tokenizer.pad_token_id is None + and getattr(self.model.config, "pad_token_id", None) is not None + ): self.tokenizer.pad_token_id = self.model.config.pad_token_id def _load_context_map(self) -> None: - """Build exact and fuzzy maps from {ontology_name}_processed.json.""" - if not (self.context_json_path): + """ + Populate in-memory maps from the context JSON (`self.context_json_path`). + + Builds: + - `_context_exact`: dict mapping lowercased term → term_info. + - `_context_rows`: list of dict rows with 'term' and 'term_info'. + + If `context_json_path` is falsy or loading fails, both structures become empty. + """ + if not self.context_json_path: self._context_exact = {} self._context_rows = [] return try: rows = json.load(open(self.context_json_path, "r", encoding="utf-8")) self._context_exact = { - str(row.get("term", "")).strip().lower(): str(row.get("term_info", "")).strip() + str(row.get("term", "")).strip().lower(): str( + row.get("term_info", "") + ).strip() for row in rows } self._context_rows = [ - {"term": str(row.get("term", "")), "term_info": str(row.get("term_info", ""))} + { + "term": str(row.get("term", "")), + "term_info": str(row.get("term_info", "")), + } for row in rows ] except Exception: @@ -304,8 +516,17 @@ def _load_context_map(self) -> None: def _lookup_context_info(self, raw_term: str) -> str: """ - Loose context lookup: split by commas, strip whitespace, case-insensitive - substring match against any row['term']. Join hits with '.'. + Retrieve textual context for a term using exact and simple fuzzy matching. + + - Exact: lowercased term lookup in `_context_exact`. + - Fuzzy: split `raw_term` by commas, strip whitespace; treat each piece + as a case-insensitive substring against row['term']. + + Args: + raw_term: Original term string (possibly comma-separated). + + Returns: + str: Concatenated matches' term_info ('.' joined). Empty string if none. """ if not raw_term: return "" @@ -329,27 +550,62 @@ def _lookup_context_info(self, raw_term: str) -> str: def _extract_positive_pairs(self, ontology_obj: Any) -> List[Tuple[str, str]]: """ - Read pairs from ontology_obj.type_taxonomies.taxonomies (or fallback to .taxonomies). - Each item must provide 'parent' and 'child' attributes/keys. + Extract positive (parent, child) edges from an ontology-like object. + + Reads from `ontology_obj.type_taxonomies.taxonomies` (preferred) or + falls back to `ontology_obj.taxonomies`. Each item must expose `parent` + and `child` as attributes or dict keys. + + Returns: + list[tuple[str, str]]: (parent, child) pairs (may be empty). """ type_taxonomies = getattr(ontology_obj, "type_taxonomies", None) - items = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology_obj, "taxonomies", None) + items = ( + getattr(type_taxonomies, "taxonomies", None) + if type_taxonomies is not None + else getattr(ontology_obj, "taxonomies", None) + ) pairs: List[Tuple[str, str]] = [] if items: for item in items: - parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") - child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + parent_term = ( + getattr(item, "parent", None) + if not isinstance(item, dict) + else item.get("parent") + ) + child_term = ( + getattr(item, "child", None) + if not isinstance(item, dict) + else item.get("child") + ) if parent_term and child_term: pairs.append((str(parent_term), str(child_term))) return pairs def _extract_pairs_for_eval(self, ontology_obj: Any) -> List[Tuple[str, str]]: + """ + Extract candidate pairs for evaluation. + + Prefers `ontology_obj.pairs` if present; otherwise falls back to the + positive pairs from the ontology (see `_extract_positive_pairs`). + + Returns: + list[tuple[str, str]]: Candidate (parent, child) pairs. + """ candidate_pairs = getattr(ontology_obj, "pairs", None) if candidate_pairs: pairs: List[Tuple[str, str]] = [] for item in candidate_pairs: - parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") - child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + parent_term = ( + getattr(item, "parent", None) + if not isinstance(item, dict) + else item.get("parent") + ) + child_term = ( + getattr(item, "child", None) + if not isinstance(item, dict) + else item.get("child") + ) if parent_term and child_term: pairs.append((str(parent_term), str(child_term))) return pairs @@ -361,29 +617,66 @@ def _generate_negatives( entities: List[str], ratio: int, ) -> List[Tuple[str, str]]: + """ + Sample negative edges by excluding known positives and self-pairs. + + Constructs the cartesian product of entities (excluding (x, x)), + removes all known positives, and samples up to `ratio * len(positives)` + negatives uniformly at random. + + Args: + positives: Known positive edges. + entities: Unique set/list of entity terms. + ratio: Target negatives per positive (lower-bounded by 1×). + + Returns: + list[tuple[str, str]]: Sampled negative pairs (may be smaller). + """ positive_set = set(positives) - all_possible = {(parent_term, child_term) for parent_term in entities for child_term in entities if parent_term != child_term} + all_possible = { + (parent, child) + for parent in entities + for child in entities + if parent != child + } negative_candidates = list(all_possible - positive_set) target_count = max(len(positive_set) * max(1, ratio), len(positive_set)) sample_count = min(target_count, len(negative_candidates)) - return random.sample(negative_candidates, k=sample_count) if sample_count > 0 else [] + return ( + random.sample(negative_candidates, k=sample_count) + if sample_count > 0 + else [] + ) def _build_text_dataset( self, positives: List[Tuple[str, str]], negatives: List[Tuple[str, str]], ) -> Tuple[List[int], List[str]]: + """ + Create parallel lists of labels and input texts for HF Datasets. + + Builds formatted inputs using `_format_input`, and duplicates examples in + the reverse direction if `bidirectional_templates` is True. + + Returns: + tuple[list[int], list[str]]: (labels, input_texts) where labels are + 1 for positive and 0 for negative. + """ self._load_context_map() labels: List[int] = [] input_texts: List[str] = [] def add_example(parent_term: str, child_term: str, label_value: int) -> None: + """Append one (and optionally reversed) example to the dataset.""" input_texts.append(self._format_input(parent_term, child_term)) labels.append(label_value) if self.bidirectional_templates: - input_texts.append(self._format_input(child_term, parent_term, reverse=True)) + input_texts.append( + self._format_input(child_term, parent_term, reverse=True) + ) labels.append(label_value) for parent_term, child_term in positives: @@ -393,7 +686,15 @@ def add_example(parent_term: str, child_term: str, label_value: int) -> None: return labels, input_texts - def _format_input(self, parent_term: str, child_term: str, reverse: bool = False) -> str: + def _format_input( + self, parent_term: str, child_term: str, reverse: bool = False + ) -> str: + """ + Format a (parent, child) pair into relation text + optional context. + + Returns: + str: " [## Context. 'parent': ... 'child': ...]" + """ relation_text = ( f"{child_term} is a subclass / child / subtype / descendant class of {parent_term}" if reverse @@ -405,63 +706,70 @@ def _format_input(self, parent_term: str, child_term: str, reverse: bool = False if not parent_info and not child_info: return relation_text - context_text = f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + context_text = ( + f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + ) return f"{relation_text} {context_text}" - @staticmethod - def _is_windows() -> bool: - return (os.name == "nt") or (platform.system().lower() == "windows") - - @staticmethod - def _default_gpt_inference_with_dataset(term: str, dataset_name: str) -> str: - """ - Generate a plain-text description for `term`, tailored by `dataset_name`. - Uses g4f if available; otherwise returns an empty string. + def _fill_bucket_threaded( + self, bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str] + ) -> None: """ - prompt = ( - f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " - "Provide as detailed a definition of this term as possible in plain text.without any markdown format." - "No reference link in result. " - "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" - "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" - "Output: Plain text paragraphs only, neutral and factual." - f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." - ) + Populate a shard with provider-generated `term_info` using threads. + Resumes from `output_path` if it already exists, periodically writes + progress (every ~10 items), and finally dumps the full bucket to disk. + """ + start_index = 0 try: - client = _G4FClient() - response = client.chat.completions.create( - model=g4f.models.default, - messages=[{"role": "user", "content": prompt}], - ) - raw_text = response.choices[0].message.content if response and response.choices else "" + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + start_index = len(existing_rows) except Exception: - raw_text = "" # or some deterministic fallback - - # Clean up - cleaned = re.sub(r"[\*\-\#]", "", raw_text) - cleaned = re.sub(r"\n\s*\n", " ", cleaned) - cleaned = cleaned.replace("\n", " ") - cleaned = re.sub(r"\s{2,}", " ", cleaned) - cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", cleaned) - sentences = [sentence for sentence in cleaned.split(".") if "?" not in sentence] - return ".".join(sentences).strip() - - @staticmethod - def _clean_term_info(raw_text: str) -> str: - """Normalize whitespace and remove link artifacts.""" - cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", str(raw_text)) - cleaned = re.sub(r"\s+", " ", cleaned).strip() - return cleaned - - @classmethod - def _merge_part_files(cls, dataset_name: str, merged_path: Path, part_paths: List[Path]) -> None: + pass + + for row_index in range(start_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider( + bucket_rows[row_index]["term"] + ) + except Exception: + bucket_rows[row_index]["term_info"] = "" + if row_index % 10 == 1: + json.dump( + bucket_rows[: row_index + 1], + open(output_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=2, + ) + + json.dump( + bucket_rows, + open(output_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=2, + ) + + def _merge_part_files( + self, dataset_name: str, merged_path: Path, shard_paths: List[Path] + ) -> None: + """ + Merge shard files into one JSON and filter boilerplate sentences. + + - Reads shard lists/dicts from `shard_paths`. + - Drops sentences that contain markers in `_CONTEXT_REMOVALS` or the + `dataset_name` string. + - Normalizes the remaining text via `_normalize_text`. + - Writes merged JSON to `merged_path`, then best-effort deletes shards. + """ merged_rows: List[dict] = [] - for part_path in part_paths: + for shard_path in shard_paths: try: - if not part_path.is_file(): + if not shard_path.is_file(): continue - part_content = json.load(open(part_path, "r", encoding="utf-8")) + part_content = json.load(open(shard_path, "r", encoding="utf-8")) if isinstance(part_content, list): merged_rows.extend(part_content) elif isinstance(part_content, dict): @@ -469,165 +777,111 @@ def _merge_part_files(cls, dataset_name: str, merged_path: Path, part_paths: Lis except Exception: continue - removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + removal_markers = list(self._CONTEXT_REMOVALS) + [dataset_name] for row in merged_rows: term_info_raw = str(row.get("term_info", "")) kept_sentences: List[str] = [] for sentence in term_info_raw.split("."): - sentence_no_links = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence) + sentence_no_links = re.sub( + r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence + ) if any(marker in sentence_no_links for marker in removal_markers): continue kept_sentences.append(sentence_no_links) - row["term_info"] = cls._clean_term_info(".".join(kept_sentences)) + row["term_info"] = self._normalize_text( + ".".join(kept_sentences), drop_questions=False + ) merged_path.parent.mkdir(parents=True, exist_ok=True) - json.dump(merged_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + json.dump( + merged_rows, + open(merged_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=4, + ) # best-effort cleanup - for part_path in part_paths: + for shard_path in shard_paths: try: - os.remove(part_path) + os.remove(shard_path) except Exception: pass - @staticmethod - def _fill_bucket_threaded(bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str]) -> None: - start_index = 0 - try: - if output_path.is_file(): - existing_rows = json.load(open(output_path, "r", encoding="utf-8")) - if isinstance(existing_rows, list) and existing_rows: - bucket_rows[: len(existing_rows)] = existing_rows - start_index = len(existing_rows) - except Exception: - pass - - for row_index in range(start_index, len(bucket_rows)): - try: - bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) - except Exception: - bucket_rows[row_index]["term_info"] = "" - if row_index % 10 == 1: - json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - - json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - - @staticmethod - def _fill_bucket_process( - worker_id: int, - bucket_rows: List[dict], - output_path: Path, - provider: Callable[[str], str], - progress_map: "multiprocessing.managers.DictProxy", - ) -> None: - current_index = 0 - try: - if output_path.is_file(): - existing_rows = json.load(open(output_path, "r", encoding="utf-8")) - if isinstance(existing_rows, list) and existing_rows: - bucket_rows[: len(existing_rows)] = existing_rows - current_index = len(existing_rows) - except Exception: - pass - - progress_map[worker_id] = current_index - - for row_index in range(current_index, len(bucket_rows)): - try: - bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) - except Exception: - bucket_rows[row_index]["term_info"] = "" - progress_map[worker_id] = row_index + 1 - if row_index % 10 == 1: - json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - - json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - progress_map[worker_id] = len(bucket_rows) - - @classmethod def _execute_for_terms( - cls, + self, terms: List[str], merged_path: Path, - part_paths: List[Path], + shard_paths: List[Path], provider: Callable[[str], str], dataset_name: str, num_workers: int = 2, ) -> None: """ - Generate context for `terms`, writing shards to `part_paths`, then merge. - Threads on Windows; processes on POSIX. + Generate context for `terms`, writing shards to `shard_paths`, then merge. + + Always uses threads (pickling-safe for instance methods). + Shows a tqdm progress bar and merges shards at the end. """ worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) - all_rows = [{"id": row_index, "term": term, "term_info": ""} for row_index, term in enumerate(terms)] + all_rows = [ + {"id": index, "term": term, "term_info": ""} + for index, term in enumerate(terms) + ] buckets: List[List[dict]] = [[] for _ in range(worker_count)] for reversed_index, row in enumerate(reversed(all_rows)): buckets[reversed_index % worker_count].append(row) - if cls._is_windows(): - total_rows = len(terms) - progress_bar = tqdm(total=total_rows, desc=f"{dataset_name} generation (threads)") - - def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: - cls._fill_bucket_threaded(bucket_rows, out_path, provider) - return len(bucket_rows) - - with ThreadPoolExecutor(max_workers=worker_count) as pool: - futures = [pool.submit(run_bucket, buckets[bucket_index], part_paths[bucket_index]) - for bucket_index in range(worker_count)] - for future in as_completed(futures): - completed_count = future.result() - if progress_bar: - progress_bar.update(completed_count) - if progress_bar: - progress_bar.close() - else: - manager = multiprocessing.Manager() - progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) - - processes: List[multiprocessing.Process] = [] - for worker_index, bucket_rows in enumerate(buckets): - process = multiprocessing.Process( - target=cls._fill_bucket_process, - args=(worker_index, bucket_rows, part_paths[worker_index], provider, progress_map), + total_rows = len(terms) + progress_bar = tqdm( + total=total_rows, desc=f"{dataset_name} generation (threads)" + ) + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + self._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [ + pool.submit( + run_bucket, buckets[bucket_index], shard_paths[bucket_index] ) - processes.append(process) - process.start() - - total_rows = len(terms) - with tqdm(total=total_rows, desc=f"{dataset_name} generation") as progress_bar: - previous_total = 0 - while any(process.is_alive() for process in processes): - current_total = int(sum(progress_map.values())) - progress_bar.update(current_total - previous_total) - previous_total = current_total - time.sleep(0.5) - current_total = int(sum(progress_map.values())) - if current_total > previous_total: - progress_bar.update(current_total - previous_total) - - for process in processes: - process.join() - - cls._merge_part_files(dataset_name, merged_path, part_paths) - - @classmethod + for bucket_index in range(worker_count) + ] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + + self._merge_part_files(dataset_name, merged_path, shard_paths) + def _re_infer_short_entries( - cls, + self, merged_path: Path, - re_part_paths: List[Path], + re_shard_paths: List[Path], re_merged_path: Path, provider: Callable[[str], str], dataset_name: str, num_workers: int, ) -> int: """ - Re-query terms with too-short term_info (< 50 chars). Returns remaining count. + Re-query terms whose `term_info` is too short (< 50 chars). + + Process: + - Read `merged_path`. + - Filter boilerplate using `_CONTEXT_REMOVALS` and `dataset_name`. + - Split into short/long groups by length 50. + - Regenerate short group with `provider` in parallel (threads). + - Merge regenerated + long back into `merged_path`. + + Returns: + int: Count of rows still < 50 chars after re-inference. """ merged_rows = json.load(open(merged_path, "r", encoding="utf-8")) - removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + removal_markers = list(self._CONTEXT_REMOVALS) + [dataset_name] short_rows: List[dict] = [] long_rows: List[dict] = [] @@ -635,9 +889,14 @@ def _re_infer_short_entries( term_info_raw = str(row.get("term_info", "")) sentences = term_info_raw.split(".") for marker in removal_markers: - sentences = [sentence if marker not in sentence else "" for sentence in sentences] - filtered_info = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", ".".join(sentences)) + sentences = [ + sentence if marker not in sentence else "" for sentence in sentences + ] + filtered_info = self._normalize_text( + ".".join(sentences), drop_questions=False + ) row["term_info"] = filtered_info + (short_rows if len(filtered_info) < 50 else long_rows).append(row) worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) @@ -645,77 +904,83 @@ def _re_infer_short_entries( for row_index, row in enumerate(short_rows): buckets[row_index % worker_count].append(row) - # clean old re-inference shards - for path in re_part_paths: + # Clean old re-inference shards + for path in re_shard_paths: try: os.remove(path) except Exception: pass total_candidates = len(short_rows) - if cls._is_windows(): - progress_bar = tqdm(total=total_candidates, desc=f"{dataset_name} re-inference (threads)") - - def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: - cls._fill_bucket_threaded(bucket_rows, out_path, provider) - return len(bucket_rows) - - with ThreadPoolExecutor(max_workers=worker_count) as pool: - futures = [pool.submit(run_bucket, buckets[bucket_index], re_part_paths[bucket_index]) - for bucket_index in range(worker_count)] - for future in as_completed(futures): - completed_count = future.result() - if progress_bar: - progress_bar.update(completed_count) - if progress_bar: - progress_bar.close() - else: - manager = multiprocessing.Manager() - progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) - - processes: List[multiprocessing.Process] = [] - for worker_index, bucket_rows in enumerate(buckets): - process = multiprocessing.Process( - target=cls._fill_bucket_process, - args=(worker_index, bucket_rows, re_part_paths[worker_index], provider, progress_map), + progress_bar = tqdm( + total=total_candidates, desc=f"{dataset_name} re-inference (threads)" + ) + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + self._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [ + pool.submit( + run_bucket, buckets[bucket_index], re_shard_paths[bucket_index] ) - processes.append(process) - process.start() - - with tqdm(total=total_candidates, desc=f"{dataset_name} re-inference") as progress_bar: - previous_total = 0 - while any(process.is_alive() for process in processes): - current_total = int(sum(progress_map.values())) - progress_bar.update(current_total - previous_total) - previous_total = current_total - time.sleep(1) - if progress_bar.n < total_candidates: - progress_bar.update(total_candidates - progress_bar.n) - - for process in processes: - process.join() - - # merge and write back - cls._merge_part_files(dataset_name, re_merged_path, re_part_paths) - new_rows = json.load(open(re_merged_path, "r", encoding="utf-8")) if re_merged_path.is_file() else [] + for bucket_index in range(worker_count) + ] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + + # Merge and write back + self._merge_part_files(dataset_name, re_merged_path, re_shard_paths) + new_rows = ( + json.load(open(re_merged_path, "r", encoding="utf-8")) + if re_merged_path.is_file() + else [] + ) final_rows = long_rows + new_rows - json.dump(final_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + json.dump( + final_rows, + open(merged_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=4, + ) - remaining_short = sum(1 for row in final_rows if len(str(row.get("term_info", ""))) < 50) + remaining_short = sum( + 1 for row in final_rows if len(str(row.get("term_info", ""))) < 50 + ) return remaining_short - @staticmethod - def _extract_terms_from_ontology(ontology: Any) -> List[str]: + def _extract_terms_from_ontology(self, ontology: Any) -> List[str]: """ - Collect unique term names from ontology.type_taxonomies.taxonomies. + Collect unique term names from `ontology.type_taxonomies.taxonomies`, + falling back to `ontology.taxonomies` if needed. + + Returns: + list[str]: Sorted unique term list. """ type_taxonomies = getattr(ontology, "type_taxonomies", None) - taxonomies = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology, "taxonomies", None) + taxonomies = ( + getattr(type_taxonomies, "taxonomies", None) + if type_taxonomies is not None + else getattr(ontology, "taxonomies", None) + ) unique_terms: set[str] = set() if taxonomies: for row in taxonomies: - parent_term = getattr(row, "parent", None) if not isinstance(row, dict) else row.get("parent") - child_term = getattr(row, "child", None) if not isinstance(row, dict) else row.get("child") + parent_term = ( + getattr(row, "parent", None) + if not isinstance(row, dict) + else row.get("parent") + ) + child_term = ( + getattr(row, "child", None) + if not isinstance(row, dict) + else row.get("child") + ) if parent_term: unique_terms.add(str(parent_term)) if child_term: @@ -732,15 +997,32 @@ def preprocess_context_from_ontology( max_retries: int = 5, ) -> Path: """ - Build {id, term, term_info} from an ontology object. - Always regenerates {dataset_name}_processed.json. + Build `{id, term, term_info}` rows from an ontology object. + + Always regenerates the fixed-name file `rwthdbis_onto_processed.json`, + performing: + - Parallel generation of term_info in shards (`_execute_for_terms`), + - Re-inference rounds for short entries (`_re_infer_short_entries`), + - Final merge and cleanup, + - Updates `self.context_json_path`. + + Filenames under `processed_dir`: + - merged: `rwthdbis_onto_processed.json` + - shards: `rwthdbis_onto_type_part{idx}.json` + - re-infer shards: `rwthdbis_onto_re_inference{idx}.json` + - re-infer merged: `rwthdbis_onto_Types_re_inference.json` + + Returns: + Path: The merged context JSON path (`rwthdbis_onto_processed.json`). """ - provider = provider or provider or partial(self._default_gpt_inference_with_dataset, dataset_name=dataset_name) + provider = provider or partial( + self._default_gpt_inference_with_dataset, dataset_name=dataset_name + ) processed_dir = Path(processed_dir) processed_dir.mkdir(parents=True, exist_ok=True) - merged_path = processed_dir / f"{dataset_name}_processed.json" + merged_path = processed_dir / "rwthdbis_onto_processed.json" if merged_path.exists(): try: merged_path.unlink() @@ -748,12 +1030,18 @@ def preprocess_context_from_ontology( pass worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) - shard_paths = [processed_dir / f"{dataset_name}_type_part{shard_index}.json" for shard_index in range(worker_count)] - reinf_paths = [processed_dir / f"{dataset_name}_re_inference{shard_index}.json" for shard_index in range(worker_count)] - reinf_merged_path = processed_dir / f"{dataset_name}_Types_re_inference.json" - - # remove any leftover shards - for path in shard_paths + reinf_paths + [reinf_merged_path]: + shard_paths = [ + processed_dir / f"rwthdbis_onto_type_part{index}.json" + for index in range(worker_count) + ] + re_shard_paths = [ + processed_dir / f"rwthdbis_onto_re_inference{index}.json" + for index in range(worker_count) + ] + re_merged_path = processed_dir / "rwthdbis_onto_Types_re_inference.json" + + # Remove any leftover shards + for path in shard_paths + re_shard_paths + [re_merged_path]: try: if path.exists(): path.unlink() @@ -766,7 +1054,7 @@ def preprocess_context_from_ontology( self._execute_for_terms( terms=unique_terms, merged_path=merged_path, - part_paths=shard_paths, + shard_paths=shard_paths, provider=provider, dataset_name=dataset_name, num_workers=worker_count, @@ -776,13 +1064,15 @@ def preprocess_context_from_ontology( while retry_round < max_retries: remaining_count = self._re_infer_short_entries( merged_path=merged_path, - re_part_paths=reinf_paths, - re_merged_path=reinf_merged_path, + re_shard_paths=re_shard_paths, + re_merged_path=re_merged_path, provider=provider, dataset_name=dataset_name, num_workers=worker_count, ) - print(f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}") + print( + f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}" + ) retry_round += 1 if remaining_count == 0: break diff --git a/ontolearner/learner/taxonomy_discovery/sbunlp.py b/ontolearner/learner/taxonomy_discovery/sbunlp.py index 9fc520d..660ec6e 100644 --- a/ontolearner/learner/taxonomy_discovery/sbunlp.py +++ b/ontolearner/learner/taxonomy_discovery/sbunlp.py @@ -1,45 +1,33 @@ # Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# License: MIT import os import re import json -import importlib.util -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional import torch -from transformers import AutoTokenizer, AutoModelForCausalLM - +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from ...base import AutoLearner + class SBUNLPFewShotLearner(AutoLearner): """ - Taxonomy-discovery via N×M batch prompting with a small Qwen model. - - Lifecycle - --------- - fit(): - Cache + clean training parent–child pairs. - predict(): - Chunk (train pairs × test terms), prompt per chunk pair, parse, merge, - and deduplicate predicted relations. + Few-shot taxonomy discovery via N×M batch prompting. + + This learner: + - Caches & cleans gold parent–child pairs during `fit`. + - Splits (train pairs × test terms) into a grid of chunks. + - Builds an instruction prompt per grid cell with few-shot JSON examples. + - Generates and parses model outputs as JSON relations. + - Merges & deduplicates all predicted edges. """ def __init__( self, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", try_4bit: bool = True, + device: str = "cpu", num_train_chunks: int = 7, num_test_chunks: int = 7, max_new_tokens: int = 140, @@ -50,88 +38,117 @@ def __init__( output_dir: Optional[str] = None, **kwargs: Any, ) -> None: + """ + Initialize the learner and core generation / batching settings. + + Args: + model_name: HF id/path of the causal LLM (e.g., Qwen Instruct). + try_4bit: If True and on CUDA, load with 4-bit NF4 quantization. + device: "cpu" or "cuda" for model execution. + num_train_chunks: Number of chunks for the gold (parent, child) bank. + num_test_chunks: Number of chunks for the test term list. + max_new_tokens: Max new tokens to generate per prompt call. + max_input_tokens: Clip the *input* prompt to this many tokens (tail kept). + temperature: Sampling temperature; 0.0 uses greedy decoding. + top_p: Nucleus sampling parameter (used when temperature > 0). + limit_num_prompts: Optional hard cap on prompts issued (debug/cost). + output_dir: Optional directory to save per-batch JSON predictions. + **kwargs: Forwarded to the base class. + """ super().__init__(**kwargs) self.model_name = model_name self.try_4bit = try_4bit + self.device = device self.num_train_chunks = num_train_chunks self.num_test_chunks = num_test_chunks - self.max_new_tokens = max_new_tokens self.max_input_tokens = max_input_tokens self.temperature = temperature self.top_p = top_p self.limit_num_prompts = limit_num_prompts - self.output_dir = output_dir self.tokenizer: Optional[AutoTokenizer] = None self.model: Optional[AutoModelForCausalLM] = None - self.device = "cuda" if torch.cuda.is_available() else "cpu" - self.train_pairs_clean: List[Dict[str, str]] = [] - # ----------------------- small helpers ---------------------- - def _clean_pairs(pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: - """ - Normalize, drop empty or self-relations, and deduplicate by (parent, child). + def _clean_pairs(self, pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: """ - cleaned_pairs: List[Dict[str, str]] = [] - seen_parent_child: set[Tuple[str, str]] = set() + Normalize, filter, and deduplicate relation pairs. - for pair_record in pair_rows or []: - if not isinstance(pair_record, dict): - continue + Operations: + - Cast 'parent'/'child' to strings and strip whitespace. + - Drop rows with empty values. + - Drop self-relations (case-insensitive parent == child). + - Deduplicate by lowercase (parent, child). - parent_label = str(pair_record.get("parent", "")).strip() - child_label = str(pair_record.get("child", "")).strip() - if not parent_label or not child_label: - continue + Args: + pair_rows: Raw list of dicts with at least 'parent' and 'child'. - normalized_key = (parent_label.lower(), child_label.lower()) - if normalized_key[0] == normalized_key[1]: # parent==child + Returns: + Cleaned list of {'parent','child'} dicts. + """ + cleaned, seen = [], set() + for rec in pair_rows or []: + if not isinstance(rec, dict): + continue + p = str(rec.get("parent", "")).strip() + c = str(rec.get("child", "")).strip() + if not p or not c: continue - if normalized_key in seen_parent_child: + key = (p.lower(), c.lower()) + if key[0] == key[1] or key in seen: continue + seen.add(key) + cleaned.append({"parent": p, "child": c}) + return cleaned - seen_parent_child.add(normalized_key) - cleaned_pairs.append({"parent": parent_label, "child": child_label}) + def _chunk_list(self, items: List[Any], num_chunks: int) -> List[List[Any]]: + """ + Split a list into `num_chunks` near-equal contiguous parts. - return cleaned_pairs + Args: + items: Sequence to split. + num_chunks: Number of chunks to produce; if <= 0, returns [items]. - def _chunk_list(items: List[Any], num_chunks: int) -> List[List[Any]]: - """ - Split `items` into `num_chunks` near-equal parts. Some chunks may be empty. + Returns: + List of chunks (some may be empty if len(items) < num_chunks). """ if num_chunks <= 0: return [items] - total_items = len(items) - base_size, remainder = divmod(total_items, num_chunks) - - chunks: List[List[Any]] = [] - start_index = 0 - for chunk_index in range(num_chunks): - current_size = base_size + (1 if chunk_index < remainder else 0) - end_index = start_index + current_size - chunks.append(items[start_index:end_index]) - start_index = end_index - return chunks - - def _ensure_dir(self, maybe_path: Optional[str]) -> None: - if maybe_path: - os.makedirs(maybe_path, exist_ok=True) - - # ---------------------- model load/gen ---------------------- - def load(self, **_: Any) -> None: + n = len(items) + base, rem = divmod(n, num_chunks) + out, start = [], 0 + for i in range(num_chunks): + size = base + (1 if i < rem else 0) + out.append(items[start : start + size]) + start += size + return out + + def _ensure_dir(self, path: Optional[str]) -> None: """ - Load tokenizer/model; use 4-bit nf4 on CUDA if available + requested. + Create a directory if `path` is a non-empty string. + + Args: + path: Directory to create (recursively). Ignored if falsy. """ - bnb_available = importlib.util.find_spec("bitsandbytes") is not None - use_4bit_quant = bool(self.try_4bit and bnb_available and self.device == "cuda") + if path: + os.makedirs(path, exist_ok=True) + def load(self, **_: Any) -> None: + """ + Load tokenizer and model; optionally enable 4-bit quantization. + + Assumes bitsandbytes is available if `try_4bit=True` on CUDA. + Sets tokenizer pad token if missing. Places model on GPU (device_map='auto') + when `device='cuda'`, otherwise on CPU. + + Args: + **_: Unused kwargs for interface compatibility. + """ quant_config = None - if use_4bit_quant: - from transformers import BitsAndBytesConfig + if self.try_4bit and self.device == "cuda": quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, @@ -140,8 +157,11 @@ def load(self, **_: Any) -> None: ) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - if self.tokenizer.pad_token is None: - self.tokenizer.pad_token = self.tokenizer.eos_token + if getattr(self.tokenizer, "pad_token_id", None) is None: + if getattr(self.tokenizer, "eos_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.eos_token + elif getattr(self.tokenizer, "unk_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.unk_token self.model = AutoModelForCausalLM.from_pretrained( self.model_name, @@ -149,12 +169,26 @@ def load(self, **_: Any) -> None: torch_dtype=(torch.float16 if self.device == "cuda" else torch.float32), quantization_config=quant_config, ) + if self.device == "cpu": + self.model.to("cpu") def _format_chat(self, user_text: str) -> str: """ - Wrap user text with the model's chat template (if present). + Wrap plain text with the model's chat template, if provided. + + Many instruction-tuned models expose `tokenizer.chat_template`. + If available, use it to construct a proper chat prompt; otherwise, + return the text unchanged. + + Args: + user_text: Content of the user message. + + Returns: + A generation-ready prompt string. """ - if hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None): + if hasattr(self.tokenizer, "apply_chat_template") and getattr( + self.tokenizer, "chat_template", None + ): return self.tokenizer.apply_chat_template( [{"role": "user", "content": user_text}], tokenize=False, @@ -165,17 +199,31 @@ def _format_chat(self, user_text: str) -> str: @torch.no_grad() def _generate(self, prompt_text: str) -> str: """ - Single prompt → model text. Clips *input* tokens to avoid overflow. - """ - formatted_prompt = self._format_chat(prompt_text) - prompt_token_ids = self.tokenizer(formatted_prompt, add_special_tokens=False, return_tensors=None)["input_ids"] - if len(prompt_token_ids) > self.max_input_tokens: - prompt_token_ids = prompt_token_ids[-self.max_input_tokens:] + Generate text for a single prompt, guarding input length. + + Steps: + 1) Format prompt via chat template (if present). + 2) Tokenize and clip the *input* to `max_input_tokens` (tail kept). + 3) Call `model.generate` with configured decoding params. + 4) Strip the echoed prompt from the decoded output (if present). - prompt_tensor = torch.tensor([prompt_token_ids]).to(self.model.device) + Args: + prompt_text: Textual prompt to feed the model. - generation = self.model.generate( - input_ids=prompt_tensor, + Returns: + Model continuation string (prompt-echo stripped when applicable). + """ + formatted = self._format_chat(prompt_text) + ids = self.tokenizer(formatted, add_special_tokens=False, return_tensors=None)[ + "input_ids" + ] + if len(ids) > self.max_input_tokens: + ids = ids[-self.max_input_tokens :] + device = next(self.model.parameters()).device + input_ids = torch.tensor([ids], device=device) + + out = self.model.generate( + input_ids=input_ids, max_new_tokens=self.max_new_tokens, do_sample=(self.temperature > 0.0), temperature=self.temperature, @@ -185,20 +233,37 @@ def _generate(self, prompt_text: str) -> str: use_cache=True, ) - decoded_full = self.tokenizer.decode(generation[0], skip_special_tokens=True) - decoded_prompt = self.tokenizer.decode(prompt_tensor[0], skip_special_tokens=True) - return decoded_full[len(decoded_prompt):].strip() if decoded_full.startswith(decoded_prompt) else decoded_full.strip() + decoded_full = self.tokenizer.decode(out[0], skip_special_tokens=True) + decoded_prompt = self.tokenizer.decode(input_ids[0], skip_special_tokens=True) + return ( + decoded_full[len(decoded_prompt) :].strip() + if decoded_full.startswith(decoded_prompt) + else decoded_full.strip() + ) - # ------------------ prompt build & parsing ------------------ - def _build_prompt(train_pairs_chunk: List[Dict[str, str]], - test_terms_chunk: List[str]) -> str: + def _build_prompt( + self, + train_pairs_chunk: List[Dict[str, str]], + test_terms_chunk: List[str], + ) -> str: """ - Few-shot with JSON examples + a block of test terms. - The model must return ONLY a JSON array of {parent, child}. + Construct a few-shot prompt with JSON examples and test terms. + + The prompt: + - Shows several gold (parent, child) examples in JSON. + - Lists the test terms (one per line) between [PAIR] tags. + - Instructs to return ONLY a JSON array of {'parent','child'}. + + Args: + train_pairs_chunk: Cleaned training relations for examples. + test_terms_chunk: The current chunk of test terms. + + Returns: + The fully formatted prompt string. """ examples_json = json.dumps(train_pairs_chunk, ensure_ascii=False, indent=2) - test_types_block = "\n".join(test_terms_chunk) - return ( + test_block = "\n".join(test_terms_chunk) + prompt = ( "From this file, extract all parent–child relations like in the examples.\n" "Return ONLY a JSON array of objects with keys 'parent' and 'child'.\n" "Output format:\n" @@ -210,108 +275,128 @@ def _build_prompt(train_pairs_chunk: List[Dict[str, str]], f"{examples_json}\n\n" "TEST TYPES (between [PAIR] tags):\n" "[PAIR]\n" - f"{test_types_block}\n" + f"{test_block}\n" "[PAIR]\n" "Return only JSON." ) + return prompt - def _parse_pairs(model_text: str) -> List[Dict[str, str]]: - """ - Parse a model response into a list of {'parent','child'} dicts. + def _parse_pairs(self, text: str) -> List[Dict[str, str]]: """ - def deduplicate_and_normalize(dict_list: List[Dict[str, str]]) -> List[Dict[str, str]]: - return SBUNLPFewShotLearner._clean_pairs(dict_list) + Parse a generation string into a list of relation dicts. - response_text = model_text.strip() + Parsing strategy: + 1) Try to parse the entire string as JSON; expect a list. + 2) Else, regex-extract the outermost JSON-like array and parse that. + 3) On failure, return an empty list. - # 1) Direct JSON list + Args: + text: Raw model output. + + Returns: + Cleaned list of {'parent','child'} dicts (possibly empty). + """ + text = text.strip() try: - maybe_json = json.loads(response_text) - if isinstance(maybe_json, list): - return deduplicate_and_normalize(maybe_json) + obj = json.loads(text) + if isinstance(obj, list): + return self._clean_pairs(obj) except Exception: pass - - # 2) Find outermost [ ... ] and parse that - outer_list_match = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", response_text) - if outer_list_match: + m = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", text) + if m: try: - array_json = json.loads(outer_list_match.group(0)) - if isinstance(array_json, list): - return deduplicate_and_normalize(array_json) + obj = json.loads(m.group(0)) + if isinstance(obj, list): + return self._clean_pairs(obj) except Exception: pass - - # 3) Nothing parsable return [] - # --------------------- AutoLearner hooks -------------------- def fit(self, train_data: Any, task: str, ontologizer: bool = True): """ - Build the training example bank (parent–child pairs). + Cache and clean gold relations for few-shot prompting. + + For `task == "taxonomy-discovery"`: + - If `ontologizer=True`, convert ontology-like input into + a list of {'parent','child'} via the base helper. + - Otherwise, accept a user-provided list directly. + - Store a cleaned, deduplicated bank in `self.train_pairs_clean`. + + Args: + train_data: Ontology-like object or list of relation dicts. + task: Task selector (expects "taxonomy-discovery"). + ontologizer: Whether to transform ontology inputs. + + Returns: + None. (State is stored on the instance.) """ if task != "taxonomy-discovery": return super().fit(train_data, task, ontologizer) - if ontologizer: - # Convert ontology object → list of {"parent","child"} gold pairs - gold_pairs_from_ontology = self.tasks_ground_truth_former( - train_data, task="taxonomy-discovery" - ) - self.train_pairs_clean = self._clean_pairs(gold_pairs_from_ontology) + gold = self.tasks_ground_truth_former(train_data, task="taxonomy-discovery") + self.train_pairs_clean = self._clean_pairs(gold) else: - # Already a Python list of dicts self.train_pairs_clean = self._clean_pairs(train_data) - def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + def _taxonomy_discovery( + self, data: Any, test: bool = False + ) -> Optional[List[Dict[str, str]]]: """ - Main prediction path. Returns a deduplicated list of relations. + Run few-shot inference (test=True) or no-op during training. + + Inference steps: + - Ensure tokenizer/model are loaded. + - Normalize `data` to a list of test terms (via base helper if needed). + - Create the N×M grid across (train_pairs_chunk × test_terms_chunk). + - For each cell: build prompt → generate → parse → (optionally) save. + - Merge and deduplicate all predicted pairs before returning. + + Args: + data: Test input (ontology-like, list of strings, or mixed). + test: If True, perform prediction; otherwise return None. + + Returns: + On `test=True`: deduplicated list of {'parent','child'}. + On `test=False`: None. """ if not test: return None - if self.model is None or self.tokenizer is None: self.load() - # Build test vocabulary of types/terms if isinstance(data, list) and (len(data) == 0 or isinstance(data[0], str)): - test_type_list: List[str] = data + test_terms: List[str] = data else: - test_type_list = super().tasks_data_former( + test_terms = super().tasks_data_former( data=data, task="taxonomy-discovery", test=True ) - # Create N×M grid train_chunks = self._chunk_list(self.train_pairs_clean, self.num_train_chunks) - test_chunks = self._chunk_list(test_type_list, self.num_test_chunks) + test_chunks = self._chunk_list(test_terms, self.num_test_chunks) self._ensure_dir(self.output_dir) - merged_predicted_pairs: List[Dict[str, str]] = [] - issued_prompt_count = 0 + merged: List[Dict[str, str]] = [] + issued = 0 - for train_chunk_index, train_pairs_chunk in enumerate(train_chunks, start=1): - for test_chunk_index, test_terms_chunk in enumerate(test_chunks, start=1): - issued_prompt_count += 1 - if self.limit_num_prompts and issued_prompt_count > self.limit_num_prompts: + for ti, tr in enumerate(train_chunks, 1): + for si, ts in enumerate(test_chunks, 1): + issued += 1 + if self.limit_num_prompts and issued > self.limit_num_prompts: break + prompt = self._build_prompt(tr, ts) + resp = self._generate(prompt) + pairs = self._parse_pairs(resp) - prompt_text = self._build_prompt(train_pairs_chunk, test_terms_chunk) - model_response = self._generate(prompt_text) - parsed_relation_pairs = self._parse_pairs(model_response) - - # Optional per-batch dump for debugging if self.output_dir: - batch_json_path = os.path.join( - self.output_dir, f"pairs_T{train_chunk_index}_S{test_chunk_index}.json" - ) - with open(batch_json_path, "w", encoding="utf-8") as fp: - json.dump(parsed_relation_pairs, fp, ensure_ascii=False, indent=2) + path = os.path.join(self.output_dir, f"pairs_T{ti}_S{si}.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(pairs, f, ensure_ascii=False, indent=2) - merged_predicted_pairs.extend(parsed_relation_pairs) + merged.extend(pairs) - if self.limit_num_prompts and issued_prompt_count >= (self.limit_num_prompts or 0): + if self.limit_num_prompts and issued >= (self.limit_num_prompts or 0): break - # Deduplicate final list - return self._clean_pairs(merged_predicted_pairs) + return self._clean_pairs(merged) diff --git a/ontolearner/learner/taxonomy_discovery/skhnlp.py b/ontolearner/learner/taxonomy_discovery/skhnlp.py index fbe53b4..c242aab 100644 --- a/ontolearner/learner/taxonomy_discovery/skhnlp.py +++ b/ontolearner/learner/taxonomy_discovery/skhnlp.py @@ -23,6 +23,7 @@ from typing import Any, Optional, List, Tuple, Dict from transformers import ( AutoTokenizer, + AutoModelForSequenceClassification, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification, @@ -35,10 +36,20 @@ from ...utils import taxonomy_split, train_test_split as ontology_split from ...data_structure import OntologyData, TaxonomicRelation + class SKHNLPTaxonomyPrompts(AutoPrompt): - """Builds the 7 taxonomy prompts used during fine-tuning / inference.""" + """Builds the 7 taxonomy prompts used during fine-tuning / inference. + + The class stores a small inventory of prompt templates that verbalize the + (parent, child) relationship using different phrasings. Each template ends + with a masked token slot intended for True/False classification. + """ + def __init__(self) -> None: - super().__init__(prompt_template="{parent} is the superclass of {child}. This statement is [MASK].") + """Initialize prompt templates and the default prompt in the base class.""" + super().__init__( + prompt_template="{parent} is the superclass of {child}. This statement is [MASK]." + ) self.templates: List[str] = [ "{parent} is the superclass of {child}. This statement is [MASK].", "{child} is a subclass of {parent}. This statement is [MASK].", @@ -49,7 +60,17 @@ def __init__(self) -> None: "{parent} is an ancestor class of {child}. This statement is [MASK].", ] - def make(self, parent: str, child: str, template_idx: int) -> str: + def format(self, parent: str, child: str, template_idx: int) -> str: + """Render a prompt for a (parent, child) pair using a specific template. + + Args: + parent: The parent/superclass label. + child: The child/subclass label. + template_idx: Index into the internal `templates` list. + + Returns: + The fully formatted prompt string. + """ return self.templates[template_idx].format(parent=parent, child=child) @@ -66,20 +87,18 @@ class SKHNLPSequentialFTLearner(AutoLearner): * PREDICT/TEST: pairwise binary classification (returns label + score). """ - # Fixed constants defining data split size and the proportional mix of - # negative sample types (reversed vs. manipulated) for balancing. - _EVAL_FRACTION: float = 0.16 - _NEG_RATIO_REVERSED: float = 1/3 - _NEG_RATIO_MANIPULATED: float = 2/3 - def __init__( self, # core model_name: str = "bert-large-uncased", n_prompts: int = 7, random_state: int = 1403, - device: Optional[str] = None, # "cuda" | "cpu" | None (auto) - + num_labels: int = 2, + device: str = "cpu", # "cuda" | "cpu" | None (auto) + # data split & negative sampling (now configurable) + eval_fraction: float = 0.16, + neg_ratio_reversed: float = 1 / 3, + neg_ratio_manipulated: float = 2 / 3, # ---- expose TrainingArguments as individual user-defined args ---- output_dir: str = "./results/", num_train_epochs: int = 1, @@ -92,12 +111,52 @@ def __init__( eval_strategy: str = "epoch", save_strategy: str = "epoch", load_best_model_at_end: bool = True, + use_fast_tokenizer: Optional[bool] = None, + trust_remote_code: bool = False, ) -> None: + """Configure the sequential fine-tuning learner. + + Args: + model_name: HF model id or local path for the BERT backbone. + n_prompts: Number of prompt variants to iterate over sequentially. + random_state: RNG seed for shuffling/sampling steps. + num_labels: Number of classes for the classifier head. + device: Force device ('cuda' or 'cpu'). If None, auto-detects CUDA. + eval_fraction: Fraction of positives to hold out for evaluation. + neg_ratio_reversed: Proportion of reversed-parent negatives vs positives. + neg_ratio_manipulated: Proportion of random-parent negatives vs positives. + output_dir: Directory where HF Trainer writes checkpoints/outputs. + num_train_epochs: Number of epochs per prompt. + per_device_train_batch_size: Training batch size per device. + per_device_eval_batch_size: Evaluation batch size per device. + warmup_steps: Linear warmup steps for LR scheduler. + weight_decay: Weight decay coefficient. + logging_dir: Directory for Trainer logs. + logging_steps: Interval for log events (in steps). + eval_strategy: Evaluation schedule ('no', 'steps', 'epoch'). + save_strategy: Checkpoint save schedule ('no', 'steps', 'epoch'). + load_best_model_at_end: Whether to restore the best checkpoint. + use_fast_tokenizer: Force fast/slow tokenizer. If None, try fast then fallback to slow. + Notes: + The model is fine-tuned *sequentially* across prompt columns. + You can control the eval split and negative sampling mix via + `eval_fraction`, `neg_ratio_reversed`, and `neg_ratio_manipulated`. + """ super().__init__() self.model_name = model_name self.n_prompts = n_prompts self.random_state = random_state - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.num_labels = num_labels + self.device = device + + # user-tunable ratios / split + self._eval_fraction = float(eval_fraction) + self._neg_ratio_reversed = float(neg_ratio_reversed) + self._neg_ratio_manipulated = float(neg_ratio_manipulated) + if not (0.0 < self._eval_fraction < 1.0): + raise ValueError("eval_fraction must be in (0, 1).") + if self._neg_ratio_reversed < 0 or self._neg_ratio_manipulated < 0: + raise ValueError("neg_ratio_* must be >= 0.") self.tokenizer: Optional[BertTokenizer] = None self.model: Optional[BertForSequenceClassification] = None @@ -109,6 +168,8 @@ def __init__( # Keep last train/eval tables for inspection self._last_train: Optional[pd.DataFrame] = None self._last_eval: Optional[pd.DataFrame] = None + self.trust_remote_code = bool(trust_remote_code) + self.use_fast_tokenizer = use_fast_tokenizer random.seed(self.random_state) @@ -128,19 +189,77 @@ def __init__( ) def load(self, model_id: Optional[str] = None, **_: Any) -> None: - """Load tokenizer and model; move model to the requested device.""" + """Load tokenizer & model in a backbone-agnostic way; move model to self.device.""" model_id = model_id or self.model_name - self.tokenizer = BertTokenizer.from_pretrained(model_id) - self.model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2) + + # ---- Tokenizer (robust fast→slow fallback unless explicitly set) ---- + if self.use_fast_tokenizer is None: + try: + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, use_fast=True, trust_remote_code=self.trust_remote_code + ) + except Exception as fast_err: + print( + f"[tokenizer] Fast tokenizer failed: {fast_err}. Falling back to slow tokenizer..." + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, use_fast=False, trust_remote_code=self.trust_remote_code + ) + else: + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, + use_fast=self.use_fast_tokenizer, + trust_remote_code=self.trust_remote_code, + ) + + # Ensure pad token exists (some models lack it) + if getattr(self.tokenizer, "pad_token", None) is None: + # Try sensible fallbacks + fallback = ( + getattr(self.tokenizer, "eos_token", None) + or getattr(self.tokenizer, "sep_token", None) + or getattr(self.tokenizer, "cls_token", None) + ) + if fallback is not None: + self.tokenizer.pad_token = fallback + + # ---- Model (classifier head sized to self.num_labels) ---- + self.model = AutoModelForSequenceClassification.from_pretrained( + model_id, + num_labels=self.num_labels, + trust_remote_code=self.trust_remote_code, + # Allows swapping in a new head size even if the checkpoint differs + ignore_mismatched_sizes=True, + ) + + # Make sure padding ids line up + if ( + getattr(self.model.config, "pad_token_id", None) is None + and getattr(self.tokenizer, "pad_token_id", None) is not None + ): + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + # Set problem type (single-label classification by default) + # If you plan multi-label, you'd switch to "multi_label_classification" self.model.config.problem_type = "single_label_classification" - # place on device chosen by user (or auto) - target_device = self.device - if target_device not in {"cuda", "cpu"}: - target_device = "cuda" if torch.cuda.is_available() else "cpu" - self.model.to(target_device) + # Move to target device + self.model.to(self.device) def tasks_ground_truth_former(self, data: Any, task: str) -> Any: + """Normalize ground-truth inputs for 'taxonomy-discovery'. + + Supports DataFrame with columns ['parent','child',('label')], + list of dicts, or falls back to the base class behavior. + + Args: + data: Input object to normalize. + task: Task name, passed from the outer pipeline. + + Returns: + A list of dictionaries with keys 'parent', 'child', and optionally + 'label' when present in the input. + """ if task != "taxonomy-discovery": return super().tasks_ground_truth_former(data, task) @@ -150,15 +269,29 @@ def tasks_ground_truth_former(self, data: Any, task: str) -> Any: {"parent": p, "child": c, "label": bool(lbl)} for p, c, lbl in zip(data["parent"], data["child"], data["label"]) ] - return [{"parent": p, "child": c} for p, c in zip(data["parent"], data["child"])] + return [ + {"parent": p, "child": c} for p, c in zip(data["parent"], data["child"]) + ] if isinstance(data, list): return data return super().tasks_ground_truth_former(data, task) - def _make_negatives(self, positives_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Return (reversed_df, manipulated_df).""" + def _make_negatives( + self, positives_df: pd.DataFrame + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create two types of negatives from a positives table. + + Returns: + A tuple `(reversed_df, manipulated_df)` where: + - `reversed_df`: pairs with parent/child columns swapped, label=False. + - `manipulated_df`: pairs with the parent replaced by a random + *different* parent from the same pool, label=False. + + Notes: + The input DataFrame must contain columns ['parent', 'child']. + """ unique_parents = positives_df["parent"].unique().tolist() def as_reversed(df: pd.DataFrame) -> pd.DataFrame: @@ -171,6 +304,7 @@ def with_random_parent(df: pd.DataFrame) -> pd.DataFrame: def pick_other_parent(p: str) -> str: pool = [x for x in unique_parents if x != p] return random.choice(pool) if pool else p + out = df.copy() out["parent"] = out["parent"].apply(pick_other_parent) out["label"] = False @@ -184,10 +318,23 @@ def _balance_with_negatives( reversed_df: pd.DataFrame, manipulated_df: pd.DataFrame, ) -> pd.DataFrame: - """Combine positives and negatives with the same ratios as before.""" + """Combine positives with negatives using configured ratios. + + Sampling ratios are defined by the instance settings + `self._neg_ratio_reversed` and `self._neg_ratio_manipulated`, + keeping the positives count unchanged. + + Args: + positives_df: Positive pairs with `label=True`. + reversed_df: Negative pairs produced by flipping parent/child. + manipulated_df: Negative pairs with randomly reassigned parents. + + Returns: + A deduplicated, shuffled DataFrame with a class-balanced mix. + """ n_pos = len(positives_df) - n_rev = int(n_pos * self._NEG_RATIO_REVERSED) - n_man = int(n_pos * self._NEG_RATIO_MANIPULATED) + n_rev = int(n_pos * self._neg_ratio_reversed) + n_man = int(n_pos * self._neg_ratio_manipulated) combined = pd.concat( [ @@ -197,26 +344,75 @@ def _balance_with_negatives( ], ignore_index=True, ) - combined = combined.drop_duplicates(subset=["parent", "child", "label"]).reset_index(drop=True) + combined = combined.drop_duplicates( + subset=["parent", "child", "label"] + ).reset_index(drop=True) return combined def _add_prompt_columns(self, df: pd.DataFrame) -> pd.DataFrame: + """Append one column per prompt variant to the given pairs table. + + For each row `(parent, child)`, creates columns `prompt_1 ... prompt_n`. + + Args: + df: Input DataFrame with columns ['parent', 'child', ...]. + + Returns: + A copy of `df` including the newly added prompt columns. + """ out = df.copy() for i in range(self.n_prompts): - out[f"prompt_{i+1}"] = out.apply( - lambda r, k=i: self.prompter.make(r["parent"], r["child"], k), axis=1 + out[f"prompt_{i + 1}"] = out.apply( + lambda r, k=i: self.prompter.format(r["parent"], r["child"], k), axis=1 ) return out - def _df_from_relations(relations: List[TaxonomicRelation], label: bool = True) -> pd.DataFrame: + def _df_from_relations( + self, relations: List[TaxonomicRelation], label: bool = True + ) -> pd.DataFrame: + """Convert a list of `TaxonomicRelation` to a DataFrame. + + Args: + relations: Iterable of `TaxonomicRelation(parent, child)`. + label: Class label to assign to all resulting rows. + + Returns: + DataFrame with columns ['parent', 'child', 'label']. + """ if not relations: return pd.DataFrame(columns=["parent", "child", "label"]) - return pd.DataFrame([{"parent": r.parent, "child": r.child, "label": label} for r in relations]) + return pd.DataFrame( + [{"parent": r.parent, "child": r.child, "label": label} for r in relations] + ) + + def _relations_from_df(self, df: pd.DataFrame) -> List[TaxonomicRelation]: + """Convert a DataFrame to a list of `TaxonomicRelation`. - def _relations_from_df(df: pd.DataFrame) -> List[TaxonomicRelation]: - return [TaxonomicRelation(parent=p, child=c) for p, c in zip(df["parent"], df["child"])] + Args: + df: DataFrame with columns ['parent', 'child']. - def _build_masked_prompt(self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]") -> str: + Returns: + List of `TaxonomicRelation` objects in row order. + """ + return [ + TaxonomicRelation(parent=p, child=c) + for p, c in zip(df["parent"], df["child"]) + ] + + def _build_masked_prompt( + self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]" + ) -> str: + """Construct one of several True/False prompts with a mask token. + + Args: + parent: Parent label. + child: Child label. + index_1_based: 1-based index selecting a template. + mask_token: The token used to denote the masked label. + + Returns: + A formatted prompt string. + """ prompts_1based = [ f"{parent} is the superclass of {child}. This statement is {mask_token}.", f"{child} is a subclass of {parent}. This statement is {mask_token}.", @@ -226,18 +422,42 @@ def _build_masked_prompt(self, parent: str, child: str, index_1_based: int, mask f"{child} is a subtype of {parent}. This statement is {mask_token}.", f"{parent} is an ancestor class of {child}. This statement is {mask_token}.", f"{child} is a descendant classs of {child}. This statement is {mask_token}.", - f"\"{parent}\" is the superclass of \"{child}\". This statement is {mask_token}.", + f'"{parent}" is the superclass of "{child}". This statement is {mask_token}.', ] return prompts_1based[index_1_based - 1] @torch.no_grad() def _predict_prompt_true_false(self, sentence: str) -> bool: + """Run a single True/False prediction on a prompt. + + Args: + sentence: Fully formatted prompt text. + + Returns: + True iff the predicted class index is 1 (positive). + """ enc = self.tokenizer(sentence, return_tensors="pt").to(self.model.device) logits = self.model(**enc).logits predicted_label = torch.argmax(logits, dim=1).item() return predicted_label == 1 def _select_parent_via_prompts(self, child: str) -> str: + """Select the most likely parent for a given child via prompt voting. + + The procedure: + 1) Generate prompts for each candidate parent at increasing "levels". + 2) Accumulate votes from the True/False classifier. + 3) Resolve ties by recursing to the next level; after 4 levels, break ties randomly. + + Args: + child: The child label whose parent should be predicted. + + Returns: + The chosen parent string. + + Raises: + AssertionError: If candidate parents were not initialized. + """ assert self._candidate_parents, "Candidate parents not initialized." scores: dict[str, int] = {p: 0 for p in self._candidate_parents} @@ -247,14 +467,18 @@ def prompt_indices_for_level(level: int) -> List[int]: return [2 * level, 2 * level + 1] def recurse(active_parents: List[str], level: int) -> str: - idxs = [i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts] + idxs = [ + i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts + ] if idxs: for parent in active_parents: votes = sum( 1 for idx in idxs if self._predict_prompt_true_false( - self._build_masked_prompt(parent=parent, child=child, index_1_based=idx) + self._build_masked_prompt( + parent=parent, child=child, index_1_based=idx + ) ) ) scores[parent] += votes @@ -277,6 +501,15 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): TEST: - OntologyData -> parent selection: [{'parent': predicted, 'child': child}] - DataFrame/list -> binary pair classification with 'label' + 'score' + + Args: + data: One of {OntologyData, pandas.DataFrame, list[dict], list[tuple]}. + test: If True, run inference; otherwise perform training. + + Returns: + - On training: None (model is fine-tuned in-place). + - On inference with OntologyData: list of {'parent','child'} predictions. + - On inference with pairs: list of dicts including 'label' and 'score'. """ is_ontology_object = isinstance(data, OntologyData) @@ -298,7 +531,9 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): if self._candidate_parents is None: self._candidate_parents = parents_in_call else: - self._candidate_parents = sorted(set(self._candidate_parents).union(parents_in_call)) + self._candidate_parents = sorted( + set(self._candidate_parents).union(parents_in_call) + ) else: if self._candidate_parents is None: self._candidate_parents = parents_in_call @@ -317,7 +552,7 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): true_probs_by_prompt: List[torch.Tensor] = [] for i in range(self.n_prompts): - col = f"prompt_{i+1}" + col = f"prompt_{i + 1}" enc = self.tokenizer( prompts_df[col].tolist(), return_tensors="pt", @@ -333,18 +568,35 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): results: List[dict[str, Any]] = [] for p, c, s, yhat in zip( - pairs_df["parent"], pairs_df["child"], avg_true_prob.tolist(), predicted_bool + pairs_df["parent"], + pairs_df["child"], + avg_true_prob.tolist(), + predicted_bool, ): - results.append({"parent": p, "child": c, "label": int(bool(yhat)), "score": float(s)}) + results.append( + { + "parent": p, + "child": c, + "label": int(bool(yhat)), + "score": float(s), + } + ) return results if isinstance(data, OntologyData): train_onto, eval_onto = ontology_split( - data, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + data, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, ) - train_pos_rel: List[TaxonomicRelation] = getattr(train_onto.type_taxonomies, "taxonomies", []) or [] - eval_pos_rel: List[TaxonomicRelation] = getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + train_pos_rel: List[TaxonomicRelation] = ( + getattr(train_onto.type_taxonomies, "taxonomies", []) or [] + ) + eval_pos_rel: List[TaxonomicRelation] = ( + getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + ) train_pos_df = self._df_from_relations(train_pos_rel, label=True) eval_pos_df = self._df_from_relations(eval_pos_rel, label=True) @@ -360,11 +612,17 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): else: if "label" not in pairs_df.columns or pairs_df["label"].nunique() == 1: - positives_df = pairs_df[pairs_df.get("label", True)][["parent", "child"]].copy() + positives_df = pairs_df[pairs_df.get("label", True)][ + ["parent", "child"] + ].copy() pos_rel = self._relations_from_df(positives_df) tr_rel, ev_rel = taxonomy_split( - pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + pos_rel, + train_terms=None, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, ) train_pos_df = self._df_from_relations(tr_rel, label=True) eval_pos_df = self._df_from_relations(ev_rel, label=True) @@ -372,8 +630,12 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) - train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) - eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + train_df = self._balance_with_negatives( + train_pos_df, tr_rev_df, tr_man_df + ) + eval_df = self._balance_with_negatives( + eval_pos_df, ev_rev_df, ev_man_df + ) train_df = self._add_prompt_columns(train_df) eval_df = self._add_prompt_columns(eval_df) @@ -383,16 +645,30 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): pos_rel = self._relations_from_df(positives_df) tr_rel, ev_rel = taxonomy_split( - pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + pos_rel, + train_terms=None, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, ) train_pos_df = self._df_from_relations(tr_rel, label=True) eval_pos_df = self._df_from_relations(ev_rel, label=True) negatives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() - negatives_df = negatives_df.sample(frac=1.0, random_state=self.random_state).reset_index(drop=True) - - n_eval_neg = max(1, int(len(negatives_df) * self._EVAL_FRACTION)) if len(negatives_df) > 0 else 0 - eval_neg_df = negatives_df.iloc[:n_eval_neg].copy() if n_eval_neg > 0 else negatives_df.iloc[:0].copy() + negatives_df = negatives_df.sample( + frac=1.0, random_state=self.random_state + ).reset_index(drop=True) + + n_eval_neg = ( + max(1, int(len(negatives_df) * self._eval_fraction)) + if len(negatives_df) > 0 + else 0 + ) + eval_neg_df = ( + negatives_df.iloc[:n_eval_neg].copy() + if n_eval_neg > 0 + else negatives_df.iloc[:0].copy() + ) train_neg_df = negatives_df.iloc[n_eval_neg:].copy() train_neg_df["label"] = False @@ -410,21 +686,36 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): # Sequential fine-tuning across prompts for i in range(self.n_prompts): - prompt_col = f"prompt_{i+1}" - train_ds = Dataset.from_pandas(train_df[[prompt_col, "label"]].reset_index(drop=True)) - eval_ds = Dataset.from_pandas(eval_df[[prompt_col, "label"]].reset_index(drop=True)) + prompt_col = f"prompt_{i + 1}" + train_ds = Dataset.from_pandas( + train_df[[prompt_col, "label"]].reset_index(drop=True) + ) + eval_ds = Dataset.from_pandas( + eval_df[[prompt_col, "label"]].reset_index(drop=True) + ) train_ds = train_ds.rename_column("label", "labels") eval_ds = eval_ds.rename_column("label", "labels") def tokenize_batch(batch): - return self.tokenizer(batch[prompt_col], padding="max_length", truncation=True) + """Tokenize a batch for the current prompt column with truncation/padding.""" + return self.tokenizer( + batch[prompt_col], padding="max_length", truncation=True + ) - train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) - eval_ds = eval_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) + train_ds = train_ds.map( + tokenize_batch, batched=True, remove_columns=[prompt_col] + ) + eval_ds = eval_ds.map( + tokenize_batch, batched=True, remove_columns=[prompt_col] + ) - train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) - eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + train_ds.set_format( + type="torch", columns=["input_ids", "attention_mask", "labels"] + ) + eval_ds.set_format( + type="torch", columns=["input_ids", "attention_mask", "labels"] + ) trainer = Trainer( model=self.model, @@ -481,13 +772,25 @@ class SKHNLPZSLearner(AutoLearner): def __init__( self, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", - device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) max_new_tokens: int = 16, - save_path: Optional[str] = None, # directory or full path + save_path: Optional[str] = None, # directory or full path verbose: bool = True, - normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" + normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" random_state: int = 1403, ) -> None: + """Configure the zero-shot learner. + + Args: + model_name: HF model id/path for the instruction-tuned causal LLM. + device: Force device ('cuda' or 'cpu'), else auto-detect. + max_new_tokens: Generation length budget for each completion. + save_path: Optional CSV path or directory for saving predictions. + verbose: If True, print progress messages. + normalize_mode: Post-processing for class names + ('none' | 'substring' | 'levenshtein' | 'auto'). + random_state: RNG seed for any sampling steps. + """ super().__init__() self.model_name = model_name self.verbose = verbose @@ -502,7 +805,7 @@ def __init__( if device is None: self._has_cuda = torch.cuda.is_available() else: - self._has_cuda = (device == "cuda") + self._has_cuda = device == "cuda" self._pipe_device = 0 if self._has_cuda else -1 self._model_device_map = {"": "cuda"} if self._has_cuda else None @@ -530,6 +833,13 @@ def __init__( def load(self, model_id: str = "") -> None: """ Load tokenizer, model, and text-generation pipeline. + + Args: + model_id: Optional HF id/path override; defaults to `self.model_name`. + + Side Effects: + Initializes the tokenizer and model, configures the generation + pipeline on CPU/GPU, and sets a pad token if absent. """ model_id = model_id or self.model_name if self.verbose: @@ -538,7 +848,10 @@ def load(self, model_id: str = "") -> None: self._tokenizer = AutoTokenizer.from_pretrained(model_id) # Ensure a pad token is set for generation - if self._tokenizer.pad_token_id is None and self._tokenizer.eos_token_id is not None: + if ( + self._tokenizer.pad_token_id is None + and self._tokenizer.eos_token_id is not None + ): self._tokenizer.pad_token = self._tokenizer.eos_token self._model = AutoModelForCausalLM.from_pretrained( @@ -558,10 +871,19 @@ def load(self, model_id: str = "") -> None: print("Device set to use", "cuda" if self._has_cuda else "cpu") print("[ZeroShotTaxonomyLearner] Model loaded.") - def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[List[Dict[str, str]]]: + def _taxonomy_discovery( + self, data: Any, test: bool = False + ) -> Optional[List[Dict[str, str]]]: """ Zero-shot prediction over all incoming rows (no filtering/augmentation). - Returns a list of dictionaries: [{'parent': predicted_label, 'child': child}, ...] + + Args: + data: One of {DataFrame, list[dict], list[tuple], Ontology-like}. + test: If False, training is skipped (zero-shot learner), and None is returned. + + Returns: + On `test=True`, a list of dicts [{'parent': predicted_label, 'child': child}, ...]. + On `test=False`, returns None. """ if not test: if self.verbose: @@ -617,16 +939,22 @@ def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[List[Di eval_df.at[idx, "prediction"] = final_label # Return in the format expected by the pipeline - return [{"parent": p, "child": c} for p, c in zip(eval_df["prediction"], eval_df["child"])] + return [ + {"parent": p, "child": c} + for p, c in zip(eval_df["prediction"], eval_df["child"]) + ] def _generate_and_parse(self, child_term: str) -> (str, str): """ Generate a completion for the given child term and extract the raw predicted class using the strict '#[ ... ]#' pattern. - Returns - ------- - (raw_generation_text, parsed_prediction_or_unknown) + Args: + child_term: The child label to classify into one of the fixed classes. + + Returns: + Tuple `(raw_generation_text, parsed_prediction_or_unknown)`, where the second + element is either the text inside '#[ ... ]#' or the string 'unknown'. """ messages = [ {"role": "system", "content": "You are a helpful classifier."}, @@ -654,13 +982,15 @@ def _generate_and_parse(self, child_term: str) -> (str, str): parsed = match.group(1).strip() if match else "unknown" return generation, parsed - # ------------------------------------------------------------------------- - # Normalization helpers - # ------------------------------------------------------------------------- - def _normalize_substring_only(self, text: str) -> str: """ Snap to a label if the string is equal to / contained in / contains a valid label (case-insensitive). + + Args: + text: Raw class text to normalize. + + Returns: + One of `CLASS_LIST` on a match; otherwise 'unknown'. """ if not isinstance(text, str): return "unknown" @@ -670,13 +1000,23 @@ def _normalize_substring_only(self, text: str) -> str: for label in self.CLASS_LIST: label_lower = label.lower() - if lowered == label_lower or lowered in label_lower or label_lower in lowered: + if ( + lowered == label_lower + or lowered in label_lower + or label_lower in lowered + ): return label return "unknown" def _normalize_levenshtein_only(self, text: str) -> str: """ Snap to the nearest label by Levenshtein (edit) distance. + + Args: + text: Raw class text to normalize. + + Returns: + The nearest label in `CLASS_LIST`, or 'unknown' if input is empty/invalid. """ if not isinstance(text, str): return "unknown" @@ -697,37 +1037,59 @@ def _normalize_levenshtein_only(self, text: str) -> str: def _normalize_auto(self, text: str) -> str: """ Cascade: try substring-first; if no match, fall back to Levenshtein snapping. + + Args: + text: Raw class text to normalize. + + Returns: + Normalized label string or 'unknown'. """ snapped = self._normalize_substring_only(text) - return snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + return ( + snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + ) - def _to_dataframe(data: Any) -> pd.DataFrame: + def _to_dataframe(self, data: Any) -> pd.DataFrame: """ - Normalize various input formats into a DataFrame with columns: - ['child', 'parent'] or ['child', 'parent', 'label']. + Normalize various input formats into a DataFrame. + + Supported inputs: + * pandas.DataFrame with columns ['child','parent',('label')] + * list[dict] with keys 'child','parent',('label') + * list of tuples/lists: (child, parent) or (child, parent, label) + * Ontology-like object with `.type_taxonomies.taxonomies` + + Args: + data: The source object to normalize. + + Returns: + A pandas DataFrame with standardized columns. + + Raises: + ValueError: If the input type/shape is not recognized. """ - # Already a DataFrame if isinstance(data, pd.DataFrame): df = data.copy() df.columns = [str(c).lower() for c in df.columns] return df.reset_index(drop=True) - # List[dict] if isinstance(data, list) and data and isinstance(data[0], dict): rows = [{str(k).lower(): v for k, v in d.items()} for d in data] return pd.DataFrame(rows).reset_index(drop=True) - # Iterable of tuples/lists: (child, parent[, label]) if isinstance(data, (list, tuple)) and data: first = data[0] if isinstance(first, (list, tuple)) and not isinstance(first, dict): n = len(first) if n >= 3: - return pd.DataFrame(data, columns=["child", "parent", "label"]).reset_index(drop=True) + return pd.DataFrame( + data, columns=["child", "parent", "label"] + ).reset_index(drop=True) if n == 2: - return pd.DataFrame(data, columns=["child", "parent"]).reset_index(drop=True) + return pd.DataFrame(data, columns=["child", "parent"]).reset_index( + drop=True + ) - # OntoLearner-style object (with .type_taxonomies.taxonomies) try: type_taxonomies = getattr(data, "type_taxonomies", None) if type_taxonomies is not None: @@ -737,9 +1099,15 @@ def _to_dataframe(data: Any) -> pd.DataFrame: for rel in taxonomies: parent = getattr(rel, "parent", None) child = getattr(rel, "child", None) - label = getattr(rel, "label", None) if hasattr(rel, "label") else None + label = ( + getattr(rel, "label", None) + if hasattr(rel, "label") + else None + ) if parent is not None and child is not None: - rows.append({"child": child, "parent": parent, "label": label}) + rows.append( + {"child": child, "parent": parent, "label": label} + ) if rows: return pd.DataFrame(rows).reset_index(drop=True) except Exception: @@ -751,10 +1119,19 @@ def _to_dataframe(data: Any) -> pd.DataFrame: ".type_taxonomies.taxonomies." ) - def _resolve_save_path(save_path: str, default_filename: str) -> str: + def _resolve_save_path(self, save_path: str, default_filename: str) -> str: """ - If `save_path` is a directory, join it with `default_filename`. - If it's a file path, return as-is. + Resolve a target file path from a directory or path-like input. + + If `save_path` points to a directory, joins it with `default_filename`. + If it already looks like a file path (has an extension), returns as-is. + + Args: + save_path: Directory or file path supplied by the caller. + default_filename: Basename to use when `save_path` is a directory. + + Returns: + A concrete file path where outputs can be written. """ base = os.path.basename(save_path) has_ext = os.path.splitext(base)[1] != "" diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py deleted file mode 100644 index a42d716..0000000 --- a/ontolearner/learner/term_typing/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .rwthdbis import RWTHDBISSFTLearner -from .sbunlp import SBUNLPZSLearner -from .alexbek import AlexbekRFLearner, AlexbekRAGLearner diff --git a/ontolearner/learner/term_typing/alexbek.py b/ontolearner/learner/term_typing/alexbek.py index 7aa6033..0db694b 100644 --- a/ontolearner/learner/term_typing/alexbek.py +++ b/ontolearner/learner/term_typing/alexbek.py @@ -12,6 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Learners for supervised and retrieval-augmented *term typing*. + +This module implements two learners: + +- **AlexbekRFLearner** (retriever/classifier): + Encodes terms with a Hugging Face encoder, optionally augments with simple + graph features, and trains a One-vs-Rest RandomForest for multi-label typing. + +- **AlexbekRAGLearner** (retrieval-augmented generation): + Builds an in-memory example index with sentence embeddings, retrieves + nearest examples for each query term, then prompts an instruction-tuned + causal LLM to produce types, parsing the JSON response. + +Both learners conform to the `AutoLearner` / `AutoRetriever` APIs used in +the outer pipeline. +""" + import gc import json import re @@ -31,22 +48,19 @@ from ...base import AutoLearner, AutoRetriever + class AlexbekRFLearner(AutoRetriever): """ Embedding-based multi-label classifier for *term typing*. - Pipeline overview: - 1) Load a Hugging Face encoder (tokenizer + model). - 2) Encode input terms into sentence embeddings. - 3) Optionally augment with simple graph (co-occurrence) features. - 4) Train a One-vs-Rest RandomForest on the concatenated features. - 5) Predict multi-label types with a probability threshold (fallback to top-1). - - API expected by LearnerPipeline: - - load(model_id) - - fit(data, task, ontologizer=True) - - predict(data, task, ontologizer=True) - - tasks_ground_truth_former(data, task) + Pipeline + 1) Load a Hugging Face encoder (tokenizer + model). + 2) Encode input terms into sentence embeddings. + 3) Optionally augment with simple graph (co-occurrence) features. + 4) Train a One-vs-Rest RandomForest on the concatenated features. + 5) Predict multi-label types with a probability threshold (fallback to top-1). + + Implements the `AutoRetriever` interface used by the outer pipeline. """ def __init__( @@ -58,6 +72,23 @@ def __init__( use_graph_features: bool = True, rf_kwargs: Optional[Dict[str, Any]] = None, ): + """Configure the RF-based multi-label learner. + + Parameters + device: + Torch device spec ('cpu' or 'cuda'). + batch_size: + Encoding mini-batch size for the transformer. + max_length: + Maximum input token length for the encoder tokenizer. + threshold: + Per-label probability threshold at prediction time. + use_graph_features: + If True, add simple graph features to embeddings. + rf_kwargs: + Optional RandomForest hyperparameters dictionary. + + """ # Runtime / inference settings self.device = torch.device(device) self.batch_size = batch_size @@ -81,21 +112,50 @@ def __init__( self.term_graph: Optional[nx.Graph] = None def load(self, model_id: str, **_: Any) -> None: - """Load a Hugging Face encoder by model id (tokenizer + base model).""" + """Load a Hugging Face encoder by model id (tokenizer + base model). + + Parameters + model_id: + HF model identifier or local path for an encoder backbone. + + Side Effects + - Sets `self.model_name`, `self.tokenizer`, `self.embedding_model`. + - Puts the model in eval mode and moves it to `self.device`. + """ self.model_name = model_id self.tokenizer = AutoTokenizer.from_pretrained(model_id) self.embedding_model = AutoModel.from_pretrained(model_id) self.embedding_model.eval().to(self.device) def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: - """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features).""" + """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features). + + Parameters + data: + Training payload; supported formats are routed via `_as_term_types_dicts`. + Each example must contain at least `{"term": str, "types": List[str]}`. + task: + Must be `'term-typing'`. + ontologizer: + Unused here; accepted for API compatibility. + **_: + Ignored extra arguments. + + Raises + ValueError + If `task` is not `'term-typing'` or if no valid examples are found. + """ if task != "term-typing": - raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) # Normalize incoming training data into a list of dicts: {term, types, RAG} training_rows = self._as_term_types_dicts(data) if not training_rows: - raise ValueError("No valid training examples found (need 'term' and 'types').") + raise ValueError( + "No valid training examples found (need 'term' and 'types')." + ) # Split out terms and raw labels training_terms: List[str] = [row["term"] for row in training_rows] @@ -110,7 +170,9 @@ def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: # Optionally build a light-weight co-occurrence graph and extract features if self.use_graph_features: self.term_graph = self._create_term_graph(training_rows) - graph_features_train = self._extract_graph_features(self.term_graph, training_terms) + graph_features_train = self._extract_graph_features( + self.term_graph, training_terms + ) X_train = np.hstack([term_embeddings_train, graph_features_train]) else: self.term_graph = None @@ -120,18 +182,48 @@ def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: Y_train = self.label_binarizer.transform(raw_label_lists) # One-vs-Rest RandomForest (one binary RF per label) - self.ovr_random_forest = OneVsRestClassifier(RandomForestClassifier(**self.rf_kwargs)) + self.ovr_random_forest = OneVsRestClassifier( + RandomForestClassifier(**self.rf_kwargs) + ) self.ovr_random_forest.fit(X_train, Y_train) - - def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> List[Dict[str, Any]]: + def predict( + self, data: Any, task: str, ontologizer: bool = True, **_: Any + ) -> List[Dict[str, Any]]: """Predict multi-label types for input terms. - Returns a list of dicts with keys: {id, term, types}. + Parameters + data: + Evaluation payload; formats normalized by `_as_predict_terms_ids`. + task: + Must be `'term-typing'`. + ontologizer: + Unused here; accepted for API compatibility. + **_: + Ignored extra arguments. + + Returns + List[Dict[str, Any]] + A list of dictionaries with keys: + - `id`: Original example id (if provided). + - `term`: Input term string. + - `types`: List of predicted label strings (selected by threshold or top-1). + + Raises + ValueError + If `task` is not `'term-typing'`. + RuntimeError + If `load()` and `fit()` have not been called. """ if task != "term-typing": - raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") - if self.ovr_random_forest is None or self.tokenizer is None or self.embedding_model is None: + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) + if ( + self.ovr_random_forest is None + or self.tokenizer is None + or self.embedding_model is None + ): raise RuntimeError("Call load() and fit() before predict().") # Normalize prediction input into parallel lists of terms and example ids @@ -142,7 +234,9 @@ def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> L # Match feature layout used during training if self.use_graph_features and self.term_graph is not None: - graph_features_test = self._extract_graph_features(self.term_graph, test_terms) + graph_features_test = self._extract_graph_features( + self.term_graph, test_terms + ) X_test = np.hstack([term_embeddings_test, graph_features_test]) else: X_test = term_embeddings_test @@ -160,7 +254,9 @@ def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> L if len(selected_label_indices) == 0: selected_label_indices = [int(np.argmax(label_probabilities))] - predicted_types = [label_names[label_idx] for label_idx in selected_label_indices] + predicted_types = [ + label_names[label_idx] for label_idx in selected_label_indices + ] predictions.append( { @@ -172,20 +268,49 @@ def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> L return predictions def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, Any]]: - """Normalize ground-truth into a list of {id, term, types} dicts for evaluation.""" + """Normalize ground-truth into a list of {id, term, types} dicts for evaluation. + + Parameters + data: + Ground-truth payload; supported formats include objects exposing + `.term_typings`, a list of dicts, or a list of tuples/lists. + task: + Must be `'term-typing'`. + + Returns + List[Dict[str, Any]] + A list of dictionaries with keys `id`, `term`, `types` (list of str). + + Raises + ValueError + If `task` is not `'term-typing'`. + """ if task != "term-typing": - raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) return self._as_gold_id_term_types(data) def _encode(self, texts: List[str]) -> np.ndarray: - """Encode a list of strings into L2-normalized sentence embeddings (NumPy array). + """Encode a list of strings into L2-normalized sentence embeddings. - If no texts are provided, returns an empty array with width equal to the model hidden size. + Parameters + texts: + List of input texts/terms. + + Returns + np.ndarray + Array of shape `(len(texts), hidden_size)` with L2-normalized + embeddings. If `texts` is empty, returns a `(0, hidden_size)` array. """ - assert self.tokenizer is not None and self.embedding_model is not None, "Call load(model_id) first." + assert self.tokenizer is not None and self.embedding_model is not None, ( + "Call load(model_id) first." + ) if not texts: - hidden_size = getattr(getattr(self.embedding_model, "config", None), "hidden_size", 768) + hidden_size = getattr( + getattr(self.embedding_model, "config", None), "hidden_size", 768 + ) return np.zeros((0, hidden_size), dtype=np.float32) batch_embeddings: List[torch.Tensor] = [] @@ -208,11 +333,15 @@ def _encode(self, texts: List[str]) -> np.ndarray: model_output = self.embedding_model(**tokenized_batch) # Prefer dedicated pooler if provided; otherwise pool by last valid token - if hasattr(model_output, "pooler_output") and model_output.pooler_output is not None: + if ( + hasattr(model_output, "pooler_output") + and model_output.pooler_output is not None + ): sentence_embeddings = model_output.pooler_output else: sentence_embeddings = self._last_token_pool( - model_output.last_hidden_state, tokenized_batch["attention_mask"] + model_output.last_hidden_state, + tokenized_batch["attention_mask"], ) # L2-normalize embeddings for stability @@ -230,18 +359,44 @@ def _encode(self, texts: List[str]) -> np.ndarray: # Concatenate all batches and convert to NumPy return torch.cat(batch_embeddings, dim=0).numpy() - def _last_token_pool(self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: - """Select the last *non-padding* token embedding for each sequence in the batch.""" + def _last_token_pool( + self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Select the last *non-padding* token embedding for each sequence. + + Parameters + last_hidden_states: + Tensor of shape `(batch, seq_len, hidden)`. + attention_mask: + Tensor of shape `(batch, seq_len)` with 1 for real tokens. + + Returns + torch.Tensor + Tensor of shape `(batch, hidden)` with per-sequence pooled embeddings. + """ last_valid_token_idx = attention_mask.sum(dim=1) - 1 # (batch,) - batch_row_idx = torch.arange(last_hidden_states.size(0), device=last_hidden_states.device) + batch_row_idx = torch.arange( + last_hidden_states.size(0), device=last_hidden_states.device + ) return last_hidden_states[batch_row_idx, last_valid_token_idx] def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: """Create a simple undirected co-occurrence graph from training rows. - Nodes: terms (with node attribute 'types'). - Edges: between a term and each neighbor from its optional RAG list. - Edge weight = number of shared types (or 0.1 if none shared). + Graph Structure + Nodes + Terms (node attribute `'types'` is stored per term). + Edges + Between a term and each neighbor from its optional RAG list. + Edge weight = number of shared types (or 0.1 if none shared). + + Parameters + training_rows: + Normalized rows with keys: `'term'`, `'types'`, optional `'RAG'`. + + Returns + networkx.Graph + The constructed undirected graph. """ graph = nx.Graph() @@ -251,7 +406,7 @@ def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: graph.add_node(term, types=term_types) # RAG may be a list of neighbor dicts like {"term": ..., "types": [...]} - for neighbor in (row.get("RAG", []) or []): + for neighbor in row.get("RAG", []) or []: neighbor_term = neighbor.get("term") neighbor_types = neighbor.get("types", []) @@ -263,12 +418,24 @@ def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: return graph - def _extract_graph_features(self, term_graph: nx.Graph, terms: List[str]) -> np.ndarray: + def _extract_graph_features( + self, term_graph: nx.Graph, terms: List[str] + ) -> np.ndarray: """Compute simple per-term graph features. + Feature Vector For each term we compute a 4-dim vector: - [degree, clustering_coefficient, degree_centrality, pagerank_score] - Returns an array of shape [len(terms), 4]. + `[degree, clustering_coefficient, degree_centrality, pagerank_score]` + + Parameters + term_graph: + Graph built over training terms. + terms: + List of term strings to extract features for. + + Returns + np.ndarray + Array of shape `(len(terms), 4)` (dtype float32). """ if len(term_graph): degree_centrality = nx.degree_centrality(term_graph) @@ -293,7 +460,26 @@ def _extract_graph_features(self, term_graph: nx.Graph, terms: List[str]) -> np. return np.asarray(feature_rows, dtype=np.float32) def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: - """Normalize diverse training data formats to a list of dicts: {term, types, RAG}.""" + """Normalize diverse training data formats to a list of dicts: {term, types, RAG}. + + Supported Inputs + - Object with attribute `.term_typings` (iterable of items exposing + `.term`, `.types`, optional `.RAG`). + - List of dicts with keys `term`, `types`, optional `RAG`. + - List/tuple of `(term, types[, RAG])`. + + Parameters + data: + Training payload. + + Returns + List[Dict[str, Any]] + Normalized dictionaries ready for training. + + Raises + ValueError + If `data` is neither a list/tuple nor exposes `.term_typings`. + """ normalized_rows: List[Dict[str, Any]] = [] # Case 1: object with attribute `.term_typings` @@ -308,13 +494,19 @@ def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: if not isinstance(type_list, list): type_list = [type_list] normalized_rows.append( - {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } ) return normalized_rows # Otherwise: must be a list/tuple-like container if not isinstance(data, (list, tuple)): - raise ValueError("Training data must be a list/tuple or expose .term_typings") + raise ValueError( + "Training data must be a list/tuple or expose .term_typings" + ) if not data: return normalized_rows @@ -330,7 +522,11 @@ def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: if not isinstance(type_list, list): type_list = [type_list] normalized_rows.append( - {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } ) return normalized_rows @@ -345,13 +541,36 @@ def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: if not isinstance(type_list, list): type_list = [type_list] normalized_rows.append( - {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } ) return normalized_rows def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: - """Normalize prediction input into parallel lists: (terms, ids).""" + """Normalize prediction input into parallel lists: (terms, ids). + + Supported Inputs + - Object with `.term_typings`. + - List of dicts with `term` and optional `id`. + - List of tuples/lists `(term, id[, ...])`. + - List of plain term strings. + + Parameters + data: + Evaluation payload. + + Returns + Tuple[List[str], List[Any]] + `(terms, example_ids)` lists aligned by index. + + Raises + ValueError + If the input format is unsupported. + """ terms: List[str] = [] example_ids: List[Any] = [] @@ -392,7 +611,20 @@ def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: raise ValueError("Unsupported predict() input format.") def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: - """Normalize gold labels into a list of dicts: {id, term, types}.""" + """Normalize gold labels into a list of dicts: {id, term, types}. + + Supported Inputs + Mirrors `_as_term_types_dicts`, but ensures an `id` is set. + + Parameters + data: + Ground-truth payload. + + Returns + List[Dict[str, Any]] + `{'id': Any, 'term': str, 'types': List[str]}` entries. + + """ gold_rows: List[Dict[str, Any]] = [] # Case 1: object with attribute `.term_typings` @@ -404,7 +636,13 @@ def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: type_list = getattr(item, "types", []) if not isinstance(type_list, list): type_list = [type_list] - gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) return gold_rows # Case 2: list/tuple container @@ -419,7 +657,13 @@ def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: type_list = row.get("types", []) if not isinstance(type_list, list): type_list = [type_list] - gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) return gold_rows # 2b) list of tuples/lists: (term, types[, id]) @@ -432,35 +676,68 @@ def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: gold_id = tuple_row[2] if len(tuple_row) > 2 else i if not isinstance(type_list, list): type_list = [type_list] - gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) return gold_rows - raise ValueError("Unsupported ground-truth input format for tasks_ground_truth_former().") + raise ValueError( + "Unsupported ground-truth input format for tasks_ground_truth_former()." + ) + class AlexbekRAGLearner(AutoLearner): """Retrieval-Augmented Term Typing learner (single task: term-typing). - Flow: - 1) fit: collect (term -> [types]) examples, build an in-memory index - using a sentence-embedding model. - 2) predict: for each new term, retrieve top-k similar examples, compose a - structured prompt, query an instruction-tuned causal LLM, and parse types. + Flow + 1) `fit`: collect (term -> [types]) examples, build an in-memory index + using a sentence-embedding model. + 2) `predict`: for each new term, retrieve top-k similar examples, compose a + structured prompt, query an instruction-tuned causal LLM, and parse types. - Returns a list of dicts: {"term": str, "types": List[str], "id": Optional[str]}. + Returns + List[Dict[str, Any]] + `{"term": str, "types": List[str], "id": Optional[str]}` rows. """ def __init__( self, llm_model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", retriever_model_id: str = "sentence-transformers/all-MiniLM-L6-v2", - device: str = "auto", # "auto" | "cuda" | "cpu" - token: str = "", # HF token if needed + device: str = "auto", # "auto" | "cuda" | "cpu" + token: str = "", # HF token if needed top_k: int = 3, max_new_tokens: int = 256, - gen_batch_size: int = 4, # generation batch size + gen_batch_size: int = 4, # generation batch size enc_batch_size: int = 64, # embedding batch size - **kwargs: Any, # absorb extra pipeline-style args + **kwargs: Any, # absorb extra pipeline-style args ) -> None: + """Configure the RAG learner. + + Parameters + llm_model_id: + HF model id/path for the instruction-tuned causal LLM. + retriever_model_id: + Sentence-embedding model id for retrieval. + device: + Device policy ('auto'|'cuda'|'cpu') for the LLM. + token: + Optional HF token for gated models. + top_k: + Number of nearest examples to retrieve per query term. + max_new_tokens: + Decoding budget for the LLM. + gen_batch_size: + Number of prompts per generation batch. + enc_batch_size: + Number of texts per embedding batch. + **kwargs: + Extra configuration captured for downstream use. + """ super().__init__() # Consolidated configuration for simple serialization @@ -482,7 +759,7 @@ def __init__( # Retriever components self.embedder: Optional[SentenceTransformer] = None - self.indexed_corpus: List[str] = [] # items: " || [...]" + self.indexed_corpus: List[str] = [] # items: " || [...]" self.corpus_embeddings: Optional[torch.Tensor] = None # Training cache of (term, [types]) tuples @@ -497,15 +774,13 @@ def __init__( "2) Be concise. Respond ONLY in JSON using double quotes.\n" 'Format: {"term":"...", "reasoning":"<<=100 words>>", "types":["...", "..."]}\n' ) - self._user_prompt_template: str = ( - """{examples} + self._user_prompt_template: str = """{examples} TERM: {term} TASK: Determine semantic types for the given term based on the domain ontology. Remember: types are generalizing categories, not the term itself. Respond in JSON. """ - ) def load( self, @@ -515,7 +790,21 @@ def load( token: Optional[str] = None, **kwargs: Any, ) -> None: - """Load the LLM and the embedding retriever. Overrides constructor values if provided.""" + """Load the LLM and the embedding retriever. Overrides constructor values if provided. + + Parameters + model_id: + Optional override for the LLM model id. + retriever_id: + Optional override for the embedding model id. + device: + Optional override for device selection policy. + token: + Optional override for HF token. + **kwargs: + Extra values to store in `extra_cfg`. + + """ if model_id is not None: self.cfg["llm_model_id"] = model_id if retriever_id is not None: @@ -556,10 +845,26 @@ def load( generation_cfg.num_beams = 1 # Retriever - self.embedder = SentenceTransformer(self.cfg["retriever_model_id"], trust_remote_code=True) + self.embedder = SentenceTransformer( + self.cfg["retriever_model_id"], trust_remote_code=True + ) def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: - """Prepare the retrieval index from training examples.""" + """Prepare the retrieval index from training examples. + + Parameters + train_data: + Training payload containing terms and their types. + task: + Must be `'term-typing'`; other tasks are forwarded to base. + ontologizer: + Unused flag for API compatibility. + + Side Effects + - Normalizes to a list of `(term, [types])`. + - Builds an indexable text corpus and (if embedder is loaded) + computes embeddings for retrieval. + """ if task != "term-typing": return super().fit(train_data, task, ontologizer) @@ -568,7 +873,8 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: # Build the textual corpus to index self.indexed_corpus = [ - f"{term} || {json.dumps(types, ensure_ascii=False)}" for term, types in self.train_term_types + f"{term} || {json.dumps(types, ensure_ascii=False)}" + for term, types in self.train_term_types ] # Embed the corpus if available; else fall back to zero-shot prompting @@ -578,7 +884,23 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: self.corpus_embeddings = None def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: - """Predict types for evaluation items; returns a list of {term, types, id?}.""" + """Predict types for evaluation items; returns a list of {term, types, id?}. + + Parameters + eval_data: + Evaluation payload to type (terms + optional ids). + task: + Must be `'term-typing'`; other tasks are forwarded to base. + ontologizer: + Unused flag for API compatibility. + + Returns + List[Dict[str, Any]] + For each input term, a dictionary with keys: + - `term`: The input term. + - `types`: A (unique, sorted) list of predicted types. + - `id`: Optional example id (if provided in input). + """ if task != "term-typing": return super().predict(eval_data, task, ontologizer) @@ -588,11 +910,15 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: # Use RAG if we have an indexed corpus & embeddings; otherwise zero-shot rag_available = ( - self.corpus_embeddings is not None and self.embedder is not None and len(self.indexed_corpus) > 0 + self.corpus_embeddings is not None + and self.embedder is not None + and len(self.indexed_corpus) > 0 ) if rag_available: - neighbor_docs_per_query = self._retrieve_batch(eval_terms, top_k=int(self.cfg["top_k"])) + neighbor_docs_per_query = self._retrieve_batch( + eval_terms, top_k=int(self.cfg["top_k"]) + ) else: neighbor_docs_per_query = [[] for _ in eval_terms] @@ -608,7 +934,9 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: # Build standardized results results: List[Dict[str, Any]] = [] - for term, example_id, predicted_types in zip(eval_terms, eval_ids, predicted_types_lists): + for term, example_id, predicted_types in zip( + eval_terms, eval_ids, predicted_types_lists + ): result_row: Dict[str, Any] = { "term": term, "types": sorted({t for t in predicted_types}), # unique + sorted @@ -617,11 +945,28 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: result_row["id"] = example_id results.append(result_row) - assert all(("term" in row and "types" in row) for row in results), "predict() must return term + types" + assert all(("term" in row and "types" in row) for row in results), ( + "predict() must return term + types" + ) return results def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: - """Extract (term, [types]) tuples from supported training payloads.""" + """Extract `(term, [types])` tuples from supported training payloads. + + Supported Inputs + - `data.term_typings` (objects exposing `.term` & `.types`) + - `list[dict]` with keys `'term'` and `'types'` + - `list[str]` → returns empty (nothing to index) + - other formats → empty + + Parameters + data: + Training payload. + + Returns + List[Tuple[str, List[str]]] + (term, types) tuples (types kept as strings). + """ term_typings = getattr(data, "term_typings", None) if term_typings is not None: parsed_pairs: List[Tuple[str, List[str]]] = [] @@ -629,7 +974,9 @@ def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: term = getattr(item, "term", None) types = list(getattr(item, "types", []) or []) if term and types: - parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + parsed_pairs.append( + (term, [t for t in types if isinstance(t, str)]) + ) return parsed_pairs if isinstance(data, list) and data and isinstance(data[0], dict): @@ -638,17 +985,35 @@ def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: term = row.get("term") types = row.get("types") or [] if term and isinstance(types, list) and types: - parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + parsed_pairs.append( + (term, [t for t in types if isinstance(t, str)]) + ) return parsed_pairs # If only a list of strings is provided, there's nothing to index for RAG - if isinstance(data, (list, set, tuple)) and all(isinstance(x, str) for x in data): + if isinstance(data, (list, set, tuple)) and all( + isinstance(x, str) for x in data + ): return [] return [] def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: - """Extract (terms, ids) from supported evaluation payloads.""" + """Extract `(terms, ids)` from supported evaluation payloads. + + Supported Inputs + - `data.term_typings` (objects exposing `.term` & optional `.id`) + - `list[str]` + - `list[dict]` with `term` and optional `id` + + Parameters + data: + Evaluation payload. + + Returns + Tuple[List[str], List[Optional[str]]] + Two lists aligned by index: terms and ids (ids may contain `None`). + """ term_typings = getattr(data, "term_typings", None) if term_typings is not None: terms: List[str] = [] @@ -672,24 +1037,50 @@ def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: return [], [] def _encode_texts(self, texts: List[str]) -> torch.Tensor: - """Encode a batch of texts with the sentence-embedding model.""" + """Encode a batch of texts with the sentence-embedding model. + + Parameters + texts: + List of strings to embed. + + Returns + torch.Tensor + Tensor of shape `(len(texts), hidden_dim)`. If `texts` is empty, + returns an empty tensor with 0 rows. + """ batch_size = int(self.cfg["enc_batch_size"]) batch_embeddings: List[torch.Tensor] = [] for batch_start in range(0, len(texts), batch_size): batch_texts = texts[batch_start : batch_start + batch_size] - embeddings = self.embedder.encode(batch_texts, convert_to_tensor=True, show_progress_bar=False) + embeddings = self.embedder.encode( + batch_texts, convert_to_tensor=True, show_progress_bar=False + ) batch_embeddings.append(embeddings) - return torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + return ( + torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + ) def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: - """Return for each query the top-k most similar corpus entries (as raw text rows).""" + """Return for each query the top-k most similar corpus entries. + + Parameters + queries: + List of query terms. + top_k: + Number of neighbors to retrieve for each query. + + Returns + List[List[str]] + For each query, a list of raw corpus strings formatted as + `" || [\\"type1\\", ...]"`. + """ if self.corpus_embeddings is None or not self.indexed_corpus: return [[] for _ in queries] - query_embeddings = self._encode_texts(queries) # [Q, D] - doc_embeddings = self.corpus_embeddings # [N, D] + query_embeddings = self._encode_texts(queries) # [Q, D] + doc_embeddings = self.corpus_embeddings # [N, D] if query_embeddings.shape[-1] != doc_embeddings.shape[-1]: raise ValueError( f"Embedding dim mismatch: {query_embeddings.shape[-1]} vs {doc_embeddings.shape[-1]}" @@ -705,7 +1096,16 @@ def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: return [[self.indexed_corpus[j] for j in row.tolist()] for row in top_indices] def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: - """Parse raw corpus rows ('term || [types]') into (term, [types]) pairs.""" + """Parse raw corpus rows ('term || [types]') into `(term, [types])` pairs. + + Parameters + docs: + Raw strings from the index/corpus. + + Returns + List[Tuple[str, List[str]]] + Parsed (term, types) pairs; malformed rows are skipped. + """ example_pairs: List[Tuple[str, List[str]]] = [] for raw_row in docs: try: @@ -713,13 +1113,24 @@ def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: term = term_raw.strip() types_list = json.loads(types_json.strip()) if isinstance(types_list, list): - example_pairs.append((term, [t for t in types_list if isinstance(t, str)])) + example_pairs.append( + (term, [t for t in types_list if isinstance(t, str)]) + ) except Exception: continue return example_pairs def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: - """Format retrieved example pairs into a compact block for the prompt.""" + """Format retrieved example pairs into a compact block for the prompt. + + Parameters + pairs: + Retrieved `(term, [types])` examples. + + Returns + str + Human-readable lines to provide *light* guidance to the LLM. + """ if not pairs: return "EXAMPLES: (none provided)" lines: List[str] = ["CLASSIFICATION EXAMPLES:"] @@ -730,12 +1141,34 @@ def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: return "\n".join(lines) def _compose_prompt(self, examples_block: str, term: str) -> str: - """Compose the final prompt from system + user blocks.""" - user_block = self._user_prompt_template.format(examples=examples_block, term=term) + """Compose the final prompt from system + user blocks. + + Parameters + examples_block: + Text block with retrieved examples. + term: + The query term to classify. + + Returns + str + Full prompt string passed to the LLM. + """ + user_block = self._user_prompt_template.format( + examples=examples_block, term=term + ) return f"{self._system_prompt}\n\n{user_block}\n" def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: - """Run generation for a batch of prompts and parse the JSON 'types' from outputs.""" + """Run generation for a batch of prompts and parse the JSON `'types'` from outputs. + + Parameters + prompts: + Finalized prompts for the LLM. + + Returns + List[List[str]] + For each prompt, a list of predicted type strings. + """ batch_size = int(self.cfg["gen_batch_size"]) all_predicted_types: List[List[str]] = [] @@ -744,7 +1177,9 @@ def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: # Tokenize and move to the LLM's device model_device = getattr(self.generation_model, "device", None) - encodings = self.tokenizer(prompt_batch, return_tensors="pt", padding=True).to(model_device) + encodings = self.tokenizer( + prompt_batch, return_tensors="pt", padding=True + ).to(model_device) input_token_length = encodings["input_ids"].shape[1] # Deterministic decoding (greedy) @@ -762,9 +1197,14 @@ def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: # Slice off the prompt tokens and decode only newly generated tokens new_token_span = generated_tokens[:, input_token_length:] - decoded_texts = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in new_token_span] - - parsed_types_per_prompt = [self._parse_types(text) for text in decoded_texts] + decoded_texts = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in new_token_span + ] + + parsed_types_per_prompt = [ + self._parse_types(text) for text in decoded_texts + ] all_predicted_types.extend(parsed_types_per_prompt) return all_predicted_types @@ -772,11 +1212,19 @@ def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: def _parse_types(self, text: str) -> List[str]: """Extract a list of type strings from LLM output. - Attempts (in order): - 1) Strict JSON object with "types". - 2) Regex-extract JSON object containing "types". - 3) Regex-extract first bracketed list. - 4) Comma-split fallback. + Parsing Strategy (in order) + 1) Strict JSON object with `"types"`. + 2) Regex-extract JSON object containing `"types"`. + 3) Regex-extract first bracketed list. + 4) Comma-split fallback. + + Parameters + text: + Raw LLM output to parse. + + Returns + List[str] + Parsed list of type strings (possibly empty if parsing fails). """ try: obj = json.loads(text) @@ -786,7 +1234,9 @@ def _parse_types(self, text: str) -> List[str]: pass try: - obj_match = re.search(r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S) + obj_match = re.search( + r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S + ) if obj_match: obj = json.loads(obj_match.group(0)) types = obj.get("types", []) @@ -795,9 +1245,12 @@ def _parse_types(self, text: str) -> List[str]: pass try: - list_match = re.search(r'\[([^\]]+)\]', text) + list_match = re.search(r"\[([^\]]+)\]", text) if list_match: - items = [x.strip().strip('"').strip("'") for x in list_match.group(1).split(",")] + items = [ + x.strip().strip('"').strip("'") + for x in list_match.group(1).split(",") + ] return [t for t in items if t] except Exception: pass diff --git a/ontolearner/learner/term_typing/rwthdbis.py b/ontolearner/learner/term_typing/rwthdbis.py index f27fd56..c8df797 100644 --- a/ontolearner/learner/term_typing/rwthdbis.py +++ b/ontolearner/learner/term_typing/rwthdbis.py @@ -27,10 +27,10 @@ TrainingArguments, set_seed, ) -from transformers import DebertaV2Tokenizer from ...base import AutoLearner + class RWTHDBISSFTLearner(AutoLearner): """ Supervised term-typing @@ -44,6 +44,7 @@ def __init__( model_name: str = "microsoft/deberta-v3-small", trained_model_path: Optional[str] = None, output_dir: Optional[str] = None, + device: str = "cpu", max_length: int = 64, per_device_train_batch_size: int = 16, gradient_accumulation_steps: int = 2, @@ -55,8 +56,35 @@ def __init__( save_total_limit: int = 1, fp16: bool = False, bf16: bool = False, - seed: int = 42 + seed: int = 42, ) -> None: + """Initialize the term-typing learner and configure training defaults. + + Args: + model_name: Backbone HF model identifier (used if `trained_model_path` is None). + trained_model_path: Optional path to a fine-tuned checkpoint for loading. + output_dir: Directory to write checkpoints and tokenizer; defaults to './term_typing'. + device: user-defined argument as 'cuda' or 'cpu'. + max_length: Maximum tokenized sequence length. + per_device_train_batch_size: Per-device batch size during training. + gradient_accumulation_steps: Number of update accumulation steps. + num_train_epochs: Training epochs. + learning_rate: Optimizer learning rate. + weight_decay: Weight decay coefficient. + logging_steps: Logging interval (steps) for the Trainer. + save_strategy: Checkpoint save strategy (e.g., 'epoch', 'steps', 'no'). + save_total_limit: Maximum number of checkpoints to keep. + fp16: Enable mixed precision (FP16) if supported. + bf16: Enable mixed precision (BF16) if supported. + seed: Random seed for reproducibility. + + Side Effects: + Creates `output_dir` if it does not exist. + + Notes: + The learner predicts exactly one label per term at inference time + (argmax over logits). + """ super().__init__() self.model_name = model_name self.trained_model_path = trained_model_path @@ -76,7 +104,7 @@ def __init__( self.bf16 = bf16 self.seed = seed - self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device self.model: Optional[AutoModelForSequenceClassification] = None self.tokenizer: Optional[AutoTokenizer] = None self.id2label: Dict[int, str] = {} @@ -84,44 +112,53 @@ def __init__( def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: """ - train: expects ontology-like object with .term_typings - test: returns List[{"term": str, "types": [str]}] (for evaluator) - """ - if not test: - return self._train_from_term_typings(train_data=data) + Train or run inference for term typing, depending on `test`. - terms = self._collect_eval_terms(data) - return self._predict_structured_output(terms) + When `test=False`, trains on `data.term_typings`. + When `test=True`, predicts labels for provided terms. - def _load_robust_tokenizer(self, backbone: str) -> AutoTokenizer: - try: - return AutoTokenizer.from_pretrained(backbone, use_fast=True) - except Exception as fast_err: - print(f"[tokenizer] Fast tokenizer failed: {fast_err}. Trying DebertaV2Tokenizer (slow)...") + Args: + data: If training, an object with `.term_typings` where each item has + `term` and `types` (list[str]). If testing, either a `List[str]` + of raw term texts or an object with `.term_typings`. + test: If True, runs inference; otherwise trains. - try: - return DebertaV2Tokenizer.from_pretrained(backbone) - except Exception as slow_err: - print(f"[tokenizer] DebertaV2Tokenizer failed: {slow_err}. Trying AutoTokenizer(use_fast=False)...") + Returns: + If `test=True`: a list of dicts like + `[{"term": "", "types": [""]}, ...]`. + If `test=False`: None. - try: - return AutoTokenizer.from_pretrained(backbone, use_fast=False) - except Exception as final_err: - raise RuntimeError( - "Failed to load a tokenizer for this DeBERTa model.\n" - "Try:\n" - " - pip install --upgrade sentencepiece\n" - " - ensure network access for model files\n" - " - clear your HF cache and retry\n" - " - pin versions: transformers==4.43.*, tokenizers<0.20\n" - f"Original error: {final_err}" - ) + Raises: + ValueError: If required fields are missing from `data`. + """ + if test: + terms = self._collect_eval_terms(data) + return self._predict_structured_output(terms) + else: + self._train_from_term_typings(train_data=data) + return None def _expand_multilabel_training_rows( self, term_typings: List[Any] ) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]: """ - From multi-label instances -> (texts, label_ids), and label maps. + Expand multi-label instances into single-label rows and derive label maps. + + Each training instance with fields: + - `term`: str-like + - `types`: list of label strings + is expanded into len(types) rows with the same `term` and individual labels. + + Args: + term_typings: Sequence of objects (e.g., dataclasses) exposing + `.term` and `.types`. + + Returns: + A tuple `(texts, label_ids, id2label, label2id)`: + - texts: Flattened list of term strings (one per label). + - label_ids: Parallel list of integer label ids. + - id2label: Mapping from id -> label string. + - label2id: Mapping from label string -> id. """ label_strings: List[str] = [] for instance in term_typings: @@ -143,18 +180,53 @@ def _expand_multilabel_training_rows( def _collect_eval_terms(self, eval_data: Any) -> List[str]: """ - Accepts List[str] OR object with .term_typings; returns list of term texts. + Collect the list of term texts to predict for evaluation. + + Accepts either: + - A `List[str]` of raw term texts, or + - An object with `.term_typings`, from which `.term` is extracted. + + Args: + eval_data: Input carrier for terms. + + Returns: + List of term strings. + + Raises: + ValueError: If `eval_data` lacks the expected structure. """ if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): terms = eval_data else: term_typings = getattr(eval_data, "term_typings", None) if term_typings is None: - raise ValueError("Provide a List[str] OR an object with .term_typings for test=True.") + raise ValueError( + "Provide a List[str] OR an object with .term_typings for test=True." + ) terms = [str(instance.term) for instance in term_typings] return terms def _train_from_term_typings(self, train_data: Any) -> None: + """Train the term-typing classifier from `.term_typings`. + + Steps: + 1) Seed RNGs for reproducibility. + 2) Expand multi-label examples into single-label rows. + 3) Build HF `DatasetDict`, tokenizer, and data collator. + 4) Initialize `AutoModelForSequenceClassification`. + 5) Train with `Trainer` and save model/tokenizer to `output_dir`. + + Args: + train_data: Object with `.term_typings`; each item exposes + `.term` (text) and `.types` (list[str]). + + Raises: + ValueError: If `train_data` does not provide `.term_typings`. + + Side Effects: + Writes a trained model to `self.output_dir` and updates + `self.id2label` / `self.label2id`. + """ set_seed(self.seed) random.seed(self.seed) torch.manual_seed(self.seed) @@ -165,15 +237,26 @@ def _train_from_term_typings(self, train_data: Any) -> None: if term_typings is None: raise ValueError("train_data must provide .term_typings for term-typing.") - texts, label_ids, self.id2label, self.label2id = self._expand_multilabel_training_rows(term_typings) + texts, label_ids, self.id2label, self.label2id = ( + self._expand_multilabel_training_rows(term_typings) + ) - dataset = DatasetDict({"train": Dataset.from_dict({"labels": label_ids, "text": texts})}) + dataset = DatasetDict( + {"train": Dataset.from_dict({"labels": label_ids, "text": texts})} + ) backbone = self.trained_model_path or self.model_name - self.tokenizer = self._load_robust_tokenizer(backbone) + try: + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) + except Exception: + # fallback if fast tokenizer isn't available + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=False) def tokenize_batch(batch: Dict[str, List[str]]): - return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + """Tokenize a batch of texts with truncation and max length.""" + return self.tokenizer( + batch["text"], truncation=True, max_length=self.max_length + ) tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"]) data_collator = DataCollatorWithPadding(self.tokenizer) @@ -185,7 +268,10 @@ def tokenize_batch(batch: Dict[str, List[str]]): label2id=self.label2id, ) - if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + if ( + getattr(self.model.config, "pad_token_id", None) is None + and self.tokenizer.pad_token_id is not None + ): self.model.config.pad_token_id = self.tokenizer.pad_token_id training_args = TrainingArguments( @@ -216,11 +302,20 @@ def tokenize_batch(batch: Dict[str, List[str]]): self.tokenizer.save_pretrained(self.output_dir) def _ensure_loaded_for_inference(self) -> None: + """Load model/tokenizer for inference if not already loaded. + + Loads from `trained_model_path` if set, otherwise from `output_dir`. + Also restores `id2label`/`label2id` from the model config when present, + moves the model to the configured device, and sets eval mode. + """ if self.model is not None and self.tokenizer is not None: return model_path = self.trained_model_path or self.output_dir self.model = AutoModelForSequenceClassification.from_pretrained(model_path) - self.tokenizer = self._load_robust_tokenizer(model_path) + try: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + except Exception: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) cfg = self.model.config if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"): @@ -230,20 +325,49 @@ def _ensure_loaded_for_inference(self) -> None: self.model.to(self.device).eval() def _predict_label_ids(self, terms: List[str]) -> List[int]: + """Predict label ids (argmax) for a list of term strings. + + Ensures model/tokenizer are loaded, then performs forward passes + term-by-term and collects the argmax label id. + + Args: + terms: List of raw term texts. + + Returns: + List of integer label ids corresponding to `terms`. + """ self._ensure_loaded_for_inference() predictions: List[int] = [] - for term_text in tqdm(terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"): - inputs = self.tokenizer(term_text, return_tensors="pt", truncation=True, max_length=self.max_length) + for term_text in tqdm( + terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}" + ): + inputs = self.tokenizer( + term_text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()} with torch.no_grad(): logits = self.model(**inputs).logits predictions.append(int(torch.argmax(logits, dim=-1).item())) return predictions - def _predict_structured_output(self, terms: List[str]) -> List[Dict[str, List[str]]]: + def _predict_structured_output( + self, terms: List[str] + ) -> List[Dict[str, List[str]]]: """ - Convert predicted IDs into evaluator format: - [{"term": "", "types": [""]}, ...] + Convert predicted label IDs into evaluator-friendly structured outputs. + + The output format is: + [{"term": "", "types": [""]}, ...] + + Args: + terms: Raw term texts to classify. + + Returns: + List of dicts mapping each input term to a list with its predicted + label string. Falls back to stringified id if label mapping is absent. """ label_ids = self._predict_label_ids(terms) id2label_map = self.id2label or {} # fallback handled below diff --git a/ontolearner/learner/term_typing/sbunlp.py b/ontolearner/learner/term_typing/sbunlp.py index f838bd0..d5c0114 100644 --- a/ontolearner/learner/term_typing/sbunlp.py +++ b/ontolearner/learner/term_typing/sbunlp.py @@ -20,123 +20,152 @@ from ...base import AutoLearner + class SBUNLPZSLearner(AutoLearner): """ Qwen-based blind term typing learner (Task B), implemented as an AutoLearner. - This class reproduces the notebook logic: - - Fit phase learns the *allowed type inventory* from training data. - - Predict phase performs blind prompting per term using the learned type list. - - Outputs are restricted to the allowed types and returned as [{"id", "types"}]. - - Expected I/O (recommended): - - fit(train_data, task="term-typing", ontologizer=True): - The framework's AutoLearner.tasks_data_former() provides a unique list of - type labels; we store it to `self.allowed_types`. - - predict(eval_data, task="term-typing", ontologizer=False): - Pass a list of dicts with keys {"id": str, "term": str} so IDs are preserved. - Returns a list of dicts [{"id": ..., "types": [...] }]. + Lifecycle: + • `fit(...)` learns/records the allowed type inventory from the training payload. + • `load(...)` explicitly loads the tokenizer/model (pass `model_id`/`token` here). + • `predict(...)` prompts the model per term and returns normalized types limited + to the learned inventory. """ def __init__( self, - model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", - device: Optional[str] = None, + device: str = "cpu", max_new_tokens: int = 64, temperature: float = 0.0, + model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", token: Optional[str] = None, ) -> None: """ + Configure runtime knobs. Model identity and auth are provided to `load(...)`. + Args: - model_id: HF model id for Qwen. - device: "cuda", "mps", or "cpu". Auto-detected if None. - max_new_tokens: Generation cap per prompt. - temperature: Not used for greedy decoding (kept for future). - token: HF token if the model is gated. + device: Torch device policy ("cuda", "mps", or "cpu"). + max_new_tokens: Max tokens to generate per prompt (greedy decoding). + temperature: Reserved for future sampling; generation is greedy here. + model_id: Fallback model id/path used if `load()` is called without args. + token: Fallback HF token used if `load()` is called without args. + + Side Effects: + Initializes runtime configuration, instance defaults for `load()`, + and placeholders for `tokenizer`, `model`, and `allowed_types`. """ super().__init__() - - # Basic configuration - self.model_id = model_id - # default device detection: prefer CUDA if available - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.device = device self.max_new_tokens = max_new_tokens self.temperature = temperature + + # Defaults that load() may use when its args are None + self.model_id = model_id self.token = token - # Model/tokenizer placeholders (populated by load()) + # Placeholders populated by load() self.tokenizer: Optional[AutoTokenizer] = None self.model: Optional[AutoModelForCausalLM] = None - # Learned inventory of allowed type labels (populated by fit()) + # Learned inventory self.allowed_types: List[str] = [] - # Regex used to extract quoted strings from model output (e.g. "type") + # Regex used to extract quoted strings from model output (e.g., "type") self._quoted_re = re.compile(r'"([^"]+)"') - def load(self, **kwargs: Any): + def load( + self, + model_id: Optional[str] = None, + token: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + ): """ - Load Qwen model and tokenizer. + Load tokenizer and model weights explicitly. - NOTE: - - The HF arguments used here mirror your original code (`token=...`). - You may see a deprecation warning for `torch_dtype` (older transformers); - switching to `dtype=` is recommended but I did not change behavior here. - """ - # Respect overrides from kwargs if provided - model_id = kwargs.get("model_id", self.model_id) - token = kwargs.get("token", self.token) + Argument precedence: + 1) Use `model_id` / `token` passed to this method (if provided). + 2) Else fall back to `self.model_id` / `self.token`. + + Device & dtype: + • If `dtype` is None, the default is float16 on CUDA/MPS and float32 on CPU. + • `device_map` is `"auto"` for non-CPU devices, `"cpu"` otherwise. + + Args: + model_id: HF model id/path to load. If None, uses `self.model_id`. + token: HF token if the model is gated. If None, uses `self.token`. + dtype: Optional torch dtype override (e.g., `torch.float16`). - # Load tokenizer. If the model is gated, pass token (original code uses `token`). - # If your environment requires `use_auth_token=` replace here. - self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) + Returns: + self + """ + resolved_model_id = model_id or self.model_id + resolved_token = token if token is not None else self.token - # Ensure tokenizer has a pad token (some models omit it) + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + resolved_model_id, token=resolved_token + ) if self.tokenizer.pad_token is None: + # Prefer EOS as pad if available self.tokenizer.pad_token = self.tokenizer.eos_token - # Device mapping for from_pretrained -> keep same behavior as original code + # Device & dtype + if dtype is None: + if self.device == "cpu": + resolved_dtype = torch.float32 + else: + # Works for CUDA and Apple MPS + resolved_dtype = torch.float16 + else: + resolved_dtype = dtype + device_map = "auto" if self.device != "cpu" else "cpu" - # original code used torch_dtype; left as-is to avoid behavioral change - torch_dtype = torch.float16 if self.device != "cpu" else torch.float32 - # Load the model weights. This can be heavy; keep same params as original. self.model = AutoModelForCausalLM.from_pretrained( - model_id, + resolved_model_id, device_map=device_map, - torch_dtype=torch_dtype, - token=token, + torch_dtype=resolved_dtype, # keep torch_dtype for broad Transformers compatibility + token=resolved_token, ) return self - # ------------------------------------------------------------------------- - # Fit / Predict interface - # ------------------------------------------------------------------------- def fit(self, train_data: Any, task: str, ontologizer: bool = True): """ Learn the allowed type inventory from the training data. - Expected behavior: - - If `tasks_data_former(..., test=False)` returns a list of strings, - set allowed_types to that list (deduped & sorted). - - If it returns a list of dicts (relationships), extract unique 'parent' - fields and use those as the allowed type inventory. + Normalization rules: + • If `ontologizer=True`, the framework's `tasks_data_former(..., test=False)` + is used to normalize `train_data`. + • If a container exposes `.term_typings`, types are collected from there. + • If the normalized data is a list of dicts with `"parent"`, unique parents + become the allowed types. + • If it's a list of strings, that unique set becomes the allowed types. - This method contains a tolerant branch for the framework's custom container: - If the returned `train_fmt` is not a list but has a `.term_typings` attribute - (e.g., OntologyData object used by the framework), iterate that attribute - and collect any `types` values found. + Args: + train_data: Training payload provided by the pipeline. + task: Must be `"term-typing"`. + ontologizer: If True, normalize via `tasks_data_former()` first. + + Returns: + self + + Raises: + ValueError: If `task` is not `"term-typing"`. + TypeError: If the training data cannot be normalized to a list of + strings or relationship dicts. """ - train_fmt = self.tasks_data_former(data=train_data, task=task, test=False) if ontologizer else train_data + train_fmt = ( + self.tasks_data_former(data=train_data, task=task, test=False) + if ontologizer + else train_data + ) if task != "term-typing": raise ValueError("SBUNLPZSLearner only implements 'term-typing'.") # If framework passed a container with `.term_typings`, extract types from there if not isinstance(train_fmt, list): - # handle OntologyData-like object with attribute 'term_typings' if hasattr(train_fmt, "term_typings"): try: - # term_typings is expected to be an iterable of objects with attribute `types` collected = set() for tt in getattr(train_fmt, "term_typings") or []: # tt.types could be list[str] or a single str @@ -147,7 +176,6 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True): else: tvals = None - # Normalize both list and single-string cases if isinstance(tvals, (list, tuple, set)): for x in tvals: if isinstance(x, str): @@ -155,145 +183,180 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True): elif isinstance(tvals, str): collected.add(tvals) - # If we successfully collected types, set allowed_types and return if collected: self.allowed_types = sorted(collected) return self - # else fall through to error below (no types found) except Exception: - # If anything unexpected occurs while iterating term_typings, - # gracefully fall through and raise the original TypeError below. + # Fall through to error below if unexpected issues occur. pass - # not a supported non-list type -> keep original behavior (raise) raise TypeError("For term-typing, expected a list of type labels at fit().") # At this point train_fmt is a list (original logic preserved) if train_fmt and isinstance(train_fmt[0], dict) and "parent" in train_fmt[0]: # Case A: Received raw relationships/pairs (e.g., from train_test_split). - # Extract unique parent types from the relationship records. unique_types = set(r.get("parent") for r in train_fmt if r.get("parent")) self.allowed_types = sorted(unique_types) elif all(isinstance(x, str) for x in train_fmt): # Case B: Received a clean list of type labels (List[str]). self.allowed_types = sorted(set(train_fmt)) else: - # The input is a list but not in either expected format -> raise - raise TypeError("For term-typing, input data format for fit() is invalid. Expected list of strings (types) or list of relationships (dicts).") + raise TypeError( + "For term-typing, input data format for fit() is invalid. " + "Expected list of strings (types) or list of relationships (dicts)." + ) return self def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: """ - Predict types for each term. + Predict types for each term and return standardized rows. Expected inputs: - - With ontologizer=True: a list[str] of term strings (IDs are autogenerated). - - With ontologizer=False: a list[dict] where each dict has keys {'id','term'}. + • With `ontologizer=True`: a `list[str]` of terms (IDs are auto-generated), + or a container exposing `.term_typings` from which `{'id','term'}` pairs + can be extracted. + • With `ontologizer=False`: a `list[dict]` of `{'id','term'}` to preserve IDs. + + Args: + eval_data: Evaluation payload as described above. + task: Must be `"term-typing"`. + ontologizer: If True, normalize through the pipeline’s data former. - This method tolerantly converts common framework containers (e.g., an - OntologyData object exposing `.term_typings`) into the expected list[dict] - shape so that the internal _term_typing() can run unchanged. + Returns: + A list of dictionaries: + `{"id": str, "term": str, "types": List[str]}`. """ if task != "term-typing": # Delegate to base for other tasks (not implemented here) return super().predict(eval_data, task, ontologizer=ontologizer) - def _extract_list_of_dicts_from_term_typings(obj) -> Optional[List[Dict[str, str]]]: - """ - Helper: try to produce a list of {"id","term"} dicts from objects - exposing a `term_typings` iterable. Supports either object-like - TermTyping (attributes) or dict-style entries. - """ + def _extract_list_of_dicts_from_term_typings( + obj, + ) -> Optional[List[Dict[str, str]]]: + """Try to derive `[{id, term}, ...]` from an object with `.term_typings`.""" tts = getattr(obj, "term_typings", None) if tts is None: return None out = [] for tt in tts: - # support object-style TermTyping (attributes) and dict-style if isinstance(tt, dict): - # try several common key names for ID tid = tt.get("ID") or tt.get("id") or tt.get("Id") or tt.get("ID_") tterm = tt.get("term") or tt.get("label") or tt.get("name") else: - # object-style access - tid = getattr(tt, "ID", None) or getattr(tt, "id", None) or getattr(tt, "Id", None) - tterm = getattr(tt, "term", None) or getattr(tt, "label", None) or getattr(tt, "name", None) + tid = ( + getattr(tt, "ID", None) + or getattr(tt, "id", None) + or getattr(tt, "Id", None) + ) + tterm = ( + getattr(tt, "term", None) + or getattr(tt, "label", None) + or getattr(tt, "name", None) + ) if tid is None or tterm is None: - # skip malformed entry - this is defensive so downstream code has valid inputs continue out.append({"id": str(tid), "term": str(tterm)}) return out if out else None # Case A: ontologizer=True -> framework often provides list[str] if ontologizer: - if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): - # Simple case: convert list of terms to list of dicts with generated IDs - eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data)] + if isinstance(eval_data, list) and all( + isinstance(x, str) for x in eval_data + ): + eval_pack = [ + {"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data) + ] else: - # Try to extract from a framework container (e.g., OntologyData) maybe = _extract_list_of_dicts_from_term_typings(eval_data) if maybe is not None: eval_pack = maybe else: - # Last resort: if eval_data is some iterable of strings, convert it - try: - if hasattr(eval_data, "__iter__") and not isinstance(eval_data, (str, bytes)): - lst = list(eval_data) - if all(isinstance(x, str) for x in lst): - eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(lst)] - else: - raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") + # Last resort: attempt to coerce iterables of str + if hasattr(eval_data, "__iter__") and not isinstance( + eval_data, (str, bytes) + ): + lst = list(eval_data) + if all(isinstance(x, str) for x in lst): + eval_pack = [ + {"id": f"TT_{i:06d}", "term": t} + for i, t in enumerate(lst) + ] else: - raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") - except TypeError: - # re-raise to preserve original error semantics - raise - # Delegate to internal inference routine + raise TypeError( + "With ontologizer=True, eval_data must be list[str] of terms." + ) + else: + raise TypeError( + "With ontologizer=True, eval_data must be list[str] of terms." + ) return self._term_typing(eval_pack, test=True) - # Case B: ontologizer=False -> we expect list[dict], but tolerate common containers + # Case B: ontologizer=False -> expect list[dict], but tolerate containers else: - if isinstance(eval_data, list) and all(isinstance(x, dict) for x in eval_data): + if isinstance(eval_data, list) and all( + isinstance(x, dict) for x in eval_data + ): eval_pack = eval_data else: - # Try to extract from framework container (term_typings) maybe = _extract_list_of_dicts_from_term_typings(eval_data) if maybe is not None: eval_pack = maybe else: - # As a final attempt, allow eval_data to be a dict with a list under some known keys if isinstance(eval_data, dict): for key in ("term_typings", "terms", "items"): - if key in eval_data and isinstance(eval_data[key], (list, tuple)): + if key in eval_data and isinstance( + eval_data[key], (list, tuple) + ): converted = [] for x in eval_data[key]: - # Accept dict-style entries that include id and term/name - if isinstance(x, dict) and ("id" in x or "ID" in x) and ("term" in x or "name" in x): + if ( + isinstance(x, dict) + and ("id" in x or "ID" in x) + and ("term" in x or "name" in x) + ): tid = x.get("ID") or x.get("id") tterm = x.get("term") or x.get("name") - converted.append({"id": str(tid), "term": str(tterm)}) + converted.append( + {"id": str(tid), "term": str(tterm)} + ) if converted: eval_pack = converted break else: - # Could not convert; raise same TypeError as before - raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") + raise TypeError( + "With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}." + ) else: - # Not a supported container -> raise - raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") - # Delegate to internal inference routine + raise TypeError( + "With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}." + ) return self._term_typing(eval_pack, test=True) - - # ------------------------------------------------------------------------- - # Internal task implementations (AutoLearner hooks) - # ------------------------------------------------------------------------- def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: """ - Core implementation: - - training mode (test=False): `data` is a list of allowed type labels -> store them. - - inference mode (test=True): `data` is a list of {"id","term"} -> produce [{"id","types"}]. + Internal implementation of the *term-typing* task. + + Training mode (`test=False`): + • Expects a `list[str]` of allowed types. Stores a sorted unique copy. + + Inference mode (`test=True`): + • Expects a `list[dict]` of `{"id","term"}` items. + • Requires `load()` to have been called (model/tokenizer available). + • Builds a blind prompt per item, generates text, parses quoted + candidates, and filters them to `self.allowed_types`. + + Args: + data: See the mode-specific expectations above. + test: Set `True` to run inference; `False` to store the type inventory. + + Returns: + • `None` in training mode. + • `list[dict]` with `{"id","term","types":[...]}` in inference mode. + + Raises: + TypeError: If `data` is not in the expected shape for the mode. + RuntimeError: If model/tokenizer are not loaded at inference time. """ if not test: # training: expect a list of strings (type labels) @@ -304,49 +367,58 @@ def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: # Inference path if not isinstance(data, list) or not all(isinstance(x, dict) for x in data): - raise TypeError("At prediction time, expected a list of {'id','term'} dicts.") + raise TypeError( + "At prediction time, expected a list of {'id','term'} dicts." + ) - # Ensure model and tokenizer are loaded if self.model is None or self.tokenizer is None: - raise RuntimeError("Model/tokenizer not loaded. Call .load() before predict().") + raise RuntimeError( + "Model/tokenizer not loaded. Call .load() before predict()." + ) results = [] for item in data: - # preserve incoming IDs and terms term_id = item["id"] term_text = item["term"] - - # build the blind JSON-prompt that instructs the model to output types prompt = self._build_blind_prompt(term_id, term_text, self.allowed_types) - - # generate and parse model output into allowed types types = self._generate_and_parse_types(prompt) - - # append result for this term (keep original id) - # include the original term so downstream evaluation (and any consumers) can match by term results.append({"id": term_id, "term": term_text, "types": types}) return results - # ------------------------------------------------------------------------- - # Prompting + parsing - # ------------------------------------------------------------------------- - - def _format_types_inline(allowed: List[str]) -> str: + def _format_types_inline(self, allowed: List[str]) -> str: """ - Format allowed types as comma-separated quoted strings for insertion into the prompt. - Example: '"type1", "type2", "type3"' + Format the allowed types for inline inclusion in prompts. + + Args: + allowed: List of allowed type labels. + + Returns: + A comma-separated string of quoted types, e.g.: + `"type1", "type2", "type3"`. Returns an empty string for an empty list. """ - return ", ".join(f'"{t}"' for t in allowed) + if not allowed: + return "" + return ", ".join(f'"{t}"' for t in allowed if isinstance(t, str) and t.strip()) - def _build_blind_prompt(self, term_id: str, term: str, allowed_types: List[str]) -> str: + def _build_blind_prompt( + self, term_id: str, term: str, allowed_types: List[str] + ) -> str: """ - Construct the prompt given a single term. The prompt: - - Instructs the model to produce a JSON array of {id, types} objects. - - Provides the allowed types list (so the model should only use those). - - Includes the single input item for which the model must decide types. + Construct the blind JSON prompt for a single term. + + The prompt: + • Instructs the model to produce ONLY a JSON array of `{id, types}` objects. + • Provides the allowed types list so the model should only use those. + • Includes the single input item for which the model must decide types. + + Args: + term_id: Identifier to carry through to the output JSON. + term: The input term string to classify. + allowed_types: Inventory used to constrain outputs. - Note: This is the same blind-prompting approach used in the original notebook. + Returns: + The full prompt string to feed to the LLM. """ allowed_str = self._format_types_inline(allowed_types) return ( @@ -367,14 +439,22 @@ def _build_blind_prompt(self, term_id: str, term: str, allowed_types: List[str]) def _generate_and_parse_types(self, prompt: str) -> List[str]: """ - Greedy generate, then extract quoted strings and filter by allowed types. - - Important details: - - We assert model/tokenizer presence before calling. - - Tokenized inputs are moved to the model device (original code uses .to(self.model.device)). - - The decoded text is scanned for quoted substrings using self._quoted_re. - - Only quoted strings that are present in self.allowed_types are kept. - - Returned list is unique & sorted for deterministic ordering. + Greedy-generate text, extract candidate types, and filter to the inventory. + + Workflow: + 1) Tokenize the prompt and generate deterministically (greedy). + 2) Decode and extract quoted substrings via regex (e.g., `"type"`). + 3) Keep only those candidates that exist in `self.allowed_types`. + 4) Return a unique, sorted list (stable across runs). + + Args: + prompt: Fully formatted prompt string. + + Returns: + List of predicted type labels (possibly empty if none found). + + Raises: + AssertionError: If `model` or `tokenizer` are unexpectedly `None`. """ assert self.model is not None and self.tokenizer is not None @@ -393,8 +473,6 @@ def _generate_and_parse_types(self, prompt: str) -> List[str]: text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) candidates = self._quoted_re.findall(text) - # Filter candidates to the allowed inventory + # Filter candidates to the allowed inventory and stabilize order. filtered = [c for c in candidates if c in self.allowed_types] - - # Return unique & sorted for stability across runs return sorted(set(filtered)) diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py deleted file mode 100644 index 6408881..0000000 --- a/ontolearner/learner/text2onto/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .sbunlp import SBUNLPFewShotLearner -from .alexbek import AlexbekFewShotLearner diff --git a/ontolearner/learner/text2onto/alexbek.py b/ontolearner/learner/text2onto/alexbek.py index 5760dca..f1692f7 100644 --- a/ontolearner/learner/text2onto/alexbek.py +++ b/ontolearner/learner/text2onto/alexbek.py @@ -31,6 +31,7 @@ class _PredictedTypesSchema(BaseModel): """Schema used when generating structured JSON { "types": [...] }.""" + types: List[str] OUTLINES_AVAILABLE: bool = True @@ -41,6 +42,7 @@ class _PredictedTypesSchema(BaseModel): OutlinesTFModel = None outlines_generate_json = None + class LocalAutoLLM(AutoLLM): """ Minimal local LLM helper. @@ -101,11 +103,15 @@ def load(self, model_id: str, *, load_in_4bit: bool = False) -> None: token=self.token, ) else: - device_map = "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + device_map = ( + "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + ) self.model = AutoModelForCausalLM.from_pretrained( model_id, device_map=device_map, - torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + torch_dtype=torch.bfloat16 + if torch.cuda.is_available() + else torch.float32, token=self.token, ) @@ -134,11 +140,17 @@ def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: Decoded new-token texts (no special tokens, stripped). """ if self.model is None or self.tokenizer is None: - raise RuntimeError("Call .load(model_id) on LocalAutoLLM before generate().") + raise RuntimeError( + "Call .load(model_id) on LocalAutoLLM before generate()." + ) - tokenized_batch = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True) + tokenized_batch = self.tokenizer( + prompts, return_tensors="pt", padding=True, truncation=True + ) input_seq_len = tokenized_batch["input_ids"].shape[1] - tokenized_batch = {k: v.to(self.model.device) for k, v in tokenized_batch.items()} + tokenized_batch = { + k: v.to(self.model.device) for k, v in tokenized_batch.items() + } with torch.no_grad(): outputs = self.model.generate( @@ -151,7 +163,11 @@ def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: # Only return the newly generated part for each row in the batch continuation_token_ids = outputs[:, input_seq_len:] - return [self.tokenizer.decode(row, skip_special_tokens=True).strip() for row in continuation_token_ids] + return [ + self.tokenizer.decode(row, skip_special_tokens=True).strip() + for row in continuation_token_ids + ] + class AlexbekFewShotLearner(AutoLearner): """ @@ -168,6 +184,7 @@ class AlexbekFewShotLearner(AutoLearner): Reads your A1 results (docs→terms), predicts types for each term, and writes two files: terms2types_pred.json + types2docs_pred.json """ + def __init__(self, model: LocalAutoLLM, device: str = "cpu", **_: Any) -> None: """ Initialize learner state and canned prompts. @@ -243,7 +260,9 @@ def fit( # Load item -> [doc_ids] item_to_docs_map = self._load_json(terms2doc_json) if not isinstance(item_to_docs_map, dict): - raise ValueError(f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]") + raise ValueError( + f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]" + ) # Reverse mapping: doc_id -> [items] doc_id_to_items_map: Dict[str, List[str]] = {} @@ -258,17 +277,25 @@ def fit( if not doc_row: continue doc_title = str(doc_row.get("title", "")) # be defensive (may be None) - doc_text = self._to_text(doc_row.get("text", "")) # string-ify list if needed + doc_text = self._to_text( + doc_row.get("text", "") + ) # string-ify list if needed if not doc_text: continue - gold_items = self._unique_preserve([s for s in labeled_items if isinstance(s, str)]) + gold_items = self._unique_preserve( + [s for s in labeled_items if isinstance(s, str)] + ) if gold_items: exemplar_candidates.append((doc_title, doc_text, gold_items)) if not exemplar_candidates: - raise RuntimeError("No candidate docs with items found to build few-shot exemplars.") + raise RuntimeError( + "No candidate docs with items found to build few-shot exemplars." + ) - chosen_exemplars = rng.sample(exemplar_candidates, k=min(sample_size, len(exemplar_candidates))) + chosen_exemplars = rng.sample( + exemplar_candidates, k=min(sample_size, len(exemplar_candidates)) + ) # Reuse exemplars for both docs→terms and docs→types prompting self._fewshot_terms_docs = chosen_exemplars self._fewshot_types_docs = chosen_exemplars @@ -315,7 +342,10 @@ def predict_terms( text = self._to_text(document_row.get("text", "")) fewshot_block = self._format_fewshot_block( - self._system_prompt_terms, self._fewshot_terms_docs, key="terms", k=few_shot_k + self._system_prompt_terms, + self._fewshot_terms_docs, + key="terms", + k=few_shot_k, ) user_block = self._format_user_block(title, text) @@ -323,7 +353,9 @@ def predict_terms( document_order.append(document_id) generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) - parsed_term_lists = [self._parse_json_list(generated, key="terms") for generated in generations] + parsed_term_lists = [ + self._parse_json_list(generated, key="terms") for generated in generations + ] os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) lines_written = 0 @@ -334,7 +366,6 @@ def predict_terms( lines_written += 1 return lines_written - def predict_types( self, *, @@ -377,7 +408,10 @@ def predict_types( text = self._to_text(document_row.get("text", "")) fewshot_block = self._format_fewshot_block( - self._system_prompt_types, self._fewshot_types_docs, key="types", k=few_shot_k + self._system_prompt_types, + self._fewshot_types_docs, + key="types", + k=few_shot_k, ) user_block = self._format_user_block(title, text) @@ -385,7 +419,9 @@ def predict_types( document_order.append(document_id) generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) - parsed_type_lists = [self._parse_json_list(generated, key="types") for generated in generations] + parsed_type_lists = [ + self._parse_json_list(generated, key="types") for generated in generations + ] os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) lines_written = 0 @@ -426,7 +462,9 @@ def evaluate_extraction_f1( gold_doc_to_items: Dict[str, set] = {} for item_label, doc_id_list in item_to_doc_ids.items(): for document_id in doc_id_list: - gold_doc_to_items.setdefault(document_id, set()).add(self._norm(item_label)) + gold_doc_to_items.setdefault(document_id, set()).add( + self._norm(item_label) + ) # Build predictions: doc_id -> set(items) pred_doc_to_items: Dict[str, set] = {} @@ -435,7 +473,9 @@ def evaluate_extraction_f1( row = json.loads(line.strip()) document_id = str(row.get("id", "")) items_list = row.get("terms" if key == "term" else "types", []) - pred_doc_to_items[document_id] = {self._norm(x) for x in items_list if isinstance(x, str)} + pred_doc_to_items[document_id] = { + self._norm(x) for x in items_list if isinstance(x, str) + } # Micro counts true_positive = false_positive = false_negative = 0 @@ -447,18 +487,34 @@ def evaluate_extraction_f1( false_positive += len(pred_set - gold_set) false_negative += len(gold_set - pred_set) - precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0 - recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0 - f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0 + precision = ( + true_positive / (true_positive + false_positive) + if (true_positive + false_positive) + else 0.0 + ) + recall = ( + true_positive / (true_positive + false_negative) + if (true_positive + false_negative) + else 0.0 + ) + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) + else 0.0 + ) return f1 def predict_types_from_terms( self, *, - doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl - doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list - few_shot_jsonl: Optional[str] = None, # JSONL lines: {"term":"...", "types":[...]} - rag_terms_json: Optional[str] = None, # JSON list; items may contain "term" and "RAG":[...] + doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl + doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list + few_shot_jsonl: Optional[ + str + ] = None, # JSONL lines: {"term":"...", "types":[...]} + rag_terms_json: Optional[ + str + ] = None, # JSON list; items may contain "term" and "RAG":[...] random_few_shot: Optional[int] = 3, model_id: str = "Qwen/Qwen2.5-1.5B-Instruct", use_structured_output: bool = True, @@ -507,7 +563,9 @@ def predict_types_from_terms( in_memory_results=doc_terms_list, ) if not doc_term_extractions: - raise ValueError("No document→terms results provided (doc_terms_jsonl/doc_terms_list).") + raise ValueError( + "No document→terms results provided (doc_terms_jsonl/doc_terms_list)." + ) # Prepare unique term list and term→doc occurrences unique_terms = self._collect_unique_terms_from_extractions(doc_term_extractions) @@ -525,7 +583,11 @@ def predict_types_from_terms( json_obj = json.loads(raw_line) except Exception: continue - if isinstance(json_obj, dict) and "term" in json_obj and "types" in json_obj: + if ( + isinstance(json_obj, dict) + and "term" in json_obj + and "types" in json_obj + ): global_few_shot_examples.append(json_obj) # Optional per-term RAG examples: {normalized_term -> [examples]} @@ -536,8 +598,12 @@ def predict_types_from_terms( if isinstance(rag_payload, list): for rag_item in rag_payload: if isinstance(rag_item, dict): - normalized_term = self._normalize_term(rag_item.get("term", "")) - rag_examples_lookup[normalized_term] = rag_item.get("RAG", []) + normalized_term = self._normalize_term( + rag_item.get("term", "") + ) + rag_examples_lookup[normalized_term] = rag_item.get( + "RAG", [] + ) except Exception: pass @@ -550,7 +616,10 @@ def predict_types_from_terms( normalized_term = self._normalize_term(term_text) # Prefer per-term RAG for this term, else use global few-shot - few_shot_examples_for_term = rag_examples_lookup.get(normalized_term, None) or global_few_shot_examples + few_shot_examples_for_term = ( + rag_examples_lookup.get(normalized_term, None) + or global_few_shot_examples + ) # Build conversation and prompt conversation_messages = self._build_conv_for_type_infer( @@ -558,28 +627,51 @@ def predict_types_from_terms( few_shot_examples=few_shot_examples_for_term, random_k=random_few_shot, ) - typing_prompt_string = self._apply_chat_template_safe_types(typing_tokenizer, conversation_messages) + typing_prompt_string = self._apply_chat_template_safe_types( + typing_tokenizer, conversation_messages + ) predicted_types: List[str] = [] raw_generation_text: str = "" # Structured JSON path (if requested and available) - if use_structured_output and OUTLINES_AVAILABLE and _PredictedTypesSchema is not None: + if ( + use_structured_output + and OUTLINES_AVAILABLE + and _PredictedTypesSchema is not None + ): try: outlines_model = OutlinesTFModel(typing_model, typing_tokenizer) # type: ignore - generator = outlines_generate_json(outlines_model, _PredictedTypesSchema) # type: ignore + generator = outlines_generate_json( + outlines_model, _PredictedTypesSchema + ) # type: ignore structured = generator(typing_prompt_string, max_tokens=512) - predicted_types = [label for label in structured.types if isinstance(label, str)] - raw_generation_text = json.dumps({"types": predicted_types}, ensure_ascii=False) + predicted_types = [ + label for label in structured.types if isinstance(label, str) + ] + raw_generation_text = json.dumps( + {"types": predicted_types}, ensure_ascii=False + ) except Exception: # Fall back to greedy decoding use_structured_output = False # Greedy decode fallback - if not use_structured_output or not OUTLINES_AVAILABLE or _PredictedTypesSchema is None: - tokenized_prompt = typing_tokenizer(typing_prompt_string, return_tensors="pt", truncation=True, max_length=2048) + if ( + not use_structured_output + or not OUTLINES_AVAILABLE + or _PredictedTypesSchema is None + ): + tokenized_prompt = typing_tokenizer( + typing_prompt_string, + return_tensors="pt", + truncation=True, + max_length=2048, + ) if torch.cuda.is_available(): - tokenized_prompt = {name: tensor.cuda() for name, tensor in tokenized_prompt.items()} + tokenized_prompt = { + name: tensor.cuda() for name, tensor in tokenized_prompt.items() + } with torch.no_grad(): output_ids = typing_model.generate( **tokenized_prompt, @@ -588,14 +680,18 @@ def predict_types_from_terms( num_beams=1, pad_token_id=typing_tokenizer.eos_token_id, ) - new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1]:] - raw_generation_text = typing_tokenizer.decode(new_token_span, skip_special_tokens=True) + new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1] :] + raw_generation_text = typing_tokenizer.decode( + new_token_span, skip_special_tokens=True + ) predicted_types = self._extract_types_from_text(raw_generation_text) - term_to_predicted_types_list.append({ - "term": term_text, - "predicted_types": sorted(set(predicted_types)), - }) + term_to_predicted_types_list.append( + { + "term": term_text, + "predicted_types": sorted(set(predicted_types)), + } + ) # 7) Build types→docs from (term→types) and (term→docs) types_to_doc_id_set: Dict[str, set] = {} @@ -603,16 +699,24 @@ def predict_types_from_terms( normalized_term = self._normalize_term(term_prediction["term"]) doc_ids_for_term = term_to_doc_ids_map.get(normalized_term, []) for type_label in term_prediction.get("predicted_types", []): - types_to_doc_id_set.setdefault(type_label, set()).update(doc_ids_for_term) + types_to_doc_id_set.setdefault(type_label, set()).update( + doc_ids_for_term + ) types_to_doc_ids: Dict[str, List[str]] = { - type_label: sorted(doc_id_set) for type_label, doc_id_set in types_to_doc_id_set.items() + type_label: sorted(doc_id_set) + for type_label, doc_id_set in types_to_doc_id_set.items() } # 8) Save outputs os.makedirs(os.path.dirname(out_terms2types) or ".", exist_ok=True) with open(out_terms2types, "w", encoding="utf-8") as fp_terms2types: - json.dump(term_to_predicted_types_list, fp_terms2types, ensure_ascii=False, indent=2) + json.dump( + term_to_predicted_types_list, + fp_terms2types, + ensure_ascii=False, + indent=2, + ) os.makedirs(os.path.dirname(out_types2docs) or ".", exist_ok=True) with open(out_types2docs, "w", encoding="utf-8") as fp_types2docs: @@ -635,7 +739,6 @@ def _load_json(self, path: str) -> Dict[str, Any]: with open(path, "r", encoding="utf-8") as file_obj: return json.load(file_obj) - def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: """ Iterate over *all* JSON objects found inside a string. @@ -669,7 +772,6 @@ def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: yield json_obj cursor_index = end_index - def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: """ Robust reader that supports: @@ -727,7 +829,6 @@ def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: return documents_by_id - def _to_text(self, text_field: Any) -> str: """ Convert a 'text' field into a single string (handles list-of-strings). @@ -748,7 +849,6 @@ def _to_text(self, text_field: Any) -> str: return " ".join(str(part) for part in text_field) return str(text_field) if text_field is not None else "" - def _unique_preserve(self, values: List[str]) -> List[str]: """ Deduplicate values while preserving the original order. @@ -771,7 +871,6 @@ def _unique_preserve(self, values: List[str]) -> List[str]: ordered_values.append(candidate) return ordered_values - def _norm(self, text: str) -> str: """ Lowercased, single-spaced normalization (for comparisons). @@ -788,7 +887,6 @@ def _norm(self, text: str) -> str: """ return " ".join(text.lower().split()) - def _normalize_term(self, term: str) -> str: """ Normalization tailored for term keys / lookups. @@ -805,7 +903,6 @@ def _normalize_term(self, term: str) -> str: """ return " ".join(str(term).strip().split()).lower() - def _format_fewshot_block( self, system_prompt: str, @@ -846,10 +943,13 @@ def _format_fewshot_block( for example_title, example_text, gold_list in fewshot_examples[:k]: lines.append("### Example") lines.append(f"User:\nTitle: {example_title}\n{example_text}") - lines.append(f'Assistant:\n{{"{key}": ' + json.dumps(gold_list, ensure_ascii=False) + "}") + lines.append( + f'Assistant:\n{{"{key}": ' + + json.dumps(gold_list, ensure_ascii=False) + + "}" + ) return "\n".join(lines) - def _format_user_block(self, title: str, text: str) -> str: """ Format the 'Task' block for the current document. @@ -868,7 +968,6 @@ def _format_user_block(self, title: str, text: str) -> str: """ return f"### Task\nUser:\nTitle: {title}\n{text}" - def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: """ Extract a list from model output, trying: @@ -911,23 +1010,34 @@ def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: # 3) Fallback: comma-split (last resort) if "," in generated_text: - return [part.strip().strip('"').strip("'") for part in generated_text.split(",") if part.strip()] + return [ + part.strip().strip('"').strip("'") + for part in generated_text.split(",") + if part.strip() + ] return [] - - def _apply_chat_template_safe_types(self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]]) -> str: + def _apply_chat_template_safe_types( + self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]] + ) -> str: """ Safely build a prompt string for chat models. Uses the model's chat template when available; otherwise falls back to a simple concatenation. """ try: - return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + return tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) except Exception: - system_text = next((m["content"] for m in messages if m.get("role") == "system"), "") - last_user_text = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + system_text = next( + (m["content"] for m in messages if m.get("role") == "system"), "" + ) + last_user_text = next( + (m["content"] for m in reversed(messages) if m.get("role") == "user"), + "", + ) return f"{system_text}\n\nUser:\n{last_user_text}\n\nAssistant:" - def _build_conv_for_type_infer( self, term: str, @@ -938,20 +1048,27 @@ def _build_conv_for_type_infer( Create a chat-style conversation for a single term→types query, optionally prepending few-shot examples. """ - messages: List[Dict[str, str]] = [{"role": "system", "content": self._system_prompt_term_to_types}] + messages: List[Dict[str, str]] = [ + {"role": "system", "content": self._system_prompt_term_to_types} + ] examples = list(few_shot_examples or []) if random_k and len(examples) > random_k: import random as _rnd + examples = _rnd.sample(examples, random_k) for exemplar in examples: example_term = exemplar.get("term", "") example_types = exemplar.get("types", []) messages.append({"role": "user", "content": f"Term: {example_term}"}) - messages.append({"role": "assistant", "content": json.dumps({"types": example_types}, ensure_ascii=False)}) + messages.append( + { + "role": "assistant", + "content": json.dumps({"types": example_types}, ensure_ascii=False), + } + ) messages.append({"role": "user", "content": f"Term: {term}"}) return messages - def _extract_types_from_text(self, generated_text: str) -> List[str]: """ Parse {"types":[...]} from a free-form generation. @@ -961,13 +1078,18 @@ def _extract_types_from_text(self, generated_text: str) -> List[str]: if object_match: json_obj = json.loads(object_match.group(0)) types_array = json_obj.get("types", []) - return [type_label for type_label in types_array if isinstance(type_label, str)] + return [ + type_label + for type_label in types_array + if isinstance(type_label, str) + ] except Exception: pass return [] - - def _load_llm_for_types(self, model_id: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + def _load_llm_for_types( + self, model_id: str + ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: """ Load a *separate* small chat model for Term→Types (keeps LocalAutoLLM untouched). """ @@ -981,7 +1103,6 @@ def _load_llm_for_types(self, model_id: str) -> Tuple[AutoModelForCausalLM, Auto ) return model, tokenizer - def _load_doc_term_extractions( self, *, @@ -1002,17 +1123,26 @@ def _load_doc_term_extractions( normalized_records: List[Dict] = [] def _coerce_to_record(source_row: Dict) -> Optional[Dict]: - document_id = str(source_row.get("id", "")) or str(source_row.get("doc_id", "")) + document_id = str(source_row.get("id", "")) or str( + source_row.get("doc_id", "") + ) if not document_id: return None terms = source_row.get("extracted_terms") if terms is None: terms = source_row.get("terms") - if terms is None and "payload" in source_row and isinstance(source_row["payload"], dict): + if ( + terms is None + and "payload" in source_row + and isinstance(source_row["payload"], dict) + ): terms = source_row["payload"].get("terms") if not isinstance(terms, list): terms = [] - return {"id": document_id, "extracted_terms": [t for t in terms if isinstance(t, str)]} + return { + "id": document_id, + "extracted_terms": [t for t in terms if isinstance(t, str)], + } if in_memory_results is not None: for source_row in in_memory_results: @@ -1053,8 +1183,9 @@ def _coerce_to_record(source_row: Dict) -> Optional[Dict]: return normalized_records - - def _collect_unique_terms_from_extractions(self, doc_term_extractions: List[Dict]) -> List[str]: + def _collect_unique_terms_from_extractions( + self, doc_term_extractions: List[Dict] + ) -> List[str]: """ Collect unique terms (original casing) from normalized document→terms results. """ @@ -1068,8 +1199,9 @@ def _collect_unique_terms_from_extractions(self, doc_term_extractions: List[Dict ordered_unique_terms.append(term_text.strip()) return ordered_unique_terms - - def _build_term_to_doc_ids(self, doc_term_extractions: List[Dict]) -> Dict[str, List[str]]: + def _build_term_to_doc_ids( + self, doc_term_extractions: List[Dict] + ) -> Dict[str, List[str]]: """ Build lookup: normalized_term -> sorted unique list of doc_ids. """ @@ -1081,4 +1213,7 @@ def _build_term_to_doc_ids(self, doc_term_extractions: List[Dict]) -> Dict[str, if not normalized or not document_id: continue term_to_doc_set.setdefault(normalized, set()).add(document_id) - return {normalized_term: sorted(doc_ids) for normalized_term, doc_ids in term_to_doc_set.items()} + return { + normalized_term: sorted(doc_ids) + for normalized_term, doc_ids in term_to_doc_set.items() + } diff --git a/ontolearner/learner/text2onto/sbunlp.py b/ontolearner/learner/text2onto/sbunlp.py index 8ab617d..49067e2 100644 --- a/ontolearner/learner/text2onto/sbunlp.py +++ b/ontolearner/learner/text2onto/sbunlp.py @@ -25,6 +25,7 @@ from ...base import AutoLearner, AutoLLM + # ----------------------------------------------------------------------------- # Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface # ----------------------------------------------------------------------------- @@ -34,19 +35,29 @@ class LocalAutoLLM(AutoLLM): Uses 4-bit quantization for efficiency and greedy decoding by default. """ - def __init__(self, label_mapper: Any = None, device: str = "cpu", token: str = "") -> None: + def __init__( + self, label_mapper: Any = None, device: str = "cpu", token: str = "" + ) -> None: super().__init__(label_mapper=label_mapper, device=device, token=token) self.model = None self.tokenizer = None - def load(self, model_id: str, load_in_4bit: bool = False, dtype: str = "auto", trust_remote_code: bool = True): + def load( + self, + model_id: str, + load_in_4bit: bool = False, + dtype: str = "auto", + trust_remote_code: bool = True, + ): """Load tokenizer + model, applying 4-bit quantization if specified and possible.""" # Determine the target data type (default to float32 for CPU, float16 for GPU) - torch_dtype_val = (torch.float16 if torch.cuda.is_available() else torch.float32) + torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32 # Load the tokenizer - self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=trust_remote_code + ) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token @@ -78,7 +89,13 @@ def load(self, model_id: str, load_in_4bit: bool = False, dtype: str = "auto", t if self.device == "cpu": self.model.to("cpu") - def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: float = 0.0, top_p: float = 1.0) -> List[str]: + def generate( + self, + inputs: List[str], + max_new_tokens: int = 64, + temperature: float = 0.0, + top_p: float = 1.0, + ) -> List[str]: """Generate continuations for a list of prompts, returning only the generated part.""" if self.model is None or self.tokenizer is None: raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") @@ -100,7 +117,9 @@ def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: flo input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, - do_sample=(temperature > 0.0), # Use greedy decoding if temperature is 0.0 + do_sample=( + temperature > 0.0 + ), # Use greedy decoding if temperature is 0.0 temperature=temperature, top_p=top_p, pad_token_id=self.tokenizer.eos_token_id, @@ -109,20 +128,25 @@ def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: flo # --- Post-processing: Extract only the generated tail --- decoded_outputs: List[str] = [] for i, output_ids in enumerate(outputs): - full_decoded_text = self.tokenizer.decode(output_ids, skip_special_tokens=True) + full_decoded_text = self.tokenizer.decode( + output_ids, skip_special_tokens=True + ) prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True) # Safely strip the prompt text from the full output if full_decoded_text.startswith(prompt_text): - generated_tail = full_decoded_text[len(prompt_text):].strip() + generated_tail = full_decoded_text[len(prompt_text) :].strip() else: # Fallback extraction (less robust if padding affects token indices) prompt_len = input_ids.shape[1] - generated_tail = self.tokenizer.decode(output_ids[prompt_len:], skip_special_tokens=True).strip() + generated_tail = self.tokenizer.decode( + output_ids[prompt_len:], skip_special_tokens=True + ).strip() decoded_outputs.append(generated_tail) return decoded_outputs + # ----------------------------------------------------------------------------- # Main Learner: SBUNLPFewShotLearner (Task A Text2Onto) # ----------------------------------------------------------------------------- @@ -195,7 +219,11 @@ def build_stratified_fewshot_prompt( num_to_sample_from_stratum = int(num_sample_docs * proportion) if num_to_sample_from_stratum > 0: - sampled_documents.extend(random.sample(stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs))) + sampled_documents.extend( + random.sample( + stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs) + ) + ) # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size' unique_docs_by_id = {} @@ -207,8 +235,12 @@ def build_stratified_fewshot_prompt( if len(final_sample_docs) > num_sample_docs: final_sample_docs = random.sample(final_sample_docs, num_sample_docs) elif len(final_sample_docs) < num_sample_docs: - remaining_docs = [d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id] - needed_count = min(num_sample_docs - len(final_sample_docs), len(remaining_docs)) + remaining_docs = [ + d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id + ] + needed_count = min( + num_sample_docs - len(final_sample_docs), len(remaining_docs) + ) final_sample_docs.extend(random.sample(remaining_docs, needed_count)) # Format the few-shot exemplar text block @@ -299,21 +331,31 @@ def build_types_fewshot_block( picked_count += 1 if picked_count >= sample_per_term: - break # Move to the next term + break # Move to the next term prompt_block = "\n".join(prompt_lines) self.fewshot_types_block = prompt_block return prompt_block - def fit(self, train_docs_jsonl: str, terms2doc_json: str, sample_size: int = 28, seed: int = 123) -> None: + def fit( + self, + train_docs_jsonl: str, + terms2doc_json: str, + sample_size: int = 28, + seed: int = 123, + ) -> None: """ Fit phase: Builds and caches the few-shot prompt blocks from the training files. No model training occurs (Few-Shot/In-Context Learning). """ # Build prompt block for Term extraction - _ = self.build_stratified_fewshot_prompt(train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed) + _ = self.build_stratified_fewshot_prompt( + train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed + ) # Build prompt block for Type extraction - _ = self.build_types_fewshot_block(train_docs_jsonl, terms2doc_json, sample_per_term=1) + _ = self.build_types_fewshot_block( + train_docs_jsonl, terms2doc_json, sample_per_term=1 + ) # ------------------------- # Inference helpers (prompt construction and output parsing) @@ -376,10 +418,18 @@ def _parse_list_like(self, raw_string: str) -> List[str]: def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str: """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output.""" # self.model is an instance of LocalAutoLLM - model_output = self.model.generate([prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0) + model_output = self.model.generate( + [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0 + ) return model_output[0] if model_output else "" - def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + def predict_terms( + self, + docs_test_jsonl: str, + out_jsonl: str, + max_lines: int = -1, + max_new_tokens: int = 120, + ) -> int: """ Runs Term Extraction on the test documents and saves results to a JSONL file. Returns: The count of individual terms written. @@ -388,7 +438,10 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - raise RuntimeError("Few-shot block for terms is empty. Call fit() first.") num_written_terms = 0 - with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + with ( + open(docs_test_jsonl, "r", encoding="utf-8") as file_in, + open(out_jsonl, "w", encoding="utf-8") as file_out, + ): for line_index, line in enumerate(file_in, start=1): if 0 < max_lines < line_index: break @@ -396,7 +449,7 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - try: document = json.loads(line.strip()) except Exception: - continue # Skip malformed JSON lines + continue # Skip malformed JSON lines doc_id = document.get("id", "unknown") title = document.get("title", "") @@ -410,7 +463,10 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - # Write extracted terms for term_or_type in predicted_terms: if isinstance(term_or_type, str) and term_or_type.strip(): - file_out.write(json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + "\n") + file_out.write( + json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + + "\n" + ) num_written_terms += 1 # Lightweight memory management for long runs @@ -421,7 +477,13 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - return num_written_terms - def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + def predict_types( + self, + docs_test_jsonl: str, + out_jsonl: str, + max_lines: int = -1, + max_new_tokens: int = 120, + ) -> int: """ Runs Type Extraction on the test documents and saves results to a JSONL file. Returns: The count of individual types written. @@ -430,7 +492,10 @@ def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - raise RuntimeError("Few-shot block for types is empty. Call fit() first.") num_written_types = 0 - with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + with ( + open(docs_test_jsonl, "r", encoding="utf-8") as file_in, + open(out_jsonl, "w", encoding="utf-8") as file_out, + ): for line_index, line in enumerate(file_in, start=1): if 0 < max_lines < line_index: break @@ -438,7 +503,7 @@ def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - try: document = json.loads(line.strip()) except Exception: - continue # Skip malformed JSON lines + continue # Skip malformed JSON lines doc_id = document.get("id", "unknown") title = document.get("title", "") @@ -452,7 +517,10 @@ def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - # Write extracted types for term_or_type in predicted_types: if isinstance(term_or_type, str) and term_or_type.strip(): - file_out.write(json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + "\n") + file_out.write( + json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + + "\n" + ) num_written_types += 1 if line_index % 50 == 0: @@ -475,7 +543,9 @@ def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]: gold_pairs.add((doc_id, clean_term)) return gold_pairs - def load_predicted_pairs(self, predicted_jsonl_path: str, key: str = "term") -> Set[Tuple[str, str]]: + def load_predicted_pairs( + self, predicted_jsonl_path: str, key: str = "term" + ) -> Set[Tuple[str, str]]: """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased.""" predicted_pairs = set() with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle: @@ -490,7 +560,9 @@ def load_predicted_pairs(self, predicted_jsonl_path: str, key: str = "term") -> predicted_pairs.add((doc_id, value.strip().lower())) return predicted_pairs - def evaluate_extraction_f1(self, terms2doc_path: str, predicted_jsonl: str, key: str = "term") -> float: + def evaluate_extraction_f1( + self, terms2doc_path: str, predicted_jsonl: str, key: str = "term" + ) -> float: """ Computes set-based binary Precision, Recall, and F1 score against the gold pairs. """ @@ -507,6 +579,7 @@ def evaluate_extraction_f1(self, terms2doc_path: str, predicted_jsonl: str, key: # Use scikit-learn for metric calculation from sklearn.metrics import precision_recall_fscore_support + precision, recall, f1, _ = precision_recall_fscore_support( y_true, y_pred, average="binary", zero_division=0 ) From ec2313528859e11ad28401de93a9797e4c353f2d Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Tue, 11 Nov 2025 10:00:08 +0100 Subject: [PATCH 06/19] removing changes from __init__.py files --- ontolearner/__init__.py | 24 +----------------------- ontolearner/learner/__init__.py | 9 --------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 5ebd3f6..a1b5d5a 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -29,18 +29,7 @@ AutoRetrieverLearner, AutoRAGLearner, StandardizedPrompting, - LabelMapper, - RWTHDBISTaxonomyLearner, - RWTHDBISTermTypingLearner, - SKHNLPZSLearner, - SKHNLPSequentialFTLearner, - SBUNLPFewShotLearner, - SBUNLPZSLearner, - SBUNLPText2OntoLearner, - AlexbekCrossAttnLearner, - AlexbekRFLearner, - AlexbekRAGLearner, - AlexbekFewShotLearner) + LabelMapper,) from ._learner import LearnerPipeline from .processor import Processor @@ -58,17 +47,6 @@ "LabelMapper", "LearnerPipeline", "Processor", - "RWTHDBISTaxonomyLearner", - "RWTHDBISTermTypingLearner", - "SKHNLPZSLearner", - "SKHNLPSequentialFTLearner", - "SBUNLPFewShotLearner", - "SBUNLPZSLearner", - "SBUNLPText2OntoLearner", - "AlexbekCrossAttnLearner", - "AlexbekRFLearner", - "AlexbekRAGLearner", - "AlexbekFewShotLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 71020e8..0baf580 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -17,12 +17,3 @@ from .rag import AutoRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper -from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner -from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner -from .taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner -from .taxonomy_discovery.sbunlp import SBUNLPFewShotLearner -from .term_typing.sbunlp import SBUNLPZSLearner -from .text2onto import SBUNLPFewShotLearner as SBUNLPText2OntoLearner -from .taxonomy_discovery.alexbek import AlexbekCrossAttnLearner -from .term_typing.alexbek import AlexbekRFLearner, AlexbekRAGLearner -from .text2onto.alexbek import AlexbekFewShotLearner From 2d49d94e2a42c3afd49ff5ee0907be123fcc3dcc Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Tue, 11 Nov 2025 13:17:44 +0100 Subject: [PATCH 07/19] Changes removed from requirements.txt --- requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 28a92bb..3ce19f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,3 @@ sentence-transformers~=5.1.0 scikit-learn~=1.6.1 bitsandbytes~=0.45.1 mistral-common[sentencepiece]~=1.8.5 -g4f -protobuf<5 -accelerate>=0.26.0 -Levenshtein From df6513d9571676c8492582c6cd6cf14dc1b653d5 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 12 Nov 2025 10:25:30 +0100 Subject: [PATCH 08/19] updated __init__.py files and dependencies --- ...r_alexbek_cross_attn_taxonomy_discovery.py} | 2 +- .../llm_learner_alexbek_rag_term_typing.py | 2 +- examples/llm_learner_alexbek_rf_term_typing.py | 4 +--- .../llm_learner_rwthdbis_taxonomy_discovery.py | 2 +- examples/llm_learner_rwthdbis_term_typing.py | 2 +- ...llm_learner_sbunlp_fs_taxonomy_discovery.py | 2 +- examples/llm_learner_sbunlp_zs_term_typing.py | 2 +- ...m_learner_skhnlp_sft_taxonomoy_discovery.py | 2 +- ...lm_learner_skhnlp_zs_taxonomoy_discovery.py | 2 +- ontolearner/__init__.py | 2 +- ontolearner/learner/__init__.py | 10 ++++++++++ .../learner/taxonomy_discovery/__init__.py | 18 ++++++++++++++++++ ontolearner/learner/term_typing/__init__.py | 17 +++++++++++++++++ ontolearner/learner/text2onto/__init__.py | 16 ++++++++++++++++ pyproject.toml | 5 ++++- requirements.txt | 3 +++ setup.py | 5 ++++- 17 files changed, 82 insertions(+), 14 deletions(-) rename examples/{llm_learner_alexbek_self_attn_taxonomy_discovery.py => llm_learner_alexbek_cross_attn_taxonomy_discovery.py} (93%) create mode 100644 ontolearner/learner/taxonomy_discovery/__init__.py create mode 100644 ontolearner/learner/term_typing/__init__.py create mode 100644 ontolearner/learner/text2onto/__init__.py diff --git a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_cross_attn_taxonomy_discovery.py similarity index 93% rename from examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py rename to examples/llm_learner_alexbek_cross_attn_taxonomy_discovery.py index 6a42160..c3bc73f 100644 --- a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py +++ b/examples/llm_learner_alexbek_cross_attn_taxonomy_discovery.py @@ -1,5 +1,5 @@ from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner.learner.taxonomy_discovery.alexbek import AlexbekCrossAttnLearner +from ontolearner.learner.taxonomy_discovery import AlexbekCrossAttnLearner # 1) Load & split ontology = GeoNames() diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py index 3a3233f..17becc2 100644 --- a/examples/llm_learner_alexbek_rag_term_typing.py +++ b/examples/llm_learner_alexbek_rag_term_typing.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner.learner.term_typing.alexbek import AlexbekRAGLearner +from ontolearner.learner.term_typing import AlexbekRAGLearner # Load the GeoNames ontology. ontology = GeoNames() diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py index 28ca94c..75e7ea2 100644 --- a/examples/llm_learner_alexbek_rf_term_typing.py +++ b/examples/llm_learner_alexbek_rf_term_typing.py @@ -1,8 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner.learner.term_typing.alexbek import ( - AlexbekRFLearner, -) # A random-forest term-typing learner over text+graph features +from ontolearner.learner.term_typing import AlexbekRFLearner # A random-forest term-typing learner over text+graph features # Load the GeoNames ontology and extract labeled term-typing data diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py index 4412c5f..9efdb6d 100644 --- a/examples/llm_learner_rwthdbis_taxonomy_discovery.py +++ b/examples/llm_learner_rwthdbis_taxonomy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import LearnerPipeline, train_test_split, ChordOntology -from ontolearner.learner.taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner +from ontolearner.learner.taxonomy_discovery import RWTHDBISSFTLearner # Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery ontology = ChordOntology() diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py index d9bdc4b..90d2b55 100644 --- a/examples/llm_learner_rwthdbis_term_typing.py +++ b/examples/llm_learner_rwthdbis_term_typing.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import LearnerPipeline, train_test_split, AgrO -from ontolearner.learner.term_typing.rwthdbis import RWTHDBISSFTLearner +from ontolearner.learner.term_typing import RWTHDBISSFTLearner # load the AgrO ontology. # AgrO provides term-typing supervision where each term can be annotated with one or more types. diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py index 2200892..4c9c779 100644 --- a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py +++ b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py @@ -2,7 +2,7 @@ from ontolearner import GeoNames, train_test_split, LearnerPipeline # Import the specific Few-Shot Learner implementation -from ontolearner.learner.taxonomy_discovery.sbunlp import SBUNLPFewShotLearner +from ontolearner.learner.taxonomy_discovery import SBUNLPFewShotLearner # Load ontology and split # Load the GeoNames ontology for taxonomy discovery. diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py index 54c070c..24e4de2 100644 --- a/examples/llm_learner_sbunlp_zs_term_typing.py +++ b/examples/llm_learner_sbunlp_zs_term_typing.py @@ -2,7 +2,7 @@ from ontolearner import AgrO, train_test_split, LearnerPipeline # Import the specific Zero-Shot Learner implementation for Term Typing -from ontolearner.learner.term_typing.sbunlp import SBUNLPZSLearner +from ontolearner.learner.term_typing import SBUNLPZSLearner # Load ontology and split # Load the AgrO ontology for type inventory and test data. diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py index 5c87925..5431d6f 100644 --- a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py +++ b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner.learner.taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner +from ontolearner.learner.taxonomy_discovery import SKHNLPSequentialFTLearner # Load ontology and split # Load the GeoNames ontology for taxonomy discovery. diff --git a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py index fec0ddd..f2bca1e 100644 --- a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py +++ b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner.learner.taxonomy_discovery.skhnlp import SKHNLPZSLearner +from ontolearner.learner.taxonomy_discovery import SKHNLPZSLearner # Load ontology and split data # The GeoNames ontology provides geographic term types and relationships. diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index a1b5d5a..237bee8 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -29,7 +29,7 @@ AutoRetrieverLearner, AutoRAGLearner, StandardizedPrompting, - LabelMapper,) + LabelMapper) from ._learner import LearnerPipeline from .processor import Processor diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 0baf580..d1c358a 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -17,3 +17,13 @@ from .rag import AutoRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper +from .taxonomy_discovery import (RWTHDBISSFTLearner, + SKHNLPSequentialFTLearner, + AlexbekCrossAttnLearner, + SBUNLPFewShotLearner, + SKHNLPZSLearner) +from .term_typing import (RWTHDBISSFTLearner, + SBUNLPZSLearner, + AlexbekRFLearner, + AlexbekRAGLearner) +from .text2onto import AlexbekFewShotLearner, SBUNLPFewShotLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py new file mode 100644 index 0000000..ec6f2f4 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .alexbek import AlexbekCrossAttnLearner +from .rwthdbis import RWTHDBISSFTLearner +from .sbunlp import SBUNLPFewShotLearner +from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py new file mode 100644 index 0000000..dec8b9f --- /dev/null +++ b/ontolearner/learner/term_typing/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .alexbek import AlexbekRAGLearner, AlexbekRFLearner +from .rwthdbis import RWTHDBISSFTLearner +from .sbunlp import SBUNLPZSLearner diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py new file mode 100644 index 0000000..489853b --- /dev/null +++ b/ontolearner/learner/text2onto/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .alexbek import AlexbekFewShotLearner +from .sbunlp import SBUNLPFewShotLearner diff --git a/pyproject.toml b/pyproject.toml index 4422243..cdb71a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ numpy = "*" pandas = "*" openpyxl = "*" tqdm = "*" +g4f = "*" pydantic = "2.11.3" pathlib = "1.0.1" python-dotenv = "*" @@ -29,7 +30,9 @@ transformers = "^4.56.0" sentence-transformers = "^5.1.0" dspy = "^2.6.14" bitsandbytes="^0.45.1" -mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] } +mistral-common = { version = "^1.8.5", extras = ["sentencepiece"]} +protobuf = "<5" +Levenshtein = "*" [tool.poetry.dev-dependencies] ruff = "*" diff --git a/requirements.txt b/requirements.txt index 3ce19f7..494f7d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ pandas openpyxl matplotlib tqdm +g4f python-dotenv rdflib~=7.1.4 networkx~=3.4.2 @@ -20,3 +21,5 @@ sentence-transformers~=5.1.0 scikit-learn~=1.6.1 bitsandbytes~=0.45.1 mistral-common[sentencepiece]~=1.8.5 +protobuf<5 +Levenshtein diff --git a/setup.py b/setup.py index 6ae94bb..1dd046b 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ "pandas", "matplotlib", "tqdm", + "g4f", "python-dotenv", "rdflib==7.1.1", "networkx==3.2.1", @@ -32,7 +33,9 @@ "transformers>=4.56.0,<5.0.0", "sentence-transformers>=5.1.0,<6.0.0", "scikit-learn>=1.6.1,<2.0.0", - "bitsandbytes>=0.45.1,<1.0.0" + "bitsandbytes>=0.45.1,<1.0.0", + "protobuf<5", + "Levenshtein" ], classifiers=[ "Development Status :: 5 - Production/Stable", From 47fd865ed4e330ba17ee8cb30024331b0b3f88d3 Mon Sep 17 00:00:00 2001 From: Krishna Rani <105223454+Krishna-Rani-t@users.noreply.github.com> Date: Wed, 12 Nov 2025 10:27:48 +0100 Subject: [PATCH 09/19] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cdb71a1..72d4ac1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ transformers = "^4.56.0" sentence-transformers = "^5.1.0" dspy = "^2.6.14" bitsandbytes="^0.45.1" -mistral-common = { version = "^1.8.5", extras = ["sentencepiece"]} +mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] } protobuf = "<5" Levenshtein = "*" From f658055489e996c1029fdc17ec76aa472707208e Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Thu, 13 Nov 2025 18:42:34 +0100 Subject: [PATCH 10/19] removed unnecessary changes from __init__.py --- ontolearner/learner/__init__.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index d1c358a..0baf580 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -17,13 +17,3 @@ from .rag import AutoRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper -from .taxonomy_discovery import (RWTHDBISSFTLearner, - SKHNLPSequentialFTLearner, - AlexbekCrossAttnLearner, - SBUNLPFewShotLearner, - SKHNLPZSLearner) -from .term_typing import (RWTHDBISSFTLearner, - SBUNLPZSLearner, - AlexbekRFLearner, - AlexbekRAGLearner) -from .text2onto import AlexbekFewShotLearner, SBUNLPFewShotLearner From b8432187c369bd14c7a1aefcac5c52ca612b9cd1 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Sun, 21 Dec 2025 18:46:26 +0100 Subject: [PATCH 11/19] Add Text2onto learner models with documentation --- docs/source/learners/llms4ol.rst | 2 +- .../llms4ol_challenge/alexbek_learner.rst | 144 ++ .../llms4ol_challenge/sbunlp_learner.rst | 146 ++ examples/llm_learner_alexbek_text2onto.py | 159 +- examples/llm_learner_sbunlp_text2onto.py | 176 +- examples/text2onto.py | 41 +- ontolearner/base/learner.py | 31 + ontolearner/evaluation/metrics.py | 104 +- ontolearner/learner/text2onto/__init__.py | 2 +- ontolearner/learner/text2onto/alexbek.py | 1589 +++++------------ ontolearner/learner/text2onto/sbunlp.py | 991 +++++----- ontolearner/text2onto/splitter.py | 75 +- 12 files changed, 1664 insertions(+), 1796 deletions(-) diff --git a/docs/source/learners/llms4ol.rst b/docs/source/learners/llms4ol.rst index 58cd23e..820171a 100644 --- a/docs/source/learners/llms4ol.rst +++ b/docs/source/learners/llms4ol.rst @@ -31,7 +31,7 @@ LLMs4OL is a community development initiative collocated with the International - **Text2Onto** - Extract ontological terms and types from unstructured text. - **ID**: ``text-to-onto`` + **ID**: ``text2onto`` **Info**: This task focuses on extracting foundational elements (Terms and Types) from unstructured text documents to build the initial structure of an ontology. It involves recognizing domain-relevant vocabulary (Term Extraction, SubTask 1) and categorizing it appropriately (Type Extraction, SubTask 2). It bridges the gap between natural language and structured knowledge representation. diff --git a/docs/source/learners/llms4ol_challenge/alexbek_learner.rst b/docs/source/learners/llms4ol_challenge/alexbek_learner.rst index 321e280..b564596 100644 --- a/docs/source/learners/llms4ol_challenge/alexbek_learner.rst +++ b/docs/source/learners/llms4ol_challenge/alexbek_learner.rst @@ -250,3 +250,147 @@ Learn and Predict truth = cross_learner.tasks_ground_truth_former(data=test_data, task=task) metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) print(metrics) + +Text2Onto +------------------ + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~ + +For the Text2Onto task, we load an ontology (via ``OM``), extract its structured content, and then generate synthetic pseudo-sentences using an LLM-backed generator (DSPy + Ollama in this example). + +.. code-block:: python + + import os + import dspy + + # Ontology loader/manager + from ontolearner.ontology import OM + + # Text2Onto utilities: synthetic generation + dataset splitting + from ontolearner.text2onto import SyntheticGenerator, SyntheticDataSplitter + + # ---- DSPy -> Ollama (LiteLLM-style) ---- + LLM_MODEL_ID = "ollama/llama3.2:3b" # use your pulled Ollama model + LLM_API_KEY = "NA" # local Ollama doesn't use a key + LLM_BASE_URL = "http://localhost:11434" # default Ollama endpoint + + dspy_llm = dspy.LM( + model=LLM_MODEL_ID, + cache=True, + max_tokens=4000, + temperature=0, + api_key=LLM_API_KEY, + base_url=LLM_BASE_URL, + ) + dspy.configure(lm=dspy_llm) + + # ---- Synthetic generation configuration ---- + pseudo_sentence_batch_size = int(os.getenv("TEXT2ONTO_BATCH", "10")) + max_worker_count_for_llm_calls = int(os.getenv("TEXT2ONTO_WORKERS", "1")) + + text2onto_synthetic_generator = SyntheticGenerator( + batch_size=pseudo_sentence_batch_size, + worker_count=max_worker_count_for_llm_calls, + ) + + # ---- Load ontology and extract structured data ---- + ontology = OM() + ontology.load() + ontological_data = ontology.extract() + + print(f"term types: {len(ontological_data.term_typings)}") + print(f"taxonomic relations: {len(ontological_data.type_taxonomies.taxonomies)}") + print(f"non-taxonomic relations: {len(ontological_data.type_non_taxonomic_relations.non_taxonomies)}") + + # ---- Generate synthetic Text2Onto samples ---- + synthetic_data = text2onto_synthetic_generator.generate( + ontological_data=ontological_data, + topic=ontology.domain, + ) + +Split Synthetic Data +~~~~~~~~~~~~~~~~~~~~ + +We split the synthetic dataset into train/val/test sets using ``SyntheticDataSplitter``. +Each split is a dict with keys: + +- ``documents`` +- ``terms`` +- ``types`` +- ``terms2docs`` +- ``terms2types`` + +.. code-block:: python + + splitter = SyntheticDataSplitter( + synthetic_data=synthetic_data, + onto_name=ontology.ontology_id, + ) + + train_data, val_data, test_data = splitter.train_test_val_split( + train=0.8, + val=0.0, + test=0.2, + ) + + print("TRAIN sizes:") + print(" documents:", len(train_data.get("documents", []))) + print(" terms:", len(train_data.get("terms", []))) + print(" types:", len(train_data.get("types", []))) + print(" terms2docs:", len(train_data.get("terms2docs", {}))) + print(" terms2types:", len(train_data.get("terms2types", {}))) + + print("TEST sizes:") + print(" documents:", len(test_data.get("documents", []))) + print(" terms:", len(test_data.get("terms", []))) + print(" types:", len(test_data.get("types", []))) + print(" terms2docs:", len(test_data.get("terms2docs", {}))) + print(" terms2types:", len(test_data.get("terms2types", {}))) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +We configure a retrieval-augmented few-shot learner for the Text2Onto task. +The learner retrieves relevant synthetic examples and uses an LLM to predict structured outputs. + +.. code-block:: python + + from ontolearner.learner.text2onto import AlexbekRAGFewShotLearner + + text2onto_learner = AlexbekRAGFewShotLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id="sentence-transformers/all-MiniLM-L6-v2", + device="cpu", # set "cuda" if available + top_k=3, + max_new_tokens=256, + use_tfidf=True, + ) + +Learn and Predict +~~~~~~~~~~~~~~~~~ + +We run the end-to-end pipeline (train -> predict -> evaluate) with ``LearnerPipeline`` using the ``text2onto`` task id. + +.. code-block:: python + + from ontolearner import LearnerPipeline + + task = "text2onto" + + pipe = LearnerPipeline( + llm=text2onto_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=False, + ) + + outputs = pipe( + train_data=train_data, + test_data=test_data, + task=task, + evaluate=True, + ontologizer_data=False, + ) + + print("Metrics:", outputs.get("metrics")) + print("Elapsed time:", outputs.get("elapsed_time")) diff --git a/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst b/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst index 860c3a4..bef83d2 100644 --- a/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst +++ b/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst @@ -31,6 +31,8 @@ Methodological Summary: - For **Taxonomy Discovery**, the focus was on detecting parent–child relationships between ontology terms. Due to the relational nature of this task, batch prompting was employed to efficiently handle multiple type pairs per inference, enabling the model to consider several candidate relations jointly. +- For **Text2Onto**, the objective was to extract ontology construction signals from text-like inputs: generating/using documents, identifying candidate terms, assigning types, and producing supporting mappings such as term–document and term–type associations. In OntoLearner, this is implemented by first generating synthetic pseudo-documents from an ontology (using an LLM-backed synthetic generator), then applying the SBU-NLP prompting strategy to infer structured outputs without any fine-tuning. Dataset splitting and optional Ontologizer-style processing are used to support reproducible evaluation and artifact generation. + Term Typing ----------------------- @@ -179,3 +181,147 @@ Learn and Predict # Evaluate taxonomy discovery performance metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) print(metrics) + +Text2Onto +------------------ + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~ + +For the Text2Onto task, we load an ontology (via ``OM``), extract its structured content, and generate synthetic pseudo-sentences using an LLM-backed generator (DSPy + Ollama in this example). + +.. code-block:: python + + import os + import dspy + + # Import ontology loader/manager and Text2Onto utilities + from ontolearner.ontology import OM + from ontolearner.text2onto import SyntheticGenerator, SyntheticDataSplitter + + # ---- DSPy -> Ollama (LiteLLM-style) ---- + LLM_MODEL_ID = "ollama/llama3.2:3b" + LLM_API_KEY = "NA" # local Ollama doesn't use a key + LLM_BASE_URL = "http://localhost:11434" # default Ollama endpoint + + dspy_llm = dspy.LM( + model=LLM_MODEL_ID, + cache=True, + max_tokens=4000, + temperature=0, + api_key=LLM_API_KEY, + base_url=LLM_BASE_URL, + ) + dspy.configure(lm=dspy_llm) + + # ---- Synthetic generation configuration ---- + batch_size = int(os.getenv("TEXT2ONTO_BATCH", "10")) + worker_count = int(os.getenv("TEXT2ONTO_WORKERS", "1")) + + text2onto_synthetic_generator = SyntheticGenerator( + batch_size=batch_size, + worker_count=worker_count, + ) + + # ---- Load ontology and extract structured data ---- + ontology = OM() + ontology.load() + ontological_data = ontology.extract() + + # Optional sanity checks to verify what was extracted from the ontology + print(f"term types: {len(ontological_data.term_typings)}") + print(f"taxonomic relations: {len(ontological_data.type_taxonomies.taxonomies)}") + print(f"non-taxonomic relations: {len(ontological_data.type_non_taxonomic_relations.non_taxonomies)}") + + # ---- Generate synthetic Text2Onto samples ---- + synthetic_data = text2onto_synthetic_generator.generate( + ontological_data=ontological_data, + topic=ontology.domain, + ) + +Split Synthetic Data +~~~~~~~~~~~~~~~~~~~~ + +We split the synthetic dataset into train/val/test sets using ``SyntheticDataSplitter``. +Each split is a dict with keys: + +- ``documents`` +- ``terms`` +- ``types`` +- ``terms2docs`` +- ``terms2types`` + +.. code-block:: python + + splitter = SyntheticDataSplitter( + synthetic_data=synthetic_data, + onto_name=ontology.ontology_id, + ) + + train_data, val_data, test_data = splitter.train_test_val_split( + train=0.8, + val=0.0, + test=0.2, + ) + + print("TRAIN sizes:") + print(" documents:", len(train_data.get("documents", []))) + print(" terms:", len(train_data.get("terms", []))) + print(" types:", len(train_data.get("types", []))) + print(" terms2docs:", len(train_data.get("terms2docs", {}))) + print(" terms2types:", len(train_data.get("terms2types", {}))) + + print("TEST sizes:") + print(" documents:", len(test_data.get("documents", []))) + print(" terms:", len(test_data.get("terms", []))) + print(" types:", len(test_data.get("types", []))) + print(" terms2docs:", len(test_data.get("terms2docs", {}))) + print(" terms2types:", len(test_data.get("terms2types", {}))) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +We configure the SBU-NLP few-shot learner for the Text2Onto task. +This learner uses an LLM to produce predictions from the synthetic Text2Onto-style samples. + +.. code-block:: python + + from ontolearner.learner.text2onto import SBUNLPFewShotLearner + + text2onto_learner = SBUNLPFewShotLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + device="cpu", # set "cuda" if available + max_new_tokens=256, + output_dir="./results/", + ) + +Learn and Predict +~~~~~~~~~~~~~~~~~ + +We run the end-to-end pipeline (train -> predict -> evaluate) with ``LearnerPipeline`` using the ``text2onto`` task id. + +.. code-block:: python + + from ontolearner import LearnerPipeline + + task = "text2onto" + + pipe = LearnerPipeline( + llm=text2onto_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=False, + ) + + outputs = pipe( + train_data=train_data, + test_data=test_data, + task=task, + evaluate=True, + ontologizer_data=True, + ) + + print("Metrics:", outputs.get("metrics")) + print("Elapsed time:", outputs.get("elapsed_time")) + + # Print all returned outputs (often includes predictions/artifacts/logs) + print(outputs) diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py index 69282a9..fe36bec 100644 --- a/examples/llm_learner_alexbek_text2onto.py +++ b/examples/llm_learner_alexbek_text2onto.py @@ -1,84 +1,111 @@ import os -import json -import torch +import dspy -# LocalAutoLLM handles model loading/generation; AlexbekFewShotLearner provides fit/predict APIs -from ontolearner.learner.text2onto.alexbek import LocalAutoLLM, AlexbekFewShotLearner +# Import ontology loader/manager +from ontolearner.ontology import OM -# Local folder where the dataset is stored (relative to this script) -DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" +# Import Text2Onto utilities: synthetic sample generation + dataset splitting +from ontolearner.text2onto import SyntheticGenerator, SyntheticDataSplitter -# Input paths (already saved) -TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") -TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") -TEST_DOCS_FULL_PATH = os.path.join( - DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl" -) +# Import pipeline orchestrator + the specific Few-Shot learner you want to run +from ontolearner import LearnerPipeline +from ontolearner.learner.text2onto import AlexbekRAGFewShotLearner + +# ---- DSPy -> Ollama (LiteLLM-style) ---- +# Configure DSPy to send prompts to a locally running Ollama server (via LiteLLM-compatible args). +LLM_MODEL_ID = "ollama/llama3.2:3b" # use your pulled Ollama model +LLM_API_KEY = "NA" # Ollama local doesn't use a key; kept for interface compatibility +LLM_BASE_URL = "http://localhost:11434" # default Ollama server endpoint -# Output paths -DOC_TERMS_OUT_PATH = os.path.join( - DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl" +# Create the DSPy language model wrapper. +# Note: DSPy uses LiteLLM-style parameters under the hood when given model/base_url/api_key. +dspy_llm = dspy.LM( + model=LLM_MODEL_ID, + cache=True, # cache generations to speed up repeated runs + max_tokens=4000, # generous context for synthetic generation prompts + temperature=0, # deterministic output; helpful for reproducibility + api_key=LLM_API_KEY, + base_url=LLM_BASE_URL, ) -TERMS2TYPES_OUT_PATH = os.path.join( - DATA_DIR, "test", "terms2types_pred_ecology.fast.json" + +# Register the LM globally so DSPy modules (and generator internals) use it. +dspy.configure(lm=dspy_llm) + +# ---- Synthetic generation configuration ---- +# Allow scaling generation without editing code by using environment variables: +# TEXT2ONTO_BATCH=20 TEXT2ONTO_WORKERS=2 python script.py +pseudo_sentence_batch_size = int(os.getenv("TEXT2ONTO_BATCH", "10")) +max_worker_count_for_llm_calls = int(os.getenv("TEXT2ONTO_WORKERS", "1")) + +# Instantiate the generator that turns ontology structures into pseudo-text samples. +text2onto_synthetic_generator = SyntheticGenerator( + batch_size=pseudo_sentence_batch_size, # number of samples requested per batch + worker_count=max_worker_count_for_llm_calls, # parallel LLM calls (increase if your machine can handle it) ) -TYPES2DOCS_OUT_PATH = os.path.join( - DATA_DIR, "test", "types2docs_pred_ecology.fast.json" + +# ---- Load ontology and extract structured data ---- +# OM loads the ontology configured in your OntoLearner setup and exposes domain metadata. +ontology = OM() +ontology.load() +ontological_data = ontology.extract() # structured: term typings, taxonomies, relations, etc. + +# ---- Generate synthetic Text2Onto samples ---- +# Uses the extracted ontology structures + domain/topic to create synthetic training examples. +synthetic_data = text2onto_synthetic_generator.generate( + ontological_data=ontological_data, + topic=ontology.domain ) -# Device selection -DEVICE = ( - "cuda" - if torch.cuda.is_available() - else ("mps" if torch.backends.mps.is_available() else "cpu") +# ---- Dataset splitter ---- +# Wrap the synthetic dataset with a splitter utility for reproducible partitioning. +splitter = SyntheticDataSplitter( + synthetic_data=synthetic_data, + onto_name=ontology.ontology_id # used to tag/identify outputs for this ontology ) -# Model config -MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" -LOAD_IN_4BIT = DEVICE == "cuda" # 4-bit helps on GPU +# Optional sanity checks to verify what was extracted from the ontology. +print(f"term types: {len(ontological_data.term_typings)}") +print(f"taxonomic relations: {len(ontological_data.type_taxonomies.taxonomies)}") +print(f"non-taxonomic relations: {len(ontological_data.type_non_taxonomic_relations.non_taxonomies)}") -# 1) Load LLM -llm = LocalAutoLLM(device=DEVICE) -llm.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) +# ---- Split into train/val/test ---- +# val=0.0 keeps the API consistent while skipping validation split for this run. +train_data, val_data, test_data = splitter.train_test_val_split(train=0.8, val=0.0, test=0.2) -# 2) Build few-shot exemplars from training split -learner = AlexbekFewShotLearner(model=llm, device=DEVICE) -learner.fit( - train_docs_jsonl=TRAIN_DOCS_PATH, - terms2doc_json=TRAIN_TERMS2DOCS_PATH, - # use defaults for sample size/seed +# ---- Configure the Few-Shot learner for Text2Onto ---- +# This learner will be used by the pipeline to learn/predict from Text2Onto-style samples. +text2ontolearner = AlexbekRAGFewShotLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", # generator model used inside the learner + retriever_model_id="sentence-transformers/all-MiniLM-L6-v2", # embedding model for retrieval + device="cpu", # set "cuda" if you have GPU support + top_k=3, # number of retrieved examples/chunks + max_new_tokens=256, # response length for the learner's generator + use_tfidf=True, # optional lexical retrieval alongside embeddings ) -# 3) Predict terms per test document -os.makedirs(os.path.dirname(DOC_TERMS_OUT_PATH), exist_ok=True) -num_written_doc_terms = learner.predict_terms( - docs_test_jsonl=TEST_DOCS_FULL_PATH, - out_jsonl=DOC_TERMS_OUT_PATH, - # use defaults for max_new_tokens and few_shot_k -) -print(f"[terms] wrote {num_written_doc_terms} lines → {DOC_TERMS_OUT_PATH}") - -# 4) Predict types for extracted terms, using the JSONL we just wrote -typing_summary = learner.predict_types_from_terms( - doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly - doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) - model_id=MODEL_ID, # reuse the same small model - out_terms2types=TERMS2TYPES_OUT_PATH, - out_types2docs=TYPES2DOCS_OUT_PATH, - # use defaults for everything else +# ---- Build pipeline ---- +# LearnerPipeline orchestrates training/prediction/evaluation for the chosen task. +pipe = LearnerPipeline( + llm=text2ontolearner, # the learner implementation used by the pipeline + llm_id="Qwen/Qwen2.5-0.5B-Instruct", # label/id recorded with results + ontologizer_data=False, # whether to run Ontologizer-related processing ) -print( - f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types" +# ---- Run end-to-end (train -> predict -> evaluate) ---- +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="text2onto", + evaluate=True, # compute evaluation metrics on the test set + ontologizer_data=False, # keep consistent with pipeline setting above ) -print(f"[saved] {TERMS2TYPES_OUT_PATH}") -print(f"[saved] {TYPES2DOCS_OUT_PATH}") - -# 5) Small preview of term→types -try: - with open(TERMS2TYPES_OUT_PATH, "r", encoding="utf-8") as fin: - preview = json.load(fin)[:3] - print("[preview] first 3:") - print(json.dumps(preview, ensure_ascii=False, indent=2)) -except Exception as e: - print(f"[preview] skipped: {e}") + +# ---- Display results ---- +# Metrics typically include task-specific scores (depends on OntoLearner implementation). +print("Metrics:", outputs.get("metrics")) + +# Total elapsed time for training + prediction + evaluation. +print("Elapsed time:", outputs["elapsed_time"]) + +# Print everything returned (often includes predictions, logs, artifacts, etc.) +print(outputs) diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py index cff543c..03cba2b 100644 --- a/examples/llm_learner_sbunlp_text2onto.py +++ b/examples/llm_learner_sbunlp_text2onto.py @@ -1,88 +1,108 @@ import os -import torch - -# Import all the required classes -from ontolearner import SBUNLPText2OntoLearner -from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM - -# Local folder where the dataset is stored -# This path is relative to the directory where the script is executed -# (e.g., E:\OntoLearner\examples) -LOCAL_DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" - -# Ensure the base directories exist -# Creates the train and test subdirectories if they don't already exist. -os.makedirs(os.path.join(LOCAL_DATA_DIR, "train"), exist_ok=True) -os.makedirs(os.path.join(LOCAL_DATA_DIR, "test"), exist_ok=True) - -# Define local file paths: POINTING TO ALREADY SAVED FILES -# These files are used as input for the Fit and Predict phases. -DOCS_ALL_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/documents.jsonl" -TERMS2DOC_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/terms2docs.json" -DOCS_TEST_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/text2onto_ecology_test_documents.jsonl" - -# Output files for predictions (saved directly under LOCAL_DATA_DIR/test) -# These files will be created by the predict_terms/types methods. -TERMS_PRED_OUT = ( - "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" +import dspy + +# Import ontology loader/manager and Text2Onto utilities +from ontolearner.ontology import OM +from ontolearner.text2onto import SyntheticGenerator, SyntheticDataSplitter + +# Import the pipeline orchestrator and the specific Few-Shot learner for Text2Onto +from ontolearner import LearnerPipeline +from ontolearner.learner.text2onto import SBUNLPFewShotLearner + +# ---- DSPy -> Ollama (LiteLLM-style) ---- +# Configure DSPy to send prompts to a locally running Ollama server. +LLM_MODEL_ID = "ollama/llama3.2:3b" +LLM_API_KEY = "NA" # Ollama local doesn't use a key; kept for interface compatibility. +LLM_BASE_URL = "http://localhost:11434" # default Ollama endpoint + +# Create the DSPy language model wrapper (LiteLLM-compatible settings) +dspy_llm = dspy.LM( + model=LLM_MODEL_ID, + cache=True, # cache generations to speed up iterative runs + max_tokens=4000, + temperature=0, # deterministic output; useful for reproducible synthetic data + api_key=LLM_API_KEY, + base_url=LLM_BASE_URL, ) -TYPES_PRED_OUT = ( - "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" + +# Register the LM globally so DSPy modules (and generator internals) use it +dspy.configure(lm=dspy_llm) + +# ---- Synthetic generation configuration ---- +# Allow scaling generation without code edits via environment variables: +# TEXT2ONTO_BATCH=20 TEXT2ONTO_WORKERS=2 python script.py +batch_size = int(os.getenv("TEXT2ONTO_BATCH", "10")) +worker_count = int(os.getenv("TEXT2ONTO_WORKERS", "1")) + +# Instantiate the generator that turns ontology structures into pseudo-text samples +text2onto_synthetic_generator = SyntheticGenerator( + batch_size=batch_size, # number of samples requested per batch + worker_count=worker_count, # parallel LLM calls (increase if your machine can handle it) +) + +# ---- Load ontology and extract structured data ---- +# OM loads the ontology configured in your OntoLearner setup and exposes its domain metadata. +ontology = OM() +ontology.load() +ontological_data = ontology.extract() # structured: term typings, taxonomies, relations, etc. + +# ---- Generate synthetic Text2Onto samples ---- +# Uses the ontology's extracted structures + domain/topic to create synthetic training examples. +synthetic_data = text2onto_synthetic_generator.generate( + ontological_data=ontological_data, + topic=ontology.domain, +) + +# Optional sanity checks to verify what was extracted from the ontology +print(f"term types: {len(ontological_data.term_typings)}") +print(f"taxonomic relations: {len(ontological_data.type_taxonomies.taxonomies)}") +print(f"non-taxonomic relations: {len(ontological_data.type_non_taxonomic_relations.non_taxonomies)}") + +# ---- Split into train/val/test ---- +# Wrap the synthetic dataset with a splitter utility for reproducible partitioning. +splitter = SyntheticDataSplitter( + synthetic_data=synthetic_data, + onto_name=ontology.ontology_id, # used to tag/identify outputs for this ontology ) -# Initialize and Load Learner --- -MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" -# Determine the device for inference (GPU or CPU) -DEVICE = "cuda" if torch.cuda.is_available() else "cpu" - -# Instantiate the underlying LLM helper -# (LocalAutoLLM handles model loading and generation) -llm_model_helper = LocalAutoLLM(device=DEVICE) - -# Instantiate the main learner class, passing the LLM helper to its constructor -learner = SBUNLPText2OntoLearner(model=llm_model_helper, device=DEVICE) - -# Load the model (This calls llm_model_helper.load) -LOAD_IN_4BIT = torch.cuda.is_available() -learner.model.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) - -# Build Few-Shot Exemplars (Fit Phase) -# The fit method uses the local data paths to build the in-context learning prompts. -learner.fit( - train_docs_jsonl=DOCS_ALL_PATH, - terms2doc_json=TERMS2DOC_PATH, - sample_size=28, - seed=123, # Seed for stratified random sampling stability +# Create splits for training and evaluation. +# val=0.0 keeps the API consistent while skipping validation split in this run. +train_data, val_data, test_data = splitter.train_test_val_split( + train=0.8, + val=0.0, + test=0.2, ) -MAX_NEW_TOKENS = 100 +# ---- Configure the Few-Shot learner for Text2Onto ---- +# This learner will be used by the pipeline to learn/predict from Text2Onto-style samples. +text2ontolearner = SBUNLPFewShotLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + device="cpu", + max_new_tokens=256, +) -terms_written = learner.predict_terms( - docs_test_jsonl=DOCS_TEST_PATH, - out_jsonl=TERMS_PRED_OUT, - max_new_tokens=MAX_NEW_TOKENS, +# Build pipeline and run +# Build the pipeline, passing the Few-Shot Learner. +pipe = LearnerPipeline( + llm=text2ontolearner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=False, ) -print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.") -# Type Extraction subtask -types_written = learner.predict_types( - docs_test_jsonl=DOCS_TEST_PATH, - out_jsonl=TYPES_PRED_OUT, - max_new_tokens=MAX_NEW_TOKENS, +# Run the full learning pipeline on the text2onto task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="text2onto", + evaluate=True, + ontologizer_data=True, ) -print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.") - -try: - # Evaluate Term Extraction using the custom F1 function and gold data - f1_term = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TERMS_PRED_OUT, key="term") - print(f"Final Term Extraction F1: {f1_term:.4f}") - - # Evaluate Type Extraction - f1_type = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TYPES_PRED_OUT, key="type") - print(f"Final Type Extraction F1: {f1_type:.4f}") - -except Exception as e: - # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) - print( - f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created." - ) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/text2onto.py b/examples/text2onto.py index c67bb5f..03190d5 100644 --- a/examples/text2onto.py +++ b/examples/text2onto.py @@ -58,20 +58,33 @@ onto_name=ontology.ontology_id ) -# Split the synthetic data into train/val/test for each component -terms, types, docs, types2docs = splitter.split(train=0.8, val=0.1, test=0.1) +# split the train, val, test +train_data, val_data, test_data = splitter.train_test_val_split( + train=0.8, + val=0.0, + test=0.2, +) -# Print how many items exist in each split for terms -print("Terms:") -for split in terms: - print(f" {split}: {len(terms[split])}") +# print train split +print("\nTRAIN split:") +print(" docs:", len(train_data.get("documents", []))) +print(" terms:", len(train_data.get("terms", []))) +print(" types:", len(train_data.get("types", []))) +print(" terms2docs:", len(train_data.get("terms2docs", {}))) +print(" terms2types:", len(train_data.get("terms2types", {}))) -# Print how many items exist in each split for types -print("Types:") -for split in types: - print(f" {split}: {len(types[split])}") +# print val split +print("\nVAL split:") +print(" docs:", len(val_data.get("documents", []))) +print(" terms:", len(val_data.get("terms", []))) +print(" types:", len(val_data.get("types", []))) +print(" terms2docs:", len(val_data.get("terms2docs", {}))) +print(" terms2types:", len(val_data.get("terms2types", {}))) -# Print how many items exist in each split for docs -print("Docs:") -for split in docs: - print(f" {split}: {len(docs[split])}") +# print test split +print("\nTEST split:") +print(" docs:", len(test_data.get("documents", []))) +print(" terms:", len(test_data.get("terms", []))) +print(" types:", len(test_data.get("types", []))) +print(" terms2docs:", len(test_data.get("terms2docs", {}))) +print(" terms2types:", len(test_data.get("terms2types", {}))) diff --git a/ontolearner/base/learner.py b/ontolearner/base/learner.py index c410915..46acd88 100644 --- a/ontolearner/base/learner.py +++ b/ontolearner/base/learner.py @@ -18,6 +18,7 @@ import torch import torch.nn.functional as F from sentence_transformers import SentenceTransformer +from collections import defaultdict class AutoLearner(ABC): """ @@ -70,6 +71,7 @@ def fit(self, train_data: Any, task: str, ontologizer: bool=True): - "term-typing": Predict semantic types for terms - "taxonomy-discovery": Identify hierarchical relationships - "non-taxonomy-discovery": Identify non-hierarchical relationships + - "text2onto" : Extract ontology terms and their semantic types from documents Raises: NotImplementedError: If not implemented by concrete class. @@ -81,6 +83,8 @@ def fit(self, train_data: Any, task: str, ontologizer: bool=True): self._taxonomy_discovery(train_data, test=False) elif task == 'non-taxonomic-re': self._non_taxonomic_re(train_data, test=False) + elif task == 'text2onto': + self._text2onto(train_data, test=False) else: raise ValueError(f"{task} is not a valid task.") @@ -103,6 +107,7 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool=True) -> Any: - term-typing: List of predicted types for each term - taxonomy-discovery: Boolean predictions for relationships - non-taxonomy-discovery: Predicted relation types + - text2onto : Extract ontology terms and their semantic types from documents Raises: NotImplementedError: If not implemented by concrete class. @@ -115,6 +120,8 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool=True) -> Any: return self._taxonomy_discovery(eval_data, test=True) elif task == 'non-taxonomic-re': return self._non_taxonomic_re(eval_data, test=True) + elif task == 'text2onto': + return self._text2onto(eval_data, test=True) else: raise ValueError(f"{task} is not a valid task.") @@ -147,6 +154,9 @@ def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]: pass + def _text2onto(self, data: Any, test: bool = False) -> Optional[Any]: + pass + def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[str | Dict[str, str]]: formatted_data = [] if task == "term-typing": @@ -171,6 +181,7 @@ def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[st non_taxonomic_types = list(set(non_taxonomic_types)) non_taxonomic_res = list(set(non_taxonomic_res)) formatted_data = {"types": non_taxonomic_types, "relations": non_taxonomic_res} + return formatted_data def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str]]: @@ -186,6 +197,26 @@ def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str] formatted_data.append({"head": non_taxonomic_triplets.head, "tail": non_taxonomic_triplets.tail, "relation": non_taxonomic_triplets.relation}) + if task == "text2onto": + terms2docs = data.get("terms2docs", {}) or {} + terms2types = data.get("terms2types", {}) or {} + + # gold doc→terms + gold_terms = [] + for term, doc_ids in terms2docs.items(): + for doc_id in doc_ids or []: + gold_terms.append({"doc_id": doc_id, "term": term}) + + # gold doc→types derived via doc→terms + term→types + doc2types = defaultdict(set) + for term, doc_ids in terms2docs.items(): + for doc_id in doc_ids or []: + for ty in (terms2types.get(term, []) or []): + if isinstance(ty, str) and ty.strip(): + doc2types[doc_id].add(ty.strip()) + gold_types = [{"doc_id": doc_id, "type": ty} for doc_id, tys in doc2types.items() for ty in tys] + return {"terms": gold_terms, "types": gold_types} + return formatted_data class AutoLLM(ABC): diff --git a/ontolearner/evaluation/metrics.py b/ontolearner/evaluation/metrics.py index 57b2d66..52340ce 100644 --- a/ontolearner/evaluation/metrics.py +++ b/ontolearner/evaluation/metrics.py @@ -11,44 +11,84 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Dict, Tuple, Set +from typing import List, Dict, Tuple, Set, Any, Union SYMMETRIC_RELATIONS = {"equivalentclass", "sameas", "disjointwith"} -def text2onto_metrics(y_true: List[str], y_pred: List[str], similarity_threshold: float = 0.8) -> Dict[str, float | int]: - def jaccard_similarity(a: str, b: str) -> float: - set_a = set(a.lower().split()) - set_b = set(b.lower().split()) - if not set_a and not set_b: +def text2onto_metrics( + y_true: Dict[str, Any], + y_pred: Dict[str, Any], + similarity_threshold: float = 0.8 +) -> Dict[str, Any]: + """ + Expects: + y_true = {"terms": [{"doc_id": str, "term": str}, ...], + "types": [{"doc_id": str, "type": str}, ...]} + y_pred = same shape + + Returns: + {"terms": {...}, "types": {...}} + """ + + def jaccard_similarity(text_a: str, text_b: str) -> float: + tokens_a = set(text_a.lower().split()) + tokens_b = set(text_b.lower().split()) + if not tokens_a and not tokens_b: return 1.0 - return len(set_a & set_b) / len(set_a | set_b) - - matched_gt_indices = set() - matched_pred_indices = set() - for i, pred_label in enumerate(y_pred): - for j, gt_label in enumerate(y_true): - if j in matched_gt_indices: - continue - sim = jaccard_similarity(pred_label, gt_label) - if sim >= similarity_threshold: - matched_pred_indices.add(i) - matched_gt_indices.add(j) - break # each gt matched once - - total_correct = len(matched_pred_indices) - total_predicted = len(y_pred) - total_ground_truth = len(y_true) + return len(tokens_a & tokens_b) / len(tokens_a | tokens_b) + + def pairs_to_strings(rows: List[Dict[str, str]], value_key: str) -> List[str]: + paired_strings: List[str] = [] + for row in rows or []: + doc_id = (row.get("doc_id") or "").strip() + value = (row.get(value_key) or "").strip() + if doc_id and value: + # keep doc association + allow token Jaccard + paired_strings.append(f"{doc_id} {value}") + return paired_strings + + def score_list(ground_truth_items: List[str], predicted_items: List[str]) -> Dict[str, Union[float, int]]: + matched_ground_truth_indices: Set[int] = set() + matched_predicted_indices: Set[int] = set() + + for predicted_index, predicted_item in enumerate(predicted_items): + for ground_truth_index, ground_truth_item in enumerate(ground_truth_items): + if ground_truth_index in matched_ground_truth_indices: + continue + + if jaccard_similarity(predicted_item, ground_truth_item) >= similarity_threshold: + matched_predicted_indices.add(predicted_index) + matched_ground_truth_indices.add(ground_truth_index) + break + + total_correct = len(matched_predicted_indices) + total_predicted = len(predicted_items) + total_ground_truth = len(ground_truth_items) + + precision = total_correct / total_predicted if total_predicted else 0.0 + recall = total_correct / total_ground_truth if total_ground_truth else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0 + + return { + "f1_score": f1, + "precision": precision, + "recall": recall, + "total_correct": total_correct, + "total_predicted": total_predicted, + "total_ground_truth": total_ground_truth, + } + + ground_truth_terms = pairs_to_strings(y_true.get("terms", []), "term") + predicted_terms = pairs_to_strings(y_pred.get("terms", []), "term") + ground_truth_types = pairs_to_strings(y_true.get("types", []), "type") + predicted_types = pairs_to_strings(y_pred.get("types", []), "type") + + terms_metrics = score_list(ground_truth_terms, predicted_terms) + types_metrics = score_list(ground_truth_types, predicted_types) - precision = total_correct / total_predicted if total_predicted > 0 else 0 - recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0 - f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 return { - "f1_score": f1_score, - "precision": precision, - "recall": recall, - "total_correct": total_correct, - "total_predicted": total_predicted, - "total_ground_truth": total_ground_truth + "terms": terms_metrics, + "types": types_metrics, } def term_typing_metrics(y_true: List[Dict[str, List[str]]], y_pred: List[Dict[str, List[str]]]) -> Dict[str, float | int]: diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py index 489853b..af31523 100644 --- a/ontolearner/learner/text2onto/__init__.py +++ b/ontolearner/learner/text2onto/__init__.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .alexbek import AlexbekFewShotLearner +from .alexbek import AlexbekRAGFewShotLearner from .sbunlp import SBUNLPFewShotLearner diff --git a/ontolearner/learner/text2onto/alexbek.py b/ontolearner/learner/text2onto/alexbek.py index f1692f7..8dee17a 100644 --- a/ontolearner/learner/text2onto/alexbek.py +++ b/ontolearner/learner/text2onto/alexbek.py @@ -12,1208 +12,587 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, List, Optional, Tuple, Iterable import json -from json.decoder import JSONDecodeError -import os -import random import re +from typing import Any, Dict, List, Optional +from collections import defaultdict import torch from transformers import AutoTokenizer, AutoModelForCausalLM -from ...base import AutoLearner, AutoLLM +from ...base import AutoLearner, AutoRetriever -try: - from outlines.models import Transformers as OutlinesTFModel - from outlines.generate import json as outlines_generate_json - from pydantic import BaseModel - - class _PredictedTypesSchema(BaseModel): - """Schema used when generating structured JSON { "types": [...] }.""" - - types: List[str] - - OUTLINES_AVAILABLE: bool = True -except Exception: - # If outlines is unavailable, we will fall back to greedy decoding + regex parsing. - OUTLINES_AVAILABLE = False - _PredictedTypesSchema = None - OutlinesTFModel = None - outlines_generate_json = None - - -class LocalAutoLLM(AutoLLM): +class AlexbekRAGFewShotLearner(AutoLearner): """ - Minimal local LLM helper. - - - Inherits AutoLLM but overrides load/generate to avoid label_mapper. - - Optional 4-bit loading with `load_in_4bit=True` in .load(). - - Greedy decoding by default (deterministic). + What it does (2-stage): + 1) doc -> terms + - retrieve top-k similar TRAIN documents (each has gold OL terms) + - build a few-shot chat prompt: (doc -> {"terms":[...]}) examples + target doc + - generate JSON {"terms":[...]} and parse it + + 2) term -> types + - retrieve top-k similar TRAIN terms (each has gold types) + - build a few-shot chat prompt: (term -> {"types":[...]}) examples + target term + - generate JSON {"types":[...]} and parse it + + Training behavior (fit): + - builds two retrieval indices: + * doc_retriever index over JSON strings of train docs (with "OL" field = gold terms) + * term_retriever index over JSON strings of train term->types examples + + Prediction behavior (predict): + - returns a dict compatible with OntoLearner evaluation_report: + { + "terms": [{"doc_id": "...", "term": "..."}, ...], + "types": [{"doc_id": "...", "type": "..."}, ...], + } + + Expected data format for task="text2onto": + data = { + "documents": [ {"id"/"doc_id": str, "title": str, "text": str, ...}, ... ], + "terms2docs": { term(str): [doc_id(str), ...], ... } + "terms2types": { term(str): [type(str), ...], ... } + } + + IMPORTANT: + - LearnerPipeline calls learner.load(model_id=llm_id). We accept that and override llm_model_id. + - We override tasks_data_former() so AutoLearner.fit/predict does NOT rewrite text2onto dicts. + - Device placement: we put the model exactly on the device string the user provides + ("cpu", "cuda", "cuda:0", "cuda:1", ...). No device_map="auto". """ - def __init__(self, device: str = "cpu", token: str = "") -> None: + TERM2TYPES_SYSTEM_PROMPT = ( + "You are an expert in ontology and semantic type classification. Your task is to predict " + "the semantic types for given terms based on their context and similar examples.\n\n" + "Given a term, you should predict its semantic types from the domain-specific ontology. " + "Use the provided examples to understand the patterns and relationships between terms and their types.\n\n" + "Output your response as a JSON object with the following structure:\n" + '{\n "types": ["type1", "type2", ...]\n}\n\n' + "The types should be relevant semantic categories that best describe the given term." + ) + + DOC2TERMS_SYSTEM_PROMPT = ( + "You are an expert in ontology term extraction.\n\n" + "TASK: Extract specific, relevant ontology terms from scientific documents.\n\n" + "INSTRUCTIONS:\n" + "- The following conversation contains few-shot examples showing correct term extraction patterns\n" + "- Study these examples carefully to understand the extraction style and approach\n" + "- Follow the EXACT same pattern and style demonstrated in the examples\n" + "- Extract only terms that actually appear in the document text\n" + "- Focus on domain-specific terminology, concepts, and technical terms\n\n" + "- The first three user-assistant conversation pairs serve as few-shot examples\n" + "- Each example shows: user provides a document, assistant extracts relevant terms\n" + "- Pay attention to the extraction patterns and term selection criteria in these examples\n\n" + "DO:\n" + "- Extract terms that are EXPLICITLY mentioned in the LAST document\n" + "- Follow the SAME extraction pattern as shown in examples\n" + "- Return unique terms without duplicates\n" + "- Use the same JSON format as demonstrated\n\n" + "DON'T:\n" + "- Hallucinate or invent terms not present in last the document\n" + "- Repeat the same term multiple times\n" + "- Deviate from the extraction style shown in examples\n\n" + "OUTPUT FORMAT: Return a JSON object with a single field 'terms' containing a list of extracted terms." + ) + + def __init__( + self, + llm_model_id: str, + retriever_model_id: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "cpu", + top_k: int = 3, + max_new_tokens: int = 256, + max_input_length: int = 2048, + use_tfidf: bool = False, + seed: int = 42, + restrict_to_known_types: bool = True, + hf_token: str = "", + local_files_only: bool = False, + **kwargs: Any, + ): """ - Initialize the local LLM holder. - Parameters ---------- - device : str - Execution device: "cpu" or "cuda". - token : str - Optional auth token for private model hubs. - """ - super().__init__(label_mapper=None, device=device, token=token) + llm_model_id: + HuggingFace model id OR local path to a downloaded model directory. + retriever_model_id: + SentenceTransformer model id OR local path to a downloaded SBERT directory. + device: + Exact device string to place model on ("cpu", "cuda", "cuda:0", ...). + top_k: + Number of retrieved examples for few-shot prompting in each stage. + max_new_tokens: + Max tokens to generate for each prompt. + max_input_length: + Max prompt length before truncation. + use_tfidf: + If docs include TF-IDF suggestions (key "TF-IDF" or "tfidf_terms"), include them in prompts. + seed: + Seed for reproducibility. + restrict_to_known_types: + If True, append allowed type label list (from training) to system prompt in term->types stage. + This helps exact-match evaluation by discouraging invented labels. + hf_token: + HuggingFace token for gated models (optional). + local_files_only: + If True, Transformers will not try to reach the internet (requires local cache / local path). + """ + super().__init__(**kwargs) + + self.llm_model_id: str = llm_model_id + self.retriever_model_id: str = retriever_model_id + self.device: str = device + self.top_k: int = int(top_k) + self.max_new_tokens: int = int(max_new_tokens) + self.max_input_length: int = int(max_input_length) + self.use_tfidf: bool = bool(use_tfidf) + self.seed: int = int(seed) + self.restrict_to_known_types: bool = bool(restrict_to_known_types) + self.hf_token: str = hf_token or "" + self.local_files_only: bool = bool(local_files_only) + self.model: Optional[AutoModelForCausalLM] = None self.tokenizer: Optional[AutoTokenizer] = None + self._loaded: bool = False - def load(self, model_id: str, *, load_in_4bit: bool = False) -> None: - """ - Load a Hugging Face causal model + tokenizer and set deterministic - generation defaults. - - Parameters - ---------- - model_id : str - Model identifier resolvable by HF `from_pretrained`. - load_in_4bit : bool - If True and bitsandbytes is available, load using 4-bit quantization. - """ - # Tokenizer - self.tokenizer = AutoTokenizer.from_pretrained( - model_id, padding_side="left", token=self.token - ) - if self.tokenizer.pad_token is None: - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Model (optionally quantized) - if load_in_4bit: - from transformers import BitsAndBytesConfig + # Internal retrievers (always used in method-1, even in "llm-only" pipeline mode) + self.doc_retriever = AutoRetriever() + self.term_retriever = AutoRetriever() - quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_use_double_quant=True, - bnb_4bit_compute_dtype=torch.bfloat16, - ) - self.model = AutoModelForCausalLM.from_pretrained( - model_id, - device_map="auto", - quantization_config=quantization_config, - token=self.token, - ) - else: - device_map = ( - "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None - ) - self.model = AutoModelForCausalLM.from_pretrained( - model_id, - device_map=device_map, - torch_dtype=torch.bfloat16 - if torch.cuda.is_available() - else torch.float32, - token=self.token, - ) + # Indexed corpora as JSON strings + self._doc_examples_json: List[str] = [] + self._term_examples_json: List[str] = [] - # Deterministic generation defaults - generation_cfg = self.model.generation_config - generation_cfg.do_sample = False - generation_cfg.temperature = None - generation_cfg.top_k = None - generation_cfg.top_p = None - generation_cfg.num_beams = 1 + # Cached allowed type labels (for optional restriction) + self._allowed_types: List[str] = [] - def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: + def tasks_data_former(self, data: Any, task: str, test: bool = False): """ - Greedy-generate continuations for a list of prompts. + Override base formatter: for task='text2onto' return data unchanged. + """ + if task == "text2onto": + return data + return super().tasks_data_former(data=data, task=task, test=test) - Parameters - ---------- - prompts : List[str] - Prompts to generate for (batched). - max_new_tokens : int - Maximum number of new tokens per continuation. - - Returns - ------- - List[str] - Decoded new-token texts (no special tokens, stripped). + def load(self, **kwargs: Any): """ - if self.model is None or self.tokenizer is None: - raise RuntimeError( - "Call .load(model_id) on LocalAutoLLM before generate()." - ) + Called by LearnerPipeline as: learner.load(model_id=llm_id) - tokenized_batch = self.tokenizer( - prompts, return_tensors="pt", padding=True, truncation=True - ) - input_seq_len = tokenized_batch["input_ids"].shape[1] - tokenized_batch = { - k: v.to(self.model.device) for k, v in tokenized_batch.items() - } + We accept overrides via kwargs: + - model_id / llm_model_id + - device, top_k, max_new_tokens, max_input_length, use_tfidf, seed, restrict_to_known_types + - hf_token, local_files_only + """ + model_id = kwargs.get("model_id") or kwargs.get("llm_model_id") + if model_id: + self.llm_model_id = str(model_id) - with torch.no_grad(): - outputs = self.model.generate( - **tokenized_batch, - max_new_tokens=max_new_tokens, - pad_token_id=self.tokenizer.eos_token_id, - do_sample=False, - num_beams=1, - ) + for k in [ + "device", + "top_k", + "max_new_tokens", + "max_input_length", + "use_tfidf", + "seed", + "restrict_to_known_types", + "hf_token", + "local_files_only", + "retriever_model_id", + ]: + if k in kwargs: + setattr(self, k, kwargs[k]) - # Only return the newly generated part for each row in the batch - continuation_token_ids = outputs[:, input_seq_len:] - return [ - self.tokenizer.decode(row, skip_special_tokens=True).strip() - for row in continuation_token_ids - ] + if self._loaded: + return + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) -class AlexbekFewShotLearner(AutoLearner): - """ - Text2Onto learner for LLMS4OL Task A (term & type extraction). - - Public API (A1 + convenience): - - fit(train_docs_jsonl, terms2doc_json, sample_size=24, seed=42) - - predict_terms(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int - - predict_types(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int - - evaluate_extraction_f1(gold_item2docs_json, preds_jsonl, key="term"|"type") -> float - - Option A (A2, term→types) bridge: - - predict_types_from_terms_option_a(...) - Reads your A1 results (docs→terms), predicts types for each term, and - writes two files: terms2types_pred.json + types2docs_pred.json - """ + dev = str(self.device).strip() + if dev.startswith("cuda") and not torch.cuda.is_available(): + raise RuntimeError(f"Device was set to '{dev}', but CUDA is not available.") - def __init__(self, model: LocalAutoLLM, device: str = "cpu", **_: Any) -> None: - """ - Initialize learner state and canned prompts. + dtype = torch.bfloat16 if dev.startswith("cuda") else torch.float32 - Parameters - ---------- - model : LocalAutoLLM - Loaded local LLM helper instance. - device : str - Device name ("cpu" or "cuda"). - """ - super().__init__(**_) - self.model = model - self.device = device - - # Few-shot exemplars for A1 (Docs→Terms) and for Docs→Types: - # Each exemplar is a tuple: (title, text, gold_list) - self._fewshot_terms_docs: List[Tuple[str, str, List[str]]] = [] - self._fewshot_types_docs: List[Tuple[str, str, List[str]]] = [] - - # System prompts - self._system_prompt_terms = ( - "You are an expert in ontology term extraction.\n" - "Extract only terms that explicitly appear in the document.\n" - 'Answer strictly as JSON: {"terms": ["..."]}\n' - ) - self._system_prompt_types = ( - "You are an expert in ontology type classification.\n" - "List ontology *types* that characterize the document’s terminology.\n" - 'Answer strictly as JSON: {"types": ["..."]}\n' - ) + tok_kwargs: Dict[str, Any] = {"local_files_only": self.local_files_only} + if self.hf_token: + tok_kwargs["token"] = self.hf_token + try: + self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model_id, **tok_kwargs) + except TypeError: + tok_kwargs.pop("token", None) + if self.hf_token: + tok_kwargs["use_auth_token"] = self.hf_token + self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model_id, **tok_kwargs) - # Compiled regex for robust JSON extraction from LLM outputs - self._json_object_regex = re.compile(r"\{[^{}]*\}", re.S) - self._json_array_regex = re.compile(r"\[[^\]]*\]", re.S) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token - # Term→Types (Option A) specific prompt - self._system_prompt_term_to_types = ( - "You are an expert in ontology and semantic type classification.\n" - "Given a term, predict its semantic types from the domain-specific ontology.\n" - 'Answer strictly as JSON:\n{"types": ["type1", "type2", "..."]}' - ) - def fit( - self, - *, - train_docs_jsonl: str, - terms2doc_json: str, - sample_size: int = 24, - seed: int = 42, - ) -> None: - """ - Build internal few-shot exemplars from a labeled training split. + model_kwargs: Dict[str, Any] = {"local_files_only": self.local_files_only} + if self.hf_token: + model_kwargs["token"] = self.hf_token - Parameters - ---------- - train_docs_jsonl : str - Path to JSONL (or tolerant JSON/JSONL) with train documents. - terms2doc_json : str - JSON mapping item -> [doc_id,...]; "item" can be a term or type. - sample_size : int - Number of exemplar documents to keep for few-shot prompting. - seed : int - RNG seed for reproducible sampling. - """ - rng = random.Random(seed) - - # Load documents and map doc_id -> row - document_map = self._load_documents_jsonl(train_docs_jsonl) - if not document_map: - raise FileNotFoundError(f"No documents found in: {train_docs_jsonl}") - - # Load item -> [doc_ids] - item_to_docs_map = self._load_json(terms2doc_json) - if not isinstance(item_to_docs_map, dict): - raise ValueError( - f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]" + try: + self.model = AutoModelForCausalLM.from_pretrained( + self.llm_model_id, + dtype=dtype, + **model_kwargs, ) - - # Reverse mapping: doc_id -> [items] - doc_id_to_items_map: Dict[str, List[str]] = {} - for item_label, doc_id_list in item_to_docs_map.items(): - for doc_id in doc_id_list: - doc_id_to_items_map.setdefault(doc_id, []).append(item_label) - - # Build candidate exemplars (title, text, gold_list) - exemplar_candidates: List[Tuple[str, str, List[str]]] = [] - for doc_id, labeled_items in doc_id_to_items_map.items(): - doc_row = document_map.get(doc_id) - if not doc_row: - continue - doc_title = str(doc_row.get("title", "")) # be defensive (may be None) - doc_text = self._to_text( - doc_row.get("text", "") - ) # string-ify list if needed - if not doc_text: - continue - gold_items = self._unique_preserve( - [s for s in labeled_items if isinstance(s, str)] + except TypeError: + model_kwargs.pop("token", None) + if self.hf_token: + model_kwargs["use_auth_token"] = self.hf_token + self.model = AutoModelForCausalLM.from_pretrained( + self.llm_model_id, + torch_dtype=dtype, + **model_kwargs, ) - if gold_items: - exemplar_candidates.append((doc_title, doc_text, gold_items)) - if not exemplar_candidates: - raise RuntimeError( - "No candidate docs with items found to build few-shot exemplars." - ) + self.model = self.model.to(dev) - chosen_exemplars = rng.sample( - exemplar_candidates, k=min(sample_size, len(exemplar_candidates)) - ) - # Reuse exemplars for both docs→terms and docs→types prompting - self._fewshot_terms_docs = chosen_exemplars - self._fewshot_types_docs = chosen_exemplars + self.doc_retriever.load(self.retriever_model_id) + self.term_retriever.load(self.retriever_model_id) - def predict_terms( - self, - *, - docs_test_jsonl: str, - out_jsonl: str, - max_new_tokens: int = 128, - few_shot_k: int = 6, - ) -> int: - """ - Extract terms that explicitly appear in each document. + self._loaded = True - Writes one JSON object per line: - {"id": "", "terms": ["...", "...", ...]} - Parameters - ---------- - docs_test_jsonl : str - Path to test/dev documents in JSONL or tolerant JSON/JSONL. - out_jsonl : str - Output JSONL path where predictions are written (one line per doc). - max_new_tokens : int - Max generation length. - few_shot_k : int - Number of few-shot exemplars to prepend per prompt. - - Returns - ------- - int - Number of lines written (i.e., number of processed documents). + def _format_doc(self, title: str, text: str, tfidf: Optional[List[str]] = None) -> str: """ - if self.model is None or self.model.model is None: - raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") - - test_documents = self._load_documents_jsonl(docs_test_jsonl) - prompts: List[str] = [] - document_order: List[str] = [] - - for document_id, document_row in test_documents.items(): - title = str(document_row.get("title", "")) - text = self._to_text(document_row.get("text", "")) - - fewshot_block = self._format_fewshot_block( - self._system_prompt_terms, - self._fewshot_terms_docs, - key="terms", - k=few_shot_k, - ) - user_block = self._format_user_block(title, text) - - prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") - document_order.append(document_id) - - generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) - parsed_term_lists = [ - self._parse_json_list(generated, key="terms") for generated in generations - ] - - os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) - lines_written = 0 - with open(out_jsonl, "w", encoding="utf-8") as fp_out: - for document_id, term_list in zip(document_order, parsed_term_lists): - payload = {"id": document_id, "terms": self._unique_preserve(term_list)} - fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") - lines_written += 1 - return lines_written - - def predict_types( - self, - *, - docs_test_jsonl: str, - out_jsonl: str, - max_new_tokens: int = 128, - few_shot_k: int = 6, - ) -> int: + Format doc as the retriever query and as the user prompt content. """ - Predict ontology types that characterize each document’s terminology. - - Writes one JSON object per line: - {"id": "", "types": ["...", "...", ...]} + s = f"Title: {title}\n\nText:\n{text}" + if tfidf: + s += f"\n\nTF-IDF based suggestions: {tfidf}" + return s - Parameters - ---------- - docs_test_jsonl : str - Path to test/dev documents in JSONL or tolerant JSON/JSONL. - out_jsonl : str - Output JSONL path where predictions are written (one line per doc). - max_new_tokens : int - Max generation length. - few_shot_k : int - Number of few-shot exemplars to prepend per prompt. - - Returns - ------- - int - Number of lines written (i.e., number of processed documents). + def _apply_chat_template(self, conversation: List[Dict[str, str]]) -> str: """ - if self.model is None or self.model.model is None: - raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") - - test_documents = self._load_documents_jsonl(docs_test_jsonl) - prompts: List[str] = [] - document_order: List[str] = [] - - for document_id, document_row in test_documents.items(): - title = str(document_row.get("title", "")) - text = self._to_text(document_row.get("text", "")) - - fewshot_block = self._format_fewshot_block( - self._system_prompt_types, - self._fewshot_types_docs, - key="types", - k=few_shot_k, - ) - user_block = self._format_user_block(title, text) - - prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") - document_order.append(document_id) - - generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) - parsed_type_lists = [ - self._parse_json_list(generated, key="types") for generated in generations - ] - - os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) - lines_written = 0 - with open(out_jsonl, "w", encoding="utf-8") as fp_out: - for document_id, type_list in zip(document_order, parsed_type_lists): - payload = {"id": document_id, "types": self._unique_preserve(type_list)} - fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") - lines_written += 1 - return lines_written - - def evaluate_extraction_f1( - self, - gold_item2docs_json: str, - preds_jsonl: str, - *, - key: str = "term", - ) -> float: + Convert conversation into a single prompt string using the tokenizer's chat template if available. """ - Compute micro-F1 over (doc_id, item) pairs. - - Parameters - ---------- - gold_item2docs_json : str - JSON mapping item -> [doc_ids]. - preds_jsonl : str - JSONL lines like {"id": "...", "terms":[...]} or {"id":"...","types":[...]}. - key : str - "term" or "type" depending on what you are evaluating. - - Returns - ------- - float - Micro-averaged F1 score. - """ - item_to_doc_ids: Dict[str, List[str]] = self._load_json(gold_item2docs_json) - - # Build gold: doc_id -> set(items) - gold_doc_to_items: Dict[str, set] = {} - for item_label, doc_id_list in item_to_doc_ids.items(): - for document_id in doc_id_list: - gold_doc_to_items.setdefault(document_id, set()).add( - self._norm(item_label) - ) - - # Build predictions: doc_id -> set(items) - pred_doc_to_items: Dict[str, set] = {} - with open(preds_jsonl, "r", encoding="utf-8") as fp_in: - for line in fp_in: - row = json.loads(line.strip()) - document_id = str(row.get("id", "")) - items_list = row.get("terms" if key == "term" else "types", []) - pred_doc_to_items[document_id] = { - self._norm(x) for x in items_list if isinstance(x, str) - } + assert self.tokenizer is not None + if hasattr(self.tokenizer, "apply_chat_template"): + return self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) - # Micro counts - true_positive = false_positive = false_negative = 0 - all_document_ids = set(gold_doc_to_items.keys()) | set(pred_doc_to_items.keys()) - for document_id in all_document_ids: - gold_set = gold_doc_to_items.get(document_id, set()) - pred_set = pred_doc_to_items.get(document_id, set()) - true_positive += len(gold_set & pred_set) - false_positive += len(pred_set - gold_set) - false_negative += len(gold_set - pred_set) - - precision = ( - true_positive / (true_positive + false_positive) - if (true_positive + false_positive) - else 0.0 - ) - recall = ( - true_positive / (true_positive + false_negative) - if (true_positive + false_negative) - else 0.0 - ) - f1 = ( - 2 * precision * recall / (precision + recall) - if (precision + recall) - else 0.0 - ) - return f1 + parts = [] + for t in conversation: + parts.append(f"{t['role'].upper()}:\n{t['content']}\n") + parts.append("ASSISTANT:\n") + return "\n".join(parts) + + def _extract_first_json_obj(self, text: str) -> Optional[dict]: + """ + Extract the first valid JSON object from generated text by scanning balanced {...}. + """ + starts = [i for i, ch in enumerate(text) if ch == "{"] + + for s in starts: + depth = 0 + for e in range(s, len(text)): + if text[e] == "{": + depth += 1 + elif text[e] == "}": + depth -= 1 + if depth == 0: + candidate = text[s : e + 1].strip().replace("\n", " ") + try: + return json.loads(candidate) + except Exception: + try: + candidate2 = re.sub(r"'", '"', candidate) + return json.loads(candidate2) + except Exception: + pass + break + return None + + def _dedup_clean(self, items: List[str]) -> List[str]: + """ + Normalize and deduplicate strings (case-insensitive). + """ + out: List[str] = [] + seen = set() + for x in items or []: + if not isinstance(x, str): + continue + x2 = re.sub(r"\s+", " ", x.strip()) + if not x2: + continue + k = x2.lower() + if k in seen: + continue + seen.add(k) + out.append(x2) + return out - def predict_types_from_terms( - self, - *, - doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl - doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list - few_shot_jsonl: Optional[ - str - ] = None, # JSONL lines: {"term":"...", "types":[...]} - rag_terms_json: Optional[ - str - ] = None, # JSON list; items may contain "term" and "RAG":[...] - random_few_shot: Optional[int] = 3, - model_id: str = "Qwen/Qwen2.5-1.5B-Instruct", - use_structured_output: bool = True, - seed: int = 42, - out_terms2types: str = "terms2types_pred.json", - out_types2docs: str = "types2docs_pred.json", - ) -> Dict[str, Any]: + def _doc_id(self, d: Dict[str, Any]) -> str: + """ + Extract doc_id from common keys: doc_id, id, docid. """ - Predict types for each unique term extracted per document and derive a types→docs map. + return str(d.get("doc_id") or d.get("id") or d.get("docid") or "") - Parameters - ---------- - doc_terms_jsonl : Optional[str] - Path to JSONL with lines like {"id": "...", "terms": [...]} or a JSON with {"results":[...]}. - doc_terms_list : Optional[List[Dict]] - In-memory results like [{"id":"...","extracted_terms":[...]}] or {"id":"...","terms":[...]}. - few_shot_jsonl : Optional[str] - Global few-shot exemplars: one JSON object per line with {"term": "...", "types":[...]}. - rag_terms_json : Optional[str] - Optional per-term RAG exemplars: a JSON list of {"term": "...", "RAG":[{"term": "...", "types":[...]}]}. - random_few_shot : Optional[int] - If provided, randomly select up to this many few-shot examples for each prediction. - model_id : str - HF model id used specifically for term→types predictions. - use_structured_output : bool - If True and outlines is available, enforce structured {"types":[...]} output. - seed : int - Random seed for reproducibility. - out_terms2types : str - Output JSON path for list of {"term": "...", "predicted_types":[...]}. - out_types2docs : str - Output JSON path for dict {"TYPE":[doc_ids,...], ...}. - - Returns - ------- - Dict[str, Any] - Summary with predictions and counts. + def _extract_documents(self, data: Any) -> List[Dict[str, Any]]: """ - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) + Accept list-of-docs OR dict with 'documents'/'docs'. + """ + if isinstance(data, list): + return data + if isinstance(data, dict): + if isinstance(data.get("documents"), list): + return data["documents"] + if isinstance(data.get("docs"), list): + return data["docs"] + raise ValueError("Expected dict with 'documents' (or 'docs'), or a list of docs.") - # Load normalized document→terms results - doc_term_extractions = self._load_doc_term_extractions( - results_json_path=doc_terms_jsonl, - in_memory_results=doc_terms_list, - ) - if not doc_term_extractions: - raise ValueError( - "No document→terms results provided (doc_terms_jsonl/doc_terms_list)." - ) + def _normalize_terms2docs(self, raw_terms2docs: Any, docs: List[Dict[str, Any]]) -> Dict[str, List[str]]: + """ + Normalize mapping to: term -> [doc_id, ...]. - # Prepare unique term list and term→doc occurrences - unique_terms = self._collect_unique_terms_from_extractions(doc_term_extractions) - term_to_doc_ids_map = self._build_term_to_doc_ids(doc_term_extractions) - - # Load optional global few-shot examples - global_few_shot_examples: List[Dict] = [] - if few_shot_jsonl and os.path.exists(few_shot_jsonl): - with open(few_shot_jsonl, "r", encoding="utf-8") as few_shot_file: - for raw_line in few_shot_file: - raw_line = raw_line.strip() - if not raw_line: - continue - try: - json_obj = json.loads(raw_line) - except Exception: - continue - if ( - isinstance(json_obj, dict) - and "term" in json_obj - and "types" in json_obj - ): - global_few_shot_examples.append(json_obj) - - # Optional per-term RAG examples: {normalized_term -> [examples]} - rag_examples_lookup: Dict[str, List[Dict]] = {} - if rag_terms_json and os.path.exists(rag_terms_json): - try: - rag_payload = self._load_json(rag_terms_json) - if isinstance(rag_payload, list): - for rag_item in rag_payload: - if isinstance(rag_item, dict): - normalized_term = self._normalize_term( - rag_item.get("term", "") - ) - rag_examples_lookup[normalized_term] = rag_item.get( - "RAG", [] - ) - except Exception: - pass + If caller accidentally provides inverted mapping: doc_id -> [term, ...], + we detect it (keys mostly match doc_ids) and invert it. + """ + if not isinstance(raw_terms2docs, dict) or not raw_terms2docs: + return {} - # Load a small chat LLM dedicated to Term→Types - typing_model, typing_tokenizer = self._load_llm_for_types(model_id) + doc_ids = {self._doc_id(d) for d in docs} + doc_ids.discard("") - # Predict types per term - term_to_predicted_types_list: List[Dict] = [] - for term_text in unique_terms: - normalized_term = self._normalize_term(term_text) + keys = list(raw_terms2docs.keys()) + sample = keys[:200] + hits = sum(1 for k in sample if str(k) in doc_ids) - # Prefer per-term RAG for this term, else use global few-shot - few_shot_examples_for_term = ( - rag_examples_lookup.get(normalized_term, None) - or global_few_shot_examples - ) + if sample and hits >= int(0.6 * len(sample)): + term2docs: Dict[str, List[str]] = defaultdict(list) + for did, terms in raw_terms2docs.items(): + did = str(did) + if did not in doc_ids: + continue + for t in (terms or []): + if isinstance(t, str) and t.strip(): + term2docs[t.strip()].append(did) + return {t: self._dedup_clean(ds) for t, ds in term2docs.items()} + + norm: Dict[str, List[str]] = {} + for term, doc_list in raw_terms2docs.items(): + if not isinstance(term, str) or not term.strip(): + continue + docs_norm = [str(d) for d in (doc_list or []) if str(d)] + if docs_norm: + norm[term.strip()] = self._dedup_clean(docs_norm) + return norm - # Build conversation and prompt - conversation_messages = self._build_conv_for_type_infer( - term=term_text, - few_shot_examples=few_shot_examples_for_term, - random_k=random_few_shot, - ) - typing_prompt_string = self._apply_chat_template_safe_types( - typing_tokenizer, conversation_messages - ) + def _generate(self, prompt: str) -> str: + """ + Deterministic single-prompt generation (no sampling). + Returns decoded completion only. + """ + assert self.model is not None and self.tokenizer is not None - predicted_types: List[str] = [] - raw_generation_text: str = "" - - # Structured JSON path (if requested and available) - if ( - use_structured_output - and OUTLINES_AVAILABLE - and _PredictedTypesSchema is not None - ): - try: - outlines_model = OutlinesTFModel(typing_model, typing_tokenizer) # type: ignore - generator = outlines_generate_json( - outlines_model, _PredictedTypesSchema - ) # type: ignore - structured = generator(typing_prompt_string, max_tokens=512) - predicted_types = [ - label for label in structured.types if isinstance(label, str) - ] - raw_generation_text = json.dumps( - {"types": predicted_types}, ensure_ascii=False - ) - except Exception: - # Fall back to greedy decoding - use_structured_output = False - - # Greedy decode fallback - if ( - not use_structured_output - or not OUTLINES_AVAILABLE - or _PredictedTypesSchema is None - ): - tokenized_prompt = typing_tokenizer( - typing_prompt_string, - return_tensors="pt", - truncation=True, - max_length=2048, - ) - if torch.cuda.is_available(): - tokenized_prompt = { - name: tensor.cuda() for name, tensor in tokenized_prompt.items() - } - with torch.no_grad(): - output_ids = typing_model.generate( - **tokenized_prompt, - max_new_tokens=256, - do_sample=False, - num_beams=1, - pad_token_id=typing_tokenizer.eos_token_id, - ) - new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1] :] - raw_generation_text = typing_tokenizer.decode( - new_token_span, skip_special_tokens=True - ) - predicted_types = self._extract_types_from_text(raw_generation_text) - - term_to_predicted_types_list.append( - { - "term": term_text, - "predicted_types": sorted(set(predicted_types)), - } - ) + enc = self.tokenizer( + prompt, + return_tensors="pt", + truncation=True, + max_length=self.max_input_length, + ) + enc = {k: v.to(self.model.device) for k, v in enc.items()} - # 7) Build types→docs from (term→types) and (term→docs) - types_to_doc_id_set: Dict[str, set] = {} - for term_prediction in term_to_predicted_types_list: - normalized_term = self._normalize_term(term_prediction["term"]) - doc_ids_for_term = term_to_doc_ids_map.get(normalized_term, []) - for type_label in term_prediction.get("predicted_types", []): - types_to_doc_id_set.setdefault(type_label, set()).update( - doc_ids_for_term - ) - - types_to_doc_ids: Dict[str, List[str]] = { - type_label: sorted(doc_id_set) - for type_label, doc_id_set in types_to_doc_id_set.items() - } - - # 8) Save outputs - os.makedirs(os.path.dirname(out_terms2types) or ".", exist_ok=True) - with open(out_terms2types, "w", encoding="utf-8") as fp_terms2types: - json.dump( - term_to_predicted_types_list, - fp_terms2types, - ensure_ascii=False, - indent=2, + with torch.no_grad(): + out = self.model.generate( + **enc, + max_new_tokens=self.max_new_tokens, + do_sample=False, + num_beams=1, + pad_token_id=self.tokenizer.eos_token_id, ) - os.makedirs(os.path.dirname(out_types2docs) or ".", exist_ok=True) - with open(out_types2docs, "w", encoding="utf-8") as fp_types2docs: - json.dump(types_to_doc_ids, fp_types2docs, ensure_ascii=False, indent=2) + gen_tokens = out[0][enc["input_ids"].shape[1] :] + return self.tokenizer.decode(gen_tokens, skip_special_tokens=True).strip() - # Cleanup VRAM if any - del typing_model, typing_tokenizer - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - return { - "terms2types_pred": term_to_predicted_types_list, - "types2docs_pred": types_to_doc_ids, - "unique_terms": len(unique_terms), - "types_count": len(types_to_doc_ids), - } - - def _load_json(self, path: str) -> Dict[str, Any]: - """Load a JSON file from disk and return its parsed object.""" - with open(path, "r", encoding="utf-8") as file_obj: - return json.load(file_obj) - - def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: + def _retrieve_doc_fewshot(self, doc: Dict[str, Any]) -> List[Dict[str, Any]]: """ - Iterate over *all* JSON objects found inside a string. - - Supports cases where multiple JSON objects are concatenated back-to-back - in a single line. It skips stray commas/whitespace between objects. - - Parameters - ---------- - blob : str - A string that may contain one or more JSON objects. - - Yields - ------ - Dict[str, Any] - Each parsed JSON object. + Retrieve top-k doc examples (JSON dicts) for few-shot doc->terms prompting. """ - json_decoder = json.JSONDecoder() - cursor_index, text_length = 0, len(blob) - while cursor_index < text_length: - # Skip whitespace/commas between objects - while cursor_index < text_length and blob[cursor_index] in " \t\r\n,": - cursor_index += 1 - if cursor_index >= text_length: - break + q = self._format_doc(doc.get("title", ""), doc.get("text", "")) + hits = self.doc_retriever.retrieve([q], top_k=self.top_k)[0] + + out: List[Dict[str, Any]] = [] + for h in hits: try: - json_obj, end_index = json_decoder.raw_decode(blob, idx=cursor_index) - except JSONDecodeError: - # Can't decode from this position; stop scanning this chunk - break - yield json_obj - cursor_index = end_index - - def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: + out.append(json.loads(h)) + except Exception: + continue + return out + + def _retrieve_term_fewshot(self, term: str) -> List[Dict[str, Any]]: """ - Robust reader that supports: - • True JSONL (one object per line) - • Lines with multiple concatenated JSON objects - • Whole file as a JSON array - - Returns - ------- - Dict[str, Dict[str, Any]] - Mapping doc_id -> full document row. + Retrieve top-k term examples (JSON dicts) for few-shot term->types prompting. """ - documents_by_id: Dict[str, Dict[str, Any]] = {} + hits = self.term_retriever.retrieve([term], top_k=self.top_k)[0] - with open(path, "r", encoding="utf-8") as file_obj: - content = file_obj.read().strip() - - # Case A: whole-file JSON array - if content.startswith("["): + out: List[Dict[str, Any]] = [] + for h in hits: try: - json_array = json.loads(content) - if isinstance(json_array, list): - for record in json_array: - if not isinstance(record, dict): - continue - document_id = str( - record.get("id") - or record.get("doc_id") - or (record.get("doc") or {}).get("id") - or "" - ) - if document_id: - documents_by_id[document_id] = record - return documents_by_id + out.append(json.loads(h)) except Exception: - # Fall back to line-wise handling if array parsing fails - pass - - # Case B: treat as JSONL-ish; parse *all* objects per line - for raw_line in content.splitlines(): - line = raw_line.strip() - if not line: continue - for record in self._iter_json_objects(line): - if not isinstance(record, dict): - continue - document_id = str( - record.get("id") - or record.get("doc_id") - or (record.get("doc") or {}).get("id") - or "" - ) - if document_id: - documents_by_id[document_id] = record - - return documents_by_id - - def _to_text(self, text_field: Any) -> str: - """ - Convert a 'text' field into a single string (handles list-of-strings). - - Parameters - ---------- - text_field : Any - The value found under "text" in the dataset row. + return out - Returns - ------- - str - A single-string representation of the text. + def _doc_to_terms(self, doc: Dict[str, Any]) -> List[str]: """ - if isinstance(text_field, str): - return text_field - if isinstance(text_field, list): - return " ".join(str(part) for part in text_field) - return str(text_field) if text_field is not None else "" - - def _unique_preserve(self, values: List[str]) -> List[str]: + Predict terms for a document using few-shot prompting + doc retrieval. """ - Deduplicate values while preserving the original order. + fewshot = self._retrieve_doc_fewshot(doc) - Parameters - ---------- - values : List[str] - Sequence possibly containing duplicates. + convo: List[Dict[str, str]] = [{"role": "system", "content": self.DOC2TERMS_SYSTEM_PROMPT}] - Returns - ------- - List[str] - Sequence without duplicates, order preserved. - """ - seen_values: set = set() - ordered_values: List[str] = [] - for candidate in values: - if candidate not in seen_values: - seen_values.add(candidate) - ordered_values.append(candidate) - return ordered_values - - def _norm(self, text: str) -> str: - """ - Lowercased, single-spaced normalization (for comparisons). + for ex in fewshot: + ex_tfidf = ex.get("TF-IDF") or ex.get("tfidf_terms") or [] + convo += [ + { + "role": "user", + "content": self._format_doc( + ex.get("title", ""), + ex.get("text", ""), + ex_tfidf if self.use_tfidf else None, + ), + }, + { + "role": "assistant", + "content": json.dumps({"terms": ex.get("OL", [])}, ensure_ascii=False), + }, + ] - Parameters - ---------- - text : str - Input string. + tfidf = doc.get("TF-IDF") or doc.get("tfidf_terms") or [] + convo.append( + { + "role": "user", + "content": self._format_doc( + doc.get("title", ""), + doc.get("text", ""), + tfidf if self.use_tfidf else None, + ), + } + ) - Returns - ------- - str - Normalized string. - """ - return " ".join(text.lower().split()) + prompt = self._apply_chat_template(convo) + gen = self._generate(prompt) + parsed = self._extract_first_json_obj(gen) or {} + return self._dedup_clean(parsed.get("terms", [])) - def _normalize_term(self, term: str) -> str: + def _term_to_types(self, term: str) -> List[str]: + """ + Predict types for a term using few-shot prompting + term retrieval. """ - Normalization tailored for term keys / lookups. + fewshot = self._retrieve_term_fewshot(term) - Parameters - ---------- - term : str - Term to normalize. + system = self.TERM2TYPES_SYSTEM_PROMPT + if self.restrict_to_known_types and self._allowed_types: + allowed_block = "\n".join(f"- {t}" for t in self._allowed_types) + system = ( + system + + "\n\nIMPORTANT CONSTRAINT:\n" + + "Choose ONLY from the following valid ontology types (do not invent new labels):\n" + + allowed_block + ) - Returns - ------- - str - Lowercased, trimmed and single-spaced term. - """ - return " ".join(str(term).strip().split()).lower() + convo: List[Dict[str, str]] = [{"role": "system", "content": system}] - def _format_fewshot_block( - self, - system_prompt: str, - fewshot_examples: List[Tuple[str, str, List[str]]], - *, - key: str, - k: int = 6, - ) -> str: - """ - Render a few-shot block like: + for ex in fewshot: + convo += [ + {"role": "user", "content": f"Term: {ex.get('term','')}"}, + { + "role": "assistant", + "content": json.dumps({"types": ex.get("types", [])}, ensure_ascii=False), + }, + ] - + convo.append({"role": "user", "content": f"Term: {term}"}) - ### Example - User: - Title: ... - - Assistant: - {"terms": [...]} or {"types": [...]} + prompt = self._apply_chat_template(convo) + gen = self._generate(prompt) + parsed = self._extract_first_json_obj(gen) or {} + return self._dedup_clean(parsed.get("types", [])) - Parameters - ---------- - system_prompt : str - Instructional system text to prepend. - fewshot_examples : List[Tuple[str, str, List[str]]] - Examples as (title, text, labels_list). - key : str - Either "terms" or "types" depending on the task. - k : int - Number of examples to include. - - Returns - ------- - str - Formatted few-shot block text. + def _text2onto(self, data: Any, test: bool = False) -> Optional[Any]: """ - lines: List[str] = [system_prompt.strip(), ""] - for example_title, example_text, gold_list in fewshot_examples[:k]: - lines.append("### Example") - lines.append(f"User:\nTitle: {example_title}\n{example_text}") - lines.append( - f'Assistant:\n{{"{key}": ' - + json.dumps(gold_list, ensure_ascii=False) - + "}" - ) - return "\n".join(lines) + Train or predict for task="text2onto". - def _format_user_block(self, title: str, text: str) -> str: + Returns: + - training: None + - prediction: {"terms": [...], "types": [...]} """ - Format the 'Task' block for the current document. + if not self._loaded: + self.load(model_id=self.llm_model_id, device=self.device) - Parameters - ---------- - title : str - Document title. - text : str - Document text (single string). - - Returns - ------- - str - Formatted user block. - """ - return f"### Task\nUser:\nTitle: {title}\n{text}" + if not isinstance(data, dict): + raise ValueError("text2onto expects a dict with documents + mappings.") - def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: - """ - Extract a list from model output, trying: - 1) JSON object with the key ({"terms":[...]} or {"types":[...]}). - 2) Any top-level JSON array. - 3) Fallback: comma-split. + docs = self._extract_documents(data) - Parameters - ---------- - generated_text : str - Raw generation text to parse. - key : str - "terms" or "types". - - Returns - ------- - List[str] - Parsed strings (best-effort). - """ - # 1) Try a JSON object and read key - try: - object_match = self._json_object_regex.search(generated_text) - if object_match: - json_obj = json.loads(object_match.group(0)) - json_array = json_obj.get(key) - if isinstance(json_array, list): - return [value for value in json_array if isinstance(value, str)] - except Exception: - pass - - # 2) Any JSON array - try: - array_match = self._json_array_regex.search(generated_text) - if array_match: - json_array = json.loads(array_match.group(0)) - if isinstance(json_array, list): - return [value for value in json_array if isinstance(value, str)] - except Exception: - pass - - # 3) Fallback: comma-split (last resort) - if "," in generated_text: - return [ - part.strip().strip('"').strip("'") - for part in generated_text.split(",") - if part.strip() - ] - return [] + raw_terms2docs = data.get("terms2docs") or data.get("term2docs") or {} + terms2types = data.get("terms2types") or data.get("term2types") or {} - def _apply_chat_template_safe_types( - self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]] - ) -> str: - """ - Safely build a prompt string for chat models. Uses the model's chat template - when available; otherwise falls back to a simple concatenation. - """ - try: - return tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - except Exception: - system_text = next( - (m["content"] for m in messages if m.get("role") == "system"), "" - ) - last_user_text = next( - (m["content"] for m in reversed(messages) if m.get("role") == "user"), - "", - ) - return f"{system_text}\n\nUser:\n{last_user_text}\n\nAssistant:" + terms2docs = self._normalize_terms2docs(raw_terms2docs, docs) - def _build_conv_for_type_infer( - self, - term: str, - few_shot_examples: Optional[List[Dict]] = None, - random_k: Optional[int] = None, - ) -> List[Dict[str, str]]: - """ - Create a chat-style conversation for a single term→types query, - optionally prepending few-shot examples. - """ - messages: List[Dict[str, str]] = [ - {"role": "system", "content": self._system_prompt_term_to_types} - ] - examples = list(few_shot_examples or []) - if random_k and len(examples) > random_k: - import random as _rnd - - examples = _rnd.sample(examples, random_k) - for exemplar in examples: - example_term = exemplar.get("term", "") - example_types = exemplar.get("types", []) - messages.append({"role": "user", "content": f"Term: {example_term}"}) - messages.append( + if not test: + self._allowed_types = sorted( { - "role": "assistant", - "content": json.dumps({"types": example_types}, ensure_ascii=False), + ty.strip() + for tys in (terms2types or {}).values() + for ty in (tys or []) + if isinstance(ty, str) and ty.strip() } ) - messages.append({"role": "user", "content": f"Term: {term}"}) - return messages - def _extract_types_from_text(self, generated_text: str) -> List[str]: - """ - Parse {"types":[...]} from a free-form generation. - """ - try: - object_match = re.search(r'\{[^}]*"types"[^}]*\}', generated_text) - if object_match: - json_obj = json.loads(object_match.group(0)) - types_array = json_obj.get("types", []) - return [ - type_label - for type_label in types_array - if isinstance(type_label, str) - ] - except Exception: - pass - return [] - - def _load_llm_for_types( - self, model_id: str - ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: - """ - Load a *separate* small chat model for Term→Types (keeps LocalAutoLLM untouched). - """ - tokenizer = AutoTokenizer.from_pretrained(model_id) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - model = AutoModelForCausalLM.from_pretrained( - model_id, - torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, - device_map="auto" if torch.cuda.is_available() else None, - ) - return model, tokenizer + # build doc->terms from term->docs + doc2terms: Dict[str, List[str]] = defaultdict(list) + for term, doc_ids in (terms2docs or {}).items(): + for did in (doc_ids or []): + doc2terms[str(did)].append(term) + + # doc few-shot corpus: doc + gold OL terms + doc_examples: List[Dict[str, Any]] = [] + for d in docs: + did = self._doc_id(d) + ex = dict(d) + ex["doc_id"] = did + ex["OL"] = self._dedup_clean(doc2terms.get(did, [])) + doc_examples.append(ex) + + # term few-shot corpus: term + gold types + term_examples = [ + {"term": t, "types": self._dedup_clean(tys)} + for t, tys in (terms2types or {}).items() + ] - def _load_doc_term_extractions( - self, - *, - results_json_path: Optional[str] = None, - in_memory_results: Optional[List[Dict]] = None, - ) -> List[Dict]: - """ - Normalize document→terms outputs to a list of: - {"id": "", "extracted_terms": ["...", ...]} - - Accepts either: - - in_memory_results (list of dicts) - - results_json_path pointing to: - • a JSONL file with lines: {"id": "...", "terms": [...]} - • OR a JSON file with {"results":[{"id":..., "extracted_terms": [...]}, ...]} - • OR a JSON list of dicts - """ - normalized_records: List[Dict] = [] + # store as JSON strings so retrievers return parseable strings + self._doc_examples_json = [json.dumps(ex, ensure_ascii=False) for ex in doc_examples] + self._term_examples_json = [json.dumps(ex, ensure_ascii=False) for ex in term_examples] - def _coerce_to_record(source_row: Dict) -> Optional[Dict]: - document_id = str(source_row.get("id", "")) or str( - source_row.get("doc_id", "") - ) - if not document_id: - return None - terms = source_row.get("extracted_terms") - if terms is None: - terms = source_row.get("terms") - if ( - terms is None - and "payload" in source_row - and isinstance(source_row["payload"], dict) - ): - terms = source_row["payload"].get("terms") - if not isinstance(terms, list): - terms = [] - return { - "id": document_id, - "extracted_terms": [t for t in terms if isinstance(t, str)], - } + # index retrievers + self.doc_retriever.index(self._doc_examples_json) + self.term_retriever.index(self._term_examples_json) + return None - if in_memory_results is not None: - for source_row in in_memory_results: - coerced_record = _coerce_to_record(source_row) - if coerced_record: - normalized_records.append(coerced_record) - return normalized_records - - if not results_json_path: - raise ValueError("Provide results_json_path or in_memory_results") - - # Detect JSON vs JSONL by extension (best-effort) - if results_json_path.endswith(".jsonl"): - with open(results_json_path, "r", encoding="utf-8") as file_in: - for raw_line in file_in: - raw_line = raw_line.strip() - if not raw_line: - continue - # Multiple concatenated objects per line? Iterate them all. - for json_obj in self._iter_json_objects(raw_line): - if isinstance(json_obj, dict): - coerced_record = _coerce_to_record(json_obj) - if coerced_record: - normalized_records.append(coerced_record) - else: - payload_obj = self._load_json(results_json_path) - if isinstance(payload_obj, dict) and "results" in payload_obj: - for source_row in payload_obj["results"]: - coerced_record = _coerce_to_record(source_row) - if coerced_record: - normalized_records.append(coerced_record) - elif isinstance(payload_obj, list): - for source_row in payload_obj: - if isinstance(source_row, dict): - coerced_record = _coerce_to_record(source_row) - if coerced_record: - normalized_records.append(coerced_record) - - return normalized_records - - def _collect_unique_terms_from_extractions( - self, doc_term_extractions: List[Dict] - ) -> List[str]: - """ - Collect unique terms (original casing) from normalized document→terms results. - """ - seen_normalized_terms: set = set() - ordered_unique_terms: List[str] = [] - for record in doc_term_extractions: - for term_text in record.get("extracted_terms", []): - normalized = self._normalize_term(term_text) - if normalized and normalized not in seen_normalized_terms: - seen_normalized_terms.add(normalized) - ordered_unique_terms.append(term_text.strip()) - return ordered_unique_terms - - def _build_term_to_doc_ids( - self, doc_term_extractions: List[Dict] - ) -> Dict[str, List[str]]: - """ - Build lookup: normalized_term -> sorted unique list of doc_ids. - """ - term_to_doc_set: Dict[str, set] = {} - for record in doc_term_extractions: - document_id = str(record.get("id", "")) - for term_text in record.get("extracted_terms", []): - normalized = self._normalize_term(term_text) - if not normalized or not document_id: - continue - term_to_doc_set.setdefault(normalized, set()).add(document_id) - return { - normalized_term: sorted(doc_ids) - for normalized_term, doc_ids in term_to_doc_set.items() - } + doc2terms_pred: Dict[str, List[str]] = {} + for d in docs: + did = self._doc_id(d) + doc2terms_pred[did] = self._doc_to_terms(d) + + unique_terms = sorted({t for ts in doc2terms_pred.values() for t in ts}) + term2types_pred: Dict[str, List[str]] = {t: self._term_to_types(t) for t in unique_terms} + + doc2types_pred: Dict[str, List[str]] = {} + for did, terms in doc2terms_pred.items(): + tys: List[str] = [] + for t in terms: + tys.extend(term2types_pred.get(t, [])) + doc2types_pred[did] = self._dedup_clean(tys) + + pred_terms = [{"doc_id": did, "term": t} for did, ts in doc2terms_pred.items() for t in ts] + pred_types = [{"doc_id": did, "type": ty} for did, tys in doc2types_pred.items() for ty in tys] + + return {"terms": pred_terms, "types": pred_types} diff --git a/ontolearner/learner/text2onto/sbunlp.py b/ontolearner/learner/text2onto/sbunlp.py index 49067e2..a7f598e 100644 --- a/ontolearner/learner/text2onto/sbunlp.py +++ b/ontolearner/learner/text2onto/sbunlp.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -#      https://opensource.org/licenses/MIT +# https://opensource.org/licenses/MIT # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -12,587 +12,592 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -import random -import re import ast import gc -from typing import Any, Dict, List, Optional, Set, Tuple +import random +import re from collections import defaultdict +from typing import Any, DefaultDict, Dict, List, Optional, Set import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig -from ...base import AutoLearner, AutoLLM +from ...base import AutoLearner - -# ----------------------------------------------------------------------------- -# Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface -# ----------------------------------------------------------------------------- -class LocalAutoLLM(AutoLLM): +class SBUNLPFewShotLearner(AutoLearner): """ - Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama). - Uses 4-bit quantization for efficiency and greedy decoding by default. + Public API expected by the pipeline: + - `load(model_id=...)` + - `fit(train_data, task=..., ontologizer=...)` + - `predict(test_data, task=..., ontologizer=...)` + + Expected input bundle format (train/test): + - "documents": list of dicts, each with keys: {"id", "title", "text"} + - "terms2docs": dict mapping term -> list of doc_ids + - "terms2types": optional dict mapping term -> list of types + + Prediction output payload (pipeline wraps this): + - {"terms": [{"doc_id": str, "term": str}, ...], + "types": [{"doc_id": str, "type": str}, ...]} """ def __init__( - self, label_mapper: Any = None, device: str = "cpu", token: str = "" - ) -> None: - super().__init__(label_mapper=label_mapper, device=device, token=token) - self.model = None - self.tokenizer = None - - def load( self, - model_id: str, + llm_model_id: Optional[str] = None, + device: str = "cpu", load_in_4bit: bool = False, - dtype: str = "auto", + max_new_tokens: int = 256, trust_remote_code: bool = True, - ): - """Load tokenizer + model, applying 4-bit quantization if specified and possible.""" + ) -> None: + """ + Initialize the few-shot learner. + + Args: + llm_model_id: Default HF model id to load if `load()` is called without one. + device: "cpu" or a CUDA device identifier (e.g. "cuda"). + load_in_4bit: Whether to attempt 4-bit quantized loading (bitsandbytes). + max_new_tokens: Maximum tokens to generate per prompt. + retriever_model_id: Unused (kept for compatibility). + top_k: Unused (kept for compatibility). + trust_remote_code: Forwarded to HF loaders (use with caution). + """ + super().__init__() + self.device = device + self.max_new_tokens = int(max_new_tokens) - # Determine the target data type (default to float32 for CPU, float16 for GPU) - torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32 + self._default_model_id = llm_model_id + self._load_in_4bit_default = bool(load_in_4bit) + self._trust_remote_code_default = bool(trust_remote_code) - # Load the tokenizer - self.tokenizer = AutoTokenizer.from_pretrained( - model_id, trust_remote_code=trust_remote_code - ) - if self.tokenizer.pad_token is None: - self.tokenizer.pad_token = self.tokenizer.eos_token + # HF objects + self.model: Optional[AutoModelForCausalLM] = None + self.tokenizer: Optional[AutoTokenizer] = None + + self._is_loaded = False + self._loaded_model_id: Optional[str] = None - quant_config = None + # Cached few-shot example blocks built during `fit()` + self.few_shot_terms_block: str = "" + self.few_shot_types_block: str = "" + + def load(self, model_id: Optional[str] = None, **kwargs: Any) -> None: + """ + Load the underlying HF causal LM and tokenizer. + + LearnerPipeline typically calls: `learner.load(model_id=llm_id)`. + + Args: + model_id: HF model id. If None, uses `llm_model_id` from __init__. + **kwargs: + load_in_4bit: override default 4-bit loading. + trust_remote_code: override default trust_remote_code. + """ + resolved_model_id = model_id or self._default_model_id + if not resolved_model_id: + raise ValueError( + f"No model_id provided to {self.__class__.__name__}.load() and no llm_model_id in __init__." + ) + + load_in_4bit = bool(kwargs.get("load_in_4bit", self._load_in_4bit_default)) + trust_remote_code = bool(kwargs.get("trust_remote_code", self._trust_remote_code_default)) + + # Avoid re-loading same model + if self._is_loaded and self._loaded_model_id == resolved_model_id: + return + + torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + + tokenizer = AutoTokenizer.from_pretrained(resolved_model_id, trust_remote_code=trust_remote_code) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + self.tokenizer = tokenizer + + quantization_config = None if load_in_4bit: - # Configure BitsAndBytes for 4-bit loading - quant_config = BitsAndBytesConfig( + quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) - if torch_dtype_val is None: - torch_dtype_val = torch.float16 + torch_dtype = torch.float16 - # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise) device_map = "auto" if (self.device != "cpu") else {"": "cpu"} - # Load the Causal Language Model - self.model = AutoModelForCausalLM.from_pretrained( - model_id, + model = AutoModelForCausalLM.from_pretrained( + resolved_model_id, device_map=device_map, - torch_dtype=torch_dtype_val, - quantization_config=quant_config, + torch_dtype=torch_dtype, + quantization_config=quantization_config, trust_remote_code=trust_remote_code, ) - # Ensure model is on the correct device (redundant if device_map="auto" but safe) if self.device == "cpu": - self.model.to("cpu") + model.to("cpu") - def generate( - self, - inputs: List[str], - max_new_tokens: int = 64, - temperature: float = 0.0, - top_p: float = 1.0, - ) -> List[str]: - """Generate continuations for a list of prompts, returning only the generated part.""" - if self.model is None or self.tokenizer is None: - raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") + self.model = model + self._is_loaded = True + self._loaded_model_id = resolved_model_id + + def _invert_terms_to_docs_mapping(self, terms_to_documents: Dict[str, List[str]]) -> Dict[str, List[str]]: + """ + Convert term->docs mapping to doc->terms mapping. - # --- Generation Setup --- - # Tokenize batch (padding is essential for batch inference) - enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True) - input_ids = enc["input_ids"] - attention_mask = enc["attention_mask"] + Args: + terms_to_documents: Mapping from term to list of document IDs. - # Move tensors to the model's device (e.g., cuda:0) - model_device = next(self.model.parameters()).device - input_ids = input_ids.to(model_device) - attention_mask = attention_mask.to(model_device) + Returns: + Mapping from document ID to list of terms associated with it. + """ + document_to_terms: DefaultDict[str, List[str]] = defaultdict(list) + for term, document_ids in (terms_to_documents or {}).items(): + for document_id in document_ids or []: + document_to_terms[str(document_id)].append(str(term)) + return dict(document_to_terms) - # --- Generate --- - with torch.no_grad(): - outputs = self.model.generate( - input_ids=input_ids, - attention_mask=attention_mask, - max_new_tokens=max_new_tokens, - do_sample=( - temperature > 0.0 - ), # Use greedy decoding if temperature is 0.0 - temperature=temperature, - top_p=top_p, - pad_token_id=self.tokenizer.eos_token_id, - ) + def _derive_document_to_types( + self, + terms_to_documents: Dict[str, List[str]], + terms_to_types: Optional[Dict[str, List[str]]], + ) -> Dict[str, List[str]]: + """ + Derive doc->types mapping using (terms->docs) and (terms->types). - # --- Post-processing: Extract only the generated tail --- - decoded_outputs: List[str] = [] - for i, output_ids in enumerate(outputs): - full_decoded_text = self.tokenizer.decode( - output_ids, skip_special_tokens=True - ) - prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True) + Args: + terms_to_documents: term -> [doc_id...] + terms_to_types: term -> [type...] - # Safely strip the prompt text from the full output - if full_decoded_text.startswith(prompt_text): - generated_tail = full_decoded_text[len(prompt_text) :].strip() - else: - # Fallback extraction (less robust if padding affects token indices) - prompt_len = input_ids.shape[1] - generated_tail = self.tokenizer.decode( - output_ids[prompt_len:], skip_special_tokens=True - ).strip() - decoded_outputs.append(generated_tail) + Returns: + doc_id -> sorted list of unique types. + """ + if not terms_to_types: + return {} - return decoded_outputs + document_to_types: DefaultDict[str, Set[str]] = defaultdict(set) + for term, document_ids in (terms_to_documents or {}).items(): + candidate_types = terms_to_types.get(term, []) or [] + for document_id in document_ids or []: + for candidate_type in candidate_types: + if isinstance(candidate_type, str) and candidate_type.strip(): + document_to_types[str(document_id)].add(candidate_type.strip()) -# ----------------------------------------------------------------------------- -# Main Learner: SBUNLPFewShotLearner (Task A Text2Onto) -# ----------------------------------------------------------------------------- -class SBUNLPFewShotLearner(AutoLearner): - """ - Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction). - It uses Few-Shot prompts generated from training data for inference. - """ + return {doc_id: sorted(list(type_set)) for doc_id, type_set in document_to_types.items()} - def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"): - super().__init__() - # self.model is an instance of LocalAutoLLM - self.model = model or LocalAutoLLM(device=device) - self.device = device - # Cached in-memory prompt blocks built during the fit phase - self.fewshot_terms_block: str = "" - self.fewshot_types_block: str = "" + def _truncate_text(self, text: str, max_chars: int) -> str: + """ + Truncate text to a maximum number of characters (adds an ellipsis when truncated). + + Args: + text: Input text. + max_chars: Maximum characters to keep. If <= 0, returns the original text. + + Returns: + Truncated or original text. + """ + if not max_chars or max_chars <= 0 or not text: + return text or "" + return (text[:max_chars] + "…") if len(text) > max_chars else text - # --- Few-shot construction (terms) --- - def build_stratified_fewshot_prompt( + def build_few_shot_terms_block( self, - documents_path: str, - terms_path: str, + documents: List[Dict[str, Any]], + terms_to_documents: Dict[str, List[str]], sample_size: int = 28, seed: int = 123, max_chars_per_text: int = 1200, ) -> str: """ - Builds the few-shot exemplar block for Term Extraction using stratified sampling. + Build and cache the few-shot block for term extraction. + + Strategy: + - Create strata by associated terms (doc -> associated term list). + - Sample proportionally across strata. + - Deduplicate by document id and top up from remaining docs if needed. + + Args: + documents: Documents with keys: {"id","title","text"}. + terms_to_documents: Mapping term -> list of doc IDs. + sample_size: Desired number of examples in the block. + seed: RNG seed (local to this call). + max_chars_per_text: Text truncation limit per example. + + Returns: + The formatted few-shot example block string. """ - random.seed(seed) - - # Read documents (JSONL) into a list - corpus_documents: List[Dict[str, Any]] = [] - with open(documents_path, "r", encoding="utf-8") as file_handle: - for line in file_handle: - if line.strip(): - corpus_documents.append(json.loads(line)) - - num_total_docs = len(corpus_documents) - num_sample_docs = min(sample_size, num_total_docs) - - # Load the map of term -> [list of document IDs] - with open(terms_path, "r", encoding="utf-8") as file_handle: - term_to_doc_map = json.load(file_handle) - - # Invert map: document ID -> [list of terms] - doc_id_to_terms_map = defaultdict(list) - for term, doc_ids in term_to_doc_map.items(): - for doc_id in doc_ids: - doc_id_to_terms_map[doc_id].append(term) - - # Define strata (groups of documents associated with specific terms) - strata_map = defaultdict(list) - for doc in corpus_documents: - doc_id = doc.get("id", "") - associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"]) + rng = random.Random(seed) + + document_to_terms = self._invert_terms_to_docs_mapping(terms_to_documents) + total_documents = len(documents) + target_sample_count = min(int(sample_size), total_documents) + + strata: DefaultDict[str, List[Dict[str, Any]]] = defaultdict(list) + for document in documents: + document_id = str(document.get("id", "")) + associated_terms = document_to_terms.get(document_id, ["no_term"]) for term in associated_terms: - strata_map[term].append(doc) + strata[str(term)].append(document) - # Perform proportional sampling across strata sampled_documents: List[Dict[str, Any]] = [] - for term_str, stratum_docs in strata_map.items(): - num_stratum_docs = len(stratum_docs) - if num_stratum_docs == 0: + for docs_in_stratum in strata.values(): + if not docs_in_stratum: continue - - # Calculate proportional sample size - proportion = num_stratum_docs / num_total_docs - num_to_sample_from_stratum = int(num_sample_docs * proportion) - - if num_to_sample_from_stratum > 0: - sampled_documents.extend( - random.sample( - stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs) - ) + proportion = len(docs_in_stratum) / max(1, total_documents) + stratum_quota = int(target_sample_count * proportion) + if stratum_quota > 0: + sampled_documents.extend(rng.sample(docs_in_stratum, min(stratum_quota, len(docs_in_stratum)))) + + sampled_by_id = {str(d.get("id", "")): d for d in sampled_documents if d.get("id", "")} + final_documents = list(sampled_by_id.values()) + + if len(final_documents) > target_sample_count: + final_documents = rng.sample(final_documents, target_sample_count) + elif len(final_documents) < target_sample_count: + remaining_documents = [d for d in documents if str(d.get("id", "")) not in sampled_by_id] + additional_needed = min(target_sample_count - len(final_documents), len(remaining_documents)) + if additional_needed > 0: + final_documents.extend(rng.sample(remaining_documents, additional_needed)) + + lines: List[str] = [] + for document in final_documents: + document_id = str(document.get("id", "")) + title = str(document.get("title", "")) + text = self._truncate_text(str(document.get("text", "")), max_chars_per_text) + associated_terms = document_to_terms.get(document_id, []) + + lines.append( + "Document ID: {doc_id}\n" + "Title: {title}\n" + "Text: {text}\n" + "Associated Terms: {terms}\n" + "----------------------------------------".format( + doc_id=document_id, + title=title, + text=text, + terms=associated_terms, ) - - # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size' - unique_docs_by_id = {} - for doc in sampled_documents: - unique_docs_by_id[doc.get("id", "")] = doc - - final_sample_docs = list(unique_docs_by_id.values()) - - if len(final_sample_docs) > num_sample_docs: - final_sample_docs = random.sample(final_sample_docs, num_sample_docs) - elif len(final_sample_docs) < num_sample_docs: - remaining_docs = [ - d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id - ] - needed_count = min( - num_sample_docs - len(final_sample_docs), len(remaining_docs) - ) - final_sample_docs.extend(random.sample(remaining_docs, needed_count)) - - # Format the few-shot exemplar text block - prompt_lines: List[str] = [] - for doc in final_sample_docs: - doc_id = doc.get("id", "") - title = doc.get("title", "") - text = doc.get("text", "") - - # Truncate text if it exceeds the maximum character limit - if max_chars_per_text and len(text) > max_chars_per_text: - text = text[:max_chars_per_text] + "…" - - associated_terms = doc_id_to_terms_map.get(doc_id, []) - prompt_lines.append( - f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------" ) - prompt_block = "\n".join(prompt_lines) - self.fewshot_terms_block = prompt_block - return prompt_block + self.few_shot_terms_block = "\n".join(lines) + return self.few_shot_terms_block - # --- Few-shot construction (types) --- - def build_types_fewshot_block( + def build_few_shot_types_block( self, - docs_jsonl: str, - terms2doc_json: str, - sample_per_term: int = 1, - full_word: bool = True, - case_sensitive: bool = True, + documents: List[Dict[str, Any]], + terms_to_documents: Dict[str, List[str]], + terms_to_types: Optional[Dict[str, List[str]]] = None, + sample_size: int = 28, + seed: int = 123, max_chars_per_text: int = 800, ) -> str: """ - Builds the few-shot block for Type Extraction. - This method samples documents based on finding an associated term/type within the text. + Build and cache the few-shot block for type (class) extraction. + + Prefers doc->types derived from `terms_to_types`; if absent, falls back to treating + associated terms as "types" for stratification (behavior-preserving fallback). + + Args: + documents: Documents with keys: {"id","title","text"}. + terms_to_documents: Mapping term -> list of doc IDs. + terms_to_types: Optional mapping term -> list of types. + sample_size: Desired number of examples in the block. + seed: RNG seed (local to this call). + max_chars_per_text: Text truncation limit per example. + + Returns: + The formatted few-shot example block string. """ - # Load documents into dict by ID - docs_by_id = {} - with open(docs_jsonl, "r", encoding="utf-8") as file_handle: - for line in file_handle: - line_stripped = line.strip() - if line_stripped: - try: - doc = json.loads(line_stripped) - doc_id = doc.get("id", "") - if doc_id: - docs_by_id[doc_id] = doc - except json.JSONDecodeError: - continue - - # Load term -> [doc_id,...] map - with open(terms2doc_json, "r", encoding="utf-8") as file_handle: - term_to_doc_map = json.load(file_handle) - - flags = 0 if case_sensitive else re.IGNORECASE - prompt_lines: List[str] = [] - - # Iterate over terms (which act as types in this context) - for term, doc_ids in term_to_doc_map.items(): - escaped_term = re.escape(term) - # Create regex pattern for matching the term in the text - pattern = rf"\b{escaped_term}\b" if full_word else escaped_term - term_regex = re.compile(pattern, flags=flags) - - picked_count = 0 - for doc_id in doc_ids: - doc = docs_by_id.get(doc_id) - if not doc: - continue - - title = doc.get("title", "") - text = doc.get("text", "") - - # Check if the term/type is actually present in the document text/title - if term_regex.search(f"{title} {text}"): - text_content = text - - # Truncate text if necessary - if max_chars_per_text and len(text_content) > max_chars_per_text: - text_content = text_content[:max_chars_per_text] + "…" - - # Escape single quotes in the term for Python list formatting in the prompt - term_for_prompt = term.replace("'", "\\'") - - prompt_lines.append( - f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------" - ) - picked_count += 1 - - if picked_count >= sample_per_term: - break # Move to the next term - - prompt_block = "\n".join(prompt_lines) - self.fewshot_types_block = prompt_block - return prompt_block + rng = random.Random(seed) - def fit( - self, - train_docs_jsonl: str, - terms2doc_json: str, - sample_size: int = 28, - seed: int = 123, - ) -> None: + documents_by_id = {str(d.get("id", "")): d for d in documents if d.get("id", "")} + + document_to_types = self._derive_document_to_types(terms_to_documents, terms_to_types) + if not document_to_types: + document_to_types = self._invert_terms_to_docs_mapping(terms_to_documents) + + type_to_documents: DefaultDict[str, List[Dict[str, Any]]] = defaultdict(list) + for document_id, candidate_types in document_to_types.items(): + document = documents_by_id.get(document_id) + if not document: + continue + for candidate_type in candidate_types: + type_to_documents[str(candidate_type)].append(document) + + total_documents = len(documents) + target_sample_count = min(int(sample_size), total_documents) + + sampled_documents: List[Dict[str, Any]] = [] + for docs_in_stratum in type_to_documents.values(): + if not docs_in_stratum: + continue + proportion = len(docs_in_stratum) / max(1, total_documents) + stratum_quota = int(target_sample_count * proportion) + if stratum_quota > 0: + sampled_documents.extend(rng.sample(docs_in_stratum, min(stratum_quota, len(docs_in_stratum)))) + + sampled_by_id = {str(d.get("id", "")): d for d in sampled_documents if d.get("id", "")} + final_documents = list(sampled_by_id.values()) + + if len(final_documents) > target_sample_count: + final_documents = rng.sample(final_documents, target_sample_count) + elif len(final_documents) < target_sample_count: + remaining_documents = [d for d in documents if str(d.get("id", "")) not in sampled_by_id] + additional_needed = min(target_sample_count - len(final_documents), len(remaining_documents)) + if additional_needed > 0: + final_documents.extend(rng.sample(remaining_documents, additional_needed)) + + lines: List[str] = [] + for document in final_documents: + document_id = str(document.get("id", "")) + title = str(document.get("title", "")) + text = self._truncate_text(str(document.get("text", "")), max_chars_per_text) + + associated_types = document_to_types.get(document_id, []) + associated_types_escaped = [t.replace("'", "\\'") for t in associated_types] + + lines.append( + "Document ID: {doc_id}\n" + "Title: {title}\n" + "Text: {text}\n" + "Associated Types: {types}\n" + "----------------------------------------".format( + doc_id=document_id, + title=title, + text=text, + types=associated_types_escaped, + ) + ) + + self.few_shot_types_block = "\n".join(lines) + return self.few_shot_types_block + + def _format_term_prompt(self, example_block: str, title: str, text: str) -> str: """ - Fit phase: Builds and caches the few-shot prompt blocks from the training files. - No model training occurs (Few-Shot/In-Context Learning). + Format a prompt for term extraction. + + Args: + example_block: Few-shot examples block. + title: Document title. + text: Document text. + + Returns: + Prompt string. """ - # Build prompt block for Term extraction - _ = self.build_stratified_fewshot_prompt( - train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed + return ( + f"{example_block}\n" + "[var]\n" + f"Title: {title}\n" + f"Text: {text}\n" + "[var]\n" + "Extract all relevant terms that could form the basis of an ontology from the above document.\n" + "Return ONLY a Python list like ['term1', 'term2', ...] and nothing else.\n" + "If no terms are found, return [].\n" ) - # Build prompt block for Type extraction - _ = self.build_types_fewshot_block( - train_docs_jsonl, terms2doc_json, sample_per_term=1 + + def _format_type_prompt(self, example_block: str, title: str, text: str) -> str: + """ + Format a prompt for type (class) extraction. + + Args: + example_block: Few-shot examples block. + title: Document title. + text: Document text. + + Returns: + Prompt string. + """ + return ( + f"{example_block}\n" + "[var]\n" + f"Title: {title}\n" + f"Text: {text}\n" + "[var]\n" + "Extract all relevant TYPES mentioned in the above document that could serve as ontology classes.\n" + "Only consider content inside the [var] ... [var] block.\n" + "Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return [].\n" ) - # ------------------------- - # Inference helpers (prompt construction and output parsing) - # ------------------------- - def _build_term_prompt(self, example_block: str, title: str, text: str) -> str: - """Constructs the full prompt for Term Extraction.""" - return f"""{example_block} - [var] - Title: {title} - Text: {text} - [var] - Extract all relevant terms that could form the basis of an ontology from the above document. - Return ONLY a Python list like ['term1', 'term2', ...] and nothing else. - If no terms are found, return []. - """ - - def _build_type_prompt(self, example_block: str, title: str, text: str) -> str: - """Constructs the full prompt for Type Extraction.""" - return f"""{example_block} - [var] - Title: {title} - Text: {text} - [var] - Extract all relevant TYPES mentioned in the above document that could serve as ontology classes. - Only consider content inside the [var] ... [var] block. - Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return []. - """ - - def _parse_list_like(self, raw_string: str) -> List[str]: - """Try to extract a Python list of strings from model output robustly.""" - processed_string = raw_string.strip() - if processed_string in ("[]", ""): + def _parse_python_list_of_strings(self, raw_text: str) -> List[str]: + """ + Parse an LLM response intended to be a Python list of strings. + + This parser is intentionally tolerant: + 1) Try literal_eval on the full string + 2) Else extract the first [...] block and literal_eval it + 3) Else fallback to extracting quoted strings + + Args: + raw_text: Model output. + + Returns: + List of strings (possibly empty). + """ + stripped = (raw_text or "").strip() + if stripped in ("", "[]"): return [] - # 1. Try direct evaluation try: - parsed_value = ast.literal_eval(processed_string) - if isinstance(parsed_value, list): - # Filter to ensure only strings are returned - return [item for item in parsed_value if isinstance(item, str)] + parsed = ast.literal_eval(stripped) + if isinstance(parsed, list): + return [item for item in parsed if isinstance(item, str)] except Exception: pass - # 2. Try finding and evaluating text within outermost brackets [ ... ] - bracket_match = re.search(r"\[[\s\S]*?\]", processed_string) - if bracket_match: + match = re.search(r"\[[\s\S]*?\]", stripped) + if match: try: - parsed_value = ast.literal_eval(bracket_match.group(0)) - if isinstance(parsed_value, list): - return [item for item in parsed_value if isinstance(item, str)] + parsed = ast.literal_eval(match.group(0)) + if isinstance(parsed, list): + return [item for item in parsed if isinstance(item, str)] except Exception: pass - # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors) - # Finds content inside either single quotes ('...') or double quotes ("...") - quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string) - flattened_list = [a_match or b_match for a_match, b_match in quoted_matches] - return flattened_list - - def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str: - """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output.""" - # self.model is an instance of LocalAutoLLM - model_output = self.model.generate( - [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0 - ) - return model_output[0] if model_output else "" + quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", stripped) + return [a or b for a, b in quoted] - def predict_terms( - self, - docs_test_jsonl: str, - out_jsonl: str, - max_lines: int = -1, - max_new_tokens: int = 120, - ) -> int: + def _generate_completion(self, prompt_text: str) -> str: """ - Runs Term Extraction on the test documents and saves results to a JSONL file. - Returns: The count of individual terms written. + Generate a completion for a single prompt (deterministic decoding). + + Args: + prompt_text: Full prompt to send to the model. + + Returns: + The generated completion text (prompt stripped where possible). """ - if not self.fewshot_terms_block: - raise RuntimeError("Few-shot block for terms is empty. Call fit() first.") - - num_written_terms = 0 - with ( - open(docs_test_jsonl, "r", encoding="utf-8") as file_in, - open(out_jsonl, "w", encoding="utf-8") as file_out, - ): - for line_index, line in enumerate(file_in, start=1): - if 0 < max_lines < line_index: - break - - try: - document = json.loads(line.strip()) - except Exception: - continue # Skip malformed JSON lines - - doc_id = document.get("id", "unknown") - title = document.get("title", "") - text = document.get("text", "") - - # Construct and call model - prompt = self._build_term_prompt(self.fewshot_terms_block, title, text) - raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) - predicted_terms = self._parse_list_like(raw_output) - - # Write extracted terms - for term_or_type in predicted_terms: - if isinstance(term_or_type, str) and term_or_type.strip(): - file_out.write( - json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) - + "\n" - ) - num_written_terms += 1 - - # Lightweight memory management for long runs - if line_index % 50 == 0: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - return num_written_terms - - def predict_types( + if self.model is None or self.tokenizer is None: + raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") + + encoded = self.tokenizer([prompt_text], return_tensors="pt", padding=True, truncation=True) + input_ids = encoded["input_ids"] + attention_mask = encoded["attention_mask"] + + model_device = next(self.model.parameters()).device + input_ids = input_ids.to(model_device) + attention_mask = attention_mask.to(model_device) + + with torch.no_grad(): + output_ids = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=self.max_new_tokens, + do_sample=False, + temperature=0.0, + top_p=1.0, + pad_token_id=self.tokenizer.eos_token_id, + )[0] + + decoded_full = self.tokenizer.decode(output_ids, skip_special_tokens=True) + decoded_prompt = self.tokenizer.decode(input_ids[0], skip_special_tokens=True) + + if decoded_full.startswith(decoded_prompt): + return decoded_full[len(decoded_prompt) :].strip() + + prompt_token_count = int(attention_mask[0].sum().item()) + return self.tokenizer.decode(output_ids[prompt_token_count:], skip_special_tokens=True).strip() + + def fit( self, - docs_test_jsonl: str, - out_jsonl: str, - max_lines: int = -1, - max_new_tokens: int = 120, - ) -> int: - """ - Runs Type Extraction on the test documents and saves results to a JSONL file. - Returns: The count of individual types written. - """ - if not self.fewshot_types_block: - raise RuntimeError("Few-shot block for types is empty. Call fit() first.") - - num_written_types = 0 - with ( - open(docs_test_jsonl, "r", encoding="utf-8") as file_in, - open(out_jsonl, "w", encoding="utf-8") as file_out, - ): - for line_index, line in enumerate(file_in, start=1): - if 0 < max_lines < line_index: - break - - try: - document = json.loads(line.strip()) - except Exception: - continue # Skip malformed JSON lines - - doc_id = document.get("id", "unknown") - title = document.get("title", "") - text = document.get("text", "") - - # Construct and call model using the dedicated type prompt block - prompt = self._build_type_prompt(self.fewshot_types_block, title, text) - raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) - predicted_types = self._parse_list_like(raw_output) - - # Write extracted types - for term_or_type in predicted_types: - if isinstance(term_or_type, str) and term_or_type.strip(): - file_out.write( - json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) - + "\n" - ) - num_written_types += 1 - - if line_index % 50 == 0: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - return num_written_types - - # --- Evaluation utilities (unchanged from prior definition, added docstrings) --- - def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]: - """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased.""" - gold_pairs = set() - with open(terms2doc_path, "r", encoding="utf-8") as file_handle: - term_to_doc_map = json.load(file_handle) - - for term, doc_ids in term_to_doc_map.items(): - clean_term = term.strip().lower() - for doc_id in doc_ids: - gold_pairs.add((doc_id, clean_term)) - return gold_pairs - - def load_predicted_pairs( - self, predicted_jsonl_path: str, key: str = "term" - ) -> Set[Tuple[str, str]]: - """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased.""" - predicted_pairs = set() - with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle: - for line in file_handle: - try: - entry = json.loads(line.strip()) - except Exception: - continue - doc_id = entry.get("doc_id") - value = entry.get(key) - if doc_id and value: - predicted_pairs.add((doc_id, value.strip().lower())) - return predicted_pairs - - def evaluate_extraction_f1( - self, terms2doc_path: str, predicted_jsonl: str, key: str = "term" - ) -> float: + train_data: Any, + task: str = "text2onto", + ontologizer: bool = False, + **kwargs: Any, + ) -> None: """ - Computes set-based binary Precision, Recall, and F1 score against the gold pairs. + Build and cache few-shot blocks from the training split. + + Args: + train_data: A split bundle dict. Must contain "documents" and "terms2docs". + task: Must be "text2onto". + ontologizer: Unused here (kept for signature compatibility). + **kwargs: + sample_size: Few-shot sample size per block (default 28). + seed: RNG seed (default 123). """ - # Load the ground truth and predictions - gold_set = self.load_gold_pairs(terms2doc_path) - predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key) + if task != "text2onto": + raise ValueError(f"{self.__class__.__name__} only supports task='text2onto' (got {task!r}).") - # Build combined universe of all pairs for score calculation - all_pairs = sorted(gold_set | predicted_set) + if not self._is_loaded: + self.load(model_id=self._default_model_id) - # Create binary labels (1=present, 0=absent) - y_true = [1 if pair in gold_set else 0 for pair in all_pairs] - y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs] + documents: List[Dict[str, Any]] = train_data.get("documents", []) or [] + terms_to_documents: Dict[str, List[str]] = train_data.get("terms2docs", {}) or {} + terms_to_types: Optional[Dict[str, List[str]]] = train_data.get("terms2types", None) - # Use scikit-learn for metric calculation - from sklearn.metrics import precision_recall_fscore_support + sample_size = int(kwargs.get("sample_size", 28)) + seed = int(kwargs.get("seed", 123)) - precision, recall, f1, _ = precision_recall_fscore_support( - y_true, y_pred, average="binary", zero_division=0 + self.build_few_shot_terms_block( + documents=documents, + terms_to_documents=terms_to_documents, + sample_size=sample_size, + seed=seed, + ) + self.build_few_shot_types_block( + documents=documents, + terms_to_documents=terms_to_documents, + terms_to_types=terms_to_types, + sample_size=sample_size, + seed=seed, ) - # Display results - num_true_positives = len(gold_set & predicted_set) + def predict( + self, + test_data: Any, + task: str = "text2onto", + ontologizer: bool = False, + **kwargs: Any, + ) -> Dict[str, Any]: + """ + Run term/type extraction over test documents. - print("\n📊 Evaluation Results:") - print(f" ✅ Precision: {precision:.4f}") - print(f" ✅ Recall: {recall:.4f}") - print(f" ✅ F1 Score: {f1:.4f}") - print(f" 📌 Gold pairs: {len(gold_set)}") - print(f" 📌 Predicted pairs:{len(predicted_set)}") - print(f" 🎯 True Positives: {num_true_positives}") + Args: + test_data: A split bundle dict. Must contain "documents". + task: Must be "text2onto". + ontologizer: Unused here (kept for signature compatibility). + **kwargs: + max_docs: If > 0, limit number of docs processed. - return float(f1) + Returns: + Prediction payload dict: {"terms": [...], "types": [...]}. + """ + if task != "text2onto": + raise ValueError(f"{self.__class__.__name__} only supports task='text2onto' (got {task!r}).") + + if not self.few_shot_terms_block or not self.few_shot_types_block: + raise RuntimeError("Few-shot blocks are empty. Pipeline should call fit() before predict().") + + max_docs = int(kwargs.get("max_docs", -1)) + documents: List[Dict[str, Any]] = test_data.get("documents", []) or [] + if max_docs > 0: + documents = documents[:max_docs] + + term_predictions: List[Dict[str, str]] = [] + type_predictions: List[Dict[str, str]] = [] + + for doc_index, document in enumerate(documents, start=1): + document_id = str(document.get("id", "unknown")) + title = str(document.get("title", "")) + text = str(document.get("text", "")) + + term_prompt = self._format_term_prompt(self.few_shot_terms_block, title, text) + extracted_terms = self._parse_python_list_of_strings(self._generate_completion(term_prompt)) + for term in extracted_terms: + normalized_term = (term or "").strip() + if normalized_term: + term_predictions.append({"doc_id": document_id, "term": normalized_term}) + + type_prompt = self._format_type_prompt(self.few_shot_types_block, title, text) + extracted_types = self._parse_python_list_of_strings(self._generate_completion(type_prompt)) + for extracted_type in extracted_types: + normalized_type = (extracted_type or "").strip() + if normalized_type: + type_predictions.append({"doc_id": document_id, "type": normalized_type}) + + if doc_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # IMPORTANT: return only the prediction payload; LearnerPipeline wraps it. + return {"terms": term_predictions, "types": type_predictions} diff --git a/ontolearner/text2onto/splitter.py b/ontolearner/text2onto/splitter.py index 3555511..cdc9e15 100644 --- a/ontolearner/text2onto/splitter.py +++ b/ontolearner/text2onto/splitter.py @@ -200,10 +200,73 @@ def generate_split_artefacts(self, split_docs): return terms_splits, types_splits, docs_split, types2docs_splits - def split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1): - split_targets, split_docs_targets = self.set_train_val_test_sizes(train_percentage=train, - val_percentage=val, - test_percentage=test) + def split_fine_grained(self, doc_ids): + """ + Build a single split bundle containing only: + - docs + - terms + - types + - terms2docs + - terms2types + """ + # normalize to string ids (constructor uses str(row.id)) + doc_ids = {str(d) for d in (doc_ids or [])} + + # docs + collect terms/types from docs + docs = [] + terms_set = set() + types_set = set() + + for doc_id in doc_ids: + doc = self.doc_id_to_doc[doc_id] + docs.append({"id": str(doc.id), "title": doc.title, "text": doc.text}) + + terms_set.update(self.doc_id_to_terms[doc_id]) + types_set.update(self.doc_id_to_types[doc_id]) + + terms = sorted(terms_set) + types = sorted(types_set) + + # terms2docs: use the constructor-built mapping and restrict to this split's doc_ids + terms2docs = { + term: sorted(list(self.term_to_doc_id.get(term, set()) & doc_ids)) + for term in terms + } + + # terms2types: ontology lookup (term -> parent types) + terms2types = {term: self.child_to_parent.get(term, []) for term in terms} + + return { + "documents": docs, + "terms": terms, + "types": types, + "terms2docs": terms2docs, + "terms2types": terms2types, + } + + def train_test_val_split(self, train: float = 0.8, val: float = 0.1, test: float = 0.1): + """ + Returns: + train_split, val_split, test_split + + Each split is a dict with keys: + - "docs" + - "terms" + - "types" + - "terms2docs" + - "terms2types" + """ + # compute which docs go to which split + split_targets, split_docs_targets = self.set_train_val_test_sizes( + train_percentage=train, + val_percentage=val, + test_percentage=test, + ) split_docs = self.create_train_val_test_splits(split_targets, split_docs_targets) - terms, types, docs, types2docs = self.generate_split_artefacts(split_docs) - return terms, types, docs, types2docs + # split_docs: {"train": set(doc_ids), "val": set(doc_ids), "test": set(doc_ids)} + + train_split = self.split_fine_grained(split_docs.get("train", set())) + val_split = self.split_fine_grained(split_docs.get("val", set())) + test_split = self.split_fine_grained(split_docs.get("test", set())) + + return train_split, val_split, test_split From 686b3d22e4c1bfb098e1498fc16f4d844cb8baaa Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Fri, 2 Jan 2026 14:39:26 +0100 Subject: [PATCH 12/19] :recycle: minor refactoring --- ontolearner/base/learner.py | 4 +-- ontolearner/learner/label_mapper.py | 2 +- ontolearner/learner/prompt.py | 45 +++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/ontolearner/base/learner.py b/ontolearner/base/learner.py index c410915..b018e25 100644 --- a/ontolearner/base/learner.py +++ b/ontolearner/base/learner.py @@ -296,8 +296,8 @@ def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]: # Decode only the generated part decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens] - print(decoded_outputs) - print(self.label_mapper.predict(decoded_outputs)) + # print(decoded_outputs) + # print(self.label_mapper.predict(decoded_outputs)) # Map the decoded text to labels return self.label_mapper.predict(decoded_outputs) diff --git a/ontolearner/learner/label_mapper.py b/ontolearner/learner/label_mapper.py index c7d279d..686bd3e 100644 --- a/ontolearner/learner/label_mapper.py +++ b/ontolearner/learner/label_mapper.py @@ -31,7 +31,7 @@ def __init__(self, ngram_range: Tuple=(1, 1), label_dict: Dict[str, List[str]]=None, analyzer: str = 'word', - iterator_no: int = 100): + iterator_no: int = 1000): """ Initializes the TFIDFLabelMapper with a specified classifier and TF-IDF configuration. diff --git a/ontolearner/learner/prompt.py b/ontolearner/learner/prompt.py index a9b8f7c..6b16346 100644 --- a/ontolearner/learner/prompt.py +++ b/ontolearner/learner/prompt.py @@ -17,15 +17,50 @@ class StandardizedPrompting(AutoPrompt): def __init__(self, task: str = None): if task == "term-typing": - prompt_template = """Determine whether the given term can be categorized as an instance of the specified high-level type. Answer with `yes` if it is otherwise answer with `no`. Do not explain. + prompt_template = """You are performing term typing. + +Determine whether the given term is a clear and unambiguous instance of the specified high-level type. + +Rules: +- Answer "yes" only if the term commonly and directly belongs to the type. +- Answer "no" if the term does not belong to the type, is ambiguous, or only weakly related. +- Use the most common meaning of the term. +- Do not explain your answer. + Term: {term} Type: {type} -Answer: """ +Answer (yes or no):""" elif task == "taxonomy-discovery": - prompt_template = """Is {parent} a direct or indirect superclass (or parent concept) of {child} in a conceptual hierarchy? Answer with yes or no. -Answer: """ + prompt_template = """You are identifying taxonomic (is-a) relationships. + +Question: +Is "{parent}" a superclass (direct or indirect) of "{child}" in a standard conceptual or ontological hierarchy? + +Rules: +- A superclass means: "{child}" is a type or instance of "{parent}". +- Answer "yes" only if the relationship is a true is-a relationship. +- Answer "no" for part-of, related-to, or associative relationships. +- Use general world knowledge. +- Do not explain. + +Parent: {parent} +Child: {child} +Answer (yes or no):""" elif task == "non-taxonomic-re": - prompt_template = """Given the conceptual types `{head}` and `{tail}`, does a `{relation}` relation exist between them? Respond with "yes" if it does, otherwise respond with "no".""" + prompt_template = """You are identifying non-taxonomic conceptual relationships. + +Given two conceptual types, determine whether the specified relation typically holds between them. + +Rules: +- Answer "yes" only if the relation commonly and meaningfully applies. +- Answer "no" if the relation is rare, indirect, or context-dependent. +- Do not infer relations that require specific situations. +- Do not explain. + +Head type: {head} +Tail type: {tail} +Relation: {relation} +Answer (yes or no):""" else: raise ValueError("Unknown task! Current tasks are: 'term-typing', 'taxonomy-discovery', 'non-taxonomic-re'") super().__init__(prompt_template) From a333d8e4dac350bbe4f9e34207c8fa00657985f1 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Sat, 3 Jan 2026 18:52:55 +0100 Subject: [PATCH 13/19] :recycle: refactor augmented learners --- ontolearner/base/learner.py | 24 ++--- ontolearner/learner/__init__.py | 2 +- ontolearner/learner/rag/__init__.py | 14 +++ ontolearner/learner/{ => rag}/rag.py | 9 +- ontolearner/learner/retriever/__init__.py | 2 +- ...lm_retriever.py => augmented_retriever.py} | 87 ++++++++++--------- ontolearner/learner/retriever/learner.py | 7 +- 7 files changed, 81 insertions(+), 64 deletions(-) create mode 100644 ontolearner/learner/rag/__init__.py rename ontolearner/learner/{ => rag}/rag.py (95%) rename ontolearner/learner/retriever/{llm_retriever.py => augmented_retriever.py} (83%) diff --git a/ontolearner/base/learner.py b/ontolearner/base/learner.py index b018e25..d283e71 100644 --- a/ontolearner/base/learner.py +++ b/ontolearner/base/learner.py @@ -201,7 +201,7 @@ class AutoLLM(ABC): tokenizer: The tokenizer associated with the model. """ - def __init__(self, label_mapper: Any, device: str='cpu', token: str="") -> None: + def __init__(self, label_mapper: Any, device: str='cpu', token: str="", max_length: int = 256) -> None: """ Initialize the LLM component. @@ -213,6 +213,7 @@ def __init__(self, label_mapper: Any, device: str='cpu', token: str="") -> None: self.device=device self.model: Optional[Any] = None self.tokenizer: Optional[Any] = None + self.max_length = max_length def load(self, model_id: str) -> None: @@ -236,10 +237,8 @@ def load(self, model_id: str) -> None: self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token) self.tokenizer.pad_token = self.tokenizer.eos_token if self.device == "cpu": - # device_map = "cpu" self.model = AutoModelForCausalLM.from_pretrained( model_id, - # device_map=device_map, torch_dtype=torch.bfloat16, token=self.token ) @@ -248,8 +247,8 @@ def load(self, model_id: str) -> None: self.model = AutoModelForCausalLM.from_pretrained( model_id, device_map=device_map, - torch_dtype=torch.bfloat16, - token=self.token + token=self.token, + trust_remote_code=True, ) self.label_mapper.fit() @@ -276,29 +275,20 @@ def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]: List of generated text responses, one for each input prompt. Responses include the original input plus generated continuation. """ - # Tokenize inputs and move to device encoded_inputs = self.tokenizer(inputs, return_tensors="pt", - padding=True, + max_length=self.max_length, truncation=True).to(self.model.device) input_ids = encoded_inputs["input_ids"] input_length = input_ids.shape[1] - - # Generate output outputs = self.model.generate( **encoded_inputs, max_new_tokens=max_new_tokens, - pad_token_id=self.tokenizer.eos_token_id + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.eos_token_id ) - - # Extract only the newly generated tokens (excluding prompt) generated_tokens = outputs[:, input_length:] - - # Decode only the generated part decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens] - # print(decoded_outputs) - # print(self.label_mapper.predict(decoded_outputs)) - # Map the decoded text to labels return self.label_mapper.predict(decoded_outputs) class AutoRetriever(ABC): diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index f44daab..3f1fb1e 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -14,6 +14,6 @@ from .llm import AutoLLMLearner, FalconLLM, MistralLLM from .retriever import AutoRetrieverLearner, LLMAugmentedRetrieverLearner -from .rag import AutoRAGLearner +from .rag import AutoRAGLearner, LLMAugmentedRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper diff --git a/ontolearner/learner/rag/__init__.py b/ontolearner/learner/rag/__init__.py new file mode 100644 index 0000000..422c641 --- /dev/null +++ b/ontolearner/learner/rag/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .rag import AutoRAGLearner, LLMAugmentedRAGLearner diff --git a/ontolearner/learner/rag.py b/ontolearner/learner/rag/rag.py similarity index 95% rename from ontolearner/learner/rag.py rename to ontolearner/learner/rag/rag.py index 5930aae..fee9b7c 100644 --- a/ontolearner/learner/rag.py +++ b/ontolearner/learner/rag/rag.py @@ -14,8 +14,7 @@ import warnings from typing import Any -from ..base import AutoLearner - +from ...base import AutoLearner class AutoRAGLearner(AutoLearner): def __init__(self, @@ -87,3 +86,9 @@ def _non_taxonomic_re(self, data: Any, test: bool = False) -> Any: return self.llm._non_taxonomic_re_predict(dataset=dataset) else: warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the fit as well.") + + +class LLMAugmentedRAGLearner(AutoRAGLearner): + + def set_augmenter(self, augmenter): + self.retriever.set_augmenter(augmenter=augmenter) diff --git a/ontolearner/learner/retriever/__init__.py b/ontolearner/learner/retriever/__init__.py index 65c0e03..c7d65c5 100644 --- a/ontolearner/learner/retriever/__init__.py +++ b/ontolearner/learner/retriever/__init__.py @@ -16,4 +16,4 @@ from .embedding import GloveRetriever, Word2VecRetriever from .ngram import NgramRetriever from .learner import AutoRetrieverLearner, LLMAugmentedRetrieverLearner -from .llm_retriever import LLMAugmenterGenerator, LLMAugmenter, LLMAugmentedRetriever +from .augmented_retriever import LLMAugmenterGenerator, LLMAugmenter, LLMAugmentedRetriever diff --git a/ontolearner/learner/retriever/llm_retriever.py b/ontolearner/learner/retriever/augmented_retriever.py similarity index 83% rename from ontolearner/learner/retriever/llm_retriever.py rename to ontolearner/learner/retriever/augmented_retriever.py index 0671cc2..ede4414 100644 --- a/ontolearner/learner/retriever/llm_retriever.py +++ b/ontolearner/learner/retriever/augmented_retriever.py @@ -17,6 +17,8 @@ from openai import OpenAI import time from tqdm import tqdm +import torch +import torch.nn.functional as F from ...base import AutoRetriever from ...utils import load_json @@ -125,7 +127,6 @@ def generate(self, conversation, function): except Exception: print("sleep for 5 seconds") time.sleep(5) - return inference def tasks_data_former(self, data: Any, task: str) -> List[str] | Dict[str, List[str]]: @@ -298,21 +299,12 @@ class LLMAugmentedRetriever(AutoRetriever): Attributes: augmenter: An augmenter instance that provides transform() and top_n_candidate. """ - - def __init__(self) -> None: - """ - Initialize the augmented retriever with no augmenter attached. - """ + def __init__(self, threshold: float = 0.0, cutoff_rate: float = 100.0) -> None: super().__init__() - self.augmenter = None + self.threshold = threshold + self.cutoff_rate = cutoff_rate def set_augmenter(self, augmenter): - """ - Attach an augmenter instance. - - Args: - augmenter: An object providing `transform(query, task)` and `top_n_candidate`. - """ self.augmenter = augmenter def retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: str = None) -> List[List[str]]: @@ -328,29 +320,46 @@ def retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: Returns: list[list[str]]: A list of document lists, one per input query. """ - parent_retrieve = super(LLMAugmentedRetriever, self).retrieve - - if task == 'taxonomy-discovery': - query_sets = [] - for idx in range(self.augmenter.top_n_candidate): - query_set = [] - for qu in query: - query_set.append(self.augmenter.transform(qu, task=task)[idx]) - query_sets.append(query_set) - - retrieves = [ - parent_retrieve(query=query_set, top_k=top_k, batch_size=batch_size) - for query_set in query_sets - ] - - results = [] - for qu_idx, qu in enumerate(query): - qu_result = [] - for top_idx in range(self.augmenter.top_n_candidate): - qu_result += retrieves[top_idx][qu_idx] - results.append(list(set(qu_result))) - - return results - - else: - return parent_retrieve(query=query, top_k=top_k, batch_size=batch_size) + if task != 'taxonomy-discovery': + return super().retrieve(query=query, top_k=top_k, batch_size=batch_size) + return self.augmented_retrieve(query, top_k=top_k, batch_size=batch_size, task=task) + + def augmented_retrieve(self, query: List[str], top_k: int = 5, batch_size: int = -1, task: str = None): + if self.embeddings is None: + raise RuntimeError("Retriever model must index documents before prediction.") + + augmented_queries, index_map = [], [] + for qu_idx, qu in enumerate(query): + augmented = self.augmenter.transform(qu, task=task) + for aug in augmented: + augmented_queries.append(aug) + index_map.append(qu_idx) + + doc_norm = F.normalize(self.embeddings, p=2, dim=1) + results = [dict() for _ in range(len(query))] + + if batch_size == -1: + batch_size = len(augmented_queries) + + for start in range(0, len(augmented_queries), batch_size): + batch_aug = augmented_queries[start:start + batch_size] + batch_embeddings = self.embedding_model.encode(batch_aug, convert_to_tensor=True) + batch_norm = F.normalize(batch_embeddings, p=2, dim=1) + similarity_matrix = torch.matmul(batch_norm, doc_norm.T) + current_top_k = min(top_k, len(self.documents)) + topk_similarities, topk_indices = torch.topk(similarity_matrix, k=current_top_k, dim=1) + + for i, (doc_indices, sim_scores) in enumerate(zip(topk_indices, topk_similarities)): + original_query_idx = index_map[start + i] + + for doc_idx, score in zip(doc_indices.tolist(), sim_scores.tolist()): + if score >= self.threshold: + doc = self.documents[doc_idx] + prev = results[original_query_idx].get(doc, 0.0) + results[original_query_idx][doc] = prev + score + + final_results = [] + for doc_score_map in results: + sorted_docs = sorted(doc_score_map.items(), key=lambda x: x[1], reverse=True) + final_results.append([doc for doc, _ in sorted_docs]) + return final_results diff --git a/ontolearner/learner/retriever/learner.py b/ontolearner/learner/retriever/learner.py index 389e542..fcdd706 100644 --- a/ontolearner/learner/retriever/learner.py +++ b/ontolearner/learner/retriever/learner.py @@ -122,7 +122,6 @@ def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]: warnings.warn("No requirement for fiting the non-taxonomic RE model, the predict module will use the input data to do the fit as well..") - class LLMAugmentedRetrieverLearner(AutoRetrieverLearner): def set_augmenter(self, augmenter): @@ -160,9 +159,9 @@ def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: taxonomic_pairs = [{"parent": candidate, "child": query} for query, candidates in zip(data, candidates_lst) for candidate in candidates if candidate.lower() != query.lower()] - taxonomic_pairs += [{"parent": query, "child": candidate} - for query, candidates in zip(data, candidates_lst) - for candidate in candidates if candidate.lower() != query.lower()] + # taxonomic_pairs += [{"parent": query, "child": candidate} + # for query, candidates in zip(data, candidates_lst) + # for candidate in candidates if candidate.lower() != query.lower()] unique_taxonomic_pairs, seen = [], set() for pair in taxonomic_pairs: key = (pair["parent"].lower(), pair["child"].lower()) # Directional key (parent, child) From f2191549f443a397e6a5ff8dd5e87fbe301d0970 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Sat, 3 Jan 2026 18:54:14 +0100 Subject: [PATCH 14/19] :sparkles: added OS compatibility test CI/CD --- .github/workflows/test-os-compatibility.yml | 49 +++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 .github/workflows/test-os-compatibility.yml diff --git a/.github/workflows/test-os-compatibility.yml b/.github/workflows/test-os-compatibility.yml new file mode 100644 index 0000000..2f0231f --- /dev/null +++ b/.github/workflows/test-os-compatibility.yml @@ -0,0 +1,49 @@ +name: Cross-platform Compatibility Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + os-compatibility-tests: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Poetry + shell: bash + run: | + curl -sSL https://install.python-poetry.org | python - + echo "$HOME/.local/bin" >> $GITHUB_PATH + echo "$APPDATA/Python/Scripts" >> $GITHUB_PATH + + - name: Configure Poetry and install plugin + shell: bash + run: | + poetry --version + poetry config virtualenvs.create false + poetry self add "poetry-dynamic-versioning[plugin]" + + - name: Install dependencies + shell: bash + run: | + poetry install --no-interaction --no-ansi + + - name: Run tests + shell: bash + run: | + poetry run pytest From 76d07db3c6f088281bad50021db32f0c68551144 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Mon, 5 Jan 2026 11:48:59 +0100 Subject: [PATCH 15/19] :bookmark: v1.4.11 --- CHANGELOG.md | 5 +++++ ontolearner/VERSION | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b6d2fe..421323d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## Changelog +### v1.4.11 (Janouary 5, 2026) +- Add `text2onto` component for challenge learners with their documentation. +- Code refactoring +- OS compatibility CI/CD + ### v1.4.10 (December 8, 2025) - add complexity score - add documentation for metrics diff --git a/ontolearner/VERSION b/ontolearner/VERSION index ac9f79c..079d7f6 100644 --- a/ontolearner/VERSION +++ b/ontolearner/VERSION @@ -1 +1 @@ -1.4.10 +1.4.11 From 883b254507a4605f85baba828961cd457868a94e Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Mon, 5 Jan 2026 11:53:35 +0100 Subject: [PATCH 16/19] :bug: bitsandbytes version fix --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 76bb5d7..9ab23c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ torch = "^2.8.0" transformers = "^4.56.0" sentence-transformers = "^5.1.0" dspy = "^2.6.14" -bitsandbytes="^0.45.1" +bitsandbytes = { version = ">=0.45.1,<1.0.0", markers = "platform_system == 'Linux'" } mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] } protobuf = "<5" Levenshtein = "*" diff --git a/requirements.txt b/requirements.txt index 773523a..d981de3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ transformers~=4.56.0 torch~=2.8.0 sentence-transformers~=5.1.0 scikit-learn~=1.6.1 -bitsandbytes~=0.45.1 +bitsandbytes>=0.45.1,<0.46.0; platform_system == "Linux" mistral-common[sentencepiece]~=1.8.5 protobuf<5 Levenshtein diff --git a/setup.py b/setup.py index ef57aa7..2472fca 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ "transformers>=4.56.0,<5.0.0", "sentence-transformers>=5.1.0,<6.0.0", "scikit-learn>=1.6.1,<2.0.0", - "bitsandbytes>=0.45.1,<1.0.0", + "bitsandbytes>=0.45.1,<1.0.0; platform_system == 'Linux'", "protobuf<5", "Levenshtein", "gensim" From 1bd76fe6c5abaa15fbe1a83a2cd818aa5c72cfbf Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Mon, 5 Jan 2026 11:57:36 +0100 Subject: [PATCH 17/19] :test_tube: --- tests/test_llm_retriever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_llm_retriever.py b/tests/test_llm_retriever.py index 3e39254..5078283 100644 --- a/tests/test_llm_retriever.py +++ b/tests/test_llm_retriever.py @@ -1,6 +1,6 @@ import pytest from unittest.mock import MagicMock, patch -from ontolearner.learner.retriever.llm_retriever import ( +from ontolearner.learner.retriever.augmented_retriever import ( LLMAugmenterGenerator, LLMAugmenter, LLMAugmentedRetriever, From 6838b821ce7f28464732c4d9530d4e882a87412e Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Mon, 5 Jan 2026 12:04:21 +0100 Subject: [PATCH 18/19] :test_tube: --- tests/test_llm_retriever.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_llm_retriever.py b/tests/test_llm_retriever.py index 5078283..641d89a 100644 --- a/tests/test_llm_retriever.py +++ b/tests/test_llm_retriever.py @@ -42,7 +42,7 @@ def __init__(self, triples): @pytest.fixture def mock_openai(): """Patch OpenAI client and return a controlled response for function calling.""" - with patch("ontolearner.learner.retriever.llm_retriever.OpenAI") as mock_client: + with patch("ontolearner.learner.retriever.augmented_retriever.OpenAI") as mock_client: instance = mock_client.return_value fake_response = MagicMock() @@ -92,7 +92,7 @@ def test_llm_augmenter_transform(): "taxonomy-discovery": {"Dog": ["Animal", "Mammal", "Pet"]}, } - with patch("ontolearner.learner.retriever.llm_retriever.load_json", return_value=fake_json): + with patch("ontolearner.learner.retriever.augmented_retriever.load_json", return_value=fake_json): augmenter = LLMAugmenter("dummy/path.json") assert augmenter.transform("Dog", "taxonomy-discovery") == ["Animal", "Mammal", "Pet"] From da8e5b0789a1d20ee55dcab8759f304299e6146e Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Mon, 5 Jan 2026 12:23:25 +0100 Subject: [PATCH 19/19] :test_tube: --- tests/test_llm_retriever.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_llm_retriever.py b/tests/test_llm_retriever.py index 641d89a..8917daf 100644 --- a/tests/test_llm_retriever.py +++ b/tests/test_llm_retriever.py @@ -102,15 +102,14 @@ def test_llm_augmenter_transform(): def test_llm_augmented_retriever_taxonomy(monkeypatch): retriever = LLMAugmentedRetriever() - def fake_retrieve(self, query, top_k=5, batch_size=32): + def fake_retrieve(self, query, top_k=5, batch_size=32, task='taxonomy-discovery'): return [[f"doc_{q}_{i}" for i in range(top_k)] for q in query] monkeypatch.setattr( - AutoRetriever, - "retrieve", + LLMAugmentedRetriever, + "augmented_retrieve", fake_retrieve ) - class FakeAug: top_n_candidate = 2 def transform(self, q, task): @@ -120,7 +119,7 @@ def transform(self, q, task): results = retriever.retrieve(["Dog"], top_k=2, task="taxonomy-discovery") assert len(results) == 1 - assert len(results[0]) == 4 + assert len(results[0]) == 2