diff --git a/docs/source/index.rst b/docs/source/index.rst index df55f18..b659d23 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -197,13 +197,14 @@ or GitHub repository: learning_tasks/text2onto .. toctree:: - :maxdepth: 1 + :maxdepth: 4 :caption: Learner Models :hidden: learners/llm learners/retrieval learners/rag + learners/llms4ol .. toctree:: :maxdepth: 4 diff --git a/docs/source/learners/images/alexbek-learner.png b/docs/source/learners/images/alexbek-learner.png new file mode 100644 index 0000000..e386d09 Binary files /dev/null and b/docs/source/learners/images/alexbek-learner.png differ diff --git a/docs/source/learners/images/challenge-logo.png b/docs/source/learners/images/challenge-logo.png new file mode 100644 index 0000000..c97b19f Binary files /dev/null and b/docs/source/learners/images/challenge-logo.png differ diff --git a/docs/source/learners/images/rwth-dbis-learner.png b/docs/source/learners/images/rwth-dbis-learner.png new file mode 100644 index 0000000..33eaad2 Binary files /dev/null and b/docs/source/learners/images/rwth-dbis-learner.png differ diff --git a/docs/source/learners/llms4ol.rst b/docs/source/learners/llms4ol.rst new file mode 100644 index 0000000..58cd23e --- /dev/null +++ b/docs/source/learners/llms4ol.rst @@ -0,0 +1,81 @@ + +.. sidebar:: Challenge Series Websites + + * `1st LLMs4OL @ ISWC 2024 `_ + * `2nd LLMs4OL @ ISWC 2025 `_ + + +.. raw:: html + +
+ challenge-logo +
+ +LLMs4OL Challenge +================================================================================================================== + + + + +LLMs4OL is a community development initiative collocated with the International Semantic Web Conference (ISWC) to explore the potential of Large Language Models (LLMs) in Ontology Learning (OL), a vital process for enhancing the web with structured knowledge to improve interoperability. By leveraging LLMs, the challenge aims to advance understanding and innovation in OL, aligning with the goals of the Semantic Web to create a more intelligent and user-friendly web. + + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - **Edition** + - **Task** + - **Description** + * - ``LLMs4OL'25`` + - **Text2Onto** + - Extract ontological terms and types from unstructured text. + + **ID**: ``text-to-onto`` + + **Info**: This task focuses on extracting foundational elements (Terms and Types) from unstructured text documents to build the initial structure of an ontology. It involves recognizing domain-relevant vocabulary (Term Extraction, SubTask 1) and categorizing it appropriately (Type Extraction, SubTask 2). It bridges the gap between natural language and structured knowledge representation. + + **Example**: **COVID-19** is a term of the type **Disease**. + * - ``LLMs4OL'24``, ``LLMs4OL'25`` + - **Term Typing** + - Discover the generalized type for a lexical term. + + **ID**: ``term-typing`` + + **Info**: The process of assigning a generalized type to each lexical term involves mapping lexical items to their most appropriate semantic categories or ontological classes. For example, in the biomedical domain, the term ``aspirin`` should be classified under ``Pharmaceutical Drug``. This task is crucial for organizing extracted terms into structured ontologies and improving knowledge reuse. + + **Example**: Assign the type ``"disease"`` to the term ``"myocardial infarction"``. + * - ``LLMs4OL'24``, ``LLMs4OL'25`` + - **Taxonomy Discovery** + - Discover the taxonomic hierarchy between type pairs. + + **ID**: ``taxonomy-discovery`` + + **Info**: Taxonomy discovery focuses on identifying hierarchical relationships between types, enabling the construction of taxonomic structures (i.e., ``is-a`` relationships). Given a pair of terms or types, the task determines whether one is a subclass of the other. For example, discovering that ``Sedan is a subclass of Car`` contributes to structuring domain knowledge in a way that supports reasoning and inferencing in ontology-driven applications. + + **Example**: Recognize that ``"lung cancer"`` is a subclass of ``"cancer"``, which is a subclass of ``"disease"``. + * - ``LLMs4OL'24``, ``LLMs4OL'25`` + - **Non-Taxonomic Relation Extraction** + - Identify non-taxonomic, semantic relations between types. + + **ID**: ``non-taxonomic-re`` + + **Info**: This task aims to extract non-hierarchical (non-taxonomic) semantic relations between concepts in an ontology. Unlike taxonomy discovery, which deals with is-a relationships, this task focuses on other meaningful associations such as part-whole (part-of), causal (causes), functional (used-for), and associative (related-to) relationships. For example, in a medical ontology, discovering that ``Aspirin treats Headache`` adds valuable relational knowledge that enhances the utility of an ontology. + + **Example**: Identify that *"virus"* ``causes`` *"infection"* or *"aspirin"* ``treats`` *"headache"*. + + +.. note:: + + * Proceedings of 1st LLMs4OL Challenge @ ISWC 2024 available at `https://www.tib-op.org/ojs/index.php/ocp/issue/view/169 `_ + * Proceedings of 2nd LLMs4OL Challenge @ ISWC 2025 available at `https://www.tib-op.org/ojs/index.php/ocp/issue/view/185 `_ + +.. toctree:: + :maxdepth: 1 + :caption: LLMs4OL Challenge Series Participants Learners + :titlesonly: + + llms4ol_challenge/rwthdbis_learner + llms4ol_challenge/skhnlp_learner + llms4ol_challenge/alexbek_learner + llms4ol_challenge/sbunlp_learner diff --git a/docs/source/learners/llms4ol_challenge/alexbek_learner.rst b/docs/source/learners/llms4ol_challenge/alexbek_learner.rst new file mode 100644 index 0000000..321e280 --- /dev/null +++ b/docs/source/learners/llms4ol_challenge/alexbek_learner.rst @@ -0,0 +1,252 @@ +Alexbek Learner +================ + +.. sidebar:: Alexbek Learner Examples + + * Text2Onto: `llm_learner_alexbek_text2onto.py `_ + * Term Typing: `llm_learner_alexbek_rf_term_typing.py `_ + * Taxonomy Discovery: `llm_learner_alexbek_cross_attn_taxonomy_discovery.py `_ + +The team presented a comprehensive system for addressing Tasks A, B, and C of the LLMs4OL 2025 challenge, which together span the full ontology construction pipeline: term extraction, typing, and taxonomy discovery. Their approach combines retrieval-augmented prompting, zero-shot classification, and attention-based graph modeling — each tailored to the demands of the respective task. + +.. note:: + + Read more about the model at `Alexbek at LLMs4OL 2025 Tasks A, B, and C: Heterogeneous LLM Methods for Ontology Learning (Few-Shot Prompting, Ensemble Typing, and Attention-Based Taxonomies) `_. + +.. hint:: + + The original implementation is available at `https://github.com/BelyaevaAlex/LLMs4OL-Challenge-Alexbek `_ repository. + +Overview +--------------------------------- + +.. raw:: html + +
+ Alexbek Team +
+
+ +For **Task A (Text2Onto)**, they jointly extract domain-specific terms and their ontological types using a retrieval-augmented generation (RAG) pipeline. Training data is reformulated into a correspondence between documents, terms, and types, while test-time inference leverages semantically similar training examples. This single-pass method requires no model fine-tuning and leverages lexical augmentation. For **Task B (Term Typing)**, which involves assigning types to given terms, they adopt a dual strategy. In the few-shot setting (for domains with labeled training data), they reuse the RAG scheme with few-shot prompting. In the zero-shot or label-scarce setting, they use a classifier that combines cosine similarity scores from multiple embedding models using confidence-based weighting (e.g., via random forests or RAG-style retrieval). For **Task C (Taxonomy Discovery)**, they model taxonomy discovery as graph inference. Using embeddings of type labels, they train a lightweight cross-attention layer to predict *is-a* relations by approximating a soft adjacency matrix. + +Methodological Summary: + +1. **Retrieval-Augmented Text2Onto.** Training data is restructured into document–term–type correspondences. At inference time, the system retrieves semantically similar training examples and feeds them, together with the query document, into a small generative LLM to jointly predict candidate terms and their types. + +2. **Hybrid Term Typing.** + + * **Random-Forest Variant.** Uses dense text embeddings (and optionally graph-based features from the ontology) as input to a random-forest classifier, producing multi-label type assignments per term. + * **RAG-Based Variant.** Combines a bi-encoder retriever with a generative LLM: for each query term, top-*k* labeled examples are retrieved and concatenated into the prompt. The LLM then predicts types in a structured format (e.g., JSON), which are parsed and evaluated. + +3. **Cross-Attention Taxonomy Discovery.** Type labels (or term representations) are embedded using a sentence-transformer model and passed through a lightweight cross-attention layer. The resulting network approximates a soft adjacency matrix over types and is trained to distinguish positive (true parent–child) from negative (corrupted) edges. + + +Term Typing (Random-Forest) +--------------------------- + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For term typing, we use GeoNames as an example ontology. Labeled term–type pairs are extracted and split into train and test sets. + +.. code-block:: python + + from ontolearner import GeoNames, train_test_split + + # Load the GeoNames ontology and extract labeled term-typing data + ontology = GeoNames() + ontology.load() + data = ontology.extract() + + # Split the labeled term-typing data into train and test sets + train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "term-typing" + +We first configure the Alexbek random-forest learner. +This learner builds features from text embeddings (and optionally graph structure) and trains a random-forest classifier for term typing. + +.. code-block:: python + + from ontolearner.learner.term_typing import AlexbekRFLearner + + rf_learner = AlexbekRFLearner( + device="cpu", # switch to "cuda" if available + batch_size=16, + max_length=512, # max tokenizer length for embedding inputs + threshold=0.30, # probability cutoff for assigning each type + use_graph_features=True # set False for pure text-based features + ) + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + # Fit the RF-based learner on the training split + rf_learner.fit(train_data, task=task) + + # Predict types for the held-out test terms + predicts = rf_learner.predict(test_data, task=task) + + # Build gold labels and evaluate + truth = rf_learner.tasks_ground_truth_former(data=test_data, task=task) + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) + +Term Typing (RAG-based) +----------------------- + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +The RAG-based term-typing setup also uses GeoNames. We again load the ontology and split labeled term–type instances into train and test sets. + +.. code-block:: python + + from ontolearner import GeoNames, train_test_split + + ontology = GeoNames() + ontology.load() + data = ontology.extract() + + # Extract labeled items and split into train/test sets for evaluation + train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "term-typing" + +Next, we configure a Retrieval-Augmented Generation (RAG) term-typing classifier. +An encoder retrieves top-k similar training examples, and a generative LLM predicts types conditioned on the query term plus retrieved examples. + +.. code-block:: python + + from ontolearner.learner.term_typing import AlexbekRAGLearner + + rag_learner = AlexbekRAGLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id="sentence-transformers/all-MiniLM-L6-v2", + device="cuda", # or "cpu" + top_k=3, + max_new_tokens=256, + output_dir="./results/", + ) + + # Load the underlying LLM and retriever for RAG-based term typing + rag_learner.load(llm_id=rag_learner.llm_model_id) + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + + # Index the training data for retrieval and prepare prompts + rag_learner.fit(train_data, task=task) + + # Predict types for the held-out test terms + predicts = rag_learner.predict(test_data, task=task) + + # Build gold labels and evaluate + truth = rag_learner.tasks_ground_truth_former(data=test_data, task=task) + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) + + +Taxonomy Discovery +------------------ + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For taxonomy discovery, we again use the GeoNames ontology. It exposes parent–child relations that can be embedded and fed to a cross-attention model. + +.. code-block:: python + + from ontolearner import GeoNames, train_test_split + + ontology = GeoNames() + ontology.load() + data = ontology.extract() + + train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "taxonomy-discovery" + +Next, we configure the Alexbek cross-attention learner. +It uses embeddings of type labels and a lightweight cross-attention layer to predict *is-a* relations. + +.. code-block:: python + + from ontolearner import AlexbekCrossAttnLearner + + cross_learner = AlexbekCrossAttnLearner( + embedding_model="sentence-transformers/all-MiniLM-L6-v2", + device="cpu", + num_heads=8, + lr=5e-5, + weight_decay=0.01, + num_epochs=1, + batch_size=256, + neg_ratio=1.0, + output_dir="./results/crossattn/", + seed=42, + ) + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + + # Train the cross-attention model on taxonomic edges + cross_learner.fit(train_data, task=task) + + # Predict taxonomic relations on the test set + predicts = cross_learner.predict(test_data, task=task) + + # Build gold labels and evaluate + truth = cross_learner.tasks_ground_truth_former(data=test_data, task=task) + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) diff --git a/docs/source/learners/llms4ol_challenge/rwthdbis_learner.rst b/docs/source/learners/llms4ol_challenge/rwthdbis_learner.rst new file mode 100644 index 0000000..011b346 --- /dev/null +++ b/docs/source/learners/llms4ol_challenge/rwthdbis_learner.rst @@ -0,0 +1,194 @@ +RWTH-DBIS Learner +================== + +.. sidebar:: RWTH-DBIS Learner Examples + + * Term Typing: `llm_learner_rwthdbis_term_typing.py `_ + * Taxonomy Discovery: `llm_learner_rwthdbis_taxonomy_discovery.py `_ + + +The RWTH-DBIS team participated in the LLMs4OL Challenge at ISWC 2024, addressing two main tasks: **Term Typing** and **Taxonomy Discovery**. The team used LLaMA-3-8B (an open-source model) and GPT-3.5-Turbo (a commercial model) to rigorously compare the performance gaps and distinct capabilities between these two classes of Large Language Models (LLMs). This comparison was crucial for establishing baselines for future Ontology Learning research, particularly focusing on how well models can generalize and incorporate external knowledge for structured knowledge extraction. The evaluation was conducted across established benchmark datasets, including GeoNames, UMLS, Schema.org, and the Gene Ontology (GO). + +.. note:: + + Read more about the model at `RWTH-DBIS at LLMs4OL 2024 Tasks A and B Knowledge-Enhanced Domain-Specific Continual Learning and Prompt-Tuning of Large Language Models for Ontology Learning `_. + +.. hint:: + + The original implementation is available at `https://github.com/MouYongli/LLMs4OL `_ repository. + + + +Overview +--------------------------------- + +.. raw:: html + +
+ RWTH-DBIS Team +
+
+ +The methodology is involved three sequential stages: **data augmentation**, **model training**, and **inference**. A key part of the data augmentation phase involved gathering rich contextual descriptions for terms and types from public web sources like Wikipedia and specialized ontology APIs. Furthermore, the team leveraged advanced commercial LLMs—specifically GPT-4o, Claude-3, and Copilot—using zero-shot prompts to access their web search capabilities and generate additional, high-quality contextual information. This enriched data was vital for overcoming the limitations of base models and enhancing their semantic understanding of domain-specific concepts prior to training. For the open-source LLaMA-3-8B model, the training stage incorporated several advanced techniques to maximize performance. These included **domain-specific continual training** to adapt the LLM's vocabulary and knowledge base to the target ontology domain (e.g., biomedical for GO, geographical for GeoNames). Furthermore, **fine-tuning** was used to specialize the model for the direct objectives of Term Typing and Taxonomy Discovery. Crucially, **knowledge-enhanced prompt-tuning** was implemented, which integrated the collected external context (from Wikipedia and commercial LLM searches) directly into the model's prompts during inference. + +Methodological Summary: + +1. **Data Collection & Context Enrichment.** Term and type descriptions were gathered from public sources like Wikipedia via its API, followed by cleaning and structuring. Commercial LLMs with web search capabilities—GPT-4o, Claude-3, and Copilot—were accessed through APIs using zero-shot prompts to gather additional contextual information. Ontology datasets were accessed directly via APIs or downloads to obtain relevant context. + +2. **Training (Domain-Specific Continual Training).** Context information for terms and types was integrated into the training data. For GeoNames, context was collected for all terms and types, while for other datasets, only type-level context was used. + +3. **Task-Specific Modeling.** + + * **Task A (Term Typing).** Models were trained on terms and corresponding types from the competition data. Each type was assigned a unique label, forming a dataset of term–label pairs for supervised fine-tuning of encoder models such as DeBERTa or LLaMA adapters. + * **Task B (Taxonomy Discovery).** Hierarchical relationships were transformed into a binary classification format. Positive samples (parent–child) were labeled as 1, and negative samples (reversed or corrupted pairs) as 0, creating the final dataset used to train taxonomic relation classifiers. + + +Term Typing +----------- + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For term typing, we use the AgrO ontology as a running example. Terms and their semantic types are split into train and test sets for supervised encoder fine-tuning and evaluation. + +.. code-block:: python + + from ontolearner import train_test_split, AgrO + + # Load the AgrO ontology and extract labeled term-typing data + ontology = AgrO() + ontology.load() + ontological_data = ontology.extract() + + # Split data into train and test sets + train_data, test_data = train_test_split( + ontological_data, + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the task you want the learner to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "term-typing" + +Next, we initialize :class:`RWTHDBISSFTLearner`. This learner is based on a small, pre-trained encoder model that is fine-tuned for the specific task. The hyperparameters control the supervised training process (e.g., epochs, batch size, learning rate). + +.. code-block:: python + + from ontolearner.learner.term_typing import RWTHDBISSFTLearner + + learner = RWTHDBISSFTLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/deberta-v3", + device="cpu", + num_train_epochs=30, + per_device_train_batch_size=16, + gradient_accumulation_steps=2, + learning_rate=2e-5, + max_length=64, + seed=42, + ) + + # Load the base encoder and prepare it for supervised term typing + learner.load(llm_id=learner.model_name) + + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ + + +.. code-block:: python + + from ontolearner import evaluation_report + + # Indexing (fitting) the model on the training data for the LLMs4OL task + learner.fit(train_data, task=task) + + # Perform prediction and evaluation directly + predicts = learner.predict(test_data, task=task) + truth = learner.tasks_ground_truth_former(data=test_data, task=task) + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) + + +Taxonomy Discovery +------------------ + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For taxonomy discovery, we use the Chord ontology as a running example. It exposes hierarchical (parent, child) relations that can be transformed into labeled edges for taxonomic relation classification. + +.. code-block:: python + + from ontolearner import train_test_split, ChordOntology + + # Load the Chord ontology (taxonomy discovery benchmark) + ontology = ChordOntology() + ontology.load() + + # Extract hierarchical (parent, child) edges and split into train/test + train_data, test_data = train_test_split( + ontology.extract(), + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "taxonomy-discovery" + +Next, we initialize :class:`RWTHDBISTaxonomyLearner`. This learner fine-tunes a transformer model to classify pairs of terms as positive or negative taxonomic relations (e.g., parent–child vs. non-parent–child). + +.. code-block:: python + + from ontolearner import RWTHDBISTaxonomyLearner + + learner = RWTHDBISTaxonomyLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + gradient_accumulation_steps=4, + learning_rate=2e-5, + max_length=256, + seed=42, + negative_ratio=5, + bidirectional_templates=True, + context_json_path=None, + ontology_name=ontology.ontology_full_name, + ) + + # Load the base model and prepare it for supervised taxonomy learning + learner.load(llm_id=learner.model_name) + + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + + # Fine-tune the model on the taxonomic training data + learner.fit(train_data, task=task) + + # Perform prediction and evaluation directly + predicts = learner.predict(test_data, task=task) + truth = learner.tasks_ground_truth_former(data=test_data, task=task) + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) diff --git a/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst b/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst new file mode 100644 index 0000000..c9b7cc0 --- /dev/null +++ b/docs/source/learners/llms4ol_challenge/sbunlp_learner.rst @@ -0,0 +1,181 @@ +SBU-NLP Learner +================ + + +.. sidebar:: SBU-NLP Learner Examples + + * Text2Onto: `llm_learner_sbunlp_text2onto.py `_ + * Term Typing (Zero-Shot): `llm_learner_sbunlp_zs_term_typing.py `_ + * Taxonomy Discovery (Few-shot): `llm_learner_sbunlp_fs_taxonomy_discovery.py `_ + +The team participated in the LLMs4OL 2025 Shared Task, which included four subtasks: (A) Ontological term and type extraction (Text2Onto), (B) Term typing (Term Typing), (C) Taxonomy discovery (Taxonomy Discovery). + + +.. note:: + + Read more about the model at `RWTH-DBIS at LLMs4OL 2024 Tasks A and B Knowledge-Enhanced Domain-Specific Continual Learning and Prompt-Tuning of Large Language Models for Ontology Learning `_. + +.. hint:: + + The original implementation is available at `https://github.com/MouYongli/LLMs4OL `_ repository. + + + +Overview +--------------------------------- +The team focused on Tasks A, B, and C, adopting a unified prompting-based methodology that required no supervised training or fine-tuning. Instead, they applied prompt engineering combined with stratified and simple random sampling, as well as chunking-based strategies, to incorporate representative examples within the context window. + +Methodological Summary: + +- For **Term Typing**, the problem was framed as zero-shot label selection over an ontology-specific type inventory. The type inventory was derived from the training split, and Qwen-based LLMs were prompted to assign one or more valid type labels to each term without any parameter fine-tuning. Stratified sampling and prompt chunking were used to expose representative type examples within the context window while keeping prompts compact and model-agnostic. + +- For **Taxonomy Discovery**, the focus was on detecting parent–child relationships between ontology terms. Due to the relational nature of this task, batch prompting was employed to efficiently handle multiple type pairs per inference, enabling the model to consider several candidate relations jointly. + +Term Typing +----------------------- + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For term typing (Task B), we use the AgrO ontology. Terms and their labels are split into train and test sets; the train split is used to derive the type inventory, while the test split is used for evaluation. + +.. code-block:: python + + from ontolearner import AgrO, train_test_split + + ontology = AgrO() + ontology.load() + data = ontology.extract() + + # Split the data into train (for type inventory) and test (terms to type) + train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "term-typing" + +Next, we configure the SBU-NLP Zero-Shot learner for term typing, using a Qwen model. +The learner's ``fit`` phase infers the inventory of allowed type labels; the model itself is never fine-tuned. + +.. code-block:: python + + from ontolearner import SBUNLPZSLearner + + llm_learner = SBUNLPZSLearner( + # Model / decoding + model_id="Qwen/Qwen2.5-0.5B-Instruct", + max_new_tokens=64, # sufficient length for JSON list of types + temperature=0.0, # deterministic (greedy) output + # token=None, # assuming public model access + ) + + # Load the underlying LLM and prepare it for zero-shot term typing + llm_learner.load(llm_id=llm_learner.model_id) + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + + # Learn the type inventory from the training split + llm_learner.fit(train_data, task=task) + + # Predict types for the held-out test terms + predicts = llm_learner.predict(test_data, task=task) + truth = llm_learner.tasks_ground_truth_former(data=test_data, task=task) + + # Evaluate zero-shot term typing performance + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) + +Taxonomy Discovery +----------------------------- + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For taxonomy discovery (Task C), we use the GeoNames ontology as an example. It provides geographic parent–child relations (an *is-a* hierarchy) that can be split into train and test sets. + +.. code-block:: python + + from ontolearner import GeoNames, train_test_split + + ontology = GeoNames() + ontology.load() + data = ontology.extract() # list of taxonomic relationships + + # Split the taxonomic relationships into train and test sets + train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "taxonomy-discovery" + +Next, we configure the SBU-NLP Few-Shot learner using a Qwen model. +This learner performs in-context learning via an N × M batch prompting scheme over (parent, child) pairs. + +.. code-block:: python + + from ontolearner import SBUNLPFewShotLearner + + llm_learner = SBUNLPFewShotLearner( + # Model / decoding + model_name="Qwen/Qwen2.5-0.5B-Instruct", + try_4bit=True, # use 4-bit if bitsandbytes + CUDA are available + max_new_tokens=140, # limit the length of the model's response (JSON output) + max_input_tokens=1500, # limit the total prompt length (context window) + temperature=0.0, # set to 0.0 for deterministic output + top_p=1.0, + + # Grid settings (N × M prompts) + n_train_chunks=7, # N: split training examples into 7 chunks + m_test_chunks=7, # M: split test terms into 7 chunks (total 49 prompts) + + # Run controls + limit_prompts=None, # None runs all N × M prompts; set an int for a restricted run + output_dir="./outputs/taskC_batches", # dump per-prompt JSON results for debugging + ) + + # Load the underlying LLM and prepare it for few-shot taxonomy discovery + llm_learner.load(llm_id=llm_learner.model_name) + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + + # Run the few-shot prompting over (parent, child) pairs + predicts = llm_learner.predict(test_data, task=task) + + # Build gold-standard labels for evaluation + truth = llm_learner.tasks_ground_truth_former(data=test_data, task=task) + + # Evaluate taxonomy discovery performance + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) diff --git a/docs/source/learners/llms4ol_challenge/skhnlp_learner.rst b/docs/source/learners/llms4ol_challenge/skhnlp_learner.rst new file mode 100644 index 0000000..72c3eb3 --- /dev/null +++ b/docs/source/learners/llms4ol_challenge/skhnlp_learner.rst @@ -0,0 +1,185 @@ +SKH-NLP Learner +============ + + +.. sidebar:: SKH-NLP Learner Examples + + * Taxonomy Discovery (Supervised Fine-Tuning): `llm_learner_skhnlp_sft_taxonomoy_discovery.py `_ + * Taxonomy Discovery (Zero-Shot): `llm_learner_skhnlp_zs_taxonomoy_discovery.py `_ + +The Taxonomy Discovery task was modeled as a classification problem and addressed using two approaches. Initial experimentation was done on BERT, fine-tuned with various prompts in a classification setup. Further experimentation was conducted on LLaMA3-70B, with different prompt formulations for optimal results. Evaluation employed substring and Levenshtein distance functions to assess answer correctness. + + +.. note:: + + Read more about the model at `SKH-NLP at LLMs4OL 2024 Task B: Taxonomy Discovery in Ontologies Using BERT and LLaMA 3 `_. + +.. hint:: + + The original implementation is available at `https://github.com/s-m-hashemi/llms4ol-2024-challenge `_ repository. + + + +Overview +--------------------------------- + +Original implementation has used the GeoNames dataset was used, containing 476 (child, parent) pairs with 9 distinct parent classes, making it a 9-class classification problem. To train the BERT classifier, a negative dataset was generated by (1) reversing records (swapping parent and child) and (2) manipulating records (randomly replacing the parent with one of the other 8). + + +Methodological Summary: + +- **BERT-Based Approach.** Modeled as a multi-class problem with 9 classes using a single binary classifier iteratively for each class. The model was fine-tuned to determine whether an *is-a* relationship exists between a given (parent, child) pair. + +- **LLaMA-Based Approach.** Evaluation focused on prompt engineering using two concepts—classification (instance–class) and hierarchy (is-a, parent–child). In some cases, the model partially used class names (e.g., only part of “mountain, hill, rock”). To handle this, substring matching and Levenshtein distance were applied during evaluation to map outputs to the closest class titles. + + +Taxonomy Discovery (Zero-Shot) +------------------------------ + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +We first load the GeoNames ontology and split the taxonomic edges into train and test sets. + +.. code-block:: python + + from ontolearner import GeoNames, train_test_split + + ontology = GeoNames() + ontology.load() + data = ontology.extract() + + train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "taxonomy-discovery" + +Next, we configure the zero-shot taxonomy learner. +This learner uses a generative LLM together with string-normalization strategies +(e.g., Levenshtein distance) to map model outputs to ontology classes. + +.. code-block:: python + + from ontolearner import SKHNLPZSLearner + + llm_learner = SKHNLPZSLearner( + model_name="Qwen/Qwen2.5-0.5B-Instruct", + device="cpu", # use "cuda" if you have a GPU + max_new_tokens=16, + save_path="./outputs/", # directory or full file path for CSV + verbose=True, + normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" + ) + + # Load the underlying LLM and prepare it for zero-shot taxonomy discovery + llm_learner.load(llm_id=llm_learner.model_name) + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from ontolearner import evaluation_report + + # Zero-shot setting: training data may be used only for label inventory or analysis; + # the main predictions are produced directly on the test split. + predicts = llm_learner.predict(test_data, task=task) + + # Build gold-standard labels for evaluation + truth = llm_learner.tasks_ground_truth_former(data=test_data, task=task) + + # Evaluate zero-shot taxonomy discovery performance + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) + + +Taxonomy Discovery (Supervised Fine-Tuning) +------------------------------------------- + +Loading Ontological Data +~~~~~~~~~~~~~~~~~~~~~~~~ + +For supervised fine-tuning, we again use GeoNames and split the taxonomic relationships into train and test sets. + +.. code-block:: python + + from ontolearner import GeoNames, train_test_split + + ontology = GeoNames() + ontology.load() + data = ontology.extract() + + train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42, + ) + +Initialize Learner +~~~~~~~~~~~~~~~~~~ + +Before defining the learner, choose the ontology learning task to perform. +Available tasks have been described in `LLMs4OL Paradigms `_. +The task IDs are: ``term-typing``, ``taxonomy-discovery``, ``non-taxonomic-re``. + +.. code-block:: python + + task = "taxonomy-discovery" + +We then configure the supervised BERT-based learner. +This learner fine-tunes a BERT-Large model using sequential prompts over (parent, child) pairs. + +.. code-block:: python + + from ontolearner import SKHNLPSequentialFTLearner + + bert_learner = SKHNLPSequentialFTLearner( + model_name="bert-large-uncased", + n_prompts=2, + random_state=1403, + device="cpu", # Note: CPU training for BERT-Large is slow. + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + warmup_steps=500, + weight_decay=0.01, + logging_dir="./logs/", + logging_steps=50, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + ) + + # Load the base BERT model and prepare it for supervised taxonomy discovery + bert_learner.load(llm_id=bert_learner.model_name) + + +Learn and Predict +~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python + + from ontolearner import evaluation_report + + # Fine-tune BERT on the taxonomic training data + bert_learner.fit(train_data, task=task) + + # Predict taxonomic relations on the held-out test set + predicts = bert_learner.predict(test_data, task=task) + + # Build gold-standard labels and evaluate + truth = bert_learner.tasks_ground_truth_former(data=test_data, task=task) + metrics = evaluation_report(y_true=truth, y_pred=predicts, task=task) + print(metrics) diff --git a/examples/llm_learner_alexbek_cross_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_cross_attn_taxonomy_discovery.py new file mode 100644 index 0000000..c3bc73f --- /dev/null +++ b/examples/llm_learner_alexbek_cross_attn_taxonomy_discovery.py @@ -0,0 +1,42 @@ +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.taxonomy_discovery import AlexbekCrossAttnLearner + +# 1) Load & split +ontology = GeoNames() +ontology.load() +data = ontology.extract() +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# 2) Configure the cross-attention learner +cross_learner = AlexbekCrossAttnLearner( + embedding_model="sentence-transformers/all-MiniLM-L6-v2", # or "Qwen/Qwen2.5-1.5B-... (if wrapped as ST)" + device="cpu", + num_heads=8, + lr=5e-5, + weight_decay=0.01, + num_epochs=1, + batch_size=256, + neg_ratio=1.0, + output_dir="./results/crossattn/", + seed=42, +) + +# 3) Build pipeline +pipeline = LearnerPipeline( + llm=cross_learner, # <- our learner + llm_id="cross-attn", # label for bookkeeping + ontologizer_data=False, # pass raw ontology objects as in your example +) + +# 4) Train + predict + evaluate +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +print("Metrics:", outputs.get("metrics")) +print("Elapsed time:", outputs["elapsed_time"]) +print(outputs) diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py new file mode 100644 index 0000000..17becc2 --- /dev/null +++ b/examples/llm_learner_alexbek_rag_term_typing.py @@ -0,0 +1,54 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.term_typing import AlexbekRAGLearner + +# Load the GeoNames ontology. +ontology = GeoNames() +ontology.load() + +# Extract labeled items and split into train/test sets for evaluation +train_data, test_data = train_test_split( + ontology.extract(), test_size=0.2, random_state=42 +) + +# Configure a Retrieval-Augmented Generation (RAG) term-typing classifier. +# - llm_model_id: generator used to predict types from the prompt + retrieved examples +# - retriever_model_id: encoder used to embed items and fetch top-k similar (RAG) examples +# - device: "cuda" for GPU or "cpu" +# - top_k: number of nearest examples to retrieve per query term +# - max_new_tokens: decoding budget of the LLM during prediction +# - output_dir: where intermediate artifacts / logs can be stored +rag_learner = AlexbekRAGLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id="sentence-transformers/all-MiniLM-L6-v2", + device="cuda", + top_k=3, + max_new_tokens=256, + output_dir="./results/", +) + +# Build the pipeline and pass raw structured objects end-to-end. +# We place the RAG learner in the llm slot and set llm_id accordingly. +pipe = LearnerPipeline( + llm=rag_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=True, +) + +# Run the full learning pipeline on the term-typing task +# - task="term-typing" (Task B) +# - evaluate=True computes precision/recall/F1 on the held-out test split +# - ontologizer_data=True must match the pipeline flag above +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=True, +) + +# Display the evaluation results and runtime +print( + "Metrics:", outputs.get("metrics") +) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} +print("Elapsed time (s):", outputs.get("elapsed_time")) diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py new file mode 100644 index 0000000..75e7ea2 --- /dev/null +++ b/examples/llm_learner_alexbek_rf_term_typing.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.term_typing import AlexbekRFLearner # A random-forest term-typing learner over text+graph features + +# Load the GeoNames ontology and extract labeled term-typing data + +ontology = GeoNames() +ontology.load() + +data = ontology.extract() + +# Split the labeled term-typing data into train and test sets +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# Configure the RF-based learner (embeddings + optional graph features) +# - device: "cpu" or "cuda" +# - threshold: decision threshold for multi-label assignment +# - use_graph_features: include ontology-graph-derived features if available +rf_learner = AlexbekRFLearner( + device="cpu", # switch to "cuda" if you have a GPU + batch_size=16, + max_length=512, # max tokenizer length for embedding model inputs + threshold=0.30, # probability cutoff for assigning each type + use_graph_features=True, # set False for pure RF on text embeddings only +) + +# Build the pipeline and pass raw structured objects end-to-end. +pipe = LearnerPipeline( + retriever=rf_learner, + retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory + ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} + device="cpu", + batch_size=16, +) + +# Run the full learning pipeline on the term-typing task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=True, +) + +# Display evaluation summary and runtime +print("Metrics:", outputs.get("metrics")) + +print("Elapsed time:", outputs["elapsed_time"]) + +print(ontology) diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py new file mode 100644 index 0000000..69282a9 --- /dev/null +++ b/examples/llm_learner_alexbek_text2onto.py @@ -0,0 +1,84 @@ +import os +import json +import torch + +# LocalAutoLLM handles model loading/generation; AlexbekFewShotLearner provides fit/predict APIs +from ontolearner.learner.text2onto.alexbek import LocalAutoLLM, AlexbekFewShotLearner + +# Local folder where the dataset is stored (relative to this script) +DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" + +# Input paths (already saved) +TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") +TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") +TEST_DOCS_FULL_PATH = os.path.join( + DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl" +) + +# Output paths +DOC_TERMS_OUT_PATH = os.path.join( + DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl" +) +TERMS2TYPES_OUT_PATH = os.path.join( + DATA_DIR, "test", "terms2types_pred_ecology.fast.json" +) +TYPES2DOCS_OUT_PATH = os.path.join( + DATA_DIR, "test", "types2docs_pred_ecology.fast.json" +) + +# Device selection +DEVICE = ( + "cuda" + if torch.cuda.is_available() + else ("mps" if torch.backends.mps.is_available() else "cpu") +) + +# Model config +MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" +LOAD_IN_4BIT = DEVICE == "cuda" # 4-bit helps on GPU + +# 1) Load LLM +llm = LocalAutoLLM(device=DEVICE) +llm.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) + +# 2) Build few-shot exemplars from training split +learner = AlexbekFewShotLearner(model=llm, device=DEVICE) +learner.fit( + train_docs_jsonl=TRAIN_DOCS_PATH, + terms2doc_json=TRAIN_TERMS2DOCS_PATH, + # use defaults for sample size/seed +) + +# 3) Predict terms per test document +os.makedirs(os.path.dirname(DOC_TERMS_OUT_PATH), exist_ok=True) +num_written_doc_terms = learner.predict_terms( + docs_test_jsonl=TEST_DOCS_FULL_PATH, + out_jsonl=DOC_TERMS_OUT_PATH, + # use defaults for max_new_tokens and few_shot_k +) +print(f"[terms] wrote {num_written_doc_terms} lines → {DOC_TERMS_OUT_PATH}") + +# 4) Predict types for extracted terms, using the JSONL we just wrote +typing_summary = learner.predict_types_from_terms( + doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly + doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) + model_id=MODEL_ID, # reuse the same small model + out_terms2types=TERMS2TYPES_OUT_PATH, + out_types2docs=TYPES2DOCS_OUT_PATH, + # use defaults for everything else +) + +print( + f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types" +) +print(f"[saved] {TERMS2TYPES_OUT_PATH}") +print(f"[saved] {TYPES2DOCS_OUT_PATH}") + +# 5) Small preview of term→types +try: + with open(TERMS2TYPES_OUT_PATH, "r", encoding="utf-8") as fin: + preview = json.load(fin)[:3] + print("[preview] first 3:") + print(json.dumps(preview, ensure_ascii=False, indent=2)) +except Exception as e: + print(f"[preview] skipped: {e}") diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py new file mode 100644 index 0000000..9efdb6d --- /dev/null +++ b/examples/llm_learner_rwthdbis_taxonomy_discovery.py @@ -0,0 +1,58 @@ +# Import core modules from the OntoLearner library +from ontolearner import LearnerPipeline, train_test_split, ChordOntology +from ontolearner.learner.taxonomy_discovery import RWTHDBISSFTLearner + +# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery +ontology = ChordOntology() +ontology.load() # Read entities, type system, and taxonomic edges into memory + +# Extract typed taxonomic edges and split into train/test while preserving the structured shape +train_data, test_data = train_test_split( + ontology.extract(), test_size=0.2, random_state=42 +) + +# Initialize a supervised taxonomy classifier (encoder-based fine-tuning) +# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views +# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions +learner = RWTHDBISSFTLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/", + device="cpu", + num_train_epochs=1, + per_device_train_batch_size=8, + gradient_accumulation_steps=4, + learning_rate=2e-5, + max_length=256, + seed=42, + negative_ratio=5, + bidirectional_templates=True, + context_json_path=None, + ontology_name=ontology.ontology_full_name, +) + +# Build the pipeline +pipeline = LearnerPipeline( + llm=learner, + llm_id=learner.model_name, + ontologizer_data=False, +) + +# # Run the full learning pipeline on the taxonomy-discovery task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print( + "Metrics:", outputs["metrics"] +) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py new file mode 100644 index 0000000..90d2b55 --- /dev/null +++ b/examples/llm_learner_rwthdbis_term_typing.py @@ -0,0 +1,53 @@ +# Import core modules from the OntoLearner library +from ontolearner import LearnerPipeline, train_test_split, AgrO +from ontolearner.learner.term_typing import RWTHDBISSFTLearner + +# load the AgrO ontology. +# AgrO provides term-typing supervision where each term can be annotated with one or more types. +ontology = AgrO() +ontology.load() +data = ontology.extract() + +# Split the labeled term-typing data into train and test sets +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# Configure a supervised encoder-based classifier for term typing. +# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results. +learner = RWTHDBISSFTLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/deberta-v3", + device="cpu", + num_train_epochs=30, + per_device_train_batch_size=16, + gradient_accumulation_steps=2, + learning_rate=2e-5, + max_length=64, + seed=42, +) + +# Build the pipeline and pass raw structured objects end-to-end. +pipeline = LearnerPipeline( + llm=learner, + llm_id=learner.model_name, + ontologizer_data=False, +) + +# Run the full learning pipeline on the term-typing task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print( + "Metrics:", outputs["metrics"] +) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py new file mode 100644 index 0000000..4c9c779 --- /dev/null +++ b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py @@ -0,0 +1,67 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline + +# Import the specific Few-Shot Learner implementation +from ontolearner.learner.taxonomy_discovery import SBUNLPFewShotLearner + +# Load ontology and split +# Load the GeoNames ontology for taxonomy discovery. +# GeoNames provides geographic parent-child relationships (is-a hierarchy). +ontology = GeoNames() +ontology.load() +data = ( + ontology.extract() +) # Extract the list of taxonomic relationships from the ontology object + +# Split the taxonomic relationships into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing (terms to find relations for) + random_state=42, +) + +# Configure the learner with user-defined inference args + device +# Configure the SBUNLP Few-Shot Learner using the Qwen model. +# This performs in-context learning via N x M batch prompting. +llm_learner = SBUNLPFewShotLearner( + # Model / decoding + model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency + max_new_tokens=140, # limit the length of the model's response (for JSON output) + max_input_tokens=1500, # limit the total prompt length (context window) + temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) + top_p=1.0, # top-p sampling disabled with temperature=0.0 + # Grid settings (N x M prompts) + n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks + m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) + # Run controls + limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run + output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging +) + +# Build pipeline and run +# Build the pipeline, passing the Few-Shot Learner. +pipe = LearnerPipeline( + llm=llm_learner, + llm_id=llm_learner.model_name, + ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers + device="auto", # automatically select CUDA or CPU +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=True, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py new file mode 100644 index 0000000..cff543c --- /dev/null +++ b/examples/llm_learner_sbunlp_text2onto.py @@ -0,0 +1,88 @@ +import os +import torch + +# Import all the required classes +from ontolearner import SBUNLPText2OntoLearner +from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM + +# Local folder where the dataset is stored +# This path is relative to the directory where the script is executed +# (e.g., E:\OntoLearner\examples) +LOCAL_DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" + +# Ensure the base directories exist +# Creates the train and test subdirectories if they don't already exist. +os.makedirs(os.path.join(LOCAL_DATA_DIR, "train"), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, "test"), exist_ok=True) + +# Define local file paths: POINTING TO ALREADY SAVED FILES +# These files are used as input for the Fit and Predict phases. +DOCS_ALL_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/documents.jsonl" +TERMS2DOC_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/terms2docs.json" +DOCS_TEST_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/text2onto_ecology_test_documents.jsonl" + +# Output files for predictions (saved directly under LOCAL_DATA_DIR/test) +# These files will be created by the predict_terms/types methods. +TERMS_PRED_OUT = ( + "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" +) +TYPES_PRED_OUT = ( + "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" +) + +# Initialize and Load Learner --- +MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +# Determine the device for inference (GPU or CPU) +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Instantiate the underlying LLM helper +# (LocalAutoLLM handles model loading and generation) +llm_model_helper = LocalAutoLLM(device=DEVICE) + +# Instantiate the main learner class, passing the LLM helper to its constructor +learner = SBUNLPText2OntoLearner(model=llm_model_helper, device=DEVICE) + +# Load the model (This calls llm_model_helper.load) +LOAD_IN_4BIT = torch.cuda.is_available() +learner.model.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) + +# Build Few-Shot Exemplars (Fit Phase) +# The fit method uses the local data paths to build the in-context learning prompts. +learner.fit( + train_docs_jsonl=DOCS_ALL_PATH, + terms2doc_json=TERMS2DOC_PATH, + sample_size=28, + seed=123, # Seed for stratified random sampling stability +) + +MAX_NEW_TOKENS = 100 + +terms_written = learner.predict_terms( + docs_test_jsonl=DOCS_TEST_PATH, + out_jsonl=TERMS_PRED_OUT, + max_new_tokens=MAX_NEW_TOKENS, +) +print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.") + +# Type Extraction subtask +types_written = learner.predict_types( + docs_test_jsonl=DOCS_TEST_PATH, + out_jsonl=TYPES_PRED_OUT, + max_new_tokens=MAX_NEW_TOKENS, +) +print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.") + +try: + # Evaluate Term Extraction using the custom F1 function and gold data + f1_term = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TERMS_PRED_OUT, key="term") + print(f"Final Term Extraction F1: {f1_term:.4f}") + + # Evaluate Type Extraction + f1_type = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TYPES_PRED_OUT, key="type") + print(f"Final Type Extraction F1: {f1_type:.4f}") + +except Exception as e: + # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) + print( + f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created." + ) diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py new file mode 100644 index 0000000..24e4de2 --- /dev/null +++ b/examples/llm_learner_sbunlp_zs_term_typing.py @@ -0,0 +1,55 @@ +# Import core modules from the OntoLearner library +from ontolearner import AgrO, train_test_split, LearnerPipeline + +# Import the specific Zero-Shot Learner implementation for Term Typing +from ontolearner.learner.term_typing import SBUNLPZSLearner + +# Load ontology and split +# Load the AgrO ontology for type inventory and test data. +ontology = AgrO() +ontology.load() +data = ontology.extract() # Extract the full set of relationships/terms + +# Split the data into train (to learn type inventory) and test (terms to predict) +train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing + random_state=42, +) + +# Configure the Qwen Zero-Shot learner (inference-only) +# This learner's 'fit' phase learns the vocabulary of allowed type labels. +llm_learner = SBUNLPZSLearner( + device="cpu", + max_new_tokens=64, + temperature=0.0, + model_id="Qwen/Qwen2.5-0.5B-Instruct", + token=None, +) + +# Build pipeline and run +# Build the pipeline, passing the Zero-Shot Learner. +pipe = LearnerPipeline( + llm=llm_learner, + llm_id=llm_learner.model_id, + ontologizer_data=False, + device="cpu", # select CUDA or CPU +) + +# Run the full learning pipeline on the Term-Typing task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for learning (type inventory) + prediction + evaluation +print("Elapsed time:", outputs.get("elapsed_time")) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py new file mode 100644 index 0000000..5431d6f --- /dev/null +++ b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py @@ -0,0 +1,60 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.taxonomy_discovery import SKHNLPSequentialFTLearner + +# Load ontology and split +# Load the GeoNames ontology for taxonomy discovery. +# GeoNames provides geographic parent-child relationships (is-a hierarchy). +ontology = GeoNames() +ontology.load() +data = ontology.extract() + +# Split the taxonomic relationships into train and test sets +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# Configure the learner with user-defined training args + device +# Configure the supervised BERT SFT Learner for taxonomy discovery. +# This fine-tunes BERT-Large using Sequential Prompts on (Parent, Child) pairs. +bert_learner = SKHNLPSequentialFTLearner( + model_name="bert-large-uncased", + n_prompts=2, + random_state=1403, + device="cpu", # Note: CPU training for BERT-Large is very slow. + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + warmup_steps=500, + weight_decay=0.01, + logging_dir="./logs/", + logging_steps=50, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) + +# Build pipeline and run +# Build the pipeline, passing the BERT Learner. +pipeline = LearnerPipeline( + llm=bert_learner, + llm_id="bert-large-uncased", + ontologizer_data=False, +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py new file mode 100644 index 0000000..f2bca1e --- /dev/null +++ b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py @@ -0,0 +1,51 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.taxonomy_discovery import SKHNLPZSLearner + +# Load ontology and split data +# The GeoNames ontology provides geographic term types and relationships. +ontology = GeoNames() +ontology.load() +train_data, test_data = train_test_split( + ontology.extract(), + test_size=0.2, + random_state=42, +) + +# Configure the learner with user-defined generation and normalization settings +# Configure the Zero-Shot Qwen Learner for taxonomy discovery. +# This model uses a fixed prompt and string normalization (Levenshtein) to classify terms. +llm_learner = SKHNLPZSLearner( + model_name="Qwen/Qwen2.5-0.5B-Instruct", + device="cpu", # use "cuda" if you have a GPU + max_new_tokens=16, + save_path="./outputs/", # directory or full file path for CSV + verbose=True, + normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" +) + +# Build pipeline and run +pipe = LearnerPipeline( + llm=llm_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=False, + device="cpu", +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipe( + train_data=train_data, # zero-shot; ignored by the LLM learner + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py new file mode 100644 index 0000000..ec6f2f4 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .alexbek import AlexbekCrossAttnLearner +from .rwthdbis import RWTHDBISSFTLearner +from .sbunlp import SBUNLPFewShotLearner +from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner diff --git a/ontolearner/learner/taxonomy_discovery/alexbek.py b/ontolearner/learner/taxonomy_discovery/alexbek.py new file mode 100644 index 0000000..3623f16 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/alexbek.py @@ -0,0 +1,500 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple + +import math +import os +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +from sentence_transformers import SentenceTransformer + +from ...base import AutoLearner + + +class RMSNorm(nn.Module): + """Root Mean Square normalization with learnable scale. + + Computes per-position normalization: + y = weight * x / sqrt(mean(x^2) + eps) + + This variant normalizes over the last dimension and keeps scale as a + learnable parameter, similar to RMSNorm used in modern transformer stacks. + """ + + def __init__(self, dim: int, eps: float = 1e-6): + """Initialize the RMSNorm layer. + + Args: + dim: Size of the last (feature) dimension to normalize over. + eps: Small constant added inside the square root for numerical + stability. + """ + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Apply RMS normalization. + + Args: + x: Input tensor of shape (..., dim). + + Returns: + Tensor of the same shape as `x`, RMS-normalized over the last axis. + """ + rms_inv = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + return self.weight * (x * rms_inv) + + +class CrossAttentionHead(nn.Module): + """Minimal multi-head *pair* scorer using cross-attention-style projections. + + Given child vector `c` and parent vector `p`: + q = W_q * c, k = W_k * p + score_head = (q_h · k_h) / sqrt(d_head) + + We average the per-head scores and apply a sigmoid to produce a probability. + This is not a full attention block—just a learnable similarity function. + """ + + def __init__( + self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6 + ): + """Initialize projections and per-stream normalizers. + + Args: + hidden_size: Dimensionality of input embeddings (child/parent). + num_heads: Number of subspaces to split the projection into. + rms_norm_eps: Epsilon for RMSNorm stability. + + Raises: + AssertionError: If `hidden_size` is not divisible by `num_heads`. + """ + super().__init__() + assert hidden_size % num_heads == 0, ( + "hidden_size must be divisible by num_heads" + ) + self.hidden_size = hidden_size + self.num_heads = num_heads + self.dim_per_head = hidden_size // num_heads + + # Linear projections for queries (child) and keys (parent) + self.query_projection = nn.Linear(hidden_size, hidden_size, bias=False) + self.key_projection = nn.Linear(hidden_size, hidden_size, bias=False) + + # Pre-projection normalization for stability + self.query_norm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.key_norm = RMSNorm(hidden_size, eps=rms_norm_eps) + + # Xavier init helps stabilize training + nn.init.xavier_uniform_(self.query_projection.weight) + nn.init.xavier_uniform_(self.key_projection.weight) + + def forward( + self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor + ) -> torch.Tensor: + """Score (child, parent) pairs. + + Args: + child_embeddings: Tensor of shape (batch, hidden_size). + parent_embeddings: Tensor of shape (batch, hidden_size). + + Returns: + Tensor of probabilities with shape (batch,), each in [0, 1]. + """ + batch_size, _ = child_embeddings.shape + + # Project and normalize + queries = self.query_norm(self.query_projection(child_embeddings)) + keys = self.key_norm(self.key_projection(parent_embeddings)) + + # Reshape into heads: (batch, heads, dim_per_head) + queries = queries.view(batch_size, self.num_heads, self.dim_per_head) + keys = keys.view(batch_size, self.num_heads, self.dim_per_head) + + # Scaled dot-product similarity per head -> (batch, heads) + per_head_scores = (queries * keys).sum(-1) / math.sqrt(self.dim_per_head) + + # Aggregate across heads -> (batch,) + mean_score = per_head_scores.mean(-1) + + # Map to probability + return torch.sigmoid(mean_score) + + +class AlexbekCrossAttnLearner(AutoLearner): + """Cross-Attention Taxonomy Learner (inherits AutoLearner). + + Workflow + - Encode terms with a SentenceTransformer. + - Train a compact cross-attention head on (parent, child) pairs + (positives + sampled negatives) using BCE loss. + - Inference returns probabilities per pair; edges with prob >= 0.5 are + labeled as positive. + + """ + + def __init__( + self, + *, + embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "cpu", + num_heads: int = 8, + lr: float = 5e-5, + weight_decay: float = 0.01, + num_epochs: int = 1, + batch_size: int = 256, + neg_ratio: float = 1.0, # negatives per positive + output_dir: str = "./results/", + seed: int = 42, + **kwargs: Any, + ): + """Configure the learner. + + Args: + embedding_model: SentenceTransformer model id/path for term encoding. + device: 'cuda' or 'cpu'. If 'cuda' is requested but unavailable, CPU + is used. + num_heads: Number of heads in the cross-attention scorer. + lr: Learning rate for AdamW. + weight_decay: Weight decay for AdamW. + num_epochs: Number of epochs to train the head. + batch_size: Minibatch size for training and scoring loops. + neg_ratio: Number of sampled negatives per positive during training. + output_dir: Directory to store artifacts (reserved for future use). + seed: Random seed for reproducibility. + **kwargs: Passed through to `AutoLearner` base init. + + Side Effects: + Creates `output_dir` if missing and seeds Python/Torch RNGs. + """ + super().__init__(**kwargs) + + # hyperparameters / settings + self.embedding_model_id = embedding_model + self.requested_device = device + self.num_heads = num_heads + self.learning_rate = lr + self.weight_decay = weight_decay + self.num_epochs = num_epochs + self.batch_size = batch_size + self.negative_ratio = neg_ratio + self.output_dir = output_dir + self.seed = seed + + # Prefer requested device but gracefully fall back to CPU + if torch.cuda.is_available() or self.requested_device == "cpu": + self.device = torch.device(self.requested_device) + else: + self.device = torch.device("cpu") + + # Will be set in load() + self.embedder: Optional[SentenceTransformer] = None + self.cross_attn_head: Optional[CrossAttentionHead] = None + self.embedding_dim: Optional[int] = None + + # Cache of term -> embedding tensor (on device) + self.term_to_vector: Dict[str, torch.Tensor] = {} + + os.makedirs(self.output_dir, exist_ok=True) + random.seed(self.seed) + torch.manual_seed(self.seed) + + def load(self, **kwargs: Any): + """Load the sentence embedding model and initialize the cross-attention head. + + Args: + **kwargs: Optional override, supports `embedding_model`. + + Side Effects: + - Initializes `self.embedder` on the configured device. + - Probes and stores `self.embedding_dim`. + - Constructs `self.cross_attn_head` with the probed dimensionality. + """ + model_id = kwargs.get("embedding_model", self.embedding_model_id) + self.embedder = SentenceTransformer( + model_id, trust_remote_code=True, device=str(self.device) + ) + + # Probe output dimensionality using a dummy encode + probe_embedding = self.embedder.encode( + ["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False + ) + self.embedding_dim = int(probe_embedding.shape[-1]) + + # Initialize the cross-attention head + self.cross_attn_head = CrossAttentionHead( + hidden_size=self.embedding_dim, num_heads=self.num_heads + ).to(self.device) + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """Train or infer taxonomy edges according to the AutoLearner contract. + + Training (`test=False`) + - Extract positives (parent, child) and the unique term set from `data`. + - Build/extend the term embedding cache. + - Sample negatives at ratio `self.negative_ratio`. + - Train the cross-attention head with BCE loss. + + Inference (`test=True`) + - Ensure embeddings exist for all terms. + - Score candidate pairs and return per-pair probabilities and labels. + + Args: + data: Ontology-like object exposing `type_taxonomies.taxonomies`, + where each item has `.parent` and `.child` string-like fields. + test: If True, perform inference instead of training. + + Returns: + - `None` on training. + - On inference: List of dicts + `{"parent": str, "child": str, "score": float, "label": int}`. + """ + if self.embedder is None or self.cross_attn_head is None: + self.load() + + if not test: + positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms( + data + ) + self._ensure_term_embeddings(unique_terms) + negative_pairs = self._sample_negative_pairs( + positive_pairs, unique_terms, ratio=self.negative_ratio, seed=self.seed + ) + self._train_cross_attn_head(positive_pairs, negative_pairs) + return None + else: + candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms( + data + ) + self._ensure_term_embeddings(unique_terms, append_only=True) + probabilities = self._score_parent_child_pairs(candidate_pairs) + + predictions = [ + { + "parent": parent, + "child": child, + "score": float(prob), + "label": int(prob >= 0.5), + } + for (parent, child), prob in zip(candidate_pairs, probabilities) + ] + return predictions + + def _ensure_term_embeddings( + self, terms: List[str], append_only: bool = False + ) -> None: + """Encode terms with the sentence embedder and store in cache. + + Args: + terms: List of unique term strings to embed. + append_only: If True, only embed terms missing from the cache; + otherwise (re)encode all provided terms. + + Raises: + RuntimeError: If called before `load()`. + """ + if self.embedder is None: + raise RuntimeError("Call load() before building term embeddings") + + terms_to_encode = ( + [t for t in terms if t not in self.term_to_vector] if append_only else terms + ) + if not terms_to_encode: + return + + embeddings = self.embedder.encode( + terms_to_encode, + convert_to_tensor=True, + normalize_embeddings=False, + batch_size=256, + show_progress_bar=False, + ) + for term, embedding in zip(terms_to_encode, embeddings): + self.term_to_vector[term] = embedding.detach().to(self.device) + + def _pairs_as_tensors( + self, pairs: List[Tuple[str, str]] + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Convert string pairs into aligned embedding tensors on the correct device. + + Args: + pairs: List of (parent, child) term strings. + + Returns: + Tuple `(child_tensor, parent_tensor)` where each tensor has shape + `(batch, embedding_dim)` and is located on `self.device`. + + Notes: + This function assumes that all terms in `pairs` are present in + `self.term_to_vector`. Use `_ensure_term_embeddings` beforehand. + """ + # child embeddings tensor of shape (batch, dim) + child_tensor = torch.stack( + [self.term_to_vector[child] for (_, child) in pairs], dim=0 + ).to(self.device) + # parent embeddings tensor of shape (batch, dim) + parent_tensor = torch.stack( + [self.term_to_vector[parent] for (parent, _) in pairs], dim=0 + ).to(self.device) + return child_tensor, parent_tensor + + def _train_cross_attn_head( + self, + positive_pairs: List[Tuple[str, str]], + negative_pairs: List[Tuple[str, str]], + ) -> None: + """Train the cross-attention head with BCE loss on labeled pairs. + + The dataset is a concatenation of positives (label 1) and sampled + negatives (label 0). The head is optimized with AdamW. + + Args: + positive_pairs: List of ground-truth (parent, child) edges. + negative_pairs: List of sampled non-edges. + + Raises: + RuntimeError: If the head has not been initialized (call `load()`). + """ + if self.cross_attn_head is None: + raise RuntimeError("Head not initialized. Call load().") + + self.cross_attn_head.train() + optimizer = torch.optim.AdamW( + self.cross_attn_head.parameters(), + lr=self.learning_rate, + weight_decay=self.weight_decay, + ) + + # Build a simple supervised dataset: 1 for positive, 0 for negative + labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [ + (1, pc) for pc in positive_pairs + ] + [(0, nc) for nc in negative_pairs] + random.shuffle(labeled_pairs) + + def iterate_minibatches( + items: List[Tuple[int, Tuple[str, str]]], batch_size: int + ): + """Yield contiguous minibatches of size `batch_size` from `items`.""" + for start in range(0, len(items), batch_size): + yield items[start : start + batch_size] + + for epoch in range(self.num_epochs): + epoch_loss_sum = 0.0 + for minibatch in iterate_minibatches(labeled_pairs, self.batch_size): + labels = torch.tensor( + [y for y, _ in minibatch], dtype=torch.float32, device=self.device + ) + string_pairs = [pc for _, pc in minibatch] + child_tensor, parent_tensor = self._pairs_as_tensors(string_pairs) + + probs = self.cross_attn_head(child_tensor, parent_tensor) + loss = F.binary_cross_entropy(probs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss_sum += float(loss.item()) * len(minibatch) + + def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float]: + """Compute probability scores for (parent, child) pairs. + + Args: + pairs: List of candidate (parent, child) edges to score. + + Returns: + List of floats in [0, 1] corresponding to the input order. + + Raises: + RuntimeError: If the head has not been initialized (call `load()`). + """ + if self.cross_attn_head is None: + raise RuntimeError("Head not initialized. Call load().") + + self.cross_attn_head.eval() + scores: List[float] = [] + with torch.no_grad(): + for start in range(0, len(pairs), self.batch_size): + chunk = pairs[start : start + self.batch_size] + child_tensor, parent_tensor = self._pairs_as_tensors(chunk) + prob = self.cross_attn_head(child_tensor, parent_tensor) + scores.extend(prob.detach().cpu().tolist()) + return scores + + def _extract_parent_child_pairs_and_terms( + self, data: Any + ) -> Tuple[List[Tuple[str, str]], List[str]]: + """Extract (parent, child) edges and the set of unique terms from an ontology-like object. + + The function expects `data.type_taxonomies.taxonomies` to be an iterable + of objects with `.parent` and `.child` string-like attributes. + + Args: + data: Ontology-like container. + + Returns: + A tuple `(pairs, terms)` where: + - `pairs` is a list of (parent, child) strings, + - `terms` is a sorted list of unique term strings (parents ∪ children). + """ + parent_child_pairs: List[Tuple[str, str]] = [] + unique_terms = set() + for edge in getattr(data, "type_taxonomies").taxonomies: + parent, child = str(edge.parent), str(edge.child) + parent_child_pairs.append((parent, child)) + unique_terms.add(parent) + unique_terms.add(child) + return parent_child_pairs, sorted(unique_terms) + + def _sample_negative_pairs( + self, + positive_pairs: List[Tuple[str, str]], + terms: List[str], + ratio: float = 1.0, + seed: int = 42, + ) -> List[Tuple[str, str]]: + """Sample random negative (parent, child) pairs not present in positives. + + Sampling is uniform over the Cartesian product of `terms` excluding + (x, x) self-pairs and any pair found in `positive_pairs`. + + Args: + positive_pairs: Known positive edges to exclude. + terms: Candidate vocabulary (parents ∪ children). + ratio: Number of negatives per positive to draw. + seed: RNG seed used for reproducible sampling. + + Returns: + A list of sampled negative pairs of approximate length + `int(len(positive_pairs) * ratio)`. + """ + random.seed(seed) + term_list = list(terms) + positive_set = set(positive_pairs) + negatives: List[Tuple[str, str]] = [] + target_negative_count = int(len(positive_pairs) * ratio) + while len(negatives) < target_negative_count: + parent = random.choice(term_list) + child = random.choice(term_list) + if parent == child: + continue + candidate = (parent, child) + if candidate in positive_set: + continue + negatives.append(candidate) + return negatives diff --git a/ontolearner/learner/taxonomy_discovery/rwthdbis.py b/ontolearner/learner/taxonomy_discovery/rwthdbis.py new file mode 100644 index 0000000..c535016 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/rwthdbis.py @@ -0,0 +1,1082 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import random +import re +import platform +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Callable +from functools import partial +from tqdm.auto import tqdm +import g4f +from g4f.client import Client as _G4FClient +import torch +from datasets import Dataset, DatasetDict +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + set_seed, +) + +from ...base import AutoLearner + + +class RWTHDBISSFTLearner(AutoLearner): + """ + Supervised classifier for (parent, child) taxonomy edges. + + Model input format: + " ## " + + Context building: + If no `context_json_path` is provided, the learner precomputes a fixed-name + context file `rwthdbis_onto_processed.json` under `output_dir/context/` + from the ontology terms and stores the path in `self.context_json_path`. + + Attributes: + model_name: Hugging Face model identifier. + output_dir: Directory where checkpoints and tokenizer are saved/loaded. + min_predictions: If no candidate is predicted positive, return the top-k + by positive probability (k = min_predictions). + max_length: Maximum tokenized length for inputs. + per_device_train_batch_size: Micro-batch size per device. + gradient_accumulation_steps: Gradient accumulation steps. + num_train_epochs: Number of training epochs. + learning_rate: Optimizer LR. + weight_decay: Weight decay for AdamW. + logging_steps: Logging interval for Trainer. + save_strategy: HF saving strategy (e.g., 'epoch'). + save_total_limit: Max checkpoints to keep. + fp16: Enable FP16 mixed precision. + bf16: Enable BF16 mixed precision (on supported hardware). + seed: Random seed for reproducibility. + negative_ratio: Number of negatives per positive during training. + bidirectional_templates: If True, also add reversed template examples. + context_json_path: Path to the preprocessed term-context JSON. If None, + the file is generated with the fixed prefix `rwthdbis_onto_*`. + ontology_name: Logical dataset/domain label used in prompts and filtering + (filenames still use the fixed `rwthdbis_onto_*` prefix). + device: user-defined argument as 'cuda' or 'cpu'. + model: Loaded/initialized `AutoModelForSequenceClassification`. + tokenizer: Loaded/initialized `AutoTokenizer`. + """ + + # Sentences containing any of these phrases are pruned from term_info. + _CONTEXT_REMOVALS = [ + "couldn't find any", + "does not require", + "assist you further", + "feel free to", + "I'm currently unable", + "the search results", + "I'm unable to", + "recommend referring directly", + "bear with me", + "searching for the most relevant information", + "I'm currently checking the most relevant", + "already in English", + "require further", + "any additional information", + "already an English", + "don't have information", + "I'm sorry,", + "For further exploration", + "For more detailed information", + ] + + def __init__( + self, + min_predictions: int = 1, + model_name: str = "distilroberta-base", + output_dir: str = "./results/taxonomy-discovery", + device: str = "cpu", + max_length: int = 256, + per_device_train_batch_size: int = 8, + gradient_accumulation_steps: int = 4, + num_train_epochs: int = 1, + learning_rate: float = 2e-5, + weight_decay: float = 0.01, + logging_steps: int = 25, + save_strategy: str = "epoch", + save_total_limit: int = 1, + fp16: bool = True, + bf16: bool = False, + seed: int = 42, + negative_ratio: int = 5, + bidirectional_templates: bool = True, + context_json_path: Optional[str] = None, + ontology_name: str = "Geonames", + ) -> None: + """ + Initialize the taxonomy-edge learner and set training/inference knobs. + + Notes: + - Output artifacts are written under `output_dir`, including + the model weights and tokenizer (for later `from_pretrained` loads). + - If `context_json_path` is not provided, a new context file named + `rwthdbis_onto_processed.json` is generated under `output_dir/context/`. + """ + super().__init__() + + self.model_name = model_name + safe_model_name = model_name.replace("/", "__") + + resolved_output = output_dir.format(model_name=safe_model_name) + self.output_dir = str(Path(resolved_output)) + Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + # Store provided argument values as-is (types are enforced by callers). + self.min_predictions = min_predictions + self.max_length = max_length + self.per_device_train_batch_size = per_device_train_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.num_train_epochs = num_train_epochs + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.logging_steps = logging_steps + self.save_strategy = save_strategy + self.save_total_limit = save_total_limit + self.fp16 = fp16 + self.bf16 = bf16 + self.seed = seed + + self.negative_ratio = negative_ratio + self.bidirectional_templates = bidirectional_templates + self.context_json_path = context_json_path + + self.ontology_name = ontology_name + self.device = device + self.model: Optional[AutoModelForSequenceClassification] = None + self.tokenizer: Optional[AutoTokenizer] = None + + # Context caches built from the context JSON. + self._context_exact: Dict[str, str] = {} # lower(term) -> info + self._context_rows: List[ + Dict[str, str] + ] = [] # [{'term': str, 'term_info': str}, ...] + + def _is_windows(self) -> bool: + """Return True if the current OS is Windows (NT).""" + return (os.name == "nt") or (platform.system().lower() == "windows") + + def _normalize_text(self, raw_text: str, *, drop_questions: bool = False) -> str: + """ + Normalize plain text consistently across the pipeline. + + Operations: + - Remove markdown-like link patterns (e.g., '[[1]](http://...)'). + - Replace newlines with spaces; collapse repeated spaces. + - Optionally drop sentences containing '?' (useful for model generations). + + Args: + raw_text: Input text to normalize. + drop_questions: If True, filter out sentences with '?'. + + Returns: + str: Cleaned single-line string. + """ + if raw_text is None: + return "" + text = str(raw_text) + + # Remove simple markdown link artifacts like [[1]](http://...) + text = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", text) + + # Replace newlines with spaces and collapse multiple spaces + text = text.replace("\n", " ") + text = re.sub(r"\s{2,}", " ", text) + + if drop_questions: + sentences = [s.strip() for s in text.split(".")] + sentences = [s for s in sentences if s and "?" not in s] + text = ". ".join(sentences) + + return text.strip() + + def _default_gpt_inference_with_dataset(self, term: str, dataset_name: str) -> str: + """ + Generate a plain-text description for `term`, conditioned on `dataset_name`, + via g4f (best-effort). Falls back to an empty string on failure. + + The raw output is then normalized with `_normalize_text(drop_questions=True)`. + + Args: + term: Term to describe. + dataset_name: Ontology/domain name used in the prompt. + + Returns: + str: Cleaned paragraph describing the term, or "" on failure. + """ + prompt = ( + f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " + "Provide as detailed a definition of this term as possible in plain text.without any markdown format." + "No reference link in result. " + "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" + "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" + "Output: Plain text paragraphs only, neutral and factual." + f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." + ) + + try: + client = _G4FClient() + response = client.chat.completions.create( + model=g4f.models.default, + messages=[{"role": "user", "content": prompt}], + ) + raw_text = ( + response.choices[0].message.content + if response and response.choices + else "" + ) + except Exception: + raw_text = "" # best-effort fallback + + return self._normalize_text(raw_text, drop_questions=True) + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """ + AutoLearner hook: route to training or prediction. + + Args: + data: Ontology-like object (has `.taxonomies` or `.type_taxonomies.taxonomies`). + test: If True, run inference; otherwise, train a model. + + Returns: + If test=True, a list of accepted edges as dicts with keys `parent` and `child`; + otherwise None. + """ + return self._predict_pairs(data) if test else self._train_from_pairs(data) + + def _train_from_pairs(self, train_data: Any) -> None: + """ + Train a binary classifier from ontology pairs. + + Steps: + 1) (Re)build the term-context JSON unless `context_json_path` is set. + 2) Extract positive (parent, child) edges from `train_data`. + 3) Sample negatives at `negative_ratio`. + 4) Tokenize, instantiate HF Trainer, train, and save. + + Args: + train_data: Ontology-like object with `.type_taxonomies.taxonomies` + (preferred) or `.taxonomies`, each item providing `parent` and `child`. + + Raises: + ValueError: If no positive pairs are found. + + Side Effects: + - Writes a trained model to `self.output_dir` (via `trainer.save_model`). + - Writes the tokenizer to `self.output_dir` (via `save_pretrained`). + - Sets `self.context_json_path` if it was previously unset. + The generated context file is named `rwthdbis_onto_processed.json`. + """ + # Always (re)build context from ontology unless an explicit file is provided + if not self.context_json_path: + context_dir = Path(self.output_dir) / "context" + context_dir.mkdir(parents=True, exist_ok=True) + processed_context_file = context_dir / "rwthdbis_onto_processed.json" + + # Remove stale file then regenerate + if processed_context_file.exists(): + try: + processed_context_file.unlink() + except Exception: + pass + + self.preprocess_context_from_ontology( + ontology=train_data, + processed_dir=context_dir, + dataset_name=self.ontology_name, + num_workers=max(1, min(os.cpu_count() or 2, 4)), + provider=partial( + self._default_gpt_inference_with_dataset, + dataset_name=self.ontology_name, + ), + max_retries=5, + ) + self.context_json_path = str(processed_context_file) + + # Reproducibility + set_seed(self.seed) + random.seed(self.seed) + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + + # Build labeled pairs from ontology; context comes from preprocessed map + positive_pairs = self._extract_positive_pairs(train_data) + if not positive_pairs: + raise ValueError("No positive (parent, child) pairs found in train_data.") + + entity_names = sorted( + {parent for parent, _ in positive_pairs} + | {child for _, child in positive_pairs} + ) + negative_pairs = self._generate_negatives( + positives=positive_pairs, + entities=entity_names, + ratio=self.negative_ratio, + ) + + labels, input_texts = self._build_text_dataset(positive_pairs, negative_pairs) + dataset_dict = DatasetDict( + {"train": Dataset.from_dict({"label": labels, "text": input_texts})} + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + # Ensure a pad token exists for robust padding across models. + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = ( + getattr(self.tokenizer, "eos_token", None) + or getattr(self.tokenizer, "sep_token", None) + or getattr(self.tokenizer, "cls_token", None) + ) + + def tokenize_batch(batch: Dict[str, List[str]]): + """Tokenize a batch of input texts for HF Datasets mapping.""" + return self.tokenizer( + batch["text"], truncation=True, max_length=self.max_length + ) + + tokenized_dataset = dataset_dict.map( + tokenize_batch, batched=True, remove_columns=["text"] + ) + data_collator = DataCollatorWithPadding(self.tokenizer) + + self.model = AutoModelForSequenceClassification.from_pretrained( + self.model_name, + num_labels=2, + id2label={0: "incorrect", 1: "correct"}, + label2id={"incorrect": 0, "correct": 1}, + ) + # Ensure model has a pad_token_id if tokenizer provides one. + if ( + getattr(self.model.config, "pad_token_id", None) is None + and self.tokenizer.pad_token_id is not None + ): + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + training_args = TrainingArguments( + output_dir=self.output_dir, + learning_rate=self.learning_rate, + per_device_train_batch_size=self.per_device_train_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + num_train_epochs=self.num_train_epochs, + weight_decay=self.weight_decay, + save_strategy=self.save_strategy, + save_total_limit=self.save_total_limit, + logging_steps=self.logging_steps, + dataloader_pin_memory=bool(torch.cuda.is_available()), + fp16=self.fp16, + bf16=self.bf16, + report_to="none", + save_safetensors=True, + ) + + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=tokenized_dataset["train"], + tokenizer=self.tokenizer, + data_collator=data_collator, + ) + trainer.train() + trainer.save_model() + # Persist tokenizer alongside the model for from_pretrained() loads. + self.tokenizer.save_pretrained(self.output_dir) + + def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: + """ + Score candidate pairs and return those predicted as positive. + + If no pair is predicted positive but `min_predictions` > 0, the top-k + pairs by positive probability are returned. + + Args: + eval_data: Ontology-like object with either `.pairs` (preferred) or + `.type_taxonomies.taxonomies` / `.taxonomies`. + + Returns: + list[dict]: Each dict has keys `parent` and `child`. + """ + import torch.nn.functional as F + + self._ensure_loaded_for_inference() + + candidate_pairs = self._extract_pairs_for_eval(eval_data) + if not candidate_pairs: + return [] + + accepted_pairs: List[Dict[str, str]] = [] + scored_candidates: List[Tuple[float, str, str, int]] = [] + + self.model.eval() + with torch.no_grad(): + for parent_term, child_term in candidate_pairs: + input_text = self._format_input(parent_term, child_term) + inputs = self.tokenizer( + input_text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) + inputs = {key: tensor.to(self.device) for key, tensor in inputs.items()} + logits = self.model(**inputs).logits + probabilities = F.softmax(logits, dim=-1).squeeze(0) + p_positive = float(probabilities[1].item()) + predicted_label = int(torch.argmax(logits, dim=-1).item()) + scored_candidates.append( + (p_positive, parent_term, child_term, predicted_label) + ) + if predicted_label == 1: + accepted_pairs.append({"parent": parent_term, "child": child_term}) + + if accepted_pairs: + return accepted_pairs + + top_k = max(0, int(self.min_predictions)) + if top_k == 0: + return [] + scored_candidates.sort(key=lambda item: item[0], reverse=True) + return [ + {"parent": parent_term, "child": child_term} + for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k] + ] + + def _ensure_loaded_for_inference(self) -> None: + """ + Load model and tokenizer from `self.output_dir` if not already loaded. + + Side Effects: + - Sets `self.model` and `self.tokenizer`. + - Moves the model to `self.device`. + - Ensures `tokenizer.pad_token_id` is set if model config provides one. + """ + if self.model is not None and self.tokenizer is not None: + return + self.model = AutoModelForSequenceClassification.from_pretrained( + self.output_dir + ).to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir) + if ( + self.tokenizer.pad_token_id is None + and getattr(self.model.config, "pad_token_id", None) is not None + ): + self.tokenizer.pad_token_id = self.model.config.pad_token_id + + def _load_context_map(self) -> None: + """ + Populate in-memory maps from the context JSON (`self.context_json_path`). + + Builds: + - `_context_exact`: dict mapping lowercased term → term_info. + - `_context_rows`: list of dict rows with 'term' and 'term_info'. + + If `context_json_path` is falsy or loading fails, both structures become empty. + """ + if not self.context_json_path: + self._context_exact = {} + self._context_rows = [] + return + try: + rows = json.load(open(self.context_json_path, "r", encoding="utf-8")) + self._context_exact = { + str(row.get("term", "")).strip().lower(): str( + row.get("term_info", "") + ).strip() + for row in rows + } + self._context_rows = [ + { + "term": str(row.get("term", "")), + "term_info": str(row.get("term_info", "")), + } + for row in rows + ] + except Exception: + self._context_exact = {} + self._context_rows = [] + + def _lookup_context_info(self, raw_term: str) -> str: + """ + Retrieve textual context for a term using exact and simple fuzzy matching. + + - Exact: lowercased term lookup in `_context_exact`. + - Fuzzy: split `raw_term` by commas, strip whitespace; treat each piece + as a case-insensitive substring against row['term']. + + Args: + raw_term: Original term string (possibly comma-separated). + + Returns: + str: Concatenated matches' term_info ('.' joined). Empty string if none. + """ + if not raw_term: + return "" + term_key = raw_term.strip().lower() + if term_key in self._context_exact: + return self._context_exact[term_key] + + subterms = [re.sub(r"\s+", "", piece) for piece in raw_term.split(",")] + matched_infos: List[str] = [] + for subterm in subterms: + if not subterm: + continue + lower_subterm = subterm.lower() + for row in self._context_rows: + if lower_subterm in row["term"].lower(): + info = row.get("term_info", "") + if info: + matched_infos.append(info) + break # one hit per subterm + return ".".join(matched_infos) + + def _extract_positive_pairs(self, ontology_obj: Any) -> List[Tuple[str, str]]: + """ + Extract positive (parent, child) edges from an ontology-like object. + + Reads from `ontology_obj.type_taxonomies.taxonomies` (preferred) or + falls back to `ontology_obj.taxonomies`. Each item must expose `parent` + and `child` as attributes or dict keys. + + Returns: + list[tuple[str, str]]: (parent, child) pairs (may be empty). + """ + type_taxonomies = getattr(ontology_obj, "type_taxonomies", None) + items = ( + getattr(type_taxonomies, "taxonomies", None) + if type_taxonomies is not None + else getattr(ontology_obj, "taxonomies", None) + ) + pairs: List[Tuple[str, str]] = [] + if items: + for item in items: + parent_term = ( + getattr(item, "parent", None) + if not isinstance(item, dict) + else item.get("parent") + ) + child_term = ( + getattr(item, "child", None) + if not isinstance(item, dict) + else item.get("child") + ) + if parent_term and child_term: + pairs.append((str(parent_term), str(child_term))) + return pairs + + def _extract_pairs_for_eval(self, ontology_obj: Any) -> List[Tuple[str, str]]: + """ + Extract candidate pairs for evaluation. + + Prefers `ontology_obj.pairs` if present; otherwise falls back to the + positive pairs from the ontology (see `_extract_positive_pairs`). + + Returns: + list[tuple[str, str]]: Candidate (parent, child) pairs. + """ + candidate_pairs = getattr(ontology_obj, "pairs", None) + if candidate_pairs: + pairs: List[Tuple[str, str]] = [] + for item in candidate_pairs: + parent_term = ( + getattr(item, "parent", None) + if not isinstance(item, dict) + else item.get("parent") + ) + child_term = ( + getattr(item, "child", None) + if not isinstance(item, dict) + else item.get("child") + ) + if parent_term and child_term: + pairs.append((str(parent_term), str(child_term))) + return pairs + return self._extract_positive_pairs(ontology_obj) + + def _generate_negatives( + self, + positives: List[Tuple[str, str]], + entities: List[str], + ratio: int, + ) -> List[Tuple[str, str]]: + """ + Sample negative edges by excluding known positives and self-pairs. + + Constructs the cartesian product of entities (excluding (x, x)), + removes all known positives, and samples up to `ratio * len(positives)` + negatives uniformly at random. + + Args: + positives: Known positive edges. + entities: Unique set/list of entity terms. + ratio: Target negatives per positive (lower-bounded by 1×). + + Returns: + list[tuple[str, str]]: Sampled negative pairs (may be smaller). + """ + positive_set = set(positives) + all_possible = { + (parent, child) + for parent in entities + for child in entities + if parent != child + } + negative_candidates = list(all_possible - positive_set) + + target_count = max(len(positive_set) * max(1, ratio), len(positive_set)) + sample_count = min(target_count, len(negative_candidates)) + return ( + random.sample(negative_candidates, k=sample_count) + if sample_count > 0 + else [] + ) + + def _build_text_dataset( + self, + positives: List[Tuple[str, str]], + negatives: List[Tuple[str, str]], + ) -> Tuple[List[int], List[str]]: + """ + Create parallel lists of labels and input texts for HF Datasets. + + Builds formatted inputs using `_format_input`, and duplicates examples in + the reverse direction if `bidirectional_templates` is True. + + Returns: + tuple[list[int], list[str]]: (labels, input_texts) where labels are + 1 for positive and 0 for negative. + """ + self._load_context_map() + + labels: List[int] = [] + input_texts: List[str] = [] + + def add_example(parent_term: str, child_term: str, label_value: int) -> None: + """Append one (and optionally reversed) example to the dataset.""" + input_texts.append(self._format_input(parent_term, child_term)) + labels.append(label_value) + if self.bidirectional_templates: + input_texts.append( + self._format_input(child_term, parent_term, reverse=True) + ) + labels.append(label_value) + + for parent_term, child_term in positives: + add_example(parent_term, child_term, 1) + for parent_term, child_term in negatives: + add_example(parent_term, child_term, 0) + + return labels, input_texts + + def _format_input( + self, parent_term: str, child_term: str, reverse: bool = False + ) -> str: + """ + Format a (parent, child) pair into relation text + optional context. + + Returns: + str: " [## Context. 'parent': ... 'child': ...]" + """ + relation_text = ( + f"{child_term} is a subclass / child / subtype / descendant class of {parent_term}" + if reverse + else f"{parent_term} is the superclass / parent / supertype / ancestor class of {child_term}" + ) + + parent_info = self._lookup_context_info(parent_term) + child_info = self._lookup_context_info(child_term) + if not parent_info and not child_info: + return relation_text + + context_text = ( + f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + ) + return f"{relation_text} {context_text}" + + def _fill_bucket_threaded( + self, bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str] + ) -> None: + """ + Populate a shard with provider-generated `term_info` using threads. + + Resumes from `output_path` if it already exists, periodically writes + progress (every ~10 items), and finally dumps the full bucket to disk. + """ + start_index = 0 + try: + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + start_index = len(existing_rows) + except Exception: + pass + + for row_index in range(start_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider( + bucket_rows[row_index]["term"] + ) + except Exception: + bucket_rows[row_index]["term_info"] = "" + if row_index % 10 == 1: + json.dump( + bucket_rows[: row_index + 1], + open(output_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=2, + ) + + json.dump( + bucket_rows, + open(output_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=2, + ) + + def _merge_part_files( + self, dataset_name: str, merged_path: Path, shard_paths: List[Path] + ) -> None: + """ + Merge shard files into one JSON and filter boilerplate sentences. + + - Reads shard lists/dicts from `shard_paths`. + - Drops sentences that contain markers in `_CONTEXT_REMOVALS` or the + `dataset_name` string. + - Normalizes the remaining text via `_normalize_text`. + - Writes merged JSON to `merged_path`, then best-effort deletes shards. + """ + merged_rows: List[dict] = [] + for shard_path in shard_paths: + try: + if not shard_path.is_file(): + continue + part_content = json.load(open(shard_path, "r", encoding="utf-8")) + if isinstance(part_content, list): + merged_rows.extend(part_content) + elif isinstance(part_content, dict): + merged_rows.append(part_content) + except Exception: + continue + + removal_markers = list(self._CONTEXT_REMOVALS) + [dataset_name] + for row in merged_rows: + term_info_raw = str(row.get("term_info", "")) + kept_sentences: List[str] = [] + for sentence in term_info_raw.split("."): + sentence_no_links = re.sub( + r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence + ) + if any(marker in sentence_no_links for marker in removal_markers): + continue + kept_sentences.append(sentence_no_links) + row["term_info"] = self._normalize_text( + ".".join(kept_sentences), drop_questions=False + ) + + merged_path.parent.mkdir(parents=True, exist_ok=True) + json.dump( + merged_rows, + open(merged_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=4, + ) + + # best-effort cleanup + for shard_path in shard_paths: + try: + os.remove(shard_path) + except Exception: + pass + + def _execute_for_terms( + self, + terms: List[str], + merged_path: Path, + shard_paths: List[Path], + provider: Callable[[str], str], + dataset_name: str, + num_workers: int = 2, + ) -> None: + """ + Generate context for `terms`, writing shards to `shard_paths`, then merge. + + Always uses threads (pickling-safe for instance methods). + Shows a tqdm progress bar and merges shards at the end. + """ + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + all_rows = [ + {"id": index, "term": term, "term_info": ""} + for index, term in enumerate(terms) + ] + + buckets: List[List[dict]] = [[] for _ in range(worker_count)] + for reversed_index, row in enumerate(reversed(all_rows)): + buckets[reversed_index % worker_count].append(row) + + total_rows = len(terms) + progress_bar = tqdm( + total=total_rows, desc=f"{dataset_name} generation (threads)" + ) + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + self._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [ + pool.submit( + run_bucket, buckets[bucket_index], shard_paths[bucket_index] + ) + for bucket_index in range(worker_count) + ] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + + self._merge_part_files(dataset_name, merged_path, shard_paths) + + def _re_infer_short_entries( + self, + merged_path: Path, + re_shard_paths: List[Path], + re_merged_path: Path, + provider: Callable[[str], str], + dataset_name: str, + num_workers: int, + ) -> int: + """ + Re-query terms whose `term_info` is too short (< 50 chars). + + Process: + - Read `merged_path`. + - Filter boilerplate using `_CONTEXT_REMOVALS` and `dataset_name`. + - Split into short/long groups by length 50. + - Regenerate short group with `provider` in parallel (threads). + - Merge regenerated + long back into `merged_path`. + + Returns: + int: Count of rows still < 50 chars after re-inference. + """ + merged_rows = json.load(open(merged_path, "r", encoding="utf-8")) + + removal_markers = list(self._CONTEXT_REMOVALS) + [dataset_name] + short_rows: List[dict] = [] + long_rows: List[dict] = [] + + for row in merged_rows: + term_info_raw = str(row.get("term_info", "")) + sentences = term_info_raw.split(".") + for marker in removal_markers: + sentences = [ + sentence if marker not in sentence else "" for sentence in sentences + ] + filtered_info = self._normalize_text( + ".".join(sentences), drop_questions=False + ) + row["term_info"] = filtered_info + + (short_rows if len(filtered_info) < 50 else long_rows).append(row) + + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + buckets: List[List[dict]] = [[] for _ in range(worker_count)] + for row_index, row in enumerate(short_rows): + buckets[row_index % worker_count].append(row) + + # Clean old re-inference shards + for path in re_shard_paths: + try: + os.remove(path) + except Exception: + pass + + total_candidates = len(short_rows) + progress_bar = tqdm( + total=total_candidates, desc=f"{dataset_name} re-inference (threads)" + ) + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + self._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [ + pool.submit( + run_bucket, buckets[bucket_index], re_shard_paths[bucket_index] + ) + for bucket_index in range(worker_count) + ] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + + # Merge and write back + self._merge_part_files(dataset_name, re_merged_path, re_shard_paths) + new_rows = ( + json.load(open(re_merged_path, "r", encoding="utf-8")) + if re_merged_path.is_file() + else [] + ) + final_rows = long_rows + new_rows + json.dump( + final_rows, + open(merged_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=4, + ) + + remaining_short = sum( + 1 for row in final_rows if len(str(row.get("term_info", ""))) < 50 + ) + return remaining_short + + def _extract_terms_from_ontology(self, ontology: Any) -> List[str]: + """ + Collect unique term names from `ontology.type_taxonomies.taxonomies`, + falling back to `ontology.taxonomies` if needed. + + Returns: + list[str]: Sorted unique term list. + """ + type_taxonomies = getattr(ontology, "type_taxonomies", None) + taxonomies = ( + getattr(type_taxonomies, "taxonomies", None) + if type_taxonomies is not None + else getattr(ontology, "taxonomies", None) + ) + unique_terms: set[str] = set() + if taxonomies: + for row in taxonomies: + parent_term = ( + getattr(row, "parent", None) + if not isinstance(row, dict) + else row.get("parent") + ) + child_term = ( + getattr(row, "child", None) + if not isinstance(row, dict) + else row.get("child") + ) + if parent_term: + unique_terms.add(str(parent_term)) + if child_term: + unique_terms.add(str(child_term)) + return sorted(unique_terms) + + def preprocess_context_from_ontology( + self, + ontology: Any, + processed_dir: str | Path, + dataset_name: str = "GeoNames", + num_workers: int = 2, + provider: Optional[Callable[[str], str]] = None, + max_retries: int = 5, + ) -> Path: + """ + Build `{id, term, term_info}` rows from an ontology object. + + Always regenerates the fixed-name file `rwthdbis_onto_processed.json`, + performing: + - Parallel generation of term_info in shards (`_execute_for_terms`), + - Re-inference rounds for short entries (`_re_infer_short_entries`), + - Final merge and cleanup, + - Updates `self.context_json_path`. + + Filenames under `processed_dir`: + - merged: `rwthdbis_onto_processed.json` + - shards: `rwthdbis_onto_type_part{idx}.json` + - re-infer shards: `rwthdbis_onto_re_inference{idx}.json` + - re-infer merged: `rwthdbis_onto_Types_re_inference.json` + + Returns: + Path: The merged context JSON path (`rwthdbis_onto_processed.json`). + """ + provider = provider or partial( + self._default_gpt_inference_with_dataset, dataset_name=dataset_name + ) + + processed_dir = Path(processed_dir) + processed_dir.mkdir(parents=True, exist_ok=True) + + merged_path = processed_dir / "rwthdbis_onto_processed.json" + if merged_path.exists(): + try: + merged_path.unlink() + except Exception: + pass + + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + shard_paths = [ + processed_dir / f"rwthdbis_onto_type_part{index}.json" + for index in range(worker_count) + ] + re_shard_paths = [ + processed_dir / f"rwthdbis_onto_re_inference{index}.json" + for index in range(worker_count) + ] + re_merged_path = processed_dir / "rwthdbis_onto_Types_re_inference.json" + + # Remove any leftover shards + for path in shard_paths + re_shard_paths + [re_merged_path]: + try: + if path.exists(): + path.unlink() + except Exception: + pass + + unique_terms = self._extract_terms_from_ontology(ontology) + print(f"[Preprocess] Unique terms from ontology: {len(unique_terms)}") + + self._execute_for_terms( + terms=unique_terms, + merged_path=merged_path, + shard_paths=shard_paths, + provider=provider, + dataset_name=dataset_name, + num_workers=worker_count, + ) + + retry_round = 0 + while retry_round < max_retries: + remaining_count = self._re_infer_short_entries( + merged_path=merged_path, + re_shard_paths=re_shard_paths, + re_merged_path=re_merged_path, + provider=provider, + dataset_name=dataset_name, + num_workers=worker_count, + ) + print( + f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}" + ) + retry_round += 1 + if remaining_count == 0: + break + + print(f"[Preprocess] Done. Merged context at: {merged_path}") + self.context_json_path = str(merged_path) + return merged_path diff --git a/ontolearner/learner/taxonomy_discovery/sbunlp.py b/ontolearner/learner/taxonomy_discovery/sbunlp.py new file mode 100644 index 0000000..660ec6e --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/sbunlp.py @@ -0,0 +1,402 @@ +# Copyright (c) 2025 SciKnowOrg +# License: MIT + +import os +import re +import json +from typing import Any, Dict, List, Optional + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from ...base import AutoLearner + + +class SBUNLPFewShotLearner(AutoLearner): + """ + Few-shot taxonomy discovery via N×M batch prompting. + + This learner: + - Caches & cleans gold parent–child pairs during `fit`. + - Splits (train pairs × test terms) into a grid of chunks. + - Builds an instruction prompt per grid cell with few-shot JSON examples. + - Generates and parses model outputs as JSON relations. + - Merges & deduplicates all predicted edges. + """ + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", + try_4bit: bool = True, + device: str = "cpu", + num_train_chunks: int = 7, + num_test_chunks: int = 7, + max_new_tokens: int = 140, + max_input_tokens: int = 1500, + temperature: float = 0.0, + top_p: float = 1.0, + limit_num_prompts: Optional[int] = None, + output_dir: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Initialize the learner and core generation / batching settings. + + Args: + model_name: HF id/path of the causal LLM (e.g., Qwen Instruct). + try_4bit: If True and on CUDA, load with 4-bit NF4 quantization. + device: "cpu" or "cuda" for model execution. + num_train_chunks: Number of chunks for the gold (parent, child) bank. + num_test_chunks: Number of chunks for the test term list. + max_new_tokens: Max new tokens to generate per prompt call. + max_input_tokens: Clip the *input* prompt to this many tokens (tail kept). + temperature: Sampling temperature; 0.0 uses greedy decoding. + top_p: Nucleus sampling parameter (used when temperature > 0). + limit_num_prompts: Optional hard cap on prompts issued (debug/cost). + output_dir: Optional directory to save per-batch JSON predictions. + **kwargs: Forwarded to the base class. + """ + super().__init__(**kwargs) + self.model_name = model_name + self.try_4bit = try_4bit + self.device = device + + self.num_train_chunks = num_train_chunks + self.num_test_chunks = num_test_chunks + self.max_new_tokens = max_new_tokens + self.max_input_tokens = max_input_tokens + self.temperature = temperature + self.top_p = top_p + self.limit_num_prompts = limit_num_prompts + self.output_dir = output_dir + + self.tokenizer: Optional[AutoTokenizer] = None + self.model: Optional[AutoModelForCausalLM] = None + self.train_pairs_clean: List[Dict[str, str]] = [] + + def _clean_pairs(self, pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: + """ + Normalize, filter, and deduplicate relation pairs. + + Operations: + - Cast 'parent'/'child' to strings and strip whitespace. + - Drop rows with empty values. + - Drop self-relations (case-insensitive parent == child). + - Deduplicate by lowercase (parent, child). + + Args: + pair_rows: Raw list of dicts with at least 'parent' and 'child'. + + Returns: + Cleaned list of {'parent','child'} dicts. + """ + cleaned, seen = [], set() + for rec in pair_rows or []: + if not isinstance(rec, dict): + continue + p = str(rec.get("parent", "")).strip() + c = str(rec.get("child", "")).strip() + if not p or not c: + continue + key = (p.lower(), c.lower()) + if key[0] == key[1] or key in seen: + continue + seen.add(key) + cleaned.append({"parent": p, "child": c}) + return cleaned + + def _chunk_list(self, items: List[Any], num_chunks: int) -> List[List[Any]]: + """ + Split a list into `num_chunks` near-equal contiguous parts. + + Args: + items: Sequence to split. + num_chunks: Number of chunks to produce; if <= 0, returns [items]. + + Returns: + List of chunks (some may be empty if len(items) < num_chunks). + """ + if num_chunks <= 0: + return [items] + n = len(items) + base, rem = divmod(n, num_chunks) + out, start = [], 0 + for i in range(num_chunks): + size = base + (1 if i < rem else 0) + out.append(items[start : start + size]) + start += size + return out + + def _ensure_dir(self, path: Optional[str]) -> None: + """ + Create a directory if `path` is a non-empty string. + + Args: + path: Directory to create (recursively). Ignored if falsy. + """ + if path: + os.makedirs(path, exist_ok=True) + + def load(self, **_: Any) -> None: + """ + Load tokenizer and model; optionally enable 4-bit quantization. + + Assumes bitsandbytes is available if `try_4bit=True` on CUDA. + Sets tokenizer pad token if missing. Places model on GPU (device_map='auto') + when `device='cuda'`, otherwise on CPU. + + Args: + **_: Unused kwargs for interface compatibility. + """ + quant_config = None + if self.try_4bit and self.device == "cuda": + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if getattr(self.tokenizer, "pad_token_id", None) is None: + if getattr(self.tokenizer, "eos_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.eos_token + elif getattr(self.tokenizer, "unk_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.unk_token + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=("auto" if self.device == "cuda" else None), + torch_dtype=(torch.float16 if self.device == "cuda" else torch.float32), + quantization_config=quant_config, + ) + if self.device == "cpu": + self.model.to("cpu") + + def _format_chat(self, user_text: str) -> str: + """ + Wrap plain text with the model's chat template, if provided. + + Many instruction-tuned models expose `tokenizer.chat_template`. + If available, use it to construct a proper chat prompt; otherwise, + return the text unchanged. + + Args: + user_text: Content of the user message. + + Returns: + A generation-ready prompt string. + """ + if hasattr(self.tokenizer, "apply_chat_template") and getattr( + self.tokenizer, "chat_template", None + ): + return self.tokenizer.apply_chat_template( + [{"role": "user", "content": user_text}], + tokenize=False, + add_generation_prompt=True, + ) + return user_text + + @torch.no_grad() + def _generate(self, prompt_text: str) -> str: + """ + Generate text for a single prompt, guarding input length. + + Steps: + 1) Format prompt via chat template (if present). + 2) Tokenize and clip the *input* to `max_input_tokens` (tail kept). + 3) Call `model.generate` with configured decoding params. + 4) Strip the echoed prompt from the decoded output (if present). + + Args: + prompt_text: Textual prompt to feed the model. + + Returns: + Model continuation string (prompt-echo stripped when applicable). + """ + formatted = self._format_chat(prompt_text) + ids = self.tokenizer(formatted, add_special_tokens=False, return_tensors=None)[ + "input_ids" + ] + if len(ids) > self.max_input_tokens: + ids = ids[-self.max_input_tokens :] + device = next(self.model.parameters()).device + input_ids = torch.tensor([ids], device=device) + + out = self.model.generate( + input_ids=input_ids, + max_new_tokens=self.max_new_tokens, + do_sample=(self.temperature > 0.0), + temperature=self.temperature, + top_p=self.top_p, + pad_token_id=self.tokenizer.pad_token_id, + eos_token_id=getattr(self.tokenizer, "eos_token_id", None), + use_cache=True, + ) + + decoded_full = self.tokenizer.decode(out[0], skip_special_tokens=True) + decoded_prompt = self.tokenizer.decode(input_ids[0], skip_special_tokens=True) + return ( + decoded_full[len(decoded_prompt) :].strip() + if decoded_full.startswith(decoded_prompt) + else decoded_full.strip() + ) + + def _build_prompt( + self, + train_pairs_chunk: List[Dict[str, str]], + test_terms_chunk: List[str], + ) -> str: + """ + Construct a few-shot prompt with JSON examples and test terms. + + The prompt: + - Shows several gold (parent, child) examples in JSON. + - Lists the test terms (one per line) between [PAIR] tags. + - Instructs to return ONLY a JSON array of {'parent','child'}. + + Args: + train_pairs_chunk: Cleaned training relations for examples. + test_terms_chunk: The current chunk of test terms. + + Returns: + The fully formatted prompt string. + """ + examples_json = json.dumps(train_pairs_chunk, ensure_ascii=False, indent=2) + test_block = "\n".join(test_terms_chunk) + prompt = ( + "From this file, extract all parent–child relations like in the examples.\n" + "Return ONLY a JSON array of objects with keys 'parent' and 'child'.\n" + "Output format:\n" + "[\n" + ' {"parent": "parent1", "child": "child1"},\n' + ' {"parent": "parent2", "child": "child2"}\n' + "]\n\n" + "EXAMPLES (JSON):\n" + f"{examples_json}\n\n" + "TEST TYPES (between [PAIR] tags):\n" + "[PAIR]\n" + f"{test_block}\n" + "[PAIR]\n" + "Return only JSON." + ) + return prompt + + def _parse_pairs(self, text: str) -> List[Dict[str, str]]: + """ + Parse a generation string into a list of relation dicts. + + Parsing strategy: + 1) Try to parse the entire string as JSON; expect a list. + 2) Else, regex-extract the outermost JSON-like array and parse that. + 3) On failure, return an empty list. + + Args: + text: Raw model output. + + Returns: + Cleaned list of {'parent','child'} dicts (possibly empty). + """ + text = text.strip() + try: + obj = json.loads(text) + if isinstance(obj, list): + return self._clean_pairs(obj) + except Exception: + pass + m = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", text) + if m: + try: + obj = json.loads(m.group(0)) + if isinstance(obj, list): + return self._clean_pairs(obj) + except Exception: + pass + return [] + + def fit(self, train_data: Any, task: str, ontologizer: bool = True): + """ + Cache and clean gold relations for few-shot prompting. + + For `task == "taxonomy-discovery"`: + - If `ontologizer=True`, convert ontology-like input into + a list of {'parent','child'} via the base helper. + - Otherwise, accept a user-provided list directly. + - Store a cleaned, deduplicated bank in `self.train_pairs_clean`. + + Args: + train_data: Ontology-like object or list of relation dicts. + task: Task selector (expects "taxonomy-discovery"). + ontologizer: Whether to transform ontology inputs. + + Returns: + None. (State is stored on the instance.) + """ + if task != "taxonomy-discovery": + return super().fit(train_data, task, ontologizer) + if ontologizer: + gold = self.tasks_ground_truth_former(train_data, task="taxonomy-discovery") + self.train_pairs_clean = self._clean_pairs(gold) + else: + self.train_pairs_clean = self._clean_pairs(train_data) + + def _taxonomy_discovery( + self, data: Any, test: bool = False + ) -> Optional[List[Dict[str, str]]]: + """ + Run few-shot inference (test=True) or no-op during training. + + Inference steps: + - Ensure tokenizer/model are loaded. + - Normalize `data` to a list of test terms (via base helper if needed). + - Create the N×M grid across (train_pairs_chunk × test_terms_chunk). + - For each cell: build prompt → generate → parse → (optionally) save. + - Merge and deduplicate all predicted pairs before returning. + + Args: + data: Test input (ontology-like, list of strings, or mixed). + test: If True, perform prediction; otherwise return None. + + Returns: + On `test=True`: deduplicated list of {'parent','child'}. + On `test=False`: None. + """ + if not test: + return None + if self.model is None or self.tokenizer is None: + self.load() + + if isinstance(data, list) and (len(data) == 0 or isinstance(data[0], str)): + test_terms: List[str] = data + else: + test_terms = super().tasks_data_former( + data=data, task="taxonomy-discovery", test=True + ) + + train_chunks = self._chunk_list(self.train_pairs_clean, self.num_train_chunks) + test_chunks = self._chunk_list(test_terms, self.num_test_chunks) + + self._ensure_dir(self.output_dir) + + merged: List[Dict[str, str]] = [] + issued = 0 + + for ti, tr in enumerate(train_chunks, 1): + for si, ts in enumerate(test_chunks, 1): + issued += 1 + if self.limit_num_prompts and issued > self.limit_num_prompts: + break + prompt = self._build_prompt(tr, ts) + resp = self._generate(prompt) + pairs = self._parse_pairs(resp) + + if self.output_dir: + path = os.path.join(self.output_dir, f"pairs_T{ti}_S{si}.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(pairs, f, ensure_ascii=False, indent=2) + + merged.extend(pairs) + + if self.limit_num_prompts and issued >= (self.limit_num_prompts or 0): + break + + return self._clean_pairs(merged) diff --git a/ontolearner/learner/taxonomy_discovery/skhnlp.py b/ontolearner/learner/taxonomy_discovery/skhnlp.py new file mode 100644 index 0000000..c242aab --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/skhnlp.py @@ -0,0 +1,1138 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import random + +import pandas as pd +import torch +import Levenshtein +from datasets import Dataset +from typing import Any, Optional, List, Tuple, Dict +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + AutoModelForCausalLM, + BertTokenizer, + BertForSequenceClassification, + pipeline, + Trainer, + TrainingArguments, +) + +from ...base import AutoLearner, AutoPrompt +from ...utils import taxonomy_split, train_test_split as ontology_split +from ...data_structure import OntologyData, TaxonomicRelation + + +class SKHNLPTaxonomyPrompts(AutoPrompt): + """Builds the 7 taxonomy prompts used during fine-tuning / inference. + + The class stores a small inventory of prompt templates that verbalize the + (parent, child) relationship using different phrasings. Each template ends + with a masked token slot intended for True/False classification. + """ + + def __init__(self) -> None: + """Initialize prompt templates and the default prompt in the base class.""" + super().__init__( + prompt_template="{parent} is the superclass of {child}. This statement is [MASK]." + ) + self.templates: List[str] = [ + "{parent} is the superclass of {child}. This statement is [MASK].", + "{child} is a subclass of {parent}. This statement is [MASK].", + "{parent} is the parent class of {child}. This statement is [MASK].", + "{child} is a child class of {parent}. This statement is [MASK].", + "{parent} is a supertype of {child}. This statement is [MASK].", + "{child} is a subtype of {parent}. This statement is [MASK].", + "{parent} is an ancestor class of {child}. This statement is [MASK].", + ] + + def format(self, parent: str, child: str, template_idx: int) -> str: + """Render a prompt for a (parent, child) pair using a specific template. + + Args: + parent: The parent/superclass label. + child: The child/subclass label. + template_idx: Index into the internal `templates` list. + + Returns: + The fully formatted prompt string. + """ + return self.templates[template_idx].format(parent=parent, child=child) + + +class SKHNLPSequentialFTLearner(AutoLearner): + """ + BERT-based classifier for taxonomy discovery. + + With OntologyData: + * TRAIN: ontology-aware split; create balanced train/eval with negatives. + * PREDICT/TEST: notebook-style parent selection -> list[{'parent', 'child'}]. + + With DataFrame/list: + * TRAIN: taxonomy_split + negatives; build prompts and fine-tune. + * PREDICT/TEST: pairwise binary classification (returns label + score). + """ + + def __init__( + self, + # core + model_name: str = "bert-large-uncased", + n_prompts: int = 7, + random_state: int = 1403, + num_labels: int = 2, + device: str = "cpu", # "cuda" | "cpu" | None (auto) + # data split & negative sampling (now configurable) + eval_fraction: float = 0.16, + neg_ratio_reversed: float = 1 / 3, + neg_ratio_manipulated: float = 2 / 3, + # ---- expose TrainingArguments as individual user-defined args ---- + output_dir: str = "./results/", + num_train_epochs: int = 1, + per_device_train_batch_size: int = 4, + per_device_eval_batch_size: int = 4, + warmup_steps: int = 500, + weight_decay: float = 0.01, + logging_dir: str = "./logs/", + logging_steps: int = 50, + eval_strategy: str = "epoch", + save_strategy: str = "epoch", + load_best_model_at_end: bool = True, + use_fast_tokenizer: Optional[bool] = None, + trust_remote_code: bool = False, + ) -> None: + """Configure the sequential fine-tuning learner. + + Args: + model_name: HF model id or local path for the BERT backbone. + n_prompts: Number of prompt variants to iterate over sequentially. + random_state: RNG seed for shuffling/sampling steps. + num_labels: Number of classes for the classifier head. + device: Force device ('cuda' or 'cpu'). If None, auto-detects CUDA. + eval_fraction: Fraction of positives to hold out for evaluation. + neg_ratio_reversed: Proportion of reversed-parent negatives vs positives. + neg_ratio_manipulated: Proportion of random-parent negatives vs positives. + output_dir: Directory where HF Trainer writes checkpoints/outputs. + num_train_epochs: Number of epochs per prompt. + per_device_train_batch_size: Training batch size per device. + per_device_eval_batch_size: Evaluation batch size per device. + warmup_steps: Linear warmup steps for LR scheduler. + weight_decay: Weight decay coefficient. + logging_dir: Directory for Trainer logs. + logging_steps: Interval for log events (in steps). + eval_strategy: Evaluation schedule ('no', 'steps', 'epoch'). + save_strategy: Checkpoint save schedule ('no', 'steps', 'epoch'). + load_best_model_at_end: Whether to restore the best checkpoint. + use_fast_tokenizer: Force fast/slow tokenizer. If None, try fast then fallback to slow. + Notes: + The model is fine-tuned *sequentially* across prompt columns. + You can control the eval split and negative sampling mix via + `eval_fraction`, `neg_ratio_reversed`, and `neg_ratio_manipulated`. + """ + super().__init__() + self.model_name = model_name + self.n_prompts = n_prompts + self.random_state = random_state + self.num_labels = num_labels + self.device = device + + # user-tunable ratios / split + self._eval_fraction = float(eval_fraction) + self._neg_ratio_reversed = float(neg_ratio_reversed) + self._neg_ratio_manipulated = float(neg_ratio_manipulated) + if not (0.0 < self._eval_fraction < 1.0): + raise ValueError("eval_fraction must be in (0, 1).") + if self._neg_ratio_reversed < 0 or self._neg_ratio_manipulated < 0: + raise ValueError("neg_ratio_* must be >= 0.") + + self.tokenizer: Optional[BertTokenizer] = None + self.model: Optional[BertForSequenceClassification] = None + self.prompter = SKHNLPTaxonomyPrompts() + + # Candidate parents (unique parent list) for multi-class parent selection. + self._candidate_parents: Optional[List[str]] = None + + # Keep last train/eval tables for inspection + self._last_train: Optional[pd.DataFrame] = None + self._last_eval: Optional[pd.DataFrame] = None + self.trust_remote_code = bool(trust_remote_code) + self.use_fast_tokenizer = use_fast_tokenizer + + random.seed(self.random_state) + + # Build TrainingArguments from the individual user-defined values + self.training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_train_epochs, + per_device_train_batch_size=per_device_train_batch_size, + per_device_eval_batch_size=per_device_eval_batch_size, + warmup_steps=warmup_steps, + weight_decay=weight_decay, + logging_dir=logging_dir, + logging_steps=logging_steps, + eval_strategy=eval_strategy, + save_strategy=save_strategy, + load_best_model_at_end=load_best_model_at_end, + ) + + def load(self, model_id: Optional[str] = None, **_: Any) -> None: + """Load tokenizer & model in a backbone-agnostic way; move model to self.device.""" + model_id = model_id or self.model_name + + # ---- Tokenizer (robust fast→slow fallback unless explicitly set) ---- + if self.use_fast_tokenizer is None: + try: + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, use_fast=True, trust_remote_code=self.trust_remote_code + ) + except Exception as fast_err: + print( + f"[tokenizer] Fast tokenizer failed: {fast_err}. Falling back to slow tokenizer..." + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, use_fast=False, trust_remote_code=self.trust_remote_code + ) + else: + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, + use_fast=self.use_fast_tokenizer, + trust_remote_code=self.trust_remote_code, + ) + + # Ensure pad token exists (some models lack it) + if getattr(self.tokenizer, "pad_token", None) is None: + # Try sensible fallbacks + fallback = ( + getattr(self.tokenizer, "eos_token", None) + or getattr(self.tokenizer, "sep_token", None) + or getattr(self.tokenizer, "cls_token", None) + ) + if fallback is not None: + self.tokenizer.pad_token = fallback + + # ---- Model (classifier head sized to self.num_labels) ---- + self.model = AutoModelForSequenceClassification.from_pretrained( + model_id, + num_labels=self.num_labels, + trust_remote_code=self.trust_remote_code, + # Allows swapping in a new head size even if the checkpoint differs + ignore_mismatched_sizes=True, + ) + + # Make sure padding ids line up + if ( + getattr(self.model.config, "pad_token_id", None) is None + and getattr(self.tokenizer, "pad_token_id", None) is not None + ): + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + # Set problem type (single-label classification by default) + # If you plan multi-label, you'd switch to "multi_label_classification" + self.model.config.problem_type = "single_label_classification" + + # Move to target device + self.model.to(self.device) + + def tasks_ground_truth_former(self, data: Any, task: str) -> Any: + """Normalize ground-truth inputs for 'taxonomy-discovery'. + + Supports DataFrame with columns ['parent','child',('label')], + list of dicts, or falls back to the base class behavior. + + Args: + data: Input object to normalize. + task: Task name, passed from the outer pipeline. + + Returns: + A list of dictionaries with keys 'parent', 'child', and optionally + 'label' when present in the input. + """ + if task != "taxonomy-discovery": + return super().tasks_ground_truth_former(data, task) + + if isinstance(data, pd.DataFrame): + if "label" in data.columns: + return [ + {"parent": p, "child": c, "label": bool(lbl)} + for p, c, lbl in zip(data["parent"], data["child"], data["label"]) + ] + return [ + {"parent": p, "child": c} for p, c in zip(data["parent"], data["child"]) + ] + + if isinstance(data, list): + return data + + return super().tasks_ground_truth_former(data, task) + + def _make_negatives( + self, positives_df: pd.DataFrame + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create two types of negatives from a positives table. + + Returns: + A tuple `(reversed_df, manipulated_df)` where: + - `reversed_df`: pairs with parent/child columns swapped, label=False. + - `manipulated_df`: pairs with the parent replaced by a random + *different* parent from the same pool, label=False. + + Notes: + The input DataFrame must contain columns ['parent', 'child']. + """ + unique_parents = positives_df["parent"].unique().tolist() + + def as_reversed(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + out[["parent", "child"]] = out[["child", "parent"]].values + out["label"] = False + return out + + def with_random_parent(df: pd.DataFrame) -> pd.DataFrame: + def pick_other_parent(p: str) -> str: + pool = [x for x in unique_parents if x != p] + return random.choice(pool) if pool else p + + out = df.copy() + out["parent"] = out["parent"].apply(pick_other_parent) + out["label"] = False + return out + + return as_reversed(positives_df), with_random_parent(positives_df) + + def _balance_with_negatives( + self, + positives_df: pd.DataFrame, + reversed_df: pd.DataFrame, + manipulated_df: pd.DataFrame, + ) -> pd.DataFrame: + """Combine positives with negatives using configured ratios. + + Sampling ratios are defined by the instance settings + `self._neg_ratio_reversed` and `self._neg_ratio_manipulated`, + keeping the positives count unchanged. + + Args: + positives_df: Positive pairs with `label=True`. + reversed_df: Negative pairs produced by flipping parent/child. + manipulated_df: Negative pairs with randomly reassigned parents. + + Returns: + A deduplicated, shuffled DataFrame with a class-balanced mix. + """ + n_pos = len(positives_df) + n_rev = int(n_pos * self._neg_ratio_reversed) + n_man = int(n_pos * self._neg_ratio_manipulated) + + combined = pd.concat( + [ + positives_df.sample(n_pos, random_state=self.random_state), + reversed_df.sample(n_rev, random_state=self.random_state), + manipulated_df.sample(n_man, random_state=self.random_state), + ], + ignore_index=True, + ) + combined = combined.drop_duplicates( + subset=["parent", "child", "label"] + ).reset_index(drop=True) + return combined + + def _add_prompt_columns(self, df: pd.DataFrame) -> pd.DataFrame: + """Append one column per prompt variant to the given pairs table. + + For each row `(parent, child)`, creates columns `prompt_1 ... prompt_n`. + + Args: + df: Input DataFrame with columns ['parent', 'child', ...]. + + Returns: + A copy of `df` including the newly added prompt columns. + """ + out = df.copy() + for i in range(self.n_prompts): + out[f"prompt_{i + 1}"] = out.apply( + lambda r, k=i: self.prompter.format(r["parent"], r["child"], k), axis=1 + ) + return out + + def _df_from_relations( + self, relations: List[TaxonomicRelation], label: bool = True + ) -> pd.DataFrame: + """Convert a list of `TaxonomicRelation` to a DataFrame. + + Args: + relations: Iterable of `TaxonomicRelation(parent, child)`. + label: Class label to assign to all resulting rows. + + Returns: + DataFrame with columns ['parent', 'child', 'label']. + """ + if not relations: + return pd.DataFrame(columns=["parent", "child", "label"]) + return pd.DataFrame( + [{"parent": r.parent, "child": r.child, "label": label} for r in relations] + ) + + def _relations_from_df(self, df: pd.DataFrame) -> List[TaxonomicRelation]: + """Convert a DataFrame to a list of `TaxonomicRelation`. + + Args: + df: DataFrame with columns ['parent', 'child']. + + Returns: + List of `TaxonomicRelation` objects in row order. + """ + return [ + TaxonomicRelation(parent=p, child=c) + for p, c in zip(df["parent"], df["child"]) + ] + + def _build_masked_prompt( + self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]" + ) -> str: + """Construct one of several True/False prompts with a mask token. + + Args: + parent: Parent label. + child: Child label. + index_1_based: 1-based index selecting a template. + mask_token: The token used to denote the masked label. + + Returns: + A formatted prompt string. + """ + prompts_1based = [ + f"{parent} is the superclass of {child}. This statement is {mask_token}.", + f"{child} is a subclass of {parent}. This statement is {mask_token}.", + f"{parent} is the parent class of {child}. This statement is {mask_token}.", + f"{child} is a child class of {parent}. This statement is {mask_token}.", + f"{parent} is a supertype of {child}. This statement is {mask_token}.", + f"{child} is a subtype of {parent}. This statement is {mask_token}.", + f"{parent} is an ancestor class of {child}. This statement is {mask_token}.", + f"{child} is a descendant classs of {child}. This statement is {mask_token}.", + f'"{parent}" is the superclass of "{child}". This statement is {mask_token}.', + ] + return prompts_1based[index_1_based - 1] + + @torch.no_grad() + def _predict_prompt_true_false(self, sentence: str) -> bool: + """Run a single True/False prediction on a prompt. + + Args: + sentence: Fully formatted prompt text. + + Returns: + True iff the predicted class index is 1 (positive). + """ + enc = self.tokenizer(sentence, return_tensors="pt").to(self.model.device) + logits = self.model(**enc).logits + predicted_label = torch.argmax(logits, dim=1).item() + return predicted_label == 1 + + def _select_parent_via_prompts(self, child: str) -> str: + """Select the most likely parent for a given child via prompt voting. + + The procedure: + 1) Generate prompts for each candidate parent at increasing "levels". + 2) Accumulate votes from the True/False classifier. + 3) Resolve ties by recursing to the next level; after 4 levels, break ties randomly. + + Args: + child: The child label whose parent should be predicted. + + Returns: + The chosen parent string. + + Raises: + AssertionError: If candidate parents were not initialized. + """ + assert self._candidate_parents, "Candidate parents not initialized." + scores: dict[str, int] = {p: 0 for p in self._candidate_parents} + + def prompt_indices_for_level(level: int) -> List[int]: + if level == 0: + return [1] + return [2 * level, 2 * level + 1] + + def recurse(active_parents: List[str], level: int) -> str: + idxs = [ + i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts + ] + if idxs: + for parent in active_parents: + votes = sum( + 1 + for idx in idxs + if self._predict_prompt_true_false( + self._build_masked_prompt( + parent=parent, child=child, index_1_based=idx + ) + ) + ) + scores[parent] += votes + + max_score = max(scores[p] for p in active_parents) + tied = [p for p in active_parents if scores[p] == max_score] + if len(tied) == 1: + return tied[0] + if level < 4: + return recurse(tied, level + 1) + return random.choice(tied) + + return recurse(list(scores.keys()), level=0) + + def _taxonomy_discovery(self, data: Any, test: bool = False): + """ + TRAIN: + - OntologyData -> ontology-aware split; negatives per split; balanced sets. + - DataFrame/list -> taxonomy_split for positives; negatives proportional. + TEST: + - OntologyData -> parent selection: [{'parent': predicted, 'child': child}] + - DataFrame/list -> binary pair classification with 'label' + 'score' + + Args: + data: One of {OntologyData, pandas.DataFrame, list[dict], list[tuple]}. + test: If True, run inference; otherwise perform training. + + Returns: + - On training: None (model is fine-tuned in-place). + - On inference with OntologyData: list of {'parent','child'} predictions. + - On inference with pairs: list of dicts including 'label' and 'score'. + """ + is_ontology_object = isinstance(data, OntologyData) + + # Normalize input + if isinstance(data, pd.DataFrame): + pairs_df = data.copy() + elif isinstance(data, list): + pairs_df = pd.DataFrame(data) + else: + gt_pairs = super().tasks_ground_truth_former(data, "taxonomy-discovery") + pairs_df = pd.DataFrame(gt_pairs) + if "label" not in pairs_df.columns: + pairs_df["label"] = True + + # Maintain candidate parents across calls + if "parent" in pairs_df.columns: + parents_in_call = sorted(pd.unique(pairs_df["parent"]).tolist()) + if test: + if self._candidate_parents is None: + self._candidate_parents = parents_in_call + else: + self._candidate_parents = sorted( + set(self._candidate_parents).union(parents_in_call) + ) + else: + if self._candidate_parents is None: + self._candidate_parents = parents_in_call + + if test: + if is_ontology_object and self._candidate_parents: + predictions: List[dict[str, str]] = [] + for _, row in pairs_df.iterrows(): + child_term = row["child"] + chosen_parent = self._select_parent_via_prompts(child_term) + predictions.append({"parent": chosen_parent, "child": child_term}) + return predictions + + # pairwise binary classification + prompts_df = self._add_prompt_columns(pairs_df.copy()) + true_probs_by_prompt: List[torch.Tensor] = [] + + for i in range(self.n_prompts): + col = f"prompt_{i + 1}" + enc = self.tokenizer( + prompts_df[col].tolist(), + return_tensors="pt", + padding=True, + truncation=True, + ).to(self.model.device) + with torch.no_grad(): + logits = self.model(**enc).logits + true_probs_by_prompt.append(torch.softmax(logits, dim=1)[:, 1]) + + avg_true_prob = torch.stack(true_probs_by_prompt, dim=0).mean(0) + predicted_bool = (avg_true_prob >= 0.5).cpu().tolist() + + results: List[dict[str, Any]] = [] + for p, c, s, yhat in zip( + pairs_df["parent"], + pairs_df["child"], + avg_true_prob.tolist(), + predicted_bool, + ): + results.append( + { + "parent": p, + "child": c, + "label": int(bool(yhat)), + "score": float(s), + } + ) + return results + + if isinstance(data, OntologyData): + train_onto, eval_onto = ontology_split( + data, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, + ) + + train_pos_rel: List[TaxonomicRelation] = ( + getattr(train_onto.type_taxonomies, "taxonomies", []) or [] + ) + eval_pos_rel: List[TaxonomicRelation] = ( + getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + ) + + train_pos_df = self._df_from_relations(train_pos_rel, label=True) + eval_pos_df = self._df_from_relations(eval_pos_rel, label=True) + + tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) + ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) + + train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) + eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + else: + if "label" not in pairs_df.columns or pairs_df["label"].nunique() == 1: + positives_df = pairs_df[pairs_df.get("label", True)][ + ["parent", "child"] + ].copy() + pos_rel = self._relations_from_df(positives_df) + + tr_rel, ev_rel = taxonomy_split( + pos_rel, + train_terms=None, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, + ) + train_pos_df = self._df_from_relations(tr_rel, label=True) + eval_pos_df = self._df_from_relations(ev_rel, label=True) + + tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) + ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) + + train_df = self._balance_with_negatives( + train_pos_df, tr_rev_df, tr_man_df + ) + eval_df = self._balance_with_negatives( + eval_pos_df, ev_rev_df, ev_man_df + ) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + else: + positives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() + pos_rel = self._relations_from_df(positives_df) + + tr_rel, ev_rel = taxonomy_split( + pos_rel, + train_terms=None, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, + ) + train_pos_df = self._df_from_relations(tr_rel, label=True) + eval_pos_df = self._df_from_relations(ev_rel, label=True) + + negatives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() + negatives_df = negatives_df.sample( + frac=1.0, random_state=self.random_state + ).reset_index(drop=True) + + n_eval_neg = ( + max(1, int(len(negatives_df) * self._eval_fraction)) + if len(negatives_df) > 0 + else 0 + ) + eval_neg_df = ( + negatives_df.iloc[:n_eval_neg].copy() + if n_eval_neg > 0 + else negatives_df.iloc[:0].copy() + ) + train_neg_df = negatives_df.iloc[n_eval_neg:].copy() + + train_neg_df["label"] = False + eval_neg_df["label"] = False + + train_df = pd.concat([train_pos_df, train_neg_df], ignore_index=True) + eval_df = pd.concat([eval_pos_df, eval_neg_df], ignore_index=True) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + # Ensure labels are int64 + train_df["label"] = train_df["label"].astype("int64") + eval_df["label"] = eval_df["label"].astype("int64") + + # Sequential fine-tuning across prompts + for i in range(self.n_prompts): + prompt_col = f"prompt_{i + 1}" + train_ds = Dataset.from_pandas( + train_df[[prompt_col, "label"]].reset_index(drop=True) + ) + eval_ds = Dataset.from_pandas( + eval_df[[prompt_col, "label"]].reset_index(drop=True) + ) + + train_ds = train_ds.rename_column("label", "labels") + eval_ds = eval_ds.rename_column("label", "labels") + + def tokenize_batch(batch): + """Tokenize a batch for the current prompt column with truncation/padding.""" + return self.tokenizer( + batch[prompt_col], padding="max_length", truncation=True + ) + + train_ds = train_ds.map( + tokenize_batch, batched=True, remove_columns=[prompt_col] + ) + eval_ds = eval_ds.map( + tokenize_batch, batched=True, remove_columns=[prompt_col] + ) + + train_ds.set_format( + type="torch", columns=["input_ids", "attention_mask", "labels"] + ) + eval_ds.set_format( + type="torch", columns=["input_ids", "attention_mask", "labels"] + ) + + trainer = Trainer( + model=self.model, + args=self.training_args, + train_dataset=train_ds, + eval_dataset=eval_ds, + ) + trainer.train() + + self._last_train = train_df + self._last_eval = eval_df + return None + + +class SKHNLPZSLearner(AutoLearner): + """ + Zero-shot taxonomy learner using an instruction-tuned causal LLM. + + Behavior + -------- + - Builds a fixed classification prompt listing 9 GeoNames parent classes. + - For each input row (child term), generates a short completion and parses + the predicted class from a strict '#[ ... ]#' format. + - Optionally normalizes the raw prediction to one of the valid 9 labels via: + * "none" : keep the parsed text as-is + * "substring" : snap to a label if either is a substring of the other + * "levenshtein" : snap to the closest label by edit distance + * "auto" : substring, then Levenshtein if needed + - Saves raw and normalized predictions to CSV if `save_path` is provided. + + Inputs the learner accepts (via `_to_dataframe`): + - pandas.DataFrame with columns: ['child', 'parent'] or ['child', 'parent', 'label'] + - list[dict] with keys: 'child', 'parent' (and optionally 'label') + - list of tuples/lists: (child, parent) or (child, parent, label) + - OntoLearner-style object exposing .type_taxonomies.taxonomies iterable with (child, parent) + """ + + # Fixed class inventory (GeoNames parents) + CLASS_LIST = [ + "city, village", + "country, state, region", + "forest, heath", + "mountain, hill, rock", + "parks, area", + "road, railroad", + "spot, building, farm", + "stream, lake", + "undersea", + ] + + # Strict format: #[ ... ]# + _PREDICTION_PATTERN = re.compile(r"#\[\s*([^\]]+?)\s*\]#") + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + max_new_tokens: int = 16, + save_path: Optional[str] = None, # directory or full path + verbose: bool = True, + normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" + random_state: int = 1403, + ) -> None: + """Configure the zero-shot learner. + + Args: + model_name: HF model id/path for the instruction-tuned causal LLM. + device: Force device ('cuda' or 'cpu'), else auto-detect. + max_new_tokens: Generation length budget for each completion. + save_path: Optional CSV path or directory for saving predictions. + verbose: If True, print progress messages. + normalize_mode: Post-processing for class names + ('none' | 'substring' | 'levenshtein' | 'auto'). + random_state: RNG seed for any sampling steps. + """ + super().__init__() + self.model_name = model_name + self.verbose = verbose + self.max_new_tokens = max_new_tokens + self.save_path = save_path + self.normalize_mode = (normalize_mode or "none").lower().strip() + self.random_state = random_state + + random.seed(self.random_state) + + # Device: auto-detect CUDA if not specified + if device is None: + self._has_cuda = torch.cuda.is_available() + else: + self._has_cuda = device == "cuda" + self._pipe_device = 0 if self._has_cuda else -1 + self._model_device_map = {"": "cuda"} if self._has_cuda else None + + self._tokenizer = None + self._model = None + self._pipeline = None + + # Prompt template used for every example + self._classification_prompt = ( + "My task is classification. My classes are as follows: " + "(city, village), (country, state, region), (forest, heath), " + "(mountain, hill, rock), (parks, area), (road, railroad), " + "(spot, building, farm), (stream, lake), (undersea). " + 'I will provide you with a phrase like "wadi mouth". ' + "The name of each class is placed within a pair of parentheses. " + "I want you to choose the most appropriate class from those mentioned above " + "based on the given phrase and present it in a format like #[parks, area]#. " + "So, the general format for each response will be #[class name]#. " + "Pay attention to the format of the response. Start with a '#' character, " + "include the class name inside it, and end with another '#' character. " + "Additionally, make sure to include a '#' character at the end to indicate " + "that the answer is complete. I don't need any additional explanations." + ) + + def load(self, model_id: str = "") -> None: + """ + Load tokenizer, model, and text-generation pipeline. + + Args: + model_id: Optional HF id/path override; defaults to `self.model_name`. + + Side Effects: + Initializes the tokenizer and model, configures the generation + pipeline on CPU/GPU, and sets a pad token if absent. + """ + model_id = model_id or self.model_name + if self.verbose: + print(f"[ZeroShotTaxonomyLearner] Loading {model_id}") + + self._tokenizer = AutoTokenizer.from_pretrained(model_id) + + # Ensure a pad token is set for generation + if ( + self._tokenizer.pad_token_id is None + and self._tokenizer.eos_token_id is not None + ): + self._tokenizer.pad_token = self._tokenizer.eos_token + + self._model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=self._model_device_map, + torch_dtype="auto", + ) + + self._pipeline = pipeline( + task="text-generation", + model=self._model, + tokenizer=self._tokenizer, + device=self._pipe_device, # 0 for GPU, -1 for CPU + ) + + if self.verbose: + print("Device set to use", "cuda" if self._has_cuda else "cpu") + print("[ZeroShotTaxonomyLearner] Model loaded.") + + def _taxonomy_discovery( + self, data: Any, test: bool = False + ) -> Optional[List[Dict[str, str]]]: + """ + Zero-shot prediction over all incoming rows (no filtering/augmentation). + + Args: + data: One of {DataFrame, list[dict], list[tuple], Ontology-like}. + test: If False, training is skipped (zero-shot learner), and None is returned. + + Returns: + On `test=True`, a list of dicts [{'parent': predicted_label, 'child': child}, ...]. + On `test=False`, returns None. + """ + if not test: + if self.verbose: + print("[ZeroShot] Training skipped (zero-shot).") + return None + + df = self._to_dataframe(data) + + if self.verbose: + print(f"[ZeroShot] Incoming rows: {len(df)}; columns: {list(df.columns)}") + + eval_df = pd.DataFrame(df).reset_index(drop=True) + if eval_df.empty: + return [] + + # Prepare columns for inspection and saving + eval_df["prediction_raw"] = "" + eval_df["prediction_sub"] = "" + eval_df["prediction_lvn"] = "" + eval_df["prediction_auto"] = "" + eval_df["prediction"] = "" # final (per normalize_mode) + + # Generate predictions row by row + for idx, row in eval_df.iterrows(): + child_term = str(row["child"]) + raw_text, parsed_raw = self._generate_and_parse(child_term) + + # Choose a string to normalize (parsed token if matched, otherwise whole output) + basis = parsed_raw if parsed_raw != "unknown" else raw_text + + # Compute all normalization variants + sub_norm = self._normalize_substring_only(basis) + lvn_norm = self._normalize_levenshtein_only(basis) + auto_norm = self._normalize_auto(basis) + + # Final selection by mode + if self.normalize_mode == "none": + final_label = parsed_raw + elif self.normalize_mode == "substring": + final_label = sub_norm + elif self.normalize_mode == "levenshtein": + final_label = lvn_norm + elif self.normalize_mode == "auto": + final_label = auto_norm + else: + final_label = parsed_raw # fallback + + # Persist to DataFrame for inspection/export + eval_df.at[idx, "prediction_raw"] = parsed_raw + eval_df.at[idx, "prediction_sub"] = sub_norm + eval_df.at[idx, "prediction_lvn"] = lvn_norm + eval_df.at[idx, "prediction_auto"] = auto_norm + eval_df.at[idx, "prediction"] = final_label + + # Return in the format expected by the pipeline + return [ + {"parent": p, "child": c} + for p, c in zip(eval_df["prediction"], eval_df["child"]) + ] + + def _generate_and_parse(self, child_term: str) -> (str, str): + """ + Generate a completion for the given child term and extract the raw predicted class + using the strict '#[ ... ]#' pattern. + + Args: + child_term: The child label to classify into one of the fixed classes. + + Returns: + Tuple `(raw_generation_text, parsed_prediction_or_unknown)`, where the second + element is either the text inside '#[ ... ]#' or the string 'unknown'. + """ + messages = [ + {"role": "system", "content": "You are a helpful classifier."}, + {"role": "user", "content": f"{self._classification_prompt} {child_term}"}, + ] + + prompt = self._tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + generation = self._pipeline( + prompt, + max_new_tokens=self.max_new_tokens, + do_sample=False, + temperature=0.0, + top_p=1.0, + eos_token_id=self._tokenizer.eos_token_id, + pad_token_id=self._tokenizer.eos_token_id, + return_full_text=False, + )[0]["generated_text"] + + match = self._PREDICTION_PATTERN.search(generation) + parsed = match.group(1).strip() if match else "unknown" + return generation, parsed + + def _normalize_substring_only(self, text: str) -> str: + """ + Snap to a label if the string is equal to / contained in / contains a valid label (case-insensitive). + + Args: + text: Raw class text to normalize. + + Returns: + One of `CLASS_LIST` on a match; otherwise 'unknown'. + """ + if not isinstance(text, str): + return "unknown" + lowered = text.strip().lower() + if not lowered: + return "unknown" + + for label in self.CLASS_LIST: + label_lower = label.lower() + if ( + lowered == label_lower + or lowered in label_lower + or label_lower in lowered + ): + return label + return "unknown" + + def _normalize_levenshtein_only(self, text: str) -> str: + """ + Snap to the nearest label by Levenshtein (edit) distance. + + Args: + text: Raw class text to normalize. + + Returns: + The nearest label in `CLASS_LIST`, or 'unknown' if input is empty/invalid. + """ + if not isinstance(text, str): + return "unknown" + lowered = text.strip().lower() + if not lowered: + return "unknown" + + best_label = None + best_distance = 10**9 + for label in self.CLASS_LIST: + label_lower = label.lower() + distance = Levenshtein.distance(lowered, label_lower) + if distance < best_distance: + best_distance = distance + best_label = label + return best_label or "unknown" + + def _normalize_auto(self, text: str) -> str: + """ + Cascade: try substring-first; if no match, fall back to Levenshtein snapping. + + Args: + text: Raw class text to normalize. + + Returns: + Normalized label string or 'unknown'. + """ + snapped = self._normalize_substring_only(text) + return ( + snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + ) + + def _to_dataframe(self, data: Any) -> pd.DataFrame: + """ + Normalize various input formats into a DataFrame. + + Supported inputs: + * pandas.DataFrame with columns ['child','parent',('label')] + * list[dict] with keys 'child','parent',('label') + * list of tuples/lists: (child, parent) or (child, parent, label) + * Ontology-like object with `.type_taxonomies.taxonomies` + + Args: + data: The source object to normalize. + + Returns: + A pandas DataFrame with standardized columns. + + Raises: + ValueError: If the input type/shape is not recognized. + """ + if isinstance(data, pd.DataFrame): + df = data.copy() + df.columns = [str(c).lower() for c in df.columns] + return df.reset_index(drop=True) + + if isinstance(data, list) and data and isinstance(data[0], dict): + rows = [{str(k).lower(): v for k, v in d.items()} for d in data] + return pd.DataFrame(rows).reset_index(drop=True) + + if isinstance(data, (list, tuple)) and data: + first = data[0] + if isinstance(first, (list, tuple)) and not isinstance(first, dict): + n = len(first) + if n >= 3: + return pd.DataFrame( + data, columns=["child", "parent", "label"] + ).reset_index(drop=True) + if n == 2: + return pd.DataFrame(data, columns=["child", "parent"]).reset_index( + drop=True + ) + + try: + type_taxonomies = getattr(data, "type_taxonomies", None) + if type_taxonomies is not None: + taxonomies = getattr(type_taxonomies, "taxonomies", None) + if taxonomies is not None: + rows = [] + for rel in taxonomies: + parent = getattr(rel, "parent", None) + child = getattr(rel, "child", None) + label = ( + getattr(rel, "label", None) + if hasattr(rel, "label") + else None + ) + if parent is not None and child is not None: + rows.append( + {"child": child, "parent": parent, "label": label} + ) + if rows: + return pd.DataFrame(rows).reset_index(drop=True) + except Exception: + pass + + raise ValueError( + "Unsupported data format. Provide a DataFrame, a list of dicts, " + "a list of (child, parent[, label]) tuples/lists, or an object with " + ".type_taxonomies.taxonomies." + ) + + def _resolve_save_path(self, save_path: str, default_filename: str) -> str: + """ + Resolve a target file path from a directory or path-like input. + + If `save_path` points to a directory, joins it with `default_filename`. + If it already looks like a file path (has an extension), returns as-is. + + Args: + save_path: Directory or file path supplied by the caller. + default_filename: Basename to use when `save_path` is a directory. + + Returns: + A concrete file path where outputs can be written. + """ + base = os.path.basename(save_path) + has_ext = os.path.splitext(base)[1] != "" + return save_path if has_ext else os.path.join(save_path, default_filename) diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py new file mode 100644 index 0000000..dec8b9f --- /dev/null +++ b/ontolearner/learner/term_typing/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .alexbek import AlexbekRAGLearner, AlexbekRFLearner +from .rwthdbis import RWTHDBISSFTLearner +from .sbunlp import SBUNLPZSLearner diff --git a/ontolearner/learner/term_typing/alexbek.py b/ontolearner/learner/term_typing/alexbek.py new file mode 100644 index 0000000..0db694b --- /dev/null +++ b/ontolearner/learner/term_typing/alexbek.py @@ -0,0 +1,1262 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Learners for supervised and retrieval-augmented *term typing*. + +This module implements two learners: + +- **AlexbekRFLearner** (retriever/classifier): + Encodes terms with a Hugging Face encoder, optionally augments with simple + graph features, and trains a One-vs-Rest RandomForest for multi-label typing. + +- **AlexbekRAGLearner** (retrieval-augmented generation): + Builds an in-memory example index with sentence embeddings, retrieves + nearest examples for each query term, then prompts an instruction-tuned + causal LLM to produce types, parsing the JSON response. + +Both learners conform to the `AutoLearner` / `AutoRetriever` APIs used in +the outer pipeline. +""" + +import gc +import json +import re +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +import networkx as nx +from tqdm import tqdm +from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.ensemble import RandomForestClassifier +from sklearn.multiclass import OneVsRestClassifier + +from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM +from sentence_transformers import SentenceTransformer + +from ...base import AutoLearner, AutoRetriever + + +class AlexbekRFLearner(AutoRetriever): + """ + Embedding-based multi-label classifier for *term typing*. + + Pipeline + 1) Load a Hugging Face encoder (tokenizer + model). + 2) Encode input terms into sentence embeddings. + 3) Optionally augment with simple graph (co-occurrence) features. + 4) Train a One-vs-Rest RandomForest on the concatenated features. + 5) Predict multi-label types with a probability threshold (fallback to top-1). + + Implements the `AutoRetriever` interface used by the outer pipeline. + """ + + def __init__( + self, + device: str = "cpu", + batch_size: int = 16, + max_length: int = 256, + threshold: float = 0.30, + use_graph_features: bool = True, + rf_kwargs: Optional[Dict[str, Any]] = None, + ): + """Configure the RF-based multi-label learner. + + Parameters + device: + Torch device spec ('cpu' or 'cuda'). + batch_size: + Encoding mini-batch size for the transformer. + max_length: + Maximum input token length for the encoder tokenizer. + threshold: + Per-label probability threshold at prediction time. + use_graph_features: + If True, add simple graph features to embeddings. + rf_kwargs: + Optional RandomForest hyperparameters dictionary. + + """ + # Runtime / inference settings + self.device = torch.device(device) + self.batch_size = batch_size + self.max_length = max_length + self.threshold = threshold # probability cutoff for selecting labels + self.use_graph_features = use_graph_features + + # RandomForest hyperparameters (with sensible defaults) + self.rf_kwargs = rf_kwargs or dict( + n_estimators=200, max_depth=20, class_weight="balanced", random_state=42 + ) + + # Filled during load/fit + self.model_name: Optional[str] = None + self.tokenizer: Optional[AutoTokenizer] = None + self.embedding_model: Optional[AutoModel] = None + + # Label processing / classifier / optional graph + self.label_binarizer = MultiLabelBinarizer() + self.ovr_random_forest: Optional[OneVsRestClassifier] = None + self.term_graph: Optional[nx.Graph] = None + + def load(self, model_id: str, **_: Any) -> None: + """Load a Hugging Face encoder by model id (tokenizer + base model). + + Parameters + model_id: + HF model identifier or local path for an encoder backbone. + + Side Effects + - Sets `self.model_name`, `self.tokenizer`, `self.embedding_model`. + - Puts the model in eval mode and moves it to `self.device`. + """ + self.model_name = model_id + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + self.embedding_model = AutoModel.from_pretrained(model_id) + self.embedding_model.eval().to(self.device) + + def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: + """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features). + + Parameters + data: + Training payload; supported formats are routed via `_as_term_types_dicts`. + Each example must contain at least `{"term": str, "types": List[str]}`. + task: + Must be `'term-typing'`. + ontologizer: + Unused here; accepted for API compatibility. + **_: + Ignored extra arguments. + + Raises + ValueError + If `task` is not `'term-typing'` or if no valid examples are found. + """ + if task != "term-typing": + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) + + # Normalize incoming training data into a list of dicts: {term, types, RAG} + training_rows = self._as_term_types_dicts(data) + if not training_rows: + raise ValueError( + "No valid training examples found (need 'term' and 'types')." + ) + + # Split out terms and raw labels + training_terms: List[str] = [row["term"] for row in training_rows] + raw_label_lists: List[List[str]] = [row["types"] for row in training_rows] + + # Fit label binarizer to learn label space/order + self.label_binarizer.fit(raw_label_lists) + + # Encode terms to sentence embeddings + term_embeddings_train = self._encode(training_terms) + + # Optionally build a light-weight co-occurrence graph and extract features + if self.use_graph_features: + self.term_graph = self._create_term_graph(training_rows) + graph_features_train = self._extract_graph_features( + self.term_graph, training_terms + ) + X_train = np.hstack([term_embeddings_train, graph_features_train]) + else: + self.term_graph = None + X_train = term_embeddings_train + + # Multi-label targets (multi-hot) + Y_train = self.label_binarizer.transform(raw_label_lists) + + # One-vs-Rest RandomForest (one binary RF per label) + self.ovr_random_forest = OneVsRestClassifier( + RandomForestClassifier(**self.rf_kwargs) + ) + self.ovr_random_forest.fit(X_train, Y_train) + + def predict( + self, data: Any, task: str, ontologizer: bool = True, **_: Any + ) -> List[Dict[str, Any]]: + """Predict multi-label types for input terms. + + Parameters + data: + Evaluation payload; formats normalized by `_as_predict_terms_ids`. + task: + Must be `'term-typing'`. + ontologizer: + Unused here; accepted for API compatibility. + **_: + Ignored extra arguments. + + Returns + List[Dict[str, Any]] + A list of dictionaries with keys: + - `id`: Original example id (if provided). + - `term`: Input term string. + - `types`: List of predicted label strings (selected by threshold or top-1). + + Raises + ValueError + If `task` is not `'term-typing'`. + RuntimeError + If `load()` and `fit()` have not been called. + """ + if task != "term-typing": + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) + if ( + self.ovr_random_forest is None + or self.tokenizer is None + or self.embedding_model is None + ): + raise RuntimeError("Call load() and fit() before predict().") + + # Normalize prediction input into parallel lists of terms and example ids + test_terms, example_ids = self._as_predict_terms_ids(data) + + # Encode terms + term_embeddings_test = self._encode(test_terms) + + # Match feature layout used during training + if self.use_graph_features and self.term_graph is not None: + graph_features_test = self._extract_graph_features( + self.term_graph, test_terms + ) + X_test = np.hstack([term_embeddings_test, graph_features_test]) + else: + X_test = term_embeddings_test + + # Probabilities per label (shape: [n_samples, n_labels]) + probability_matrix = self.ovr_random_forest.predict_proba(X_test) + + predictions: List[Dict[str, Any]] = [] + label_names = self.label_binarizer.classes_ + threshold = float(self.threshold) + + # Select labels above threshold; fallback to argmax if none exceed it + for row_index, label_probabilities in enumerate(probability_matrix): + selected_label_indices = np.where(label_probabilities > threshold)[0] + if len(selected_label_indices) == 0: + selected_label_indices = [int(np.argmax(label_probabilities))] + + predicted_types = [ + label_names[label_idx] for label_idx in selected_label_indices + ] + + predictions.append( + { + "id": example_ids[row_index], + "term": test_terms[row_index], + "types": predicted_types, + } + ) + return predictions + + def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, Any]]: + """Normalize ground-truth into a list of {id, term, types} dicts for evaluation. + + Parameters + data: + Ground-truth payload; supported formats include objects exposing + `.term_typings`, a list of dicts, or a list of tuples/lists. + task: + Must be `'term-typing'`. + + Returns + List[Dict[str, Any]] + A list of dictionaries with keys `id`, `term`, `types` (list of str). + + Raises + ValueError + If `task` is not `'term-typing'`. + """ + if task != "term-typing": + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) + return self._as_gold_id_term_types(data) + + def _encode(self, texts: List[str]) -> np.ndarray: + """Encode a list of strings into L2-normalized sentence embeddings. + + Parameters + texts: + List of input texts/terms. + + Returns + np.ndarray + Array of shape `(len(texts), hidden_size)` with L2-normalized + embeddings. If `texts` is empty, returns a `(0, hidden_size)` array. + """ + assert self.tokenizer is not None and self.embedding_model is not None, ( + "Call load(model_id) first." + ) + + if not texts: + hidden_size = getattr( + getattr(self.embedding_model, "config", None), "hidden_size", 768 + ) + return np.zeros((0, hidden_size), dtype=np.float32) + + batch_embeddings: List[torch.Tensor] = [] + + for start_idx in tqdm(range(0, len(texts), self.batch_size), desc="Embedding"): + end_idx = start_idx + self.batch_size + batch_texts = texts[start_idx:end_idx] + + # Tokenize and move to device + tokenized_batch = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="pt", + ).to(self.device) + + # Forward pass without gradients + with torch.no_grad(): + model_output = self.embedding_model(**tokenized_batch) + + # Prefer dedicated pooler if provided; otherwise pool by last valid token + if ( + hasattr(model_output, "pooler_output") + and model_output.pooler_output is not None + ): + sentence_embeddings = model_output.pooler_output + else: + sentence_embeddings = self._last_token_pool( + model_output.last_hidden_state, + tokenized_batch["attention_mask"], + ) + + # L2-normalize embeddings for stability + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + # Detach, move to CPU, collect + batch_embeddings.append(sentence_embeddings.detach().cpu()) + + # Best-effort memory cleanup (especially useful on CUDA) + del tokenized_batch, model_output, sentence_embeddings + if self.device.type == "cuda": + torch.cuda.empty_cache() + gc.collect() + + # Concatenate all batches and convert to NumPy + return torch.cat(batch_embeddings, dim=0).numpy() + + def _last_token_pool( + self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Select the last *non-padding* token embedding for each sequence. + + Parameters + last_hidden_states: + Tensor of shape `(batch, seq_len, hidden)`. + attention_mask: + Tensor of shape `(batch, seq_len)` with 1 for real tokens. + + Returns + torch.Tensor + Tensor of shape `(batch, hidden)` with per-sequence pooled embeddings. + """ + last_valid_token_idx = attention_mask.sum(dim=1) - 1 # (batch,) + batch_row_idx = torch.arange( + last_hidden_states.size(0), device=last_hidden_states.device + ) + return last_hidden_states[batch_row_idx, last_valid_token_idx] + + def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: + """Create a simple undirected co-occurrence graph from training rows. + + Graph Structure + Nodes + Terms (node attribute `'types'` is stored per term). + Edges + Between a term and each neighbor from its optional RAG list. + Edge weight = number of shared types (or 0.1 if none shared). + + Parameters + training_rows: + Normalized rows with keys: `'term'`, `'types'`, optional `'RAG'`. + + Returns + networkx.Graph + The constructed undirected graph. + """ + graph = nx.Graph() + + for row in training_rows: + term = row["term"] + term_types = row.get("types", []) + graph.add_node(term, types=term_types) + + # RAG may be a list of neighbor dicts like {"term": ..., "types": [...]} + for neighbor in row.get("RAG", []) or []: + neighbor_term = neighbor.get("term") + neighbor_types = neighbor.get("types", []) + + # Shared-type-based edge weight (weak edge if no overlap) + shared_types = set(term_types).intersection(set(neighbor_types)) + edge_weight = float(len(shared_types)) if shared_types else 0.1 + + graph.add_edge(term, neighbor_term, weight=edge_weight) + + return graph + + def _extract_graph_features( + self, term_graph: nx.Graph, terms: List[str] + ) -> np.ndarray: + """Compute simple per-term graph features. + + Feature Vector + For each term we compute a 4-dim vector: + `[degree, clustering_coefficient, degree_centrality, pagerank_score]` + + Parameters + term_graph: + Graph built over training terms. + terms: + List of term strings to extract features for. + + Returns + np.ndarray + Array of shape `(len(terms), 4)` (dtype float32). + """ + if len(term_graph): + degree_centrality = nx.degree_centrality(term_graph) + pagerank_scores = nx.pagerank(term_graph) + else: + degree_centrality, pagerank_scores = {}, {} + + feature_rows: List[List[float]] = [] + for term in terms: + if term in term_graph: + feature_rows.append( + [ + float(term_graph.degree(term)), + float(nx.clustering(term_graph, term)), + float(degree_centrality.get(term, 0.0)), + float(pagerank_scores.get(term, 0.0)), + ] + ) + else: + feature_rows.append([0.0, 0.0, 0.0, 0.0]) + + return np.asarray(feature_rows, dtype=np.float32) + + def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: + """Normalize diverse training data formats to a list of dicts: {term, types, RAG}. + + Supported Inputs + - Object with attribute `.term_typings` (iterable of items exposing + `.term`, `.types`, optional `.RAG`). + - List of dicts with keys `term`, `types`, optional `RAG`. + - List/tuple of `(term, types[, RAG])`. + + Parameters + data: + Training payload. + + Returns + List[Dict[str, Any]] + Normalized dictionaries ready for training. + + Raises + ValueError + If `data` is neither a list/tuple nor exposes `.term_typings`. + """ + normalized_rows: List[Dict[str, Any]] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for item in term_typings_attr: + term_text = getattr(item, "term", None) + type_list = getattr(item, "types", None) + rag_neighbors = getattr(item, "RAG", None) + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } + ) + return normalized_rows + + # Otherwise: must be a list/tuple-like container + if not isinstance(data, (list, tuple)): + raise ValueError( + "Training data must be a list/tuple or expose .term_typings" + ) + + if not data: + return normalized_rows + + # Case 2: list of dicts + if isinstance(data[0], dict): + for row in data: + term_text = row.get("term") + type_list = row.get("types") + rag_neighbors = row.get("RAG") + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } + ) + return normalized_rows + + # Case 3: list of tuples/lists: (term, types[, RAG]) + for item in data: + if not isinstance(item, (list, tuple)) or len(item) < 2: + continue + term_text, type_list = item[0], item[1] + rag_neighbors = item[2] if len(item) > 2 else None + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } + ) + + return normalized_rows + + def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: + """Normalize prediction input into parallel lists: (terms, ids). + + Supported Inputs + - Object with `.term_typings`. + - List of dicts with `term` and optional `id`. + - List of tuples/lists `(term, id[, ...])`. + - List of plain term strings. + + Parameters + data: + Evaluation payload. + + Returns + Tuple[List[str], List[Any]] + `(terms, example_ids)` lists aligned by index. + + Raises + ValueError + If the input format is unsupported. + """ + terms: List[str] = [] + example_ids: List[Any] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for idx, item in enumerate(term_typings_attr): + terms.append(str(getattr(item, "term", ""))) + example_ids.append(getattr(item, "id", getattr(item, "ID", idx))) + return terms, example_ids + + # Case 2: list/tuple container + if isinstance(data, (list, tuple)) and data: + first_element = data[0] + + # 2a) list of dicts + if isinstance(first_element, dict): + for i, row in enumerate(data): + terms.append(str(row.get("term", ""))) + example_ids.append(row.get("id", row.get("ID", i))) + return terms, example_ids + + # 2b) list of tuples/lists: (term, id[, ...]) + if isinstance(first_element, (list, tuple)): + for i, tuple_row in enumerate(data): + if not tuple_row: + continue + terms.append(str(tuple_row[0])) + example_ids.append(tuple_row[1] if len(tuple_row) > 1 else i) + return terms, example_ids + + # 2c) list of strings (terms only) + if isinstance(first_element, str): + terms = [str(x) for x in data] # type: ignore[arg-type] + example_ids = list(range(len(terms))) + return terms, example_ids + + raise ValueError("Unsupported predict() input format.") + + def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: + """Normalize gold labels into a list of dicts: {id, term, types}. + + Supported Inputs + Mirrors `_as_term_types_dicts`, but ensures an `id` is set. + + Parameters + data: + Ground-truth payload. + + Returns + List[Dict[str, Any]] + `{'id': Any, 'term': str, 'types': List[str]}` entries. + + """ + gold_rows: List[Dict[str, Any]] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for idx, item in enumerate(term_typings_attr): + gold_id = getattr(item, "id", getattr(item, "ID", idx)) + term_text = str(getattr(item, "term", "")) + type_list = getattr(item, "types", []) + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) + return gold_rows + + # Case 2: list/tuple container + if isinstance(data, (list, tuple)) and data: + first_element = data[0] + + # 2a) list of dicts + if isinstance(first_element, dict): + for i, row in enumerate(data): + gold_id = row.get("id", row.get("ID", i)) + term_text = str(row.get("term", "")) + type_list = row.get("types", []) + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) + return gold_rows + + # 2b) list of tuples/lists: (term, types[, id]) + if isinstance(first_element, (list, tuple)): + for i, tuple_row in enumerate(data): + if not tuple_row or len(tuple_row) < 2: + continue + term_text = str(tuple_row[0]) + type_list = tuple_row[1] + gold_id = tuple_row[2] if len(tuple_row) > 2 else i + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) + return gold_rows + + raise ValueError( + "Unsupported ground-truth input format for tasks_ground_truth_former()." + ) + + +class AlexbekRAGLearner(AutoLearner): + """Retrieval-Augmented Term Typing learner (single task: term-typing). + + Flow + 1) `fit`: collect (term -> [types]) examples, build an in-memory index + using a sentence-embedding model. + 2) `predict`: for each new term, retrieve top-k similar examples, compose a + structured prompt, query an instruction-tuned causal LLM, and parse types. + + Returns + List[Dict[str, Any]] + `{"term": str, "types": List[str], "id": Optional[str]}` rows. + """ + + def __init__( + self, + llm_model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "auto", # "auto" | "cuda" | "cpu" + token: str = "", # HF token if needed + top_k: int = 3, + max_new_tokens: int = 256, + gen_batch_size: int = 4, # generation batch size + enc_batch_size: int = 64, # embedding batch size + **kwargs: Any, # absorb extra pipeline-style args + ) -> None: + """Configure the RAG learner. + + Parameters + llm_model_id: + HF model id/path for the instruction-tuned causal LLM. + retriever_model_id: + Sentence-embedding model id for retrieval. + device: + Device policy ('auto'|'cuda'|'cpu') for the LLM. + token: + Optional HF token for gated models. + top_k: + Number of nearest examples to retrieve per query term. + max_new_tokens: + Decoding budget for the LLM. + gen_batch_size: + Number of prompts per generation batch. + enc_batch_size: + Number of texts per embedding batch. + **kwargs: + Extra configuration captured for downstream use. + """ + super().__init__() + + # Consolidated configuration for simple serialization + self.cfg: Dict[str, Any] = { + "llm_model_id": llm_model_id, + "retriever_model_id": retriever_model_id, + "device": device, + "token": token, + "top_k": int(top_k), + "max_new_tokens": int(max_new_tokens), + "gen_batch_size": int(gen_batch_size), + "enc_batch_size": int(enc_batch_size), + } + self.extra_cfg: Dict[str, Any] = dict(kwargs) + + # LLM components + self.tokenizer: Optional[AutoTokenizer] = None + self.generation_model: Optional[AutoModelForCausalLM] = None + + # Retriever components + self.embedder: Optional[SentenceTransformer] = None + self.indexed_corpus: List[str] = [] # items: " || [...]" + self.corpus_embeddings: Optional[torch.Tensor] = None + + # Training cache of (term, [types]) tuples + self.train_term_types: List[Tuple[str, List[str]]] = [] + + # Prompt templates + self._system_prompt: str = ( + "You are an expert in ontologies and semantic term classification.\n" + "Task: determine semantic types for the TERM using the EXAMPLES provided.\n" + "Rules:\n" + "1) Types must be generalizing categories from the domain ontology.\n" + "2) Be concise. Respond ONLY in JSON using double quotes.\n" + 'Format: {"term":"...", "reasoning":"<<=100 words>>", "types":["...", "..."]}\n' + ) + self._user_prompt_template: str = """{examples} + + TERM: {term} + + TASK: Determine semantic types for the given term based on the domain ontology. + Remember: types are generalizing categories, not the term itself. Respond in JSON. + """ + + def load( + self, + model_id: Optional[str] = None, + retriever_id: Optional[str] = None, + device: Optional[str] = None, + token: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Load the LLM and the embedding retriever. Overrides constructor values if provided. + + Parameters + model_id: + Optional override for the LLM model id. + retriever_id: + Optional override for the embedding model id. + device: + Optional override for device selection policy. + token: + Optional override for HF token. + **kwargs: + Extra values to store in `extra_cfg`. + + """ + if model_id is not None: + self.cfg["llm_model_id"] = model_id + if retriever_id is not None: + self.cfg["retriever_model_id"] = retriever_id + if device is not None: + self.cfg["device"] = device + if token is not None: + self.cfg["token"] = token + self.extra_cfg.update(kwargs) + + # Choose device & dtype for the LLM + cuda_available: bool = torch.cuda.is_available() + use_cuda: bool = cuda_available and (self.cfg["device"] != "cpu") + device_map: str = "auto" if use_cuda else "cpu" + torch_dtype = torch.bfloat16 if use_cuda else torch.float32 + + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + self.cfg["llm_model_id"], padding_side="left", token=self.cfg["token"] + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # LLM + self.generation_model = AutoModelForCausalLM.from_pretrained( + self.cfg["llm_model_id"], + device_map=device_map, + torch_dtype=torch_dtype, + token=self.cfg["token"], + ) + + # Deterministic decoding defaults + generation_cfg = self.generation_model.generation_config + generation_cfg.do_sample = False + generation_cfg.temperature = None + generation_cfg.top_p = None + generation_cfg.top_k = None + generation_cfg.num_beams = 1 + + # Retriever + self.embedder = SentenceTransformer( + self.cfg["retriever_model_id"], trust_remote_code=True + ) + + def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: + """Prepare the retrieval index from training examples. + + Parameters + train_data: + Training payload containing terms and their types. + task: + Must be `'term-typing'`; other tasks are forwarded to base. + ontologizer: + Unused flag for API compatibility. + + Side Effects + - Normalizes to a list of `(term, [types])`. + - Builds an indexable text corpus and (if embedder is loaded) + computes embeddings for retrieval. + """ + if task != "term-typing": + return super().fit(train_data, task, ontologizer) + + # Normalize incoming training data -> list[(term, [types])] + self.train_term_types = self._unpack_train(train_data) + + # Build the textual corpus to index + self.indexed_corpus = [ + f"{term} || {json.dumps(types, ensure_ascii=False)}" + for term, types in self.train_term_types + ] + + # Embed the corpus if available; else fall back to zero-shot prompting + if self.indexed_corpus and self.embedder is not None: + self.corpus_embeddings = self._encode_texts(self.indexed_corpus) + else: + self.corpus_embeddings = None + + def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: + """Predict types for evaluation items; returns a list of {term, types, id?}. + + Parameters + eval_data: + Evaluation payload to type (terms + optional ids). + task: + Must be `'term-typing'`; other tasks are forwarded to base. + ontologizer: + Unused flag for API compatibility. + + Returns + List[Dict[str, Any]] + For each input term, a dictionary with keys: + - `term`: The input term. + - `types`: A (unique, sorted) list of predicted types. + - `id`: Optional example id (if provided in input). + """ + if task != "term-typing": + return super().predict(eval_data, task, ontologizer) + + eval_terms, eval_ids = self._unpack_eval(eval_data) + if not eval_terms: + return [] + + # Use RAG if we have an indexed corpus & embeddings; otherwise zero-shot + rag_available = ( + self.corpus_embeddings is not None + and self.embedder is not None + and len(self.indexed_corpus) > 0 + ) + + if rag_available: + neighbor_docs_per_query = self._retrieve_batch( + eval_terms, top_k=int(self.cfg["top_k"]) + ) + else: + neighbor_docs_per_query = [[] for _ in eval_terms] + + # Compose prompts + prompts: List[str] = [] + for term, neighbor_docs in zip(eval_terms, neighbor_docs_per_query): + example_pairs = self._decode_examples(neighbor_docs) + examples_block = self._format_examples(example_pairs) + prompt_text = self._compose_prompt(examples_block, term) + prompts.append(prompt_text) + + predicted_types_lists = self._generate_and_parse(prompts) + + # Build standardized results + results: List[Dict[str, Any]] = [] + for term, example_id, predicted_types in zip( + eval_terms, eval_ids, predicted_types_lists + ): + result_row: Dict[str, Any] = { + "term": term, + "types": sorted({t for t in predicted_types}), # unique + sorted + } + if example_id is not None: + result_row["id"] = example_id + results.append(result_row) + + assert all(("term" in row and "types" in row) for row in results), ( + "predict() must return term + types" + ) + return results + + def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: + """Extract `(term, [types])` tuples from supported training payloads. + + Supported Inputs + - `data.term_typings` (objects exposing `.term` & `.types`) + - `list[dict]` with keys `'term'` and `'types'` + - `list[str]` → returns empty (nothing to index) + - other formats → empty + + Parameters + data: + Training payload. + + Returns + List[Tuple[str, List[str]]] + (term, types) tuples (types kept as strings). + """ + term_typings = getattr(data, "term_typings", None) + if term_typings is not None: + parsed_pairs: List[Tuple[str, List[str]]] = [] + for item in term_typings: + term = getattr(item, "term", None) + types = list(getattr(item, "types", []) or []) + if term and types: + parsed_pairs.append( + (term, [t for t in types if isinstance(t, str)]) + ) + return parsed_pairs + + if isinstance(data, list) and data and isinstance(data[0], dict): + parsed_pairs = [] + for row in data: + term = row.get("term") + types = row.get("types") or [] + if term and isinstance(types, list) and types: + parsed_pairs.append( + (term, [t for t in types if isinstance(t, str)]) + ) + return parsed_pairs + + # If only a list of strings is provided, there's nothing to index for RAG + if isinstance(data, (list, set, tuple)) and all( + isinstance(x, str) for x in data + ): + return [] + + return [] + + def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: + """Extract `(terms, ids)` from supported evaluation payloads. + + Supported Inputs + - `data.term_typings` (objects exposing `.term` & optional `.id`) + - `list[str]` + - `list[dict]` with `term` and optional `id` + + Parameters + data: + Evaluation payload. + + Returns + Tuple[List[str], List[Optional[str]]] + Two lists aligned by index: terms and ids (ids may contain `None`). + """ + term_typings = getattr(data, "term_typings", None) + if term_typings is not None: + terms: List[str] = [] + ids: List[Optional[str]] = [] + for item in term_typings: + terms.append(getattr(item, "term", "")) + ids.append(getattr(item, "id", None)) + return terms, ids + + if isinstance(data, list) and data and isinstance(data[0], str): + return list(data), [None] * len(data) + + if isinstance(data, list) and data and isinstance(data[0], dict): + terms: List[str] = [] + ids: List[Optional[str]] = [] + for row in data: + terms.append(row.get("term", "")) + ids.append(row.get("id")) + return terms, ids + + return [], [] + + def _encode_texts(self, texts: List[str]) -> torch.Tensor: + """Encode a batch of texts with the sentence-embedding model. + + Parameters + texts: + List of strings to embed. + + Returns + torch.Tensor + Tensor of shape `(len(texts), hidden_dim)`. If `texts` is empty, + returns an empty tensor with 0 rows. + """ + batch_size = int(self.cfg["enc_batch_size"]) + batch_embeddings: List[torch.Tensor] = [] + + for batch_start in range(0, len(texts), batch_size): + batch_texts = texts[batch_start : batch_start + batch_size] + embeddings = self.embedder.encode( + batch_texts, convert_to_tensor=True, show_progress_bar=False + ) + batch_embeddings.append(embeddings) + + return ( + torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + ) + + def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: + """Return for each query the top-k most similar corpus entries. + + Parameters + queries: + List of query terms. + top_k: + Number of neighbors to retrieve for each query. + + Returns + List[List[str]] + For each query, a list of raw corpus strings formatted as + `" || [\\"type1\\", ...]"`. + """ + if self.corpus_embeddings is None or not self.indexed_corpus: + return [[] for _ in queries] + + query_embeddings = self._encode_texts(queries) # [Q, D] + doc_embeddings = self.corpus_embeddings # [N, D] + if query_embeddings.shape[-1] != doc_embeddings.shape[-1]: + raise ValueError( + f"Embedding dim mismatch: {query_embeddings.shape[-1]} vs {doc_embeddings.shape[-1]}" + ) + + # Cosine similarity via L2-normalized dot product + q_norm = F.normalize(query_embeddings, p=2, dim=1) + d_norm = F.normalize(doc_embeddings, p=2, dim=1) + cos_sim = torch.matmul(q_norm, d_norm.T) # [Q, N] + + k = min(max(1, top_k), len(self.indexed_corpus)) + _, top_indices = torch.topk(cos_sim, k=k, dim=1) + return [[self.indexed_corpus[j] for j in row.tolist()] for row in top_indices] + + def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: + """Parse raw corpus rows ('term || [types]') into `(term, [types])` pairs. + + Parameters + docs: + Raw strings from the index/corpus. + + Returns + List[Tuple[str, List[str]]] + Parsed (term, types) pairs; malformed rows are skipped. + """ + example_pairs: List[Tuple[str, List[str]]] = [] + for raw_row in docs: + try: + term_raw, types_json = raw_row.split("||", 1) + term = term_raw.strip() + types_list = json.loads(types_json.strip()) + if isinstance(types_list, list): + example_pairs.append( + (term, [t for t in types_list if isinstance(t, str)]) + ) + except Exception: + continue + return example_pairs + + def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: + """Format retrieved example pairs into a compact block for the prompt. + + Parameters + pairs: + Retrieved `(term, [types])` examples. + + Returns + str + Human-readable lines to provide *light* guidance to the LLM. + """ + if not pairs: + return "EXAMPLES: (none provided)" + lines: List[str] = ["CLASSIFICATION EXAMPLES:"] + for idx, (term, types) in enumerate(pairs, 1): + preview_types = types[:3] # keep context small + lines.append(f"{idx}. Term: '{term}' → Types: {list(preview_types)}") + lines.append("END OF EXAMPLES.") + return "\n".join(lines) + + def _compose_prompt(self, examples_block: str, term: str) -> str: + """Compose the final prompt from system + user blocks. + + Parameters + examples_block: + Text block with retrieved examples. + term: + The query term to classify. + + Returns + str + Full prompt string passed to the LLM. + """ + user_block = self._user_prompt_template.format( + examples=examples_block, term=term + ) + return f"{self._system_prompt}\n\n{user_block}\n" + + def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: + """Run generation for a batch of prompts and parse the JSON `'types'` from outputs. + + Parameters + prompts: + Finalized prompts for the LLM. + + Returns + List[List[str]] + For each prompt, a list of predicted type strings. + """ + batch_size = int(self.cfg["gen_batch_size"]) + all_predicted_types: List[List[str]] = [] + + for batch_start in range(0, len(prompts), batch_size): + prompt_batch = prompts[batch_start : batch_start + batch_size] + + # Tokenize and move to the LLM's device + model_device = getattr(self.generation_model, "device", None) + encodings = self.tokenizer( + prompt_batch, return_tensors="pt", padding=True + ).to(model_device) + input_token_length = encodings["input_ids"].shape[1] + + # Deterministic decoding (greedy) + with torch.no_grad(): + generated_tokens = self.generation_model.generate( + **encodings, + do_sample=False, + num_beams=1, + temperature=None, + top_p=None, + top_k=None, + max_new_tokens=int(self.cfg["max_new_tokens"]), + pad_token_id=self.tokenizer.eos_token_id, + ) + + # Slice off the prompt tokens and decode only newly generated tokens + new_token_span = generated_tokens[:, input_token_length:] + decoded_texts = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in new_token_span + ] + + parsed_types_per_prompt = [ + self._parse_types(text) for text in decoded_texts + ] + all_predicted_types.extend(parsed_types_per_prompt) + + return all_predicted_types + + def _parse_types(self, text: str) -> List[str]: + """Extract a list of type strings from LLM output. + + Parsing Strategy (in order) + 1) Strict JSON object with `"types"`. + 2) Regex-extract JSON object containing `"types"`. + 3) Regex-extract first bracketed list. + 4) Comma-split fallback. + + Parameters + text: + Raw LLM output to parse. + + Returns + List[str] + Parsed list of type strings (possibly empty if parsing fails). + """ + try: + obj = json.loads(text) + if isinstance(obj, dict) and isinstance(obj.get("types"), list): + return [t for t in obj["types"] if isinstance(t, str)] + except Exception: + pass + + try: + obj_match = re.search( + r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S + ) + if obj_match: + obj = json.loads(obj_match.group(0)) + types = obj.get("types", []) + return [t for t in types if isinstance(t, str)] + except Exception: + pass + + try: + list_match = re.search(r"\[([^\]]+)\]", text) + if list_match: + items = [ + x.strip().strip('"').strip("'") + for x in list_match.group(1).split(",") + ] + return [t for t in items if t] + except Exception: + pass + + if "," in text: + items = [x.strip().strip('"').strip("'") for x in text.split(",")] + return [t for t in items if t] + + return [] diff --git a/ontolearner/learner/term_typing/rwthdbis.py b/ontolearner/learner/term_typing/rwthdbis.py new file mode 100644 index 0000000..c8df797 --- /dev/null +++ b/ontolearner/learner/term_typing/rwthdbis.py @@ -0,0 +1,379 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +from typing import Any, Dict, List, Optional, Tuple + +import torch +from datasets import Dataset, DatasetDict +from tqdm.auto import tqdm +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + set_seed, +) + +from ...base import AutoLearner + + +class RWTHDBISSFTLearner(AutoLearner): + """ + Supervised term-typing + + Training expands multi-label examples into multiple single-label rows. + Inference returns: [{"term": "", "types": [""]}, ...] + """ + + def __init__( + self, + model_name: str = "microsoft/deberta-v3-small", + trained_model_path: Optional[str] = None, + output_dir: Optional[str] = None, + device: str = "cpu", + max_length: int = 64, + per_device_train_batch_size: int = 16, + gradient_accumulation_steps: int = 2, + num_train_epochs: int = 3, + learning_rate: float = 2e-5, + weight_decay: float = 0.01, + logging_steps: int = 50, + save_strategy: str = "epoch", + save_total_limit: int = 1, + fp16: bool = False, + bf16: bool = False, + seed: int = 42, + ) -> None: + """Initialize the term-typing learner and configure training defaults. + + Args: + model_name: Backbone HF model identifier (used if `trained_model_path` is None). + trained_model_path: Optional path to a fine-tuned checkpoint for loading. + output_dir: Directory to write checkpoints and tokenizer; defaults to './term_typing'. + device: user-defined argument as 'cuda' or 'cpu'. + max_length: Maximum tokenized sequence length. + per_device_train_batch_size: Per-device batch size during training. + gradient_accumulation_steps: Number of update accumulation steps. + num_train_epochs: Training epochs. + learning_rate: Optimizer learning rate. + weight_decay: Weight decay coefficient. + logging_steps: Logging interval (steps) for the Trainer. + save_strategy: Checkpoint save strategy (e.g., 'epoch', 'steps', 'no'). + save_total_limit: Maximum number of checkpoints to keep. + fp16: Enable mixed precision (FP16) if supported. + bf16: Enable mixed precision (BF16) if supported. + seed: Random seed for reproducibility. + + Side Effects: + Creates `output_dir` if it does not exist. + + Notes: + The learner predicts exactly one label per term at inference time + (argmax over logits). + """ + super().__init__() + self.model_name = model_name + self.trained_model_path = trained_model_path + self.output_dir = output_dir or "./term_typing" + os.makedirs(self.output_dir, exist_ok=True) + + self.max_length = max_length + self.per_device_train_batch_size = per_device_train_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.num_train_epochs = num_train_epochs + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.logging_steps = logging_steps + self.save_strategy = save_strategy + self.save_total_limit = save_total_limit + self.fp16 = fp16 + self.bf16 = bf16 + self.seed = seed + + self.device = device + self.model: Optional[AutoModelForSequenceClassification] = None + self.tokenizer: Optional[AutoTokenizer] = None + self.id2label: Dict[int, str] = {} + self.label2id: Dict[str, int] = {} + + def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: + """ + Train or run inference for term typing, depending on `test`. + + When `test=False`, trains on `data.term_typings`. + When `test=True`, predicts labels for provided terms. + + Args: + data: If training, an object with `.term_typings` where each item has + `term` and `types` (list[str]). If testing, either a `List[str]` + of raw term texts or an object with `.term_typings`. + test: If True, runs inference; otherwise trains. + + Returns: + If `test=True`: a list of dicts like + `[{"term": "", "types": [""]}, ...]`. + If `test=False`: None. + + Raises: + ValueError: If required fields are missing from `data`. + """ + if test: + terms = self._collect_eval_terms(data) + return self._predict_structured_output(terms) + else: + self._train_from_term_typings(train_data=data) + return None + + def _expand_multilabel_training_rows( + self, term_typings: List[Any] + ) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]: + """ + Expand multi-label instances into single-label rows and derive label maps. + + Each training instance with fields: + - `term`: str-like + - `types`: list of label strings + is expanded into len(types) rows with the same `term` and individual labels. + + Args: + term_typings: Sequence of objects (e.g., dataclasses) exposing + `.term` and `.types`. + + Returns: + A tuple `(texts, label_ids, id2label, label2id)`: + - texts: Flattened list of term strings (one per label). + - label_ids: Parallel list of integer label ids. + - id2label: Mapping from id -> label string. + - label2id: Mapping from label string -> id. + """ + label_strings: List[str] = [] + for instance in term_typings: + label_strings.extend([str(label) for label in instance.types]) + + unique_labels = sorted(set(label_strings)) + id2label = {i: label for i, label in enumerate(unique_labels)} + label2id = {label: i for i, label in enumerate(unique_labels)} + + texts: List[str] = [] + label_ids: List[int] = [] + for instance in term_typings: + term_text = str(instance.term) + for label in instance.types: + texts.append(term_text) + label_ids.append(label2id[str(label)]) + + return texts, label_ids, id2label, label2id + + def _collect_eval_terms(self, eval_data: Any) -> List[str]: + """ + Collect the list of term texts to predict for evaluation. + + Accepts either: + - A `List[str]` of raw term texts, or + - An object with `.term_typings`, from which `.term` is extracted. + + Args: + eval_data: Input carrier for terms. + + Returns: + List of term strings. + + Raises: + ValueError: If `eval_data` lacks the expected structure. + """ + if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): + terms = eval_data + else: + term_typings = getattr(eval_data, "term_typings", None) + if term_typings is None: + raise ValueError( + "Provide a List[str] OR an object with .term_typings for test=True." + ) + terms = [str(instance.term) for instance in term_typings] + return terms + + def _train_from_term_typings(self, train_data: Any) -> None: + """Train the term-typing classifier from `.term_typings`. + + Steps: + 1) Seed RNGs for reproducibility. + 2) Expand multi-label examples into single-label rows. + 3) Build HF `DatasetDict`, tokenizer, and data collator. + 4) Initialize `AutoModelForSequenceClassification`. + 5) Train with `Trainer` and save model/tokenizer to `output_dir`. + + Args: + train_data: Object with `.term_typings`; each item exposes + `.term` (text) and `.types` (list[str]). + + Raises: + ValueError: If `train_data` does not provide `.term_typings`. + + Side Effects: + Writes a trained model to `self.output_dir` and updates + `self.id2label` / `self.label2id`. + """ + set_seed(self.seed) + random.seed(self.seed) + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + + term_typings: List[Any] = getattr(train_data, "term_typings", None) + if term_typings is None: + raise ValueError("train_data must provide .term_typings for term-typing.") + + texts, label_ids, self.id2label, self.label2id = ( + self._expand_multilabel_training_rows(term_typings) + ) + + dataset = DatasetDict( + {"train": Dataset.from_dict({"labels": label_ids, "text": texts})} + ) + + backbone = self.trained_model_path or self.model_name + try: + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) + except Exception: + # fallback if fast tokenizer isn't available + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=False) + + def tokenize_batch(batch: Dict[str, List[str]]): + """Tokenize a batch of texts with truncation and max length.""" + return self.tokenizer( + batch["text"], truncation=True, max_length=self.max_length + ) + + tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"]) + data_collator = DataCollatorWithPadding(self.tokenizer) + + self.model = AutoModelForSequenceClassification.from_pretrained( + backbone, + num_labels=len(self.id2label), + id2label=self.id2label, + label2id=self.label2id, + ) + + if ( + getattr(self.model.config, "pad_token_id", None) is None + and self.tokenizer.pad_token_id is not None + ): + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + training_args = TrainingArguments( + output_dir=self.output_dir, + learning_rate=self.learning_rate, + per_device_train_batch_size=self.per_device_train_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + num_train_epochs=self.num_train_epochs, + weight_decay=self.weight_decay, + save_strategy=self.save_strategy, + save_total_limit=self.save_total_limit, + logging_steps=self.logging_steps, + fp16=self.fp16, + bf16=self.bf16, + report_to=[], + ) + + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=tokenized["train"], + tokenizer=self.tokenizer, + data_collator=data_collator, + ) + + trainer.train() + trainer.save_model(self.output_dir) + self.tokenizer.save_pretrained(self.output_dir) + + def _ensure_loaded_for_inference(self) -> None: + """Load model/tokenizer for inference if not already loaded. + + Loads from `trained_model_path` if set, otherwise from `output_dir`. + Also restores `id2label`/`label2id` from the model config when present, + moves the model to the configured device, and sets eval mode. + """ + if self.model is not None and self.tokenizer is not None: + return + model_path = self.trained_model_path or self.output_dir + self.model = AutoModelForSequenceClassification.from_pretrained(model_path) + try: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + except Exception: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + + cfg = self.model.config + if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"): + self.id2label = dict(cfg.id2label) + self.label2id = dict(cfg.label2id) + + self.model.to(self.device).eval() + + def _predict_label_ids(self, terms: List[str]) -> List[int]: + """Predict label ids (argmax) for a list of term strings. + + Ensures model/tokenizer are loaded, then performs forward passes + term-by-term and collects the argmax label id. + + Args: + terms: List of raw term texts. + + Returns: + List of integer label ids corresponding to `terms`. + """ + self._ensure_loaded_for_inference() + predictions: List[int] = [] + for term_text in tqdm( + terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}" + ): + inputs = self.tokenizer( + term_text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) + inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()} + with torch.no_grad(): + logits = self.model(**inputs).logits + predictions.append(int(torch.argmax(logits, dim=-1).item())) + return predictions + + def _predict_structured_output( + self, terms: List[str] + ) -> List[Dict[str, List[str]]]: + """ + Convert predicted label IDs into evaluator-friendly structured outputs. + + The output format is: + [{"term": "", "types": [""]}, ...] + + Args: + terms: Raw term texts to classify. + + Returns: + List of dicts mapping each input term to a list with its predicted + label string. Falls back to stringified id if label mapping is absent. + """ + label_ids = self._predict_label_ids(terms) + id2label_map = self.id2label or {} # fallback handled below + + results: List[Dict[str, List[str]]] = [] + for term_text, label_id in zip(terms, label_ids): + label_str = id2label_map.get(int(label_id), str(int(label_id))) + results.append({"term": term_text, "types": [label_str]}) + return results diff --git a/ontolearner/learner/term_typing/sbunlp.py b/ontolearner/learner/term_typing/sbunlp.py new file mode 100644 index 0000000..d5c0114 --- /dev/null +++ b/ontolearner/learner/term_typing/sbunlp.py @@ -0,0 +1,478 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional +import re + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from ...base import AutoLearner + + +class SBUNLPZSLearner(AutoLearner): + """ + Qwen-based blind term typing learner (Task B), implemented as an AutoLearner. + + Lifecycle: + • `fit(...)` learns/records the allowed type inventory from the training payload. + • `load(...)` explicitly loads the tokenizer/model (pass `model_id`/`token` here). + • `predict(...)` prompts the model per term and returns normalized types limited + to the learned inventory. + """ + + def __init__( + self, + device: str = "cpu", + max_new_tokens: int = 64, + temperature: float = 0.0, + model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", + token: Optional[str] = None, + ) -> None: + """ + Configure runtime knobs. Model identity and auth are provided to `load(...)`. + + Args: + device: Torch device policy ("cuda", "mps", or "cpu"). + max_new_tokens: Max tokens to generate per prompt (greedy decoding). + temperature: Reserved for future sampling; generation is greedy here. + model_id: Fallback model id/path used if `load()` is called without args. + token: Fallback HF token used if `load()` is called without args. + + Side Effects: + Initializes runtime configuration, instance defaults for `load()`, + and placeholders for `tokenizer`, `model`, and `allowed_types`. + """ + super().__init__() + self.device = device + self.max_new_tokens = max_new_tokens + self.temperature = temperature + + # Defaults that load() may use when its args are None + self.model_id = model_id + self.token = token + + # Placeholders populated by load() + self.tokenizer: Optional[AutoTokenizer] = None + self.model: Optional[AutoModelForCausalLM] = None + + # Learned inventory + self.allowed_types: List[str] = [] + + # Regex used to extract quoted strings from model output (e.g., "type") + self._quoted_re = re.compile(r'"([^"]+)"') + + def load( + self, + model_id: Optional[str] = None, + token: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + ): + """ + Load tokenizer and model weights explicitly. + + Argument precedence: + 1) Use `model_id` / `token` passed to this method (if provided). + 2) Else fall back to `self.model_id` / `self.token`. + + Device & dtype: + • If `dtype` is None, the default is float16 on CUDA/MPS and float32 on CPU. + • `device_map` is `"auto"` for non-CPU devices, `"cpu"` otherwise. + + Args: + model_id: HF model id/path to load. If None, uses `self.model_id`. + token: HF token if the model is gated. If None, uses `self.token`. + dtype: Optional torch dtype override (e.g., `torch.float16`). + + Returns: + self + """ + resolved_model_id = model_id or self.model_id + resolved_token = token if token is not None else self.token + + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + resolved_model_id, token=resolved_token + ) + if self.tokenizer.pad_token is None: + # Prefer EOS as pad if available + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Device & dtype + if dtype is None: + if self.device == "cpu": + resolved_dtype = torch.float32 + else: + # Works for CUDA and Apple MPS + resolved_dtype = torch.float16 + else: + resolved_dtype = dtype + + device_map = "auto" if self.device != "cpu" else "cpu" + + self.model = AutoModelForCausalLM.from_pretrained( + resolved_model_id, + device_map=device_map, + torch_dtype=resolved_dtype, # keep torch_dtype for broad Transformers compatibility + token=resolved_token, + ) + return self + + def fit(self, train_data: Any, task: str, ontologizer: bool = True): + """ + Learn the allowed type inventory from the training data. + + Normalization rules: + • If `ontologizer=True`, the framework's `tasks_data_former(..., test=False)` + is used to normalize `train_data`. + • If a container exposes `.term_typings`, types are collected from there. + • If the normalized data is a list of dicts with `"parent"`, unique parents + become the allowed types. + • If it's a list of strings, that unique set becomes the allowed types. + + Args: + train_data: Training payload provided by the pipeline. + task: Must be `"term-typing"`. + ontologizer: If True, normalize via `tasks_data_former()` first. + + Returns: + self + + Raises: + ValueError: If `task` is not `"term-typing"`. + TypeError: If the training data cannot be normalized to a list of + strings or relationship dicts. + """ + train_fmt = ( + self.tasks_data_former(data=train_data, task=task, test=False) + if ontologizer + else train_data + ) + if task != "term-typing": + raise ValueError("SBUNLPZSLearner only implements 'term-typing'.") + + # If framework passed a container with `.term_typings`, extract types from there + if not isinstance(train_fmt, list): + if hasattr(train_fmt, "term_typings"): + try: + collected = set() + for tt in getattr(train_fmt, "term_typings") or []: + # tt.types could be list[str] or a single str + if hasattr(tt, "types"): + tvals = tt.types + elif isinstance(tt, dict) and "types" in tt: + tvals = tt["types"] + else: + tvals = None + + if isinstance(tvals, (list, tuple, set)): + for x in tvals: + if isinstance(x, str): + collected.add(x) + elif isinstance(tvals, str): + collected.add(tvals) + + if collected: + self.allowed_types = sorted(collected) + return self + except Exception: + # Fall through to error below if unexpected issues occur. + pass + + raise TypeError("For term-typing, expected a list of type labels at fit().") + + # At this point train_fmt is a list (original logic preserved) + if train_fmt and isinstance(train_fmt[0], dict) and "parent" in train_fmt[0]: + # Case A: Received raw relationships/pairs (e.g., from train_test_split). + unique_types = set(r.get("parent") for r in train_fmt if r.get("parent")) + self.allowed_types = sorted(unique_types) + elif all(isinstance(x, str) for x in train_fmt): + # Case B: Received a clean list of type labels (List[str]). + self.allowed_types = sorted(set(train_fmt)) + else: + raise TypeError( + "For term-typing, input data format for fit() is invalid. " + "Expected list of strings (types) or list of relationships (dicts)." + ) + + return self + + def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: + """ + Predict types for each term and return standardized rows. + + Expected inputs: + • With `ontologizer=True`: a `list[str]` of terms (IDs are auto-generated), + or a container exposing `.term_typings` from which `{'id','term'}` pairs + can be extracted. + • With `ontologizer=False`: a `list[dict]` of `{'id','term'}` to preserve IDs. + + Args: + eval_data: Evaluation payload as described above. + task: Must be `"term-typing"`. + ontologizer: If True, normalize through the pipeline’s data former. + + Returns: + A list of dictionaries: + `{"id": str, "term": str, "types": List[str]}`. + """ + if task != "term-typing": + # Delegate to base for other tasks (not implemented here) + return super().predict(eval_data, task, ontologizer=ontologizer) + + def _extract_list_of_dicts_from_term_typings( + obj, + ) -> Optional[List[Dict[str, str]]]: + """Try to derive `[{id, term}, ...]` from an object with `.term_typings`.""" + tts = getattr(obj, "term_typings", None) + if tts is None: + return None + out = [] + for tt in tts: + if isinstance(tt, dict): + tid = tt.get("ID") or tt.get("id") or tt.get("Id") or tt.get("ID_") + tterm = tt.get("term") or tt.get("label") or tt.get("name") + else: + tid = ( + getattr(tt, "ID", None) + or getattr(tt, "id", None) + or getattr(tt, "Id", None) + ) + tterm = ( + getattr(tt, "term", None) + or getattr(tt, "label", None) + or getattr(tt, "name", None) + ) + if tid is None or tterm is None: + continue + out.append({"id": str(tid), "term": str(tterm)}) + return out if out else None + + # Case A: ontologizer=True -> framework often provides list[str] + if ontologizer: + if isinstance(eval_data, list) and all( + isinstance(x, str) for x in eval_data + ): + eval_pack = [ + {"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data) + ] + else: + maybe = _extract_list_of_dicts_from_term_typings(eval_data) + if maybe is not None: + eval_pack = maybe + else: + # Last resort: attempt to coerce iterables of str + if hasattr(eval_data, "__iter__") and not isinstance( + eval_data, (str, bytes) + ): + lst = list(eval_data) + if all(isinstance(x, str) for x in lst): + eval_pack = [ + {"id": f"TT_{i:06d}", "term": t} + for i, t in enumerate(lst) + ] + else: + raise TypeError( + "With ontologizer=True, eval_data must be list[str] of terms." + ) + else: + raise TypeError( + "With ontologizer=True, eval_data must be list[str] of terms." + ) + return self._term_typing(eval_pack, test=True) + + # Case B: ontologizer=False -> expect list[dict], but tolerate containers + else: + if isinstance(eval_data, list) and all( + isinstance(x, dict) for x in eval_data + ): + eval_pack = eval_data + else: + maybe = _extract_list_of_dicts_from_term_typings(eval_data) + if maybe is not None: + eval_pack = maybe + else: + if isinstance(eval_data, dict): + for key in ("term_typings", "terms", "items"): + if key in eval_data and isinstance( + eval_data[key], (list, tuple) + ): + converted = [] + for x in eval_data[key]: + if ( + isinstance(x, dict) + and ("id" in x or "ID" in x) + and ("term" in x or "name" in x) + ): + tid = x.get("ID") or x.get("id") + tterm = x.get("term") or x.get("name") + converted.append( + {"id": str(tid), "term": str(tterm)} + ) + if converted: + eval_pack = converted + break + else: + raise TypeError( + "With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}." + ) + else: + raise TypeError( + "With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}." + ) + return self._term_typing(eval_pack, test=True) + + def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: + """ + Internal implementation of the *term-typing* task. + + Training mode (`test=False`): + • Expects a `list[str]` of allowed types. Stores a sorted unique copy. + + Inference mode (`test=True`): + • Expects a `list[dict]` of `{"id","term"}` items. + • Requires `load()` to have been called (model/tokenizer available). + • Builds a blind prompt per item, generates text, parses quoted + candidates, and filters them to `self.allowed_types`. + + Args: + data: See the mode-specific expectations above. + test: Set `True` to run inference; `False` to store the type inventory. + + Returns: + • `None` in training mode. + • `list[dict]` with `{"id","term","types":[...]}` in inference mode. + + Raises: + TypeError: If `data` is not in the expected shape for the mode. + RuntimeError: If model/tokenizer are not loaded at inference time. + """ + if not test: + # training: expect a list of strings (type labels) + if not isinstance(data, list): + raise TypeError("Expected a list of type labels at training time.") + self.allowed_types = sorted(set(data)) + return None + + # Inference path + if not isinstance(data, list) or not all(isinstance(x, dict) for x in data): + raise TypeError( + "At prediction time, expected a list of {'id','term'} dicts." + ) + + if self.model is None or self.tokenizer is None: + raise RuntimeError( + "Model/tokenizer not loaded. Call .load() before predict()." + ) + + results = [] + for item in data: + term_id = item["id"] + term_text = item["term"] + prompt = self._build_blind_prompt(term_id, term_text, self.allowed_types) + types = self._generate_and_parse_types(prompt) + results.append({"id": term_id, "term": term_text, "types": types}) + + return results + + def _format_types_inline(self, allowed: List[str]) -> str: + """ + Format the allowed types for inline inclusion in prompts. + + Args: + allowed: List of allowed type labels. + + Returns: + A comma-separated string of quoted types, e.g.: + `"type1", "type2", "type3"`. Returns an empty string for an empty list. + """ + if not allowed: + return "" + return ", ".join(f'"{t}"' for t in allowed if isinstance(t, str) and t.strip()) + + def _build_blind_prompt( + self, term_id: str, term: str, allowed_types: List[str] + ) -> str: + """ + Construct the blind JSON prompt for a single term. + + The prompt: + • Instructs the model to produce ONLY a JSON array of `{id, types}` objects. + • Provides the allowed types list so the model should only use those. + • Includes the single input item for which the model must decide types. + + Args: + term_id: Identifier to carry through to the output JSON. + term: The input term string to classify. + allowed_types: Inventory used to constrain outputs. + + Returns: + The full prompt string to feed to the LLM. + """ + allowed_str = self._format_types_inline(allowed_types) + return ( + "Identify the type(s) of the term in a second JSON file.\n" + "A term can have more than one type.\n" + "Output file must be in this format:\n" + "[\n" + '{ "id": "TT_465e8904", "types": [ "type1" ] },\n' + '{ "id": "TT_01c7707e", "types": [ "type2", "type3" ] },\n' + '{ "id": "TT_b20cb478", "types": [ "type4" ] }\n' + "]\n" + "The id must be taken from the input JSON file.\n" + "You must find the type(s) for each term in the JSON file.\n" + "Types must be selected only from the types list.\n\n" + f"Types list: {allowed_str}\n\n" + f'{{ "id": "{term_id}", "term": "{term}" }}' + ) + + def _generate_and_parse_types(self, prompt: str) -> List[str]: + """ + Greedy-generate text, extract candidate types, and filter to the inventory. + + Workflow: + 1) Tokenize the prompt and generate deterministically (greedy). + 2) Decode and extract quoted substrings via regex (e.g., `"type"`). + 3) Keep only those candidates that exist in `self.allowed_types`. + 4) Return a unique, sorted list (stable across runs). + + Args: + prompt: Fully formatted prompt string. + + Returns: + List of predicted type labels (possibly empty if none found). + + Raises: + AssertionError: If `model` or `tokenizer` are unexpectedly `None`. + """ + assert self.model is not None and self.tokenizer is not None + + # Tokenize prompt and move tensors to model device to avoid device mismatch + inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) + + with torch.no_grad(): + outputs = self.model.generate( + **inputs, + max_new_tokens=self.max_new_tokens, + do_sample=False, # deterministic (greedy) decoding + pad_token_id=self.tokenizer.eos_token_id, + ) + + # Decode full generated sequence (prompt + generation). Then extract quoted strings. + text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + candidates = self._quoted_re.findall(text) + + # Filter candidates to the allowed inventory and stabilize order. + filtered = [c for c in candidates if c in self.allowed_types] + return sorted(set(filtered)) diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py new file mode 100644 index 0000000..489853b --- /dev/null +++ b/ontolearner/learner/text2onto/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .alexbek import AlexbekFewShotLearner +from .sbunlp import SBUNLPFewShotLearner diff --git a/ontolearner/learner/text2onto/alexbek.py b/ontolearner/learner/text2onto/alexbek.py new file mode 100644 index 0000000..f1692f7 --- /dev/null +++ b/ontolearner/learner/text2onto/alexbek.py @@ -0,0 +1,1219 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple, Iterable +import json +from json.decoder import JSONDecodeError +import os +import random +import re + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +from ...base import AutoLearner, AutoLLM + +try: + from outlines.models import Transformers as OutlinesTFModel + from outlines.generate import json as outlines_generate_json + from pydantic import BaseModel + + class _PredictedTypesSchema(BaseModel): + """Schema used when generating structured JSON { "types": [...] }.""" + + types: List[str] + + OUTLINES_AVAILABLE: bool = True +except Exception: + # If outlines is unavailable, we will fall back to greedy decoding + regex parsing. + OUTLINES_AVAILABLE = False + _PredictedTypesSchema = None + OutlinesTFModel = None + outlines_generate_json = None + + +class LocalAutoLLM(AutoLLM): + """ + Minimal local LLM helper. + + - Inherits AutoLLM but overrides load/generate to avoid label_mapper. + - Optional 4-bit loading with `load_in_4bit=True` in .load(). + - Greedy decoding by default (deterministic). + """ + + def __init__(self, device: str = "cpu", token: str = "") -> None: + """ + Initialize the local LLM holder. + + Parameters + ---------- + device : str + Execution device: "cpu" or "cuda". + token : str + Optional auth token for private model hubs. + """ + super().__init__(label_mapper=None, device=device, token=token) + self.model: Optional[AutoModelForCausalLM] = None + self.tokenizer: Optional[AutoTokenizer] = None + + def load(self, model_id: str, *, load_in_4bit: bool = False) -> None: + """ + Load a Hugging Face causal model + tokenizer and set deterministic + generation defaults. + + Parameters + ---------- + model_id : str + Model identifier resolvable by HF `from_pretrained`. + load_in_4bit : bool + If True and bitsandbytes is available, load using 4-bit quantization. + """ + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, padding_side="left", token=self.token + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Model (optionally quantized) + if load_in_4bit: + from transformers import BitsAndBytesConfig + + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + quantization_config=quantization_config, + token=self.token, + ) + else: + device_map = ( + "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch.bfloat16 + if torch.cuda.is_available() + else torch.float32, + token=self.token, + ) + + # Deterministic generation defaults + generation_cfg = self.model.generation_config + generation_cfg.do_sample = False + generation_cfg.temperature = None + generation_cfg.top_k = None + generation_cfg.top_p = None + generation_cfg.num_beams = 1 + + def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: + """ + Greedy-generate continuations for a list of prompts. + + Parameters + ---------- + prompts : List[str] + Prompts to generate for (batched). + max_new_tokens : int + Maximum number of new tokens per continuation. + + Returns + ------- + List[str] + Decoded new-token texts (no special tokens, stripped). + """ + if self.model is None or self.tokenizer is None: + raise RuntimeError( + "Call .load(model_id) on LocalAutoLLM before generate()." + ) + + tokenized_batch = self.tokenizer( + prompts, return_tensors="pt", padding=True, truncation=True + ) + input_seq_len = tokenized_batch["input_ids"].shape[1] + tokenized_batch = { + k: v.to(self.model.device) for k, v in tokenized_batch.items() + } + + with torch.no_grad(): + outputs = self.model.generate( + **tokenized_batch, + max_new_tokens=max_new_tokens, + pad_token_id=self.tokenizer.eos_token_id, + do_sample=False, + num_beams=1, + ) + + # Only return the newly generated part for each row in the batch + continuation_token_ids = outputs[:, input_seq_len:] + return [ + self.tokenizer.decode(row, skip_special_tokens=True).strip() + for row in continuation_token_ids + ] + + +class AlexbekFewShotLearner(AutoLearner): + """ + Text2Onto learner for LLMS4OL Task A (term & type extraction). + + Public API (A1 + convenience): + - fit(train_docs_jsonl, terms2doc_json, sample_size=24, seed=42) + - predict_terms(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int + - predict_types(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int + - evaluate_extraction_f1(gold_item2docs_json, preds_jsonl, key="term"|"type") -> float + + Option A (A2, term→types) bridge: + - predict_types_from_terms_option_a(...) + Reads your A1 results (docs→terms), predicts types for each term, and + writes two files: terms2types_pred.json + types2docs_pred.json + """ + + def __init__(self, model: LocalAutoLLM, device: str = "cpu", **_: Any) -> None: + """ + Initialize learner state and canned prompts. + + Parameters + ---------- + model : LocalAutoLLM + Loaded local LLM helper instance. + device : str + Device name ("cpu" or "cuda"). + """ + super().__init__(**_) + self.model = model + self.device = device + + # Few-shot exemplars for A1 (Docs→Terms) and for Docs→Types: + # Each exemplar is a tuple: (title, text, gold_list) + self._fewshot_terms_docs: List[Tuple[str, str, List[str]]] = [] + self._fewshot_types_docs: List[Tuple[str, str, List[str]]] = [] + + # System prompts + self._system_prompt_terms = ( + "You are an expert in ontology term extraction.\n" + "Extract only terms that explicitly appear in the document.\n" + 'Answer strictly as JSON: {"terms": ["..."]}\n' + ) + self._system_prompt_types = ( + "You are an expert in ontology type classification.\n" + "List ontology *types* that characterize the document’s terminology.\n" + 'Answer strictly as JSON: {"types": ["..."]}\n' + ) + + # Compiled regex for robust JSON extraction from LLM outputs + self._json_object_regex = re.compile(r"\{[^{}]*\}", re.S) + self._json_array_regex = re.compile(r"\[[^\]]*\]", re.S) + + # Term→Types (Option A) specific prompt + self._system_prompt_term_to_types = ( + "You are an expert in ontology and semantic type classification.\n" + "Given a term, predict its semantic types from the domain-specific ontology.\n" + 'Answer strictly as JSON:\n{"types": ["type1", "type2", "..."]}' + ) + + def fit( + self, + *, + train_docs_jsonl: str, + terms2doc_json: str, + sample_size: int = 24, + seed: int = 42, + ) -> None: + """ + Build internal few-shot exemplars from a labeled training split. + + Parameters + ---------- + train_docs_jsonl : str + Path to JSONL (or tolerant JSON/JSONL) with train documents. + terms2doc_json : str + JSON mapping item -> [doc_id,...]; "item" can be a term or type. + sample_size : int + Number of exemplar documents to keep for few-shot prompting. + seed : int + RNG seed for reproducible sampling. + """ + rng = random.Random(seed) + + # Load documents and map doc_id -> row + document_map = self._load_documents_jsonl(train_docs_jsonl) + if not document_map: + raise FileNotFoundError(f"No documents found in: {train_docs_jsonl}") + + # Load item -> [doc_ids] + item_to_docs_map = self._load_json(terms2doc_json) + if not isinstance(item_to_docs_map, dict): + raise ValueError( + f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]" + ) + + # Reverse mapping: doc_id -> [items] + doc_id_to_items_map: Dict[str, List[str]] = {} + for item_label, doc_id_list in item_to_docs_map.items(): + for doc_id in doc_id_list: + doc_id_to_items_map.setdefault(doc_id, []).append(item_label) + + # Build candidate exemplars (title, text, gold_list) + exemplar_candidates: List[Tuple[str, str, List[str]]] = [] + for doc_id, labeled_items in doc_id_to_items_map.items(): + doc_row = document_map.get(doc_id) + if not doc_row: + continue + doc_title = str(doc_row.get("title", "")) # be defensive (may be None) + doc_text = self._to_text( + doc_row.get("text", "") + ) # string-ify list if needed + if not doc_text: + continue + gold_items = self._unique_preserve( + [s for s in labeled_items if isinstance(s, str)] + ) + if gold_items: + exemplar_candidates.append((doc_title, doc_text, gold_items)) + + if not exemplar_candidates: + raise RuntimeError( + "No candidate docs with items found to build few-shot exemplars." + ) + + chosen_exemplars = rng.sample( + exemplar_candidates, k=min(sample_size, len(exemplar_candidates)) + ) + # Reuse exemplars for both docs→terms and docs→types prompting + self._fewshot_terms_docs = chosen_exemplars + self._fewshot_types_docs = chosen_exemplars + + def predict_terms( + self, + *, + docs_test_jsonl: str, + out_jsonl: str, + max_new_tokens: int = 128, + few_shot_k: int = 6, + ) -> int: + """ + Extract terms that explicitly appear in each document. + + Writes one JSON object per line: + {"id": "", "terms": ["...", "...", ...]} + + Parameters + ---------- + docs_test_jsonl : str + Path to test/dev documents in JSONL or tolerant JSON/JSONL. + out_jsonl : str + Output JSONL path where predictions are written (one line per doc). + max_new_tokens : int + Max generation length. + few_shot_k : int + Number of few-shot exemplars to prepend per prompt. + + Returns + ------- + int + Number of lines written (i.e., number of processed documents). + """ + if self.model is None or self.model.model is None: + raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") + + test_documents = self._load_documents_jsonl(docs_test_jsonl) + prompts: List[str] = [] + document_order: List[str] = [] + + for document_id, document_row in test_documents.items(): + title = str(document_row.get("title", "")) + text = self._to_text(document_row.get("text", "")) + + fewshot_block = self._format_fewshot_block( + self._system_prompt_terms, + self._fewshot_terms_docs, + key="terms", + k=few_shot_k, + ) + user_block = self._format_user_block(title, text) + + prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") + document_order.append(document_id) + + generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) + parsed_term_lists = [ + self._parse_json_list(generated, key="terms") for generated in generations + ] + + os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) + lines_written = 0 + with open(out_jsonl, "w", encoding="utf-8") as fp_out: + for document_id, term_list in zip(document_order, parsed_term_lists): + payload = {"id": document_id, "terms": self._unique_preserve(term_list)} + fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") + lines_written += 1 + return lines_written + + def predict_types( + self, + *, + docs_test_jsonl: str, + out_jsonl: str, + max_new_tokens: int = 128, + few_shot_k: int = 6, + ) -> int: + """ + Predict ontology types that characterize each document’s terminology. + + Writes one JSON object per line: + {"id": "", "types": ["...", "...", ...]} + + Parameters + ---------- + docs_test_jsonl : str + Path to test/dev documents in JSONL or tolerant JSON/JSONL. + out_jsonl : str + Output JSONL path where predictions are written (one line per doc). + max_new_tokens : int + Max generation length. + few_shot_k : int + Number of few-shot exemplars to prepend per prompt. + + Returns + ------- + int + Number of lines written (i.e., number of processed documents). + """ + if self.model is None or self.model.model is None: + raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") + + test_documents = self._load_documents_jsonl(docs_test_jsonl) + prompts: List[str] = [] + document_order: List[str] = [] + + for document_id, document_row in test_documents.items(): + title = str(document_row.get("title", "")) + text = self._to_text(document_row.get("text", "")) + + fewshot_block = self._format_fewshot_block( + self._system_prompt_types, + self._fewshot_types_docs, + key="types", + k=few_shot_k, + ) + user_block = self._format_user_block(title, text) + + prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") + document_order.append(document_id) + + generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) + parsed_type_lists = [ + self._parse_json_list(generated, key="types") for generated in generations + ] + + os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) + lines_written = 0 + with open(out_jsonl, "w", encoding="utf-8") as fp_out: + for document_id, type_list in zip(document_order, parsed_type_lists): + payload = {"id": document_id, "types": self._unique_preserve(type_list)} + fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") + lines_written += 1 + return lines_written + + def evaluate_extraction_f1( + self, + gold_item2docs_json: str, + preds_jsonl: str, + *, + key: str = "term", + ) -> float: + """ + Compute micro-F1 over (doc_id, item) pairs. + + Parameters + ---------- + gold_item2docs_json : str + JSON mapping item -> [doc_ids]. + preds_jsonl : str + JSONL lines like {"id": "...", "terms":[...]} or {"id":"...","types":[...]}. + key : str + "term" or "type" depending on what you are evaluating. + + Returns + ------- + float + Micro-averaged F1 score. + """ + item_to_doc_ids: Dict[str, List[str]] = self._load_json(gold_item2docs_json) + + # Build gold: doc_id -> set(items) + gold_doc_to_items: Dict[str, set] = {} + for item_label, doc_id_list in item_to_doc_ids.items(): + for document_id in doc_id_list: + gold_doc_to_items.setdefault(document_id, set()).add( + self._norm(item_label) + ) + + # Build predictions: doc_id -> set(items) + pred_doc_to_items: Dict[str, set] = {} + with open(preds_jsonl, "r", encoding="utf-8") as fp_in: + for line in fp_in: + row = json.loads(line.strip()) + document_id = str(row.get("id", "")) + items_list = row.get("terms" if key == "term" else "types", []) + pred_doc_to_items[document_id] = { + self._norm(x) for x in items_list if isinstance(x, str) + } + + # Micro counts + true_positive = false_positive = false_negative = 0 + all_document_ids = set(gold_doc_to_items.keys()) | set(pred_doc_to_items.keys()) + for document_id in all_document_ids: + gold_set = gold_doc_to_items.get(document_id, set()) + pred_set = pred_doc_to_items.get(document_id, set()) + true_positive += len(gold_set & pred_set) + false_positive += len(pred_set - gold_set) + false_negative += len(gold_set - pred_set) + + precision = ( + true_positive / (true_positive + false_positive) + if (true_positive + false_positive) + else 0.0 + ) + recall = ( + true_positive / (true_positive + false_negative) + if (true_positive + false_negative) + else 0.0 + ) + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) + else 0.0 + ) + return f1 + + def predict_types_from_terms( + self, + *, + doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl + doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list + few_shot_jsonl: Optional[ + str + ] = None, # JSONL lines: {"term":"...", "types":[...]} + rag_terms_json: Optional[ + str + ] = None, # JSON list; items may contain "term" and "RAG":[...] + random_few_shot: Optional[int] = 3, + model_id: str = "Qwen/Qwen2.5-1.5B-Instruct", + use_structured_output: bool = True, + seed: int = 42, + out_terms2types: str = "terms2types_pred.json", + out_types2docs: str = "types2docs_pred.json", + ) -> Dict[str, Any]: + """ + Predict types for each unique term extracted per document and derive a types→docs map. + + Parameters + ---------- + doc_terms_jsonl : Optional[str] + Path to JSONL with lines like {"id": "...", "terms": [...]} or a JSON with {"results":[...]}. + doc_terms_list : Optional[List[Dict]] + In-memory results like [{"id":"...","extracted_terms":[...]}] or {"id":"...","terms":[...]}. + few_shot_jsonl : Optional[str] + Global few-shot exemplars: one JSON object per line with {"term": "...", "types":[...]}. + rag_terms_json : Optional[str] + Optional per-term RAG exemplars: a JSON list of {"term": "...", "RAG":[{"term": "...", "types":[...]}]}. + random_few_shot : Optional[int] + If provided, randomly select up to this many few-shot examples for each prediction. + model_id : str + HF model id used specifically for term→types predictions. + use_structured_output : bool + If True and outlines is available, enforce structured {"types":[...]} output. + seed : int + Random seed for reproducibility. + out_terms2types : str + Output JSON path for list of {"term": "...", "predicted_types":[...]}. + out_types2docs : str + Output JSON path for dict {"TYPE":[doc_ids,...], ...}. + + Returns + ------- + Dict[str, Any] + Summary with predictions and counts. + """ + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + # Load normalized document→terms results + doc_term_extractions = self._load_doc_term_extractions( + results_json_path=doc_terms_jsonl, + in_memory_results=doc_terms_list, + ) + if not doc_term_extractions: + raise ValueError( + "No document→terms results provided (doc_terms_jsonl/doc_terms_list)." + ) + + # Prepare unique term list and term→doc occurrences + unique_terms = self._collect_unique_terms_from_extractions(doc_term_extractions) + term_to_doc_ids_map = self._build_term_to_doc_ids(doc_term_extractions) + + # Load optional global few-shot examples + global_few_shot_examples: List[Dict] = [] + if few_shot_jsonl and os.path.exists(few_shot_jsonl): + with open(few_shot_jsonl, "r", encoding="utf-8") as few_shot_file: + for raw_line in few_shot_file: + raw_line = raw_line.strip() + if not raw_line: + continue + try: + json_obj = json.loads(raw_line) + except Exception: + continue + if ( + isinstance(json_obj, dict) + and "term" in json_obj + and "types" in json_obj + ): + global_few_shot_examples.append(json_obj) + + # Optional per-term RAG examples: {normalized_term -> [examples]} + rag_examples_lookup: Dict[str, List[Dict]] = {} + if rag_terms_json and os.path.exists(rag_terms_json): + try: + rag_payload = self._load_json(rag_terms_json) + if isinstance(rag_payload, list): + for rag_item in rag_payload: + if isinstance(rag_item, dict): + normalized_term = self._normalize_term( + rag_item.get("term", "") + ) + rag_examples_lookup[normalized_term] = rag_item.get( + "RAG", [] + ) + except Exception: + pass + + # Load a small chat LLM dedicated to Term→Types + typing_model, typing_tokenizer = self._load_llm_for_types(model_id) + + # Predict types per term + term_to_predicted_types_list: List[Dict] = [] + for term_text in unique_terms: + normalized_term = self._normalize_term(term_text) + + # Prefer per-term RAG for this term, else use global few-shot + few_shot_examples_for_term = ( + rag_examples_lookup.get(normalized_term, None) + or global_few_shot_examples + ) + + # Build conversation and prompt + conversation_messages = self._build_conv_for_type_infer( + term=term_text, + few_shot_examples=few_shot_examples_for_term, + random_k=random_few_shot, + ) + typing_prompt_string = self._apply_chat_template_safe_types( + typing_tokenizer, conversation_messages + ) + + predicted_types: List[str] = [] + raw_generation_text: str = "" + + # Structured JSON path (if requested and available) + if ( + use_structured_output + and OUTLINES_AVAILABLE + and _PredictedTypesSchema is not None + ): + try: + outlines_model = OutlinesTFModel(typing_model, typing_tokenizer) # type: ignore + generator = outlines_generate_json( + outlines_model, _PredictedTypesSchema + ) # type: ignore + structured = generator(typing_prompt_string, max_tokens=512) + predicted_types = [ + label for label in structured.types if isinstance(label, str) + ] + raw_generation_text = json.dumps( + {"types": predicted_types}, ensure_ascii=False + ) + except Exception: + # Fall back to greedy decoding + use_structured_output = False + + # Greedy decode fallback + if ( + not use_structured_output + or not OUTLINES_AVAILABLE + or _PredictedTypesSchema is None + ): + tokenized_prompt = typing_tokenizer( + typing_prompt_string, + return_tensors="pt", + truncation=True, + max_length=2048, + ) + if torch.cuda.is_available(): + tokenized_prompt = { + name: tensor.cuda() for name, tensor in tokenized_prompt.items() + } + with torch.no_grad(): + output_ids = typing_model.generate( + **tokenized_prompt, + max_new_tokens=256, + do_sample=False, + num_beams=1, + pad_token_id=typing_tokenizer.eos_token_id, + ) + new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1] :] + raw_generation_text = typing_tokenizer.decode( + new_token_span, skip_special_tokens=True + ) + predicted_types = self._extract_types_from_text(raw_generation_text) + + term_to_predicted_types_list.append( + { + "term": term_text, + "predicted_types": sorted(set(predicted_types)), + } + ) + + # 7) Build types→docs from (term→types) and (term→docs) + types_to_doc_id_set: Dict[str, set] = {} + for term_prediction in term_to_predicted_types_list: + normalized_term = self._normalize_term(term_prediction["term"]) + doc_ids_for_term = term_to_doc_ids_map.get(normalized_term, []) + for type_label in term_prediction.get("predicted_types", []): + types_to_doc_id_set.setdefault(type_label, set()).update( + doc_ids_for_term + ) + + types_to_doc_ids: Dict[str, List[str]] = { + type_label: sorted(doc_id_set) + for type_label, doc_id_set in types_to_doc_id_set.items() + } + + # 8) Save outputs + os.makedirs(os.path.dirname(out_terms2types) or ".", exist_ok=True) + with open(out_terms2types, "w", encoding="utf-8") as fp_terms2types: + json.dump( + term_to_predicted_types_list, + fp_terms2types, + ensure_ascii=False, + indent=2, + ) + + os.makedirs(os.path.dirname(out_types2docs) or ".", exist_ok=True) + with open(out_types2docs, "w", encoding="utf-8") as fp_types2docs: + json.dump(types_to_doc_ids, fp_types2docs, ensure_ascii=False, indent=2) + + # Cleanup VRAM if any + del typing_model, typing_tokenizer + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return { + "terms2types_pred": term_to_predicted_types_list, + "types2docs_pred": types_to_doc_ids, + "unique_terms": len(unique_terms), + "types_count": len(types_to_doc_ids), + } + + def _load_json(self, path: str) -> Dict[str, Any]: + """Load a JSON file from disk and return its parsed object.""" + with open(path, "r", encoding="utf-8") as file_obj: + return json.load(file_obj) + + def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: + """ + Iterate over *all* JSON objects found inside a string. + + Supports cases where multiple JSON objects are concatenated back-to-back + in a single line. It skips stray commas/whitespace between objects. + + Parameters + ---------- + blob : str + A string that may contain one or more JSON objects. + + Yields + ------ + Dict[str, Any] + Each parsed JSON object. + """ + json_decoder = json.JSONDecoder() + cursor_index, text_length = 0, len(blob) + while cursor_index < text_length: + # Skip whitespace/commas between objects + while cursor_index < text_length and blob[cursor_index] in " \t\r\n,": + cursor_index += 1 + if cursor_index >= text_length: + break + try: + json_obj, end_index = json_decoder.raw_decode(blob, idx=cursor_index) + except JSONDecodeError: + # Can't decode from this position; stop scanning this chunk + break + yield json_obj + cursor_index = end_index + + def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: + """ + Robust reader that supports: + • True JSONL (one object per line) + • Lines with multiple concatenated JSON objects + • Whole file as a JSON array + + Returns + ------- + Dict[str, Dict[str, Any]] + Mapping doc_id -> full document row. + """ + documents_by_id: Dict[str, Dict[str, Any]] = {} + + with open(path, "r", encoding="utf-8") as file_obj: + content = file_obj.read().strip() + + # Case A: whole-file JSON array + if content.startswith("["): + try: + json_array = json.loads(content) + if isinstance(json_array, list): + for record in json_array: + if not isinstance(record, dict): + continue + document_id = str( + record.get("id") + or record.get("doc_id") + or (record.get("doc") or {}).get("id") + or "" + ) + if document_id: + documents_by_id[document_id] = record + return documents_by_id + except Exception: + # Fall back to line-wise handling if array parsing fails + pass + + # Case B: treat as JSONL-ish; parse *all* objects per line + for raw_line in content.splitlines(): + line = raw_line.strip() + if not line: + continue + for record in self._iter_json_objects(line): + if not isinstance(record, dict): + continue + document_id = str( + record.get("id") + or record.get("doc_id") + or (record.get("doc") or {}).get("id") + or "" + ) + if document_id: + documents_by_id[document_id] = record + + return documents_by_id + + def _to_text(self, text_field: Any) -> str: + """ + Convert a 'text' field into a single string (handles list-of-strings). + + Parameters + ---------- + text_field : Any + The value found under "text" in the dataset row. + + Returns + ------- + str + A single-string representation of the text. + """ + if isinstance(text_field, str): + return text_field + if isinstance(text_field, list): + return " ".join(str(part) for part in text_field) + return str(text_field) if text_field is not None else "" + + def _unique_preserve(self, values: List[str]) -> List[str]: + """ + Deduplicate values while preserving the original order. + + Parameters + ---------- + values : List[str] + Sequence possibly containing duplicates. + + Returns + ------- + List[str] + Sequence without duplicates, order preserved. + """ + seen_values: set = set() + ordered_values: List[str] = [] + for candidate in values: + if candidate not in seen_values: + seen_values.add(candidate) + ordered_values.append(candidate) + return ordered_values + + def _norm(self, text: str) -> str: + """ + Lowercased, single-spaced normalization (for comparisons). + + Parameters + ---------- + text : str + Input string. + + Returns + ------- + str + Normalized string. + """ + return " ".join(text.lower().split()) + + def _normalize_term(self, term: str) -> str: + """ + Normalization tailored for term keys / lookups. + + Parameters + ---------- + term : str + Term to normalize. + + Returns + ------- + str + Lowercased, trimmed and single-spaced term. + """ + return " ".join(str(term).strip().split()).lower() + + def _format_fewshot_block( + self, + system_prompt: str, + fewshot_examples: List[Tuple[str, str, List[str]]], + *, + key: str, + k: int = 6, + ) -> str: + """ + Render a few-shot block like: + + + + ### Example + User: + Title: ... + + Assistant: + {"terms": [...]} or {"types": [...]} + + Parameters + ---------- + system_prompt : str + Instructional system text to prepend. + fewshot_examples : List[Tuple[str, str, List[str]]] + Examples as (title, text, labels_list). + key : str + Either "terms" or "types" depending on the task. + k : int + Number of examples to include. + + Returns + ------- + str + Formatted few-shot block text. + """ + lines: List[str] = [system_prompt.strip(), ""] + for example_title, example_text, gold_list in fewshot_examples[:k]: + lines.append("### Example") + lines.append(f"User:\nTitle: {example_title}\n{example_text}") + lines.append( + f'Assistant:\n{{"{key}": ' + + json.dumps(gold_list, ensure_ascii=False) + + "}" + ) + return "\n".join(lines) + + def _format_user_block(self, title: str, text: str) -> str: + """ + Format the 'Task' block for the current document. + + Parameters + ---------- + title : str + Document title. + text : str + Document text (single string). + + Returns + ------- + str + Formatted user block. + """ + return f"### Task\nUser:\nTitle: {title}\n{text}" + + def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: + """ + Extract a list from model output, trying: + 1) JSON object with the key ({"terms":[...]} or {"types":[...]}). + 2) Any top-level JSON array. + 3) Fallback: comma-split. + + Parameters + ---------- + generated_text : str + Raw generation text to parse. + key : str + "terms" or "types". + + Returns + ------- + List[str] + Parsed strings (best-effort). + """ + # 1) Try a JSON object and read key + try: + object_match = self._json_object_regex.search(generated_text) + if object_match: + json_obj = json.loads(object_match.group(0)) + json_array = json_obj.get(key) + if isinstance(json_array, list): + return [value for value in json_array if isinstance(value, str)] + except Exception: + pass + + # 2) Any JSON array + try: + array_match = self._json_array_regex.search(generated_text) + if array_match: + json_array = json.loads(array_match.group(0)) + if isinstance(json_array, list): + return [value for value in json_array if isinstance(value, str)] + except Exception: + pass + + # 3) Fallback: comma-split (last resort) + if "," in generated_text: + return [ + part.strip().strip('"').strip("'") + for part in generated_text.split(",") + if part.strip() + ] + return [] + + def _apply_chat_template_safe_types( + self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]] + ) -> str: + """ + Safely build a prompt string for chat models. Uses the model's chat template + when available; otherwise falls back to a simple concatenation. + """ + try: + return tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + except Exception: + system_text = next( + (m["content"] for m in messages if m.get("role") == "system"), "" + ) + last_user_text = next( + (m["content"] for m in reversed(messages) if m.get("role") == "user"), + "", + ) + return f"{system_text}\n\nUser:\n{last_user_text}\n\nAssistant:" + + def _build_conv_for_type_infer( + self, + term: str, + few_shot_examples: Optional[List[Dict]] = None, + random_k: Optional[int] = None, + ) -> List[Dict[str, str]]: + """ + Create a chat-style conversation for a single term→types query, + optionally prepending few-shot examples. + """ + messages: List[Dict[str, str]] = [ + {"role": "system", "content": self._system_prompt_term_to_types} + ] + examples = list(few_shot_examples or []) + if random_k and len(examples) > random_k: + import random as _rnd + + examples = _rnd.sample(examples, random_k) + for exemplar in examples: + example_term = exemplar.get("term", "") + example_types = exemplar.get("types", []) + messages.append({"role": "user", "content": f"Term: {example_term}"}) + messages.append( + { + "role": "assistant", + "content": json.dumps({"types": example_types}, ensure_ascii=False), + } + ) + messages.append({"role": "user", "content": f"Term: {term}"}) + return messages + + def _extract_types_from_text(self, generated_text: str) -> List[str]: + """ + Parse {"types":[...]} from a free-form generation. + """ + try: + object_match = re.search(r'\{[^}]*"types"[^}]*\}', generated_text) + if object_match: + json_obj = json.loads(object_match.group(0)) + types_array = json_obj.get("types", []) + return [ + type_label + for type_label in types_array + if isinstance(type_label, str) + ] + except Exception: + pass + return [] + + def _load_llm_for_types( + self, model_id: str + ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + """ + Load a *separate* small chat model for Term→Types (keeps LocalAutoLLM untouched). + """ + tokenizer = AutoTokenizer.from_pretrained(model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + device_map="auto" if torch.cuda.is_available() else None, + ) + return model, tokenizer + + def _load_doc_term_extractions( + self, + *, + results_json_path: Optional[str] = None, + in_memory_results: Optional[List[Dict]] = None, + ) -> List[Dict]: + """ + Normalize document→terms outputs to a list of: + {"id": "", "extracted_terms": ["...", ...]} + + Accepts either: + - in_memory_results (list of dicts) + - results_json_path pointing to: + • a JSONL file with lines: {"id": "...", "terms": [...]} + • OR a JSON file with {"results":[{"id":..., "extracted_terms": [...]}, ...]} + • OR a JSON list of dicts + """ + normalized_records: List[Dict] = [] + + def _coerce_to_record(source_row: Dict) -> Optional[Dict]: + document_id = str(source_row.get("id", "")) or str( + source_row.get("doc_id", "") + ) + if not document_id: + return None + terms = source_row.get("extracted_terms") + if terms is None: + terms = source_row.get("terms") + if ( + terms is None + and "payload" in source_row + and isinstance(source_row["payload"], dict) + ): + terms = source_row["payload"].get("terms") + if not isinstance(terms, list): + terms = [] + return { + "id": document_id, + "extracted_terms": [t for t in terms if isinstance(t, str)], + } + + if in_memory_results is not None: + for source_row in in_memory_results: + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + return normalized_records + + if not results_json_path: + raise ValueError("Provide results_json_path or in_memory_results") + + # Detect JSON vs JSONL by extension (best-effort) + if results_json_path.endswith(".jsonl"): + with open(results_json_path, "r", encoding="utf-8") as file_in: + for raw_line in file_in: + raw_line = raw_line.strip() + if not raw_line: + continue + # Multiple concatenated objects per line? Iterate them all. + for json_obj in self._iter_json_objects(raw_line): + if isinstance(json_obj, dict): + coerced_record = _coerce_to_record(json_obj) + if coerced_record: + normalized_records.append(coerced_record) + else: + payload_obj = self._load_json(results_json_path) + if isinstance(payload_obj, dict) and "results" in payload_obj: + for source_row in payload_obj["results"]: + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + elif isinstance(payload_obj, list): + for source_row in payload_obj: + if isinstance(source_row, dict): + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + + return normalized_records + + def _collect_unique_terms_from_extractions( + self, doc_term_extractions: List[Dict] + ) -> List[str]: + """ + Collect unique terms (original casing) from normalized document→terms results. + """ + seen_normalized_terms: set = set() + ordered_unique_terms: List[str] = [] + for record in doc_term_extractions: + for term_text in record.get("extracted_terms", []): + normalized = self._normalize_term(term_text) + if normalized and normalized not in seen_normalized_terms: + seen_normalized_terms.add(normalized) + ordered_unique_terms.append(term_text.strip()) + return ordered_unique_terms + + def _build_term_to_doc_ids( + self, doc_term_extractions: List[Dict] + ) -> Dict[str, List[str]]: + """ + Build lookup: normalized_term -> sorted unique list of doc_ids. + """ + term_to_doc_set: Dict[str, set] = {} + for record in doc_term_extractions: + document_id = str(record.get("id", "")) + for term_text in record.get("extracted_terms", []): + normalized = self._normalize_term(term_text) + if not normalized or not document_id: + continue + term_to_doc_set.setdefault(normalized, set()).add(document_id) + return { + normalized_term: sorted(doc_ids) + for normalized_term, doc_ids in term_to_doc_set.items() + } diff --git a/ontolearner/learner/text2onto/sbunlp.py b/ontolearner/learner/text2onto/sbunlp.py new file mode 100644 index 0000000..49067e2 --- /dev/null +++ b/ontolearner/learner/text2onto/sbunlp.py @@ -0,0 +1,598 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#      https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import random +import re +import ast +import gc +from typing import Any, Dict, List, Optional, Set, Tuple +from collections import defaultdict + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig + +from ...base import AutoLearner, AutoLLM + + +# ----------------------------------------------------------------------------- +# Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface +# ----------------------------------------------------------------------------- +class LocalAutoLLM(AutoLLM): + """ + Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama). + Uses 4-bit quantization for efficiency and greedy decoding by default. + """ + + def __init__( + self, label_mapper: Any = None, device: str = "cpu", token: str = "" + ) -> None: + super().__init__(label_mapper=label_mapper, device=device, token=token) + self.model = None + self.tokenizer = None + + def load( + self, + model_id: str, + load_in_4bit: bool = False, + dtype: str = "auto", + trust_remote_code: bool = True, + ): + """Load tokenizer + model, applying 4-bit quantization if specified and possible.""" + + # Determine the target data type (default to float32 for CPU, float16 for GPU) + torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32 + + # Load the tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=trust_remote_code + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + quant_config = None + if load_in_4bit: + # Configure BitsAndBytes for 4-bit loading + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + if torch_dtype_val is None: + torch_dtype_val = torch.float16 + + # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise) + device_map = "auto" if (self.device != "cpu") else {"": "cpu"} + + # Load the Causal Language Model + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch_dtype_val, + quantization_config=quant_config, + trust_remote_code=trust_remote_code, + ) + + # Ensure model is on the correct device (redundant if device_map="auto" but safe) + if self.device == "cpu": + self.model.to("cpu") + + def generate( + self, + inputs: List[str], + max_new_tokens: int = 64, + temperature: float = 0.0, + top_p: float = 1.0, + ) -> List[str]: + """Generate continuations for a list of prompts, returning only the generated part.""" + if self.model is None or self.tokenizer is None: + raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") + + # --- Generation Setup --- + # Tokenize batch (padding is essential for batch inference) + enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True) + input_ids = enc["input_ids"] + attention_mask = enc["attention_mask"] + + # Move tensors to the model's device (e.g., cuda:0) + model_device = next(self.model.parameters()).device + input_ids = input_ids.to(model_device) + attention_mask = attention_mask.to(model_device) + + # --- Generate --- + with torch.no_grad(): + outputs = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_new_tokens, + do_sample=( + temperature > 0.0 + ), # Use greedy decoding if temperature is 0.0 + temperature=temperature, + top_p=top_p, + pad_token_id=self.tokenizer.eos_token_id, + ) + + # --- Post-processing: Extract only the generated tail --- + decoded_outputs: List[str] = [] + for i, output_ids in enumerate(outputs): + full_decoded_text = self.tokenizer.decode( + output_ids, skip_special_tokens=True + ) + prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True) + + # Safely strip the prompt text from the full output + if full_decoded_text.startswith(prompt_text): + generated_tail = full_decoded_text[len(prompt_text) :].strip() + else: + # Fallback extraction (less robust if padding affects token indices) + prompt_len = input_ids.shape[1] + generated_tail = self.tokenizer.decode( + output_ids[prompt_len:], skip_special_tokens=True + ).strip() + decoded_outputs.append(generated_tail) + + return decoded_outputs + + +# ----------------------------------------------------------------------------- +# Main Learner: SBUNLPFewShotLearner (Task A Text2Onto) +# ----------------------------------------------------------------------------- +class SBUNLPFewShotLearner(AutoLearner): + """ + Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction). + It uses Few-Shot prompts generated from training data for inference. + """ + + def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"): + super().__init__() + # self.model is an instance of LocalAutoLLM + self.model = model or LocalAutoLLM(device=device) + self.device = device + # Cached in-memory prompt blocks built during the fit phase + self.fewshot_terms_block: str = "" + self.fewshot_types_block: str = "" + + # --- Few-shot construction (terms) --- + def build_stratified_fewshot_prompt( + self, + documents_path: str, + terms_path: str, + sample_size: int = 28, + seed: int = 123, + max_chars_per_text: int = 1200, + ) -> str: + """ + Builds the few-shot exemplar block for Term Extraction using stratified sampling. + """ + random.seed(seed) + + # Read documents (JSONL) into a list + corpus_documents: List[Dict[str, Any]] = [] + with open(documents_path, "r", encoding="utf-8") as file_handle: + for line in file_handle: + if line.strip(): + corpus_documents.append(json.loads(line)) + + num_total_docs = len(corpus_documents) + num_sample_docs = min(sample_size, num_total_docs) + + # Load the map of term -> [list of document IDs] + with open(terms_path, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + # Invert map: document ID -> [list of terms] + doc_id_to_terms_map = defaultdict(list) + for term, doc_ids in term_to_doc_map.items(): + for doc_id in doc_ids: + doc_id_to_terms_map[doc_id].append(term) + + # Define strata (groups of documents associated with specific terms) + strata_map = defaultdict(list) + for doc in corpus_documents: + doc_id = doc.get("id", "") + associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"]) + for term in associated_terms: + strata_map[term].append(doc) + + # Perform proportional sampling across strata + sampled_documents: List[Dict[str, Any]] = [] + for term_str, stratum_docs in strata_map.items(): + num_stratum_docs = len(stratum_docs) + if num_stratum_docs == 0: + continue + + # Calculate proportional sample size + proportion = num_stratum_docs / num_total_docs + num_to_sample_from_stratum = int(num_sample_docs * proportion) + + if num_to_sample_from_stratum > 0: + sampled_documents.extend( + random.sample( + stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs) + ) + ) + + # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size' + unique_docs_by_id = {} + for doc in sampled_documents: + unique_docs_by_id[doc.get("id", "")] = doc + + final_sample_docs = list(unique_docs_by_id.values()) + + if len(final_sample_docs) > num_sample_docs: + final_sample_docs = random.sample(final_sample_docs, num_sample_docs) + elif len(final_sample_docs) < num_sample_docs: + remaining_docs = [ + d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id + ] + needed_count = min( + num_sample_docs - len(final_sample_docs), len(remaining_docs) + ) + final_sample_docs.extend(random.sample(remaining_docs, needed_count)) + + # Format the few-shot exemplar text block + prompt_lines: List[str] = [] + for doc in final_sample_docs: + doc_id = doc.get("id", "") + title = doc.get("title", "") + text = doc.get("text", "") + + # Truncate text if it exceeds the maximum character limit + if max_chars_per_text and len(text) > max_chars_per_text: + text = text[:max_chars_per_text] + "…" + + associated_terms = doc_id_to_terms_map.get(doc_id, []) + prompt_lines.append( + f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------" + ) + + prompt_block = "\n".join(prompt_lines) + self.fewshot_terms_block = prompt_block + return prompt_block + + # --- Few-shot construction (types) --- + def build_types_fewshot_block( + self, + docs_jsonl: str, + terms2doc_json: str, + sample_per_term: int = 1, + full_word: bool = True, + case_sensitive: bool = True, + max_chars_per_text: int = 800, + ) -> str: + """ + Builds the few-shot block for Type Extraction. + This method samples documents based on finding an associated term/type within the text. + """ + # Load documents into dict by ID + docs_by_id = {} + with open(docs_jsonl, "r", encoding="utf-8") as file_handle: + for line in file_handle: + line_stripped = line.strip() + if line_stripped: + try: + doc = json.loads(line_stripped) + doc_id = doc.get("id", "") + if doc_id: + docs_by_id[doc_id] = doc + except json.JSONDecodeError: + continue + + # Load term -> [doc_id,...] map + with open(terms2doc_json, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + flags = 0 if case_sensitive else re.IGNORECASE + prompt_lines: List[str] = [] + + # Iterate over terms (which act as types in this context) + for term, doc_ids in term_to_doc_map.items(): + escaped_term = re.escape(term) + # Create regex pattern for matching the term in the text + pattern = rf"\b{escaped_term}\b" if full_word else escaped_term + term_regex = re.compile(pattern, flags=flags) + + picked_count = 0 + for doc_id in doc_ids: + doc = docs_by_id.get(doc_id) + if not doc: + continue + + title = doc.get("title", "") + text = doc.get("text", "") + + # Check if the term/type is actually present in the document text/title + if term_regex.search(f"{title} {text}"): + text_content = text + + # Truncate text if necessary + if max_chars_per_text and len(text_content) > max_chars_per_text: + text_content = text_content[:max_chars_per_text] + "…" + + # Escape single quotes in the term for Python list formatting in the prompt + term_for_prompt = term.replace("'", "\\'") + + prompt_lines.append( + f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------" + ) + picked_count += 1 + + if picked_count >= sample_per_term: + break # Move to the next term + + prompt_block = "\n".join(prompt_lines) + self.fewshot_types_block = prompt_block + return prompt_block + + def fit( + self, + train_docs_jsonl: str, + terms2doc_json: str, + sample_size: int = 28, + seed: int = 123, + ) -> None: + """ + Fit phase: Builds and caches the few-shot prompt blocks from the training files. + No model training occurs (Few-Shot/In-Context Learning). + """ + # Build prompt block for Term extraction + _ = self.build_stratified_fewshot_prompt( + train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed + ) + # Build prompt block for Type extraction + _ = self.build_types_fewshot_block( + train_docs_jsonl, terms2doc_json, sample_per_term=1 + ) + + # ------------------------- + # Inference helpers (prompt construction and output parsing) + # ------------------------- + def _build_term_prompt(self, example_block: str, title: str, text: str) -> str: + """Constructs the full prompt for Term Extraction.""" + return f"""{example_block} + [var] + Title: {title} + Text: {text} + [var] + Extract all relevant terms that could form the basis of an ontology from the above document. + Return ONLY a Python list like ['term1', 'term2', ...] and nothing else. + If no terms are found, return []. + """ + + def _build_type_prompt(self, example_block: str, title: str, text: str) -> str: + """Constructs the full prompt for Type Extraction.""" + return f"""{example_block} + [var] + Title: {title} + Text: {text} + [var] + Extract all relevant TYPES mentioned in the above document that could serve as ontology classes. + Only consider content inside the [var] ... [var] block. + Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return []. + """ + + def _parse_list_like(self, raw_string: str) -> List[str]: + """Try to extract a Python list of strings from model output robustly.""" + processed_string = raw_string.strip() + if processed_string in ("[]", ""): + return [] + + # 1. Try direct evaluation + try: + parsed_value = ast.literal_eval(processed_string) + if isinstance(parsed_value, list): + # Filter to ensure only strings are returned + return [item for item in parsed_value if isinstance(item, str)] + except Exception: + pass + + # 2. Try finding and evaluating text within outermost brackets [ ... ] + bracket_match = re.search(r"\[[\s\S]*?\]", processed_string) + if bracket_match: + try: + parsed_value = ast.literal_eval(bracket_match.group(0)) + if isinstance(parsed_value, list): + return [item for item in parsed_value if isinstance(item, str)] + except Exception: + pass + + # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors) + # Finds content inside either single quotes ('...') or double quotes ("...") + quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string) + flattened_list = [a_match or b_match for a_match, b_match in quoted_matches] + return flattened_list + + def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str: + """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output.""" + # self.model is an instance of LocalAutoLLM + model_output = self.model.generate( + [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0 + ) + return model_output[0] if model_output else "" + + def predict_terms( + self, + docs_test_jsonl: str, + out_jsonl: str, + max_lines: int = -1, + max_new_tokens: int = 120, + ) -> int: + """ + Runs Term Extraction on the test documents and saves results to a JSONL file. + Returns: The count of individual terms written. + """ + if not self.fewshot_terms_block: + raise RuntimeError("Few-shot block for terms is empty. Call fit() first.") + + num_written_terms = 0 + with ( + open(docs_test_jsonl, "r", encoding="utf-8") as file_in, + open(out_jsonl, "w", encoding="utf-8") as file_out, + ): + for line_index, line in enumerate(file_in, start=1): + if 0 < max_lines < line_index: + break + + try: + document = json.loads(line.strip()) + except Exception: + continue # Skip malformed JSON lines + + doc_id = document.get("id", "unknown") + title = document.get("title", "") + text = document.get("text", "") + + # Construct and call model + prompt = self._build_term_prompt(self.fewshot_terms_block, title, text) + raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) + predicted_terms = self._parse_list_like(raw_output) + + # Write extracted terms + for term_or_type in predicted_terms: + if isinstance(term_or_type, str) and term_or_type.strip(): + file_out.write( + json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + + "\n" + ) + num_written_terms += 1 + + # Lightweight memory management for long runs + if line_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return num_written_terms + + def predict_types( + self, + docs_test_jsonl: str, + out_jsonl: str, + max_lines: int = -1, + max_new_tokens: int = 120, + ) -> int: + """ + Runs Type Extraction on the test documents and saves results to a JSONL file. + Returns: The count of individual types written. + """ + if not self.fewshot_types_block: + raise RuntimeError("Few-shot block for types is empty. Call fit() first.") + + num_written_types = 0 + with ( + open(docs_test_jsonl, "r", encoding="utf-8") as file_in, + open(out_jsonl, "w", encoding="utf-8") as file_out, + ): + for line_index, line in enumerate(file_in, start=1): + if 0 < max_lines < line_index: + break + + try: + document = json.loads(line.strip()) + except Exception: + continue # Skip malformed JSON lines + + doc_id = document.get("id", "unknown") + title = document.get("title", "") + text = document.get("text", "") + + # Construct and call model using the dedicated type prompt block + prompt = self._build_type_prompt(self.fewshot_types_block, title, text) + raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) + predicted_types = self._parse_list_like(raw_output) + + # Write extracted types + for term_or_type in predicted_types: + if isinstance(term_or_type, str) and term_or_type.strip(): + file_out.write( + json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + + "\n" + ) + num_written_types += 1 + + if line_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return num_written_types + + # --- Evaluation utilities (unchanged from prior definition, added docstrings) --- + def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]: + """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased.""" + gold_pairs = set() + with open(terms2doc_path, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + for term, doc_ids in term_to_doc_map.items(): + clean_term = term.strip().lower() + for doc_id in doc_ids: + gold_pairs.add((doc_id, clean_term)) + return gold_pairs + + def load_predicted_pairs( + self, predicted_jsonl_path: str, key: str = "term" + ) -> Set[Tuple[str, str]]: + """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased.""" + predicted_pairs = set() + with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle: + for line in file_handle: + try: + entry = json.loads(line.strip()) + except Exception: + continue + doc_id = entry.get("doc_id") + value = entry.get(key) + if doc_id and value: + predicted_pairs.add((doc_id, value.strip().lower())) + return predicted_pairs + + def evaluate_extraction_f1( + self, terms2doc_path: str, predicted_jsonl: str, key: str = "term" + ) -> float: + """ + Computes set-based binary Precision, Recall, and F1 score against the gold pairs. + """ + # Load the ground truth and predictions + gold_set = self.load_gold_pairs(terms2doc_path) + predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key) + + # Build combined universe of all pairs for score calculation + all_pairs = sorted(gold_set | predicted_set) + + # Create binary labels (1=present, 0=absent) + y_true = [1 if pair in gold_set else 0 for pair in all_pairs] + y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs] + + # Use scikit-learn for metric calculation + from sklearn.metrics import precision_recall_fscore_support + + precision, recall, f1, _ = precision_recall_fscore_support( + y_true, y_pred, average="binary", zero_division=0 + ) + + # Display results + num_true_positives = len(gold_set & predicted_set) + + print("\n📊 Evaluation Results:") + print(f" ✅ Precision: {precision:.4f}") + print(f" ✅ Recall: {recall:.4f}") + print(f" ✅ F1 Score: {f1:.4f}") + print(f" 📌 Gold pairs: {len(gold_set)}") + print(f" 📌 Predicted pairs:{len(predicted_set)}") + print(f" 🎯 True Positives: {num_true_positives}") + + return float(f1) diff --git a/pyproject.toml b/pyproject.toml index 4422243..72d4ac1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ numpy = "*" pandas = "*" openpyxl = "*" tqdm = "*" +g4f = "*" pydantic = "2.11.3" pathlib = "1.0.1" python-dotenv = "*" @@ -30,6 +31,8 @@ sentence-transformers = "^5.1.0" dspy = "^2.6.14" bitsandbytes="^0.45.1" mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] } +protobuf = "<5" +Levenshtein = "*" [tool.poetry.dev-dependencies] ruff = "*" diff --git a/requirements.txt b/requirements.txt index 3ce19f7..494f7d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ pandas openpyxl matplotlib tqdm +g4f python-dotenv rdflib~=7.1.4 networkx~=3.4.2 @@ -20,3 +21,5 @@ sentence-transformers~=5.1.0 scikit-learn~=1.6.1 bitsandbytes~=0.45.1 mistral-common[sentencepiece]~=1.8.5 +protobuf<5 +Levenshtein diff --git a/setup.py b/setup.py index 6ae94bb..1dd046b 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ "pandas", "matplotlib", "tqdm", + "g4f", "python-dotenv", "rdflib==7.1.1", "networkx==3.2.1", @@ -32,7 +33,9 @@ "transformers>=4.56.0,<5.0.0", "sentence-transformers>=5.1.0,<6.0.0", "scikit-learn>=1.6.1,<2.0.0", - "bitsandbytes>=0.45.1,<1.0.0" + "bitsandbytes>=0.45.1,<1.0.0", + "protobuf<5", + "Levenshtein" ], classifiers=[ "Development Status :: 5 - Production/Stable",