|
1 | | -# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. |
| 1 | +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. |
2 | 2 | # |
3 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
4 | 4 | # you may not use this file except in compliance with the License. |
|
23 | 23 | """ |
24 | 24 | from typing import Any, Dict |
25 | 25 |
|
26 | | -from .encoders import LightweightEncoder |
| 26 | +from ..base import BaseEncoder |
| 27 | + |
| 28 | +class LightweightEncoder(BaseEncoder): |
| 29 | + """ |
| 30 | + A lightweight encoder for parsing ontology data and preprocessing it. |
| 31 | +
|
| 32 | + This class provides methods for parsing ontological data, applying text preprocessing, |
| 33 | + and formatting the data into a structure suitable for further processing. |
| 34 | + """ |
| 35 | + def parse(self, **kwargs) -> Any: |
| 36 | + """ |
| 37 | + Parses the source and target ontologies, applying preprocessing. |
| 38 | +
|
| 39 | + This method extracts ontology items (IRI and label) from the source and target ontologies, |
| 40 | + applies text preprocessing to the labels, and returns the encoded data. |
| 41 | +
|
| 42 | + Parameters: |
| 43 | + **kwargs: Contains the source and target ontologies as keyword arguments. |
| 44 | +
|
| 45 | + Returns: |
| 46 | + list: A list containing two elements, the processed source and target ontologies. |
| 47 | + """ |
| 48 | + source_onto, target_onto = kwargs["source"], kwargs["target"] |
| 49 | + source_ontos = [] |
| 50 | + for source in source_onto: |
| 51 | + encoded_source = self.get_owl_items(owl=source) |
| 52 | + encoded_source["text"] = self.preprocess(encoded_source["text"]) |
| 53 | + source_ontos.append(encoded_source) |
| 54 | + target_ontos = [] |
| 55 | + for target in target_onto: |
| 56 | + encoded_target = self.get_owl_items(owl=target) |
| 57 | + encoded_target["text"] = self.preprocess(encoded_target["text"]) |
| 58 | + target_ontos.append(encoded_target) |
| 59 | + return [source_ontos, target_ontos] |
| 60 | + |
| 61 | + def __str__(self): |
| 62 | + """ |
| 63 | + Returns a string representation of the encoder. |
| 64 | +
|
| 65 | + Returns: |
| 66 | + dict: A dictionary with the class name as key and items_in_owl as value. |
| 67 | + """ |
| 68 | + return {"LightweightEncoder": self.items_in_owl} |
| 69 | + |
| 70 | + def get_owl_items(self, owl: Dict) -> Any: |
| 71 | + """ |
| 72 | + Abstract method for extracting ontology data. |
| 73 | +
|
| 74 | + This method should be implemented by subclasses to extract specific ontology data |
| 75 | + (e.g., IRI and label) from the provided ontology item. |
| 76 | +
|
| 77 | + Parameters: |
| 78 | + owl (Dict): A dictionary representing an ontology item. |
| 79 | +
|
| 80 | + Returns: |
| 81 | + Any: The extracted ontology data. |
| 82 | + """ |
| 83 | + pass |
| 84 | + |
| 85 | + def get_encoder_info(self): |
| 86 | + """ |
| 87 | + Provides information about the encoder. |
| 88 | +
|
| 89 | + Returns: |
| 90 | + str: A description of the encoder's function in the overall pipeline. |
| 91 | + """ |
| 92 | + return "INPUT CONSIST OF COMBINED INFORMATION TO FUZZY STRING MATCHING" |
| 93 | + |
27 | 94 |
|
28 | 95 | class ConceptLightweightEncoder(LightweightEncoder): |
29 | 96 | """ |
|
0 commit comments