|
| 1 | +import json |
| 2 | +from ontoaligner.ontology import MaterialInformationMatOntoOMDataset |
| 3 | +from ontoaligner.utils import metrics, xmlify |
| 4 | +from ontoaligner.ontology_matchers import FalconLLMBERTRetrieverICVRAG |
| 5 | +from ontoaligner.encoder import ConceptRAGEncoder |
| 6 | +from ontoaligner.postprocess import rag_hybrid_postprocessor |
| 7 | + |
| 8 | +# Step 1: Initialize the Ontology Matching Task |
| 9 | +# The MaterialInformationMatOntoOMDataset object is created to start the ontology matching task |
| 10 | +task = MaterialInformationMatOntoOMDataset() |
| 11 | +print("Test Task:", task) |
| 12 | + |
| 13 | +# Step 2: Collect the dataset for ontology matching |
| 14 | +# This collects the source ontology, target ontology, and reference alignments required for the matching process |
| 15 | +dataset = task.collect( |
| 16 | + source_ontology_path="../assets/MI-MatOnto/mi_ontology.xml", # Path to the source ontology file |
| 17 | + target_ontology_path="../assets/MI-MatOnto/matonto_ontology.xml", # Path to the target ontology file |
| 18 | + reference_matching_path="../assets/MI-MatOnto/matchings.xml" # Path to the reference alignments |
| 19 | +) |
| 20 | + |
| 21 | +# Step 3: Initialize the ConceptRAGEncoder model |
| 22 | +# This encoder will process the source and target ontologies for the matching process |
| 23 | +encoder_model = ConceptRAGEncoder() |
| 24 | + |
| 25 | +# Step 4: Encode the ontologies |
| 26 | +# This encodes the source, target, and reference ontologies into a format that can be used by the model |
| 27 | +encoded_ontology = encoder_model(source=dataset['source'], target=dataset['target'], reference=dataset['reference']) |
| 28 | + |
| 29 | +# Step 5: Define model configuration for FalconLLMBERTRetrieverICVRAG |
| 30 | +# This includes the device settings, retrieval configuration, and LLM configuration for the matching model |
| 31 | +config = { |
| 32 | + "retriever_config": { |
| 33 | + "device": 'cuda', # Specify the device for computation |
| 34 | + "top_k": 5, # Number of top candidates to retrieve |
| 35 | + "threshold": 0.1, # Threshold for IR scores |
| 36 | + # "openai_key": "" # Uncomment to use OpenAI models |
| 37 | + }, |
| 38 | + "llm_config": { |
| 39 | + "device": "cuda", # Specify the device for computation |
| 40 | + "max_length": 300, # Max length for LLM input |
| 41 | + "max_new_tokens": 10, # Max new tokens for generation |
| 42 | + "huggingface_access_token": "", # Huggingface access token for restricted models |
| 43 | + "device_map": 'balanced', # Device mapping strategy |
| 44 | + "batch_size": 32, # Batch size for inference |
| 45 | + "answer_set": { |
| 46 | + "yes": ["yes", "correct", "true", "positive", "valid"], |
| 47 | + "no": ["no", "incorrect", "false", "negative", "invalid"] |
| 48 | + } |
| 49 | + # "openai_key": "" # Uncomment to use OpenAI models |
| 50 | + } |
| 51 | +} |
| 52 | + |
| 53 | +# Step 6: Initialize the Matching Model (FalconLLMBERTRetrieverICVRAG) |
| 54 | +# This step creates an instance of the FalconLLMBERTRetrieverICVRAG model with the configuration provided |
| 55 | +model = FalconLLMBERTRetrieverICVRAG(**config) |
| 56 | + |
| 57 | +# Step 7: Load pre-trained models (LLM and IR models) |
| 58 | +# Loads the pre-trained Falcon LLM and information retrieval models |
| 59 | +model.load( |
| 60 | + llm_path="tiiuae/falcon-7b", # Path to the pre-trained LLM |
| 61 | + ir_path="all-MiniLM-L6-v2" # Path to the IR model |
| 62 | +) |
| 63 | + |
| 64 | +# Step 8: Generate predictions using the model |
| 65 | +# This generates predictions for the ontology matching task using the encoded data |
| 66 | +predicts = model.generate(input_data=encoded_ontology) |
| 67 | + |
| 68 | +# Step 9: Post-process the predictions using a hybrid approach |
| 69 | +# This applies a post-processing step to refine the predictions based on IR and LLM confidence thresholds |
| 70 | +hybrid_matchings, hybrid_configs = rag_hybrid_postprocessor( |
| 71 | + predicts=predicts, |
| 72 | + ir_score_threshold=0.4, # IR score threshold for filtering |
| 73 | + llm_confidence_th=0.5 # LLM confidence threshold for filtering |
| 74 | +) |
| 75 | + |
| 76 | +# Step 10: Evaluate the hybrid matchings |
| 77 | +# This generates an evaluation report comparing the predicted matchings to the reference matchings |
| 78 | +evaluation = metrics.evaluation_report(predicts=hybrid_matchings, references=dataset['reference']) |
| 79 | +print("Hybrid Matching Evaluation Report:", json.dumps(evaluation, indent=4)) |
| 80 | + |
| 81 | +# Step 11: Print the hybrid matching configuration |
| 82 | +# This outputs the configuration used for hybrid matching |
| 83 | +print("Hybrid Matching Obtained Configuration:", hybrid_configs) |
| 84 | + |
| 85 | +# Step 12: Convert the hybrid matchings to XML format |
| 86 | +# This generates an XML representation of the final ontology matchings |
| 87 | +xml_str = xmlify.xml_alignment_generator(matchings=hybrid_matchings) |
| 88 | + |
| 89 | +# Step 13: Save the XML output to a file |
| 90 | +# The resulting XML string is saved to an XML file for further use |
| 91 | +output_file_path = "matchings.xml" |
| 92 | +with open(output_file_path, "w", encoding="utf-8") as xml_file: |
| 93 | + xml_file.write(xml_str) |
0 commit comments