Skip to content

Commit f3c76ea

Browse files
committed
✨ add ICV RAG example
1 parent f9979d5 commit f3c76ea

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed

examples/icv_rag_matching.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import json
2+
from ontoaligner.ontology import MaterialInformationMatOntoOMDataset
3+
from ontoaligner.utils import metrics, xmlify
4+
from ontoaligner.ontology_matchers import FalconLLMBERTRetrieverICVRAG
5+
from ontoaligner.encoder import ConceptRAGEncoder
6+
from ontoaligner.postprocess import rag_hybrid_postprocessor
7+
8+
# Step 1: Initialize the Ontology Matching Task
9+
# The MaterialInformationMatOntoOMDataset object is created to start the ontology matching task
10+
task = MaterialInformationMatOntoOMDataset()
11+
print("Test Task:", task)
12+
13+
# Step 2: Collect the dataset for ontology matching
14+
# This collects the source ontology, target ontology, and reference alignments required for the matching process
15+
dataset = task.collect(
16+
source_ontology_path="../assets/MI-MatOnto/mi_ontology.xml", # Path to the source ontology file
17+
target_ontology_path="../assets/MI-MatOnto/matonto_ontology.xml", # Path to the target ontology file
18+
reference_matching_path="../assets/MI-MatOnto/matchings.xml" # Path to the reference alignments
19+
)
20+
21+
# Step 3: Initialize the ConceptRAGEncoder model
22+
# This encoder will process the source and target ontologies for the matching process
23+
encoder_model = ConceptRAGEncoder()
24+
25+
# Step 4: Encode the ontologies
26+
# This encodes the source, target, and reference ontologies into a format that can be used by the model
27+
encoded_ontology = encoder_model(source=dataset['source'], target=dataset['target'], reference=dataset['reference'])
28+
29+
# Step 5: Define model configuration for FalconLLMBERTRetrieverICVRAG
30+
# This includes the device settings, retrieval configuration, and LLM configuration for the matching model
31+
config = {
32+
"retriever_config": {
33+
"device": 'cuda', # Specify the device for computation
34+
"top_k": 5, # Number of top candidates to retrieve
35+
"threshold": 0.1, # Threshold for IR scores
36+
# "openai_key": "" # Uncomment to use OpenAI models
37+
},
38+
"llm_config": {
39+
"device": "cuda", # Specify the device for computation
40+
"max_length": 300, # Max length for LLM input
41+
"max_new_tokens": 10, # Max new tokens for generation
42+
"huggingface_access_token": "", # Huggingface access token for restricted models
43+
"device_map": 'balanced', # Device mapping strategy
44+
"batch_size": 32, # Batch size for inference
45+
"answer_set": {
46+
"yes": ["yes", "correct", "true", "positive", "valid"],
47+
"no": ["no", "incorrect", "false", "negative", "invalid"]
48+
}
49+
# "openai_key": "" # Uncomment to use OpenAI models
50+
}
51+
}
52+
53+
# Step 6: Initialize the Matching Model (FalconLLMBERTRetrieverICVRAG)
54+
# This step creates an instance of the FalconLLMBERTRetrieverICVRAG model with the configuration provided
55+
model = FalconLLMBERTRetrieverICVRAG(**config)
56+
57+
# Step 7: Load pre-trained models (LLM and IR models)
58+
# Loads the pre-trained Falcon LLM and information retrieval models
59+
model.load(
60+
llm_path="tiiuae/falcon-7b", # Path to the pre-trained LLM
61+
ir_path="all-MiniLM-L6-v2" # Path to the IR model
62+
)
63+
64+
# Step 8: Generate predictions using the model
65+
# This generates predictions for the ontology matching task using the encoded data
66+
predicts = model.generate(input_data=encoded_ontology)
67+
68+
# Step 9: Post-process the predictions using a hybrid approach
69+
# This applies a post-processing step to refine the predictions based on IR and LLM confidence thresholds
70+
hybrid_matchings, hybrid_configs = rag_hybrid_postprocessor(
71+
predicts=predicts,
72+
ir_score_threshold=0.4, # IR score threshold for filtering
73+
llm_confidence_th=0.5 # LLM confidence threshold for filtering
74+
)
75+
76+
# Step 10: Evaluate the hybrid matchings
77+
# This generates an evaluation report comparing the predicted matchings to the reference matchings
78+
evaluation = metrics.evaluation_report(predicts=hybrid_matchings, references=dataset['reference'])
79+
print("Hybrid Matching Evaluation Report:", json.dumps(evaluation, indent=4))
80+
81+
# Step 11: Print the hybrid matching configuration
82+
# This outputs the configuration used for hybrid matching
83+
print("Hybrid Matching Obtained Configuration:", hybrid_configs)
84+
85+
# Step 12: Convert the hybrid matchings to XML format
86+
# This generates an XML representation of the final ontology matchings
87+
xml_str = xmlify.xml_alignment_generator(matchings=hybrid_matchings)
88+
89+
# Step 13: Save the XML output to a file
90+
# The resulting XML string is saved to an XML file for further use
91+
output_file_path = "matchings.xml"
92+
with open(output_file_path, "w", encoding="utf-8") as xml_file:
93+
xml_file.write(xml_str)

0 commit comments

Comments
 (0)