Skip to content

Commit f7c4a6c

Browse files
committed
Refactor the model and matching algorithm
Currently, we can only effectively use the `SemanticMatch` and `EquivalenceTable` classes with the server. This structure was not very effective for testing. We refactor away from using the model and instead add a new `algorithm` module. Semantic matches are now stored in a `networkx.DiGraph`, a directed graph. The algorithm is performed on the graph with effective graph operations, making the whole process more efficient as well. Furthermore, we clean up the service to use it in a more pythonic way, eliminating the need for the `service_model` module. Note, that this is a major refactor and solves multiple problems: Fixes #1 Fixes #5
1 parent ce50050 commit f7c4a6c

File tree

17 files changed

+771
-463
lines changed

17 files changed

+771
-463
lines changed

.github/workflows/ci.yml

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,63 @@
1-
name: test
1+
name: ci
22

3-
on:
4-
push:
5-
branches:
6-
- '**'
3+
on: [push, pull_request]
4+
5+
6+
env:
7+
X_PYTHON_VERSION: "3.11"
78

89
jobs:
910
build:
10-
runs-on: ${{ matrix.os }}
11-
strategy:
12-
matrix:
13-
os:
14-
- ubuntu-latest
15-
- windows-latest
11+
# This job checks if the build succeeds
12+
runs-on: ubuntu-latest
13+
steps:
14+
- name: Checkout code
15+
uses: actions/checkout@v4
16+
17+
- name: Set up Python ${{ env.X_PYTHON_VERSION }}
18+
uses: actions/setup-python@v4
19+
with:
20+
python-version: ${{ env.X_PYTHON_VERSION }}
1621

22+
- name: Build the package
23+
run: pip install .
24+
25+
test:
26+
# This job runs the unittests
27+
runs-on: ubuntu-latest
1728
steps:
1829
- name: Checkout code
1930
uses: actions/checkout@v4
2031

21-
- name: Set up Python
32+
- name: Set up Python ${{ env.X_PYTHON_VERSION }}
2233
uses: actions/setup-python@v4
2334
with:
24-
python-version: "3.10"
25-
architecture: x64
35+
python-version: ${{ env.X_PYTHON_VERSION }}
2636

2737
- name: Install Python dependencies
28-
run: pip install -r requirements.txt
38+
run: |
39+
python -m pip install --upgrade pip
40+
pip install .[dev]
2941
3042
- name: Run Python Tests
3143
run: python -m unittest discover
44+
45+
static-analysis:
46+
# This job runs static code analysis, namely pycodestyle and mypy
47+
runs-on: ubuntu-latest
48+
steps:
49+
- uses: actions/checkout@v4
50+
- name: Set up Python
51+
uses: actions/setup-python@v2
52+
with:
53+
python-version: ${{ env.X_PYTHON_VERSION }}
54+
- name: Install Python dependencies
55+
run: |
56+
python -m pip install --upgrade pip
57+
pip install .[dev]
58+
- name: Check typing with MyPy
59+
run: |
60+
mypy semantic_matcher test
61+
- name: Check code style with PyCodestyle
62+
run: |
63+
pycodestyle --count --max-line-length 120 semantic_matcher test

config.ini.default

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
endpoint=http://127.0.0.1
33
LISTEN_ADDRESS=127.0.0.1
44
port=8000
5-
equivalence_table_file=./resources/equivalence_table.json
5+
match_graph_file=./resources/example_graph.json
66

77
[RESOLVER]
88
endpoint=http://semantic_id_resolver

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ dependencies = [
1616
"pydantic>=1.10",
1717
"uvicorn>=0.21.1",
1818
"requests>=2.31.0",
19+
"networkx>=3.4.2",
20+
]
21+
22+
[project.optional-dependencies]
23+
dev = [
24+
"mypy",
25+
"pycodestyle",
26+
"coverage",
1927
]
2028

2129
[tool.setuptools]

resources/equivalence_table.json

Lines changed: 0 additions & 32 deletions
This file was deleted.

resources/example_graph.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[
2+
{
3+
"base_semantic_id": "A",
4+
"match_semantic_id": "B",
5+
"score": 0.8,
6+
"path": []
7+
},
8+
{
9+
"base_semantic_id": "B",
10+
"match_semantic_id": "C",
11+
"score": 0.7,
12+
"path": []
13+
},
14+
{
15+
"base_semantic_id": "B",
16+
"match_semantic_id": "D",
17+
"score": 0.6,
18+
"path": []
19+
},
20+
{
21+
"base_semantic_id": "C",
22+
"match_semantic_id": "D",
23+
"score": 0.9,
24+
"path": []
25+
}
26+
]

semantic_matcher/algorithm.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import json
2+
from typing import List, Tuple
3+
import heapq
4+
5+
import networkx as nx
6+
from pydantic import BaseModel
7+
8+
9+
class SemanticMatchGraph(nx.DiGraph):
10+
def __init__(self):
11+
super().__init__()
12+
13+
def add_semantic_match(self,
14+
base_semantic_id: str,
15+
match_semantic_id: str,
16+
score: float):
17+
self.add_edge(
18+
u_of_edge=base_semantic_id,
19+
v_of_edge=match_semantic_id,
20+
weight=score,
21+
)
22+
23+
def get_all_matches(self) -> List["SemanticMatch"]:
24+
matches: List["SemanticMatch"] = []
25+
26+
# Iterate over all edges in the graph
27+
for base, match, data in self.edges(data=True):
28+
score = data.get("weight", 0.0) # Get weight, default to 0.0 if missing
29+
matches.append(SemanticMatch(
30+
base_semantic_id=base,
31+
match_semantic_id=match,
32+
score=score,
33+
path=[] # Direct match, no intermediate nodes
34+
))
35+
36+
return matches
37+
38+
def to_file(self, filename: str):
39+
with open(filename, "w") as file:
40+
matches = [match.model_dump() for match in self.get_all_matches()]
41+
json.dump(matches, file, indent=4)
42+
43+
@classmethod
44+
def from_file(cls, filename: str) -> "SemanticMatchGraph":
45+
with open(filename, "r") as file:
46+
matches_data = json.load(file)
47+
graph = SemanticMatchGraph()
48+
for match_data in matches_data:
49+
graph.add_semantic_match(
50+
base_semantic_id=match_data["base_semantic_id"],
51+
match_semantic_id=match_data["match_semantic_id"],
52+
score=match_data["score"]
53+
)
54+
return graph
55+
56+
57+
class SemanticMatch(BaseModel):
58+
base_semantic_id: str
59+
match_semantic_id: str
60+
score: float
61+
path: List[str] # The path of `semantic_id`s that the algorithm took
62+
63+
def __str__(self) -> str:
64+
return f"{' -> '.join(self.path + [self.match_semantic_id])} = {self.score}"
65+
66+
def __hash__(self):
67+
return hash((
68+
self.base_semantic_id,
69+
self.match_semantic_id,
70+
self.score,
71+
tuple(self.path),
72+
))
73+
74+
75+
def find_semantic_matches(
76+
graph: SemanticMatchGraph,
77+
semantic_id: str,
78+
min_score: float = 0.5
79+
) -> List[SemanticMatch]:
80+
"""
81+
Find semantic matches for a given node with a minimum score threshold.
82+
83+
Args:
84+
graph (nx.DiGraph): The directed graph with weighted edges.
85+
semantic_id (str): The starting semantic_id.
86+
min_score (float): The minimum similarity score to consider.
87+
This value is necessary to ensure the search terminates also with sufficiently large graphs.
88+
89+
Returns:
90+
List[SemanticMatch]:
91+
A list of MatchResults, sorted by their score with the highest score first.
92+
"""
93+
if semantic_id not in graph:
94+
return []
95+
96+
# We need to make sure that all possible paths starting from the given semantic_id are explored.
97+
# To achieve this, we use the concept of "priority queue". While we could use a simple FIFO list of matches to
98+
# explore, this way we actually end up with an already sorted result with the highest match at the beginning of the
99+
# list. As possible implementation of this abstract data structure, we choose to use a "max-heap".
100+
# However, there is no efficient implementation of a max-heap in Python, so rather we use the built-in "min-heap"
101+
# and negate the score values. A priority queue ensures that elements with the highest priority are processed first,
102+
# regardless of when they were added.
103+
# We initialize the priority queue:
104+
pq: List[Tuple[float, str, List[str]]] = [(-1.0, semantic_id, [])] # (neg_score, node, path)
105+
# The queue is structured as follows:
106+
# - `neg_score`: The negative score of the match
107+
# - `node`: The `match_semantic_id` of the match
108+
# - `path`: The path between the `semantic_id` and the `match_semantic_id`
109+
110+
# Prepare the result list
111+
results: List[SemanticMatch] = []
112+
113+
# Run the priority queue until all possible paths have been explored
114+
# This means in each iteration:
115+
# - We pop the top element of the queue as it's the next highest semantic match we want to explore
116+
# - If the match has a score higher or equal to the given `min_score`, we add it to the results
117+
# - We add all connected `semantic_id`s to the priority queue to be treated next
118+
# - We go to the next element of the queue
119+
while pq:
120+
# Get the highest-score match from the queue
121+
neg_score, node, path = heapq.heappop(pq)
122+
score = -neg_score # Convert back to positive
123+
124+
# Store result if above threshold (except the start node)
125+
if node != semantic_id and score >= min_score:
126+
results.append(SemanticMatch(
127+
base_semantic_id=semantic_id,
128+
match_semantic_id=node,
129+
score=score,
130+
path=path
131+
))
132+
133+
# Traverse to the neighboring and therefore connected `semantic_id`s
134+
for neighbor, edge_data in graph[node].items():
135+
new_score: float = score * edge_data["weight"] # Multiplicative propagation
136+
137+
# Prevent loops by ensuring we do not revisit the start node after the first iteration
138+
if neighbor == semantic_id:
139+
continue # Avoid re-exploring the start node
140+
141+
# We add the newly found `semantic_id`s to the queue to be explored next in order of their score
142+
if new_score >= min_score:
143+
heapq.heappush(pq, (-new_score, neighbor, path + [node])) # Push updated path
144+
145+
return results

semantic_matcher/examples/__init__.py

Whitespace-only changes.

semantic_matcher/examples/simple_example_equivalence_table.py

Lines changed: 0 additions & 38 deletions
This file was deleted.

0 commit comments

Comments
 (0)