Skip to content

Commit be609ee

Browse files
authored
Merge pull request #1 from qedsoftware/add-initial-osa-code
Add initial OSA code
2 parents 673d039 + 3a1b364 commit be609ee

File tree

13 files changed

+716
-2
lines changed

13 files changed

+716
-2
lines changed

.flake8

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[flake8]
2+
# see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8
3+
extend-ignore = E203, E501, W503, E701, E704
4+
exclude = \
5+
.venv,
6+
*/.ipynb_checkpoints
7+
8+
max-line-length=100
9+
10+
per-file-ignores =
11+
# imported but unused
12+
__init__.py: F401
13+

.github/workflows/ci.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
build:
11+
runs-on: ubuntu-latest
12+
13+
strategy:
14+
matrix:
15+
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
16+
17+
steps:
18+
- name: Checkout code
19+
uses: actions/checkout@v3
20+
21+
- name: Set up Python ${{ matrix.python-version }}
22+
uses: actions/setup-python@v4
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
26+
- name: Upgrade pip
27+
run: python -m pip install --upgrade pip
28+
29+
- name: Install development dependencies
30+
run: pip install .[dev]
31+
32+
- name: Ensure linters.sh is executable
33+
run: chmod +x ./linters.sh
34+
35+
- name: Run linters script
36+
run: ./linters.sh
37+
38+
- name: Run tests
39+
run: python -m unittest discover -s tests -p "*.py"

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,4 +191,7 @@ cython_debug/
191191
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
192192
# refer to https://docs.cursor.com/context/ignore-files
193193
.cursorignore
194-
.cursorindexingignore
194+
.cursorindexingignore
195+
196+
# Edit Distance OSA file build by Cython
197+
editdistance/edit_distance_osa.cpp

README.md

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,57 @@
1-
# edit-distance
1+
# edit-distance
2+
3+
edit-distance is a Python package that provides an implementation of the Optimal String Alignment (OSA) algorithm for calculating edit distances. The package leverages C++ extensions via Cython for improved performance.
4+
5+
## Features
6+
7+
- Calculate edit distances using the OSA algorithm.
8+
- Enables custom weights for each edit operation.
9+
- Find all edit paths resulting in the minimal OSA distance between strings.
10+
- High-performance implementation using C++ and Cython.
11+
- Easy integration into Python projects.
12+
13+
## Installation
14+
15+
Ensure you have a C++ compiler installed. Then, clone the repository and install the package using:
16+
17+
```sh
18+
pip install .
19+
```
20+
21+
Alternatively, call `setup.py` directly:
22+
23+
```sh
24+
python setup.py build_ext --inplace
25+
python setup.py install
26+
```
27+
28+
For more details on the setup, see [setup.py](setup.py).
29+
30+
## Usage
31+
32+
After installation, you can import and use the module in your Python code:
33+
34+
```python
35+
import editdistance.osa
36+
37+
# Example usage:
38+
str1 = "kitten"
39+
str2 = "sitting"
40+
distance = editdistance.osa.calculate_distance(str1, str2)
41+
print(f"The edit distance between '{{}}' and '{{}}' is {{}}".format(str1, str2, distance))
42+
```
43+
44+
See examples located in [examples](examples/osa_example.py) directory.
45+
46+
## Running Tests
47+
48+
The test suite is located in the [tests](tests/tests_osa.py) directory. To run the tests, execute:
49+
50+
```sh
51+
python -m unittest discover -v
52+
```
53+
54+
## License
55+
56+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
57+
```
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#include "_edit_distance_osa.hpp"
2+
#include <algorithm>
3+
#include <cmath>
4+
5+
std::vector<std::vector<double>> compute_dp_table(
6+
const std::string& a,
7+
const std::string& b,
8+
const std::map<EditopName, double>& cost_map
9+
) {
10+
int len_a = a.length();
11+
int len_b = b.length();
12+
std::vector<std::vector<double>> dp(len_a + 1, std::vector<double>(len_b + 1, 0.0));
13+
14+
for (int i = 0; i <= len_a; ++i) {
15+
dp[i][0] = i * cost_map.at(DELETE);
16+
}
17+
for (int j = 0; j <= len_b; ++j) {
18+
dp[0][j] = j * cost_map.at(INSERT);
19+
}
20+
21+
for (int i = 1; i <= len_a; ++i) {
22+
for (int j = 1; j <= len_b; ++j) {
23+
double deletion = dp[i-1][j] + cost_map.at(DELETE);
24+
double insertion = dp[i][j-1] + cost_map.at(INSERT);
25+
double substitution_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
26+
double substitution = dp[i-1][j-1] + substitution_cost;
27+
28+
dp[i][j] = std::min({deletion, insertion, substitution});
29+
30+
if (i > 1 && j > 1 &&
31+
a[i-1] == b[j-2] && a[i-2] == b[j-1]) {
32+
dp[i][j] = std::min(dp[i][j],
33+
dp[i-2][j-2] + cost_map.at(TRANSPOSE));
34+
}
35+
}
36+
}
37+
38+
return dp;
39+
}
40+
41+
42+
double cpp_compute_distance(
43+
const std::string& a,
44+
const std::string& b,
45+
const std::map<EditopName, double>& cost_map
46+
) {
47+
auto dp = compute_dp_table(a, b, cost_map);
48+
return dp[a.length()][b.length()];
49+
}
50+
51+
std::vector<std::vector<Editop>> backtrack_all_paths(
52+
const std::string& a,
53+
const std::string& b,
54+
const std::map<EditopName, double>& cost_map,
55+
const std::vector<std::vector<double>>& dp,
56+
int i,
57+
int j,
58+
std::vector<Editop>& current_path
59+
) {
60+
if (i == 0 && j == 0) {
61+
std::vector<Editop> reversed_path = current_path;
62+
std::reverse(reversed_path.begin(), reversed_path.end());
63+
return {reversed_path};
64+
}
65+
66+
std::vector<std::vector<Editop>> all_paths;
67+
double current_cost = dp[i][j];
68+
const double tol = 1e-6;
69+
70+
71+
if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) {
72+
Editop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1]));
73+
current_path.push_back(op);
74+
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path);
75+
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
76+
current_path.pop_back();
77+
}
78+
79+
if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) {
80+
Editop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1]));
81+
current_path.push_back(op);
82+
auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path);
83+
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
84+
current_path.pop_back();
85+
}
86+
87+
88+
if (i > 0 && j > 0) {
89+
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
90+
if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) {
91+
std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]);
92+
Editop op(REPLACE, i-1, j-1, sub_cost, out_char);
93+
current_path.push_back(op);
94+
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path);
95+
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
96+
current_path.pop_back();
97+
}
98+
}
99+
100+
101+
if (i > 1 && j > 1 &&
102+
a[i-1] == b[j-2] && a[i-2] == b[j-1] &&
103+
std::abs((dp[i-2][j-2] + cost_map.at(TRANSPOSE)) - current_cost) < tol) {
104+
std::string transpose_str = std::string(1, b[j-2]) + std::string(1, b[j-1]);
105+
Editop op(TRANSPOSE, i-2, j-2, cost_map.at(TRANSPOSE), transpose_str);
106+
current_path.push_back(op);
107+
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path);
108+
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
109+
current_path.pop_back();
110+
}
111+
112+
return all_paths;
113+
}
114+
115+
116+
std::vector<std::vector<Editop>> cpp_compute_all_paths(
117+
const std::string& a,
118+
const std::string& b,
119+
const std::map<EditopName, double>& cost_map
120+
) {
121+
auto dp = compute_dp_table(a, b, cost_map);
122+
std::vector<Editop> current_path;
123+
return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path);
124+
}
125+
126+
127+
void cpp_print_all_paths(
128+
const std::string& a,
129+
const std::string& b,
130+
const std::map<EditopName, double>& cost_map
131+
) {
132+
auto paths = cpp_compute_all_paths(a, b, cost_map);
133+
double distance = cpp_compute_distance(a, b, cost_map);
134+
135+
std::cout << "OSA Distance from '" << a << "' to '" << b << "': " << distance << std::endl;
136+
std::cout << "Number of optimal edit sequences: " << paths.size() << std::endl;
137+
std::cout << std::endl;
138+
139+
for (size_t i = 0; i < paths.size(); ++i) {
140+
std::cout << "Path " << (i + 1) << ":" << std::endl;
141+
for (const auto& op : paths[i]) {
142+
std::cout << " " << op << std::endl;
143+
}
144+
std::cout << std::endl;
145+
}
146+
}
147+
148+
std::string editop_name_to_string(EditopName name) {
149+
switch (name) {
150+
case INSERT: return "INSERT";
151+
case DELETE: return "DELETE";
152+
case REPLACE: return "REPLACE";
153+
case TRANSPOSE: return "TRANSPOSE";
154+
default: return "UNKNOWN";
155+
}
156+
}
157+
158+
std::ostream& operator<<(std::ostream& os, const Editop& op) {
159+
os << "Editop(name=" << editop_name_to_string(op.name)
160+
<< ", src_idx=" << op.src_idx
161+
<< ", dst_idx=" << op.dst_idx
162+
<< ", cost=" << op.cost
163+
<< ", output_string='" << op.output_string << "')";
164+
return os;
165+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#ifndef EDIT_DISTANCE_OSA_HPP
2+
#define EDIT_DISTANCE_OSA_HPP
3+
4+
#include <string>
5+
#include <vector>
6+
#include <map>
7+
#include <iostream>
8+
9+
10+
enum EditopName {
11+
INSERT,
12+
DELETE,
13+
REPLACE,
14+
TRANSPOSE
15+
};
16+
17+
struct Editop {
18+
EditopName name;
19+
int src_idx;
20+
int dst_idx;
21+
double cost;
22+
std::string output_string;
23+
24+
Editop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {}
25+
Editop(EditopName n, int si, int di, double c, const std::string& os)
26+
: name(n), src_idx(si), dst_idx(di), cost(c), output_string(os) {}
27+
};
28+
29+
30+
std::vector<std::vector<double>> compute_dp_table(
31+
const std::string& a,
32+
const std::string& b,
33+
const std::map<EditopName, double>& cost_map
34+
);
35+
36+
37+
double cpp_compute_distance(
38+
const std::string& a,
39+
const std::string& b,
40+
const std::map<EditopName, double>& cost_map
41+
);
42+
43+
44+
std::vector<std::vector<Editop>> backtrack_all_paths(
45+
const std::string& a,
46+
const std::string& b,
47+
const std::map<EditopName, double>& cost_map,
48+
const std::vector<std::vector<double>>& dp,
49+
int i,
50+
int j,
51+
std::vector<Editop>& current_path
52+
);
53+
54+
55+
std::vector<std::vector<Editop>> cpp_compute_all_paths(
56+
const std::string& a,
57+
const std::string& b,
58+
const std::map<EditopName, double>& cost_map
59+
);
60+
61+
62+
void cpp_print_all_paths(
63+
const std::string& a,
64+
const std::string& b,
65+
const std::map<EditopName, double>& cost_map
66+
);
67+
68+
69+
std::string editop_name_to_string(EditopName name);
70+
std::ostream& operator<<(std::ostream& os, const Editop& op);
71+
72+
#endif // EDIT_DISTANCE_OSA_HPP

0 commit comments

Comments
 (0)