Skip to content

Commit f7089f0

Browse files
authored
Merge pull request #35 from statisticsnorway/thefuzz
migrate from fuzzywuzzy to thefuzz
2 parents 6044549 + 501b9d4 commit f7089f0

File tree

8 files changed

+2312
-1044
lines changed

8 files changed

+2312
-1044
lines changed

docs/conf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import os
1616
import sys
1717

18-
1918
sys.path.insert(0, os.path.abspath("../src"))
2019

2120
# -- Project information -----------------------------------------------------

example_notebook/notebook_example.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
{
44
"cell_type": "code",
55
"execution_count": null,
6-
"id": "34950912-618c-4ed3-ac5d-8adcef20275b",
6+
"id": "0",
77
"metadata": {
88
"tags": []
99
},
@@ -18,7 +18,7 @@
1818
{
1919
"cell_type": "code",
2020
"execution_count": null,
21-
"id": "c17476a1-bad8-4e21-b9fe-9ba709cbeb36",
21+
"id": "1",
2222
"metadata": {
2323
"tags": []
2424
},
@@ -31,7 +31,7 @@
3131
{
3232
"cell_type": "code",
3333
"execution_count": null,
34-
"id": "a07bd2aa-de4a-46cc-a028-bd1a44c49537",
34+
"id": "2",
3535
"metadata": {},
3636
"outputs": [],
3737
"source": [
@@ -42,7 +42,7 @@
4242
{
4343
"cell_type": "code",
4444
"execution_count": null,
45-
"id": "f89ebfb5-3dee-4a71-a876-9fd16de2a431",
45+
"id": "3",
4646
"metadata": {
4747
"tags": []
4848
},

noxfile.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
import nox
1111

12-
1312
try:
1413
from nox_poetry import Session
1514
from nox_poetry import session

poetry.lock

Lines changed: 2301 additions & 1031 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ Changelog = "https://github.com/statisticsnorway/ssb-hermes/releases"
1616
[tool.poetry.dependencies]
1717
python = "^3.10"
1818
click = ">=8.0.1"
19-
fuzzywuzzy = "^0.18.0"
2019
pandas = "^2.1.4"
2120
numpy = "^1.26.2"
22-
python-levenshtein = "^0.23.0"
21+
thefuzz = "^0.22.1"
22+
ipykernel = "^7.2.0"
2323

2424
[tool.poetry.group.dev.dependencies]
2525
pygments = ">=2.10.0"

src/ssb_hermes/_find_match_rules.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
33
This python file contain the three functions for adress matching based on the three rules:
44
* Rule1: If there is only one unit at a location(postnr), then we use this unit.
5-
* Rule2: If there are multiple units we use fuzzywuzzy with 75% match.
5+
* Rule2: If there are multiple units we use thefuzz with 75% match.
66
* Rule3: If rule 1 and 2 did not work, we iterate up geographically.
77
"""
88

99
"""Importing packages"""
1010
from typing import Any
1111

1212
import pandas as pd # type: ignore
13-
from fuzzywuzzy import process # type: ignore
13+
from thefuzz import process # type: ignore
1414

1515
"""Importing other internal functions"""
1616
from ._functions import _check_for_value
@@ -46,7 +46,7 @@ def _find_match_rule2(
4646
tuple: item, rule
4747
"""
4848
try:
49-
item, match = process.extractOne(
49+
item, _match = process.extractOne(
5050
query=query, choices=choices, score_cutoff=score_cutoff
5151
)
5252
except (ValueError, AttributeError, TypeError):

src/ssb_hermes/_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Any
44

55
import pandas as pd # type: ignore
6-
from fuzzywuzzy import process # type: ignore
6+
from thefuzz import process # type: ignore
77

88

99
def _add_row(

src/ssb_hermes/functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def find_match(
102102
# Sjekker om det kun finnes en enhet på postnummeret i vof. Isåfall bruker jeg denne enheten
103103
if len(liste_registry) == 1:
104104
item, rule = _find_match_rule1(liste_registry)
105-
# Om det fins flere enheter på samme postnr bruker jeg fuzzywuzzy for å matche dem.
105+
# Om det fins flere enheter på samme postnr bruker jeg thefuzz for å matche dem.
106106
else:
107107
if _check_all_values_equal(
108108
df_registry_subset, registry_type_columns

0 commit comments

Comments
 (0)