Skip to content

Commit 9169444

Browse files
committed
update return types of processors
1 parent 426fbb2 commit 9169444

File tree

4 files changed

+28
-80
lines changed

4 files changed

+28
-80
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ pip install .
129129
```console
130130
> choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
131131
> process.extract("new york jets", choices, limit=2)
132-
[('new york jets', 100), ('new york giants', 78.57142639160156)]
132+
[('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
133133
> process.extractOne("cowboys", choices)
134-
("dallas cowboys", 90)
134+
("Dallas Cowboys", 90, 3)
135135
```
136136

137137
## License

docs/usage/process.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,17 @@ Find the best matches in a list of choices.
4343

4444
Returns:
4545

46-
- **matches**: *List[Tuple[str, float]] or List[Tuple[str, float, str]])*
46+
- **matches**: *List[Tuple[str, float, Any]]*
4747

4848
Returns a list of all matches that have a `score >= score_cutoff`. The list will
49-
be of either `(<choice>, <ratio>)` when `choices` is a list of strings
49+
be of either `(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings
5050
or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
5151

5252

5353
```console
5454
> choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
5555
> process.extract("new york jets", choices, limit=2)
56-
[('new york jets', 100), ('new york giants', 78.57142639160156)]
56+
[('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
5757
```
5858

5959
=== "C++"
@@ -62,7 +62,7 @@ Find the best matches in a list of choices.
6262
using rapidfuzz::process::extract;
6363

6464
// matches is a vector of std::pairs
65-
// [('new york jets', 100), ('new york giants', 78.57142639160156)]
65+
// [('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
6666
auto matches = extract(
6767
"new york jets",
6868
std::vector<std::string>{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"},
@@ -80,15 +80,15 @@ Finds the best match in a list of choices by comparing them using the provided s
8080

8181
Returns:
8282

83-
- **matches**: *Union[None, Tuple[str, float], Tuple[str, float, str]]*
83+
- **matches**: *Union[None, Tuple[str, float, Any]]*
8484

85-
Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(<choice>, <ratio>)` when `choices` is a list of strings or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
85+
Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
8686

8787

8888
```console
8989
> choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
9090
> process.extractOne("cowboys", choices)
91-
("dallas cowboys", 90)
91+
("Dallas Cowboys", 90, 3)
9292
```
9393

9494
=== "C++"
@@ -97,7 +97,7 @@ Finds the best match in a list of choices by comparing them using the provided s
9797
using rapidfuzz::process::extractOne;
9898

9999
// matches is a boost::optional<std::pair>
100-
// ("dallas cowboys", 90)
100+
// ("Dallas Cowboys", 90, 3)
101101
auto matches = extractOne(
102102
"cowboys",
103103
std::vector<std::string>{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"});

src/py_abstraction.cpp

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
/* SPDX-License-Identifier: MIT */
22
/* Copyright © 2020 Max Bachmann */
3-
/* Copyright © 2011 Adam Cohen */
43

54
#include "fuzz.hpp"
65
#include "py_utils.hpp"
@@ -639,9 +638,9 @@ std::unique_ptr<CachedFuzz> get_matching_instance(PyObject* scorer)
639638
static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
640639
PyObject* scorer, PyObject* processor, double score_cutoff)
641640
{
642-
bool match_found = false;
643641
PyObject* result_choice = NULL;
644642
PyObject* choice_key = NULL;
643+
Py_ssize_t result_index = -1;
645644
std::vector<PyObject*> outer_owner_list;
646645

647646
bool is_dict = false;
@@ -687,10 +686,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
687686
}
688687
outer_owner_list.push_back(choices);
689688

690-
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
689+
Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices);
691690

692-
693-
for (std::size_t i = 0; i < choice_count; ++i) {
691+
for (Py_ssize_t i = 0; i < choice_count; ++i) {
694692
PyObject* py_choice = NULL;
695693
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
696694

@@ -741,9 +739,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
741739
if (comp == 1) {
742740
Py_DecRef(py_score_cutoff);
743741
py_score_cutoff = score;
744-
match_found = true;
745742
result_choice = py_match_choice;
746743
choice_key = py_choice;
744+
result_index = i;
747745
} else if (comp == 0) {
748746
Py_DecRef(score);
749747
} else if (comp == -1) {
@@ -758,7 +756,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
758756

759757
free_owner_list(outer_owner_list);
760758

761-
if (!match_found) {
759+
if (result_index != -1) {
762760
Py_DecRef(py_score_cutoff);
763761
Py_RETURN_NONE;
764762
}
@@ -769,7 +767,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
769767

770768
PyObject* result = is_dict
771769
? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key)
772-
: Py_BuildValue("(OO)", result_choice, py_score_cutoff);
770+
: Py_BuildValue("(OOn)", result_choice, py_score_cutoff, result_index);
773771

774772
Py_DecRef(py_score_cutoff);
775773
return result;
@@ -793,17 +791,17 @@ constexpr const char* extractOne_docstring =
793791
"Returns:\n"
794792
" Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n"
795793
" no match with a score >= score_cutoff\n"
796-
" Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n"
794+
" Union[None, Tuple[str, float, Any]]: Returns the best match the best match\n"
797795
" in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n"
798-
" be in the form`(<choice>, <ratio>)` when `choices` is a list of strings\n"
796+
" be in the form`(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings\n"
799797
" or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.";
800798

801799
static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds)
802800
{
803-
bool match_found = false;
804801
PyObject* result_choice = NULL;
805802
PyObject* choice_key = NULL;
806803
double result_score;
804+
Py_ssize_t result_index = -1;
807805
std::vector<PyObject*> outer_owner_list;
808806
python_string query;
809807
bool is_dict = false;
@@ -856,9 +854,9 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds
856854
}
857855
outer_owner_list.push_back(choices);
858856

859-
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
857+
Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices);
860858

861-
for (std::size_t i = 0; i < choice_count; ++i) {
859+
for (Py_ssize_t i = 0; i < choice_count; ++i) {
862860
PyObject* py_choice = NULL;
863861
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
864862

@@ -889,23 +887,23 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds
889887
// increase the value by a small step so it might be able to exit early
890888
score_cutoff = score + (float)0.00001;
891889
result_score = score;
892-
match_found = true;
893890
result_choice = py_match_choice;
894891
choice_key = py_choice;
892+
result_index = i;
895893
}
896894
free_owner_list(inner_owner_list);
897895
}
898896

899897
free_owner_list(outer_owner_list);
900898

901-
if (!match_found) {
899+
if (result_index == -1) {
902900
Py_RETURN_NONE;
903901
}
904902

905903
if (is_dict) {
906904
return Py_BuildValue("(OdO)", result_choice, result_score, choice_key);
907905
} else {
908-
return Py_BuildValue("(Od)", result_choice, result_score);
906+
return Py_BuildValue("(Odn)", result_choice, result_score, result_index);
909907
}
910908
}
911909

src/rapidfuzz/process.py

Lines changed: 4 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_
2727
if score >= score_cutoff:
2828
yield (match_choice, score, choice)
2929
else:
30-
for choice in choices:
30+
for i, choice in enumerate(choices):
3131
if choice is None:
3232
continue
3333
b = processor(choice) if processor else choice
@@ -38,26 +38,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_
3838
score_cutoff=score_cutoff)
3939

4040
if score >= score_cutoff:
41-
yield (choice, score)
42-
43-
def iterExtractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0):
44-
if query is None:
45-
return
46-
47-
a = processor(query) if processor else query
48-
49-
for (i, choice) in enumerate(choices):
50-
if choice is None:
51-
continue
52-
b = processor(choice) if processor else choice
53-
score = scorer(
54-
a, b,
55-
processor=None,
56-
score_cutoff=score_cutoff)
57-
58-
if score >= score_cutoff:
59-
yield (i, score)
60-
41+
yield (choice, score, i)
6142

6243
def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
6344
"""
@@ -76,9 +57,9 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc
7657
a lower score than this number will not be returned. Defaults to 0
7758
7859
Returns:
79-
Union[List[Tuple[str, float]], List[Tuple[str, float, str]]]: Returns a
60+
Union[List[Tuple[str, float, Any]]]: Returns a
8061
list of all matches that have a `score >= score_cutoff`. The list will
81-
be of either `(<choice>, <ratio>)` when `choices` is a list of strings
62+
be of either `(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings
8263
or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
8364
"""
8465
results = iterExtract(query, choices, scorer, processor, score_cutoff)
@@ -87,34 +68,3 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc
8768
return sorted(results, key=lambda x: x[1], reverse=True)
8869

8970
return heapq.nlargest(limit, results, key=lambda x: x[1])
90-
91-
92-
def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
93-
"""
94-
Find the best matches in a list of choices
95-
96-
Args:
97-
query (str): string we want to find
98-
choices (Iterable): list of all strings the query should be compared with
99-
scorer (Callable): optional callable that is used to calculate the matching score between
100-
the query and each choice. WRatio is used by default
101-
processor (Callable): optional callable that reformats the strings. utils.default_process
102-
is used by default, which lowercases the strings and trims whitespace
103-
limit (int): maximum amount of results to return
104-
score_cutoff (float): Optional argument for a score threshold. Matches with
105-
a lower score than this number will not be returned. Defaults to 0
106-
107-
Returns:
108-
List[Tuple[int, float]]: returns a list of all incides in the list that have a score >= score_cutoff
109-
110-
"""
111-
results = iterExtractIndices(query, choices, scorer, processor, score_cutoff)
112-
113-
if limit is None:
114-
return sorted(results, key=lambda x: x[1], reverse=True)
115-
116-
return heapq.nlargest(limit, results, key=lambda x: x[1])
117-
118-
119-
def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
120-
return extract(query, choices, scorer, processor, limit, score_cutoff)

0 commit comments

Comments
 (0)