fix incorrect ref counting

maxbachmann · maxbachmann · commit c6eebb70a55b · 2021-03-03T16:08:42.000+01:00
diff --git a/.github/workflows/pythonbuild.yml b/.github/workflows/pythonbuild.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Run Unit Tests
         run: |
           pip install .
-          pip install pytest hypothesis
+          pip install pytest hypothesis pandas
           pytest
 
 
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.1.1
+1.1.2
diff --git a/setup.py b/setup.py
@@ -38,8 +38,8 @@ def build_extensions(self):
         elif ct == 'msvc':
             opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
         for ext in self.extensions:
-            ext.extra_compile_args = opts
-            ext.extra_link_args = link_opts
+            ext.extra_compile_args += opts
+            ext.extra_link_args += link_opts
         build_ext.build_extensions(self)
 
 setup(
diff --git a/src/cpp_process.cpp b/src/cpp_process.cpp
diff --git a/src/cpp_process.pyx b/src/cpp_process.pyx
@@ -11,6 +11,7 @@ from cpython.list cimport PyList_New
 from cpython.list cimport PyList_SET_ITEM
 from cpython.object cimport PyObject
 from cpython.ref cimport Py_INCREF
+from cpython.ref cimport Py_DECREF
 
 import heapq
 
@@ -358,6 +359,10 @@ cdef inline extract_dict(scorer_context context, choices, processor, size_t limi
             score = context.scorer(context.context, choice, score_cutoff)
 
             if score >= score_cutoff:
+                # especially the key object might be created on the fly by e.g. pandas.Dataframe
+                # so we need to ensure Python does not deallocate it
+                Py_INCREF(choice)
+                Py_INCREF(choice_key)
                 results.push_back(DictMatchElem(score, i, <PyObject*>choice, <PyObject*>choice_key))
             index += 1
 
@@ -379,10 +384,15 @@ cdef inline extract_dict(scorer_context context, choices, processor, size_t limi
         # https://stackoverflow.com/questions/43553763/cythonize-list-of-all-splits-of-a-string/43557675#43557675
         PyList_SET_ITEM(result_list, i,
             <object>Py_BuildValue("OdO",
-                <PyObject*>choices[<object>results[i].key],
+                <PyObject*>results[i].choice,
                 results[i].score,
                 <PyObject*>results[i].key))
 
+    # decref all reference counts
+    for i in range(results.size()):
+        Py_DECREF(<object>results[i].choice)
+        Py_DECREF(<object>results[i].key)
+
     return result_list
 
 
@@ -393,7 +403,7 @@ cdef inline extract_list(scorer_context context, choices, processor, size_t limi
     # todo possibly a smaller vector would be good to reduce memory usage
     cdef vector[ListMatchElem] results
     results.reserve(<size_t>len(choices))
-    cdef object result_list
+    cdef list result_list
 
     if processor is not None:
         for choice in choices:
@@ -751,4 +761,3 @@ def extract_iter(query, choices, scorer=fuzz.WRatio, processor=utils.default_pro
                     if py_score >= score_cutoff:
                         yield(choice, py_score, index)
                     index += 1
-        
diff --git a/src/rapidfuzz-cpp b/src/rapidfuzz-cpp
@@ -1 +1 @@
-Subproject commit ea6f17dd4d3af1f15f46ff608da7cfa28625ed5a
+Subproject commit 91f20cd9930e620c7c250381bcca640570480dbd
diff --git a/src/rapidfuzz/__init__.py b/src/rapidfuzz/__init__.py
@@ -3,6 +3,6 @@
 """
 __author__ = "Max Bachmann"
 __license__ = "MIT"
-__version__ = "1.1.1"
+__version__ = "1.1.2"
 
 from rapidfuzz import process, fuzz, utils, levenshtein, string_metric
diff --git a/tests/test_process.py b/tests/test_process.py
@@ -5,6 +5,7 @@
 import pytest
 
 from rapidfuzz import process, fuzz, utils
+import pandas as pd
 
 class ProcessTest(unittest.TestCase):
     def setUp(self):
@@ -187,6 +188,12 @@ def testNullStrings(self):
         best = process.extractOne(query, choices)
         self.assertEqual(best[0], choices[1])
 
+    def testIssue81(self):
+        # this mostly tests whether this segfaults due to incorrect ref counting
+        choices = pd.Series(['test color brightness', 'test lemon', 'test lavender'], index=[67478, 67479, 67480])
+        matches = process.extract("test", choices)
+        assert matches == [('test color brightness', 90.0, 67478), ('test lemon', 90.0, 67479), ('test lavender', 90.0, 67480)]
+
 
 def custom_scorer(s1, s2, processor=None, score_cutoff=0):
     return fuzz.ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)