From 6fab426f7d6b09775dfb998c62295d2850c9f0bb Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 1 Jun 2022 23:35:16 -0400 Subject: [PATCH 1/2] random forest --- dedupe/api.py | 6 +++--- dedupe/labeler.py | 10 +++++----- tests/test_labeler.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 398d05200..1c6bf322a 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -15,7 +15,7 @@ import tempfile import numpy -import sklearn.linear_model +import sklearn.ensemble import sklearn.model_selection import dedupe.core as core @@ -1049,8 +1049,8 @@ class ActiveMatching(Matching): """ classifier = sklearn.model_selection.GridSearchCV( - estimator=sklearn.linear_model.LogisticRegression(), - param_grid={"C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}, + estimator=sklearn.ensemble.RandomForestClassifier(), + param_grid={"n_estimators": [100, 200, 400, 800, 1600]}, scoring="f1", n_jobs=-1, ) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index c95e08aca..d99b85e11 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -5,7 +5,7 @@ import numpy from typing import List from typing_extensions import Protocol -import sklearn.linear_model +import sklearn.ensemble import dedupe.core as core import dedupe.training as training @@ -38,7 +38,7 @@ class HasDataModel(Protocol): data_model: datamodel.DataModel -class RLRLearner(sklearn.linear_model.LogisticRegression, ActiveLearner): +class RFLearner(sklearn.ensemble.RandomForestClassifier, ActiveLearner): def __init__(self, data_model): super().__init__() self.data_model = data_model @@ -304,7 +304,7 @@ def _sample(self, data_1, data_2, sample_size): class DisagreementLearner(ActiveLearner): - classifier: RLRLearner + classifier: RFLearner blocker: BlockLearner candidates: List[TrainingExample] @@ -409,7 +409,7 @@ def __init__( self.candidates = self.blocker.candidates - self.classifier = RLRLearner(self.data_model) + self.classifier = RFLearner(self.data_model) self.classifier.candidates = self.candidates self._common_init() @@ -441,7 +441,7 @@ def __init__( self.blocker = RecordLinkBlockLearner(data_model, data_1, data_2, index_include) self.candidates = self.blocker.candidates - self.classifier = RLRLearner(self.data_model) + self.classifier = RFLearner(self.data_model) self.classifier.candidates = self.candidates self._common_init() diff --git a/tests/test_labeler.py b/tests/test_labeler.py index 48293ab3a..1bd4e8e4b 100644 --- a/tests/test_labeler.py +++ b/tests/test_labeler.py @@ -20,7 +20,7 @@ def setUp(self): def test_AL(self): random.seed(1111111111110) original_N = len(SAMPLE) - active_learner = dedupe.labeler.RLRLearner(self.data_model) + active_learner = dedupe.labeler.RFLearner(self.data_model) active_learner.candidates = SAMPLE assert len(active_learner) == original_N From 574aa84cf4b7f12e50dfe9099b579cbc54059a23 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 2 Jun 2022 00:42:38 -0400 Subject: [PATCH 2/2] fewer elements in grid search --- dedupe/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedupe/api.py b/dedupe/api.py index 28199b9f8..cca694f97 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -1092,7 +1092,7 @@ def __init__( ] self.classifier = sklearn.model_selection.GridSearchCV( estimator=sklearn.ensemble.RandomForestClassifier(), - param_grid={"n_estimators": [100, 200, 400, 800, 1600]}, + param_grid={"n_estimators": [100, 200, 400, 800]}, scoring="f1", n_jobs=-1, )