Duplicate history cleanup (#483)

fopina · Ross Mechanic · commit 223a7bb9c0b3 · 2018-12-18T10:45:46.000-05:00
* added duplicate history cleanup command

* tests for clean_duplicate_history command

* make format

* coverage increase

* review suggestions implemented

* review suggestions implemented

* review suggestions implemented

* codeclimate

* merged common code
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -28,6 +28,7 @@ Authors
 - David Grochowski (`ThePumpingLemma <https://github.com/ThePumpingLemma>`_)
 - David Hite
 - Eduardo Cuducos
+- Filipe Pina (@fopina)
 - Florian Eßer
 - Frank Sachsenheim
 - George Vilches
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -12,6 +12,7 @@ Changes
 - Add german translations (gh-484)
 - Add `extra_context` parameter to history_form_view (gh-467)
 - Fixed bug that prevented `next_record` and `prev_record` to work with custom manager names (gh-501)
+- Added management command `clean_duplicate_history` to remove duplicate history entries (gh-483)
 
 2.5.1 (2018-10-19)
 ------------------
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -328,3 +328,25 @@ And to revert to that ``HistoricalPoll`` instance, we can do:
 This will change the ``poll`` instance to have the data from the
 ``HistoricalPoll`` object and it will create a new row in the
 ``HistoricalPoll`` table indicating that a new change has been made.
+
+Duplicate history entries
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For performance reasons, ``django-simple-history`` always creates an ``HistoricalRecord``
+when ``Model.save()`` is called regardless of data having actually changed.
+If you find yourself with a lot of history duplicates you can schedule the
+``clean_duplicate_history`` command
+
+.. code-block:: bash
+
+    $ python manage.py clean_duplicate_history --auto
+
+You can use ``--auto`` to clean up duplicates for every model
+with ``HistoricalRecords`` or enumerate specific models as args.
+There is also ``-m/--minutes`` to specify how many minutes to go
+back in history while searching (default checks whole history),
+so you can schedule, for instance, an hourly cronjob such as
+
+.. code-block:: bash
+
+    $ python manage.py clean_duplicate_history -m 60 --auto
diff --git a/simple_history/management/commands/clean_duplicate_history.py b/simple_history/management/commands/clean_duplicate_history.py
@@ -0,0 +1,110 @@
+from django.utils import timezone
+from django.db import transaction
+
+from . import populate_history
+from ... import models, utils
+from ...exceptions import NotHistoricalModelError
+
+
+class Command(populate_history.Command):
+    args = "<app.model app.model ...>"
+    help = (
+        "Scans HistoricalRecords for identical sequencial entries "
+        "(duplicates) in a model and deletes them."
+    )
+
+    DONE_CLEANING_FOR_MODEL = "Removed {count} historical records for {model}\n"
+
+    def add_arguments(self, parser):
+        parser.add_argument("models", nargs="*", type=str)
+        parser.add_argument(
+            "--auto",
+            action="store_true",
+            dest="auto",
+            default=False,
+            help="Automatically search for models with the HistoricalRecords field "
+            "type",
+        )
+        parser.add_argument(
+            "-d", "--dry", action="store_true", help="Dry (test) run only, no changes"
+        )
+        parser.add_argument(
+            "-m", "--minutes", type=int, help="Only search the last MINUTES of history"
+        )
+
+    def handle(self, *args, **options):
+        self.verbosity = options["verbosity"]
+
+        to_process = set()
+        model_strings = options.get("models", []) or args
+
+        if model_strings:
+            for model_pair in self._handle_model_list(*model_strings):
+                to_process.add(model_pair)
+
+        elif options["auto"]:
+            to_process = self._auto_models()
+
+        else:
+            self.log(self.COMMAND_HINT)
+
+        self._process(to_process, date_back=options["minutes"], dry_run=options["dry"])
+
+    def _process(self, to_process, date_back=None, dry_run=True):
+        if date_back:
+            stop_date = timezone.now() - timezone.timedelta(minutes=date_back)
+        else:
+            stop_date = None
+
+        for model, history_model in to_process:
+            m_qs = history_model.objects
+            if stop_date:
+                m_qs = m_qs.filter(history_date__gte=stop_date)
+            found = m_qs.count()
+            self.log("{0} has {1} historical entries".format(model, found), 2)
+            if not found:
+                continue
+
+            # it would be great if we could just iterate over the instances that
+            # have changes (in the given period) but
+            # `m_qs.values(model._meta.pk.name).distinct()`
+            # is actually slower than looping all and filtering in the code...
+            for o in model.objects.all():
+                self._process_instance(o, model, stop_date=stop_date, dry_run=dry_run)
+
+    def _process_instance(self, instance, model, stop_date=None, dry_run=True):
+        entries_deleted = 0
+        o_qs = instance.history.all()
+        if stop_date:
+            # to compare last history match
+            extra_one = o_qs.filter(history_date__lte=stop_date).first()
+            o_qs = o_qs.filter(history_date__gte=stop_date)
+        else:
+            extra_one = None
+        with transaction.atomic():
+            # ordering is ('-history_date', '-history_id') so this is ok
+            f1 = o_qs.first()
+            if not f1:
+                return
+
+            for f2 in o_qs[1:]:
+                entries_deleted += self._check_and_delete(f1, f2, dry_run)
+                f1 = f2
+            if extra_one:
+                entries_deleted += self._check_and_delete(f1, extra_one, dry_run)
+
+        self.log(
+            self.DONE_CLEANING_FOR_MODEL.format(model=model, count=entries_deleted)
+        )
+
+    def log(self, message, verbosity_level=1):
+        if self.verbosity >= verbosity_level:
+            self.stdout.write(message)
+
+    def _check_and_delete(self, entry1, entry2, dry_run=True):
+        delta = entry1.diff_against(entry2)
+        if not delta.changed_fields:
+            if not dry_run:
+                entry1.delete()
+            return 1
+        return 0
diff --git a/simple_history/management/commands/populate_history.py b/simple_history/management/commands/populate_history.py
@@ -55,22 +55,27 @@ def handle(self, *args, **options):
                 to_process.add(model_pair)
 
         elif options["auto"]:
-            for model in models.registered_models.values():
-                try:  # avoid issues with mutli-table inheritance
-                    history_model = utils.get_history_model_for_model(model)
-                except NotHistoricalModelError:
-                    continue
-                to_process.add((model, history_model))
-            if not to_process:
-                if self.verbosity >= 1:
-                    self.stdout.write(self.NO_REGISTERED_MODELS)
+            to_process = self._auto_models()
 
         else:
             if self.verbosity >= 1:
                 self.stdout.write(self.COMMAND_HINT)
 
         self._process(to_process, batch_size=options["batchsize"])
 
+    def _auto_models(self):
+        to_process = set()
+        for model in models.registered_models.values():
+            try:  # avoid issues with multi-table inheritance
+                history_model = utils.get_history_model_for_model(model)
+            except NotHistoricalModelError:
+                continue
+            to_process.add((model, history_model))
+        if not to_process:
+            if self.verbosity >= 1:
+                self.stdout.write(self.NO_REGISTERED_MODELS)
+        return to_process
+
     def _handle_model_list(self, *args):
         failing = False
         for natural_key in args:
diff --git a/simple_history/tests/tests/test_commands.py b/simple_history/tests/tests/test_commands.py
@@ -1,13 +1,13 @@
 from contextlib import contextmanager
-from datetime import datetime
+from datetime import datetime, timedelta
 
 from django.core import management
 from django.test import TestCase
 from six.moves import cStringIO as StringIO
 
 from simple_history import models as sh_models
-from simple_history.management.commands import populate_history
-from ..models import Book, Poll, PollWithExcludeFields, Restaurant
+from simple_history.management.commands import populate_history, clean_duplicate_history
+from ..models import Book, Poll, PollWithExcludeFields, Restaurant, Place
 
 
 @contextmanager
@@ -118,7 +118,7 @@ def test_existing_objects(self):
 
     def test_no_historical(self):
         out = StringIO()
-        with replace_registry():
+        with replace_registry({"test_place": Place}):
             management.call_command(self.command_name, auto=True, stdout=out)
         self.assertIn(populate_history.Command.NO_REGISTERED_MODELS, out.getvalue())
 
@@ -180,3 +180,178 @@ def test_excluded_fields(self):
         )
         initial_history_record = PollWithExcludeFields.history.all()[0]
         self.assertEqual(initial_history_record.question, poll.question)
+
+
+class TestCleanDuplicateHistory(TestCase):
+    command_name = "clean_duplicate_history"
+    command_error = (management.CommandError, SystemExit)
+
+    def test_no_args(self):
+        out = StringIO()
+        management.call_command(self.command_name, stdout=out, stderr=StringIO())
+        self.assertIn(clean_duplicate_history.Command.COMMAND_HINT, out.getvalue())
+
+    def test_bad_args(self):
+        test_data = (
+            (clean_duplicate_history.Command.MODEL_NOT_HISTORICAL, ("tests.place",)),
+            (clean_duplicate_history.Command.MODEL_NOT_FOUND, ("invalid.model",)),
+            (clean_duplicate_history.Command.MODEL_NOT_FOUND, ("bad_key",)),
+        )
+        for msg, args in test_data:
+            out = StringIO()
+            self.assertRaises(
+                self.command_error,
+                management.call_command,
+                self.command_name,
+                *args,
+                stdout=StringIO(),
+                stderr=out
+            )
+            self.assertIn(msg, out.getvalue())
+
+    def test_no_historical(self):
+        out = StringIO()
+        with replace_registry({"test_place": Place}):
+            management.call_command(self.command_name, auto=True, stdout=out)
+        self.assertIn(
+            clean_duplicate_history.Command.NO_REGISTERED_MODELS, out.getvalue()
+        )
+
+    def test_auto_dry_run(self):
+        p = Poll.objects.create(
+            question="Will this be deleted?", pub_date=datetime.now()
+        )
+        p.save()
+
+        # not related to dry_run test, just for increasing coverage :)
+        # create instance with single-entry history older than "minutes"
+        # so it is skipped
+        p = Poll.objects.create(
+            question="Will this be deleted?", pub_date=datetime.now()
+        )
+        h = p.history.first()
+        h.history_date -= timedelta(hours=1)
+        h.save()
+
+        self.assertEqual(Poll.history.all().count(), 3)
+        out = StringIO()
+        management.call_command(
+            self.command_name,
+            auto=True,
+            minutes=50,
+            dry=True,
+            stdout=out,
+            stderr=StringIO(),
+        )
+        self.assertEqual(
+            out.getvalue(),
+            "Removed 1 historical records for "
+            "<class 'simple_history.tests.models.Poll'>\n",
+        )
+        self.assertEqual(Poll.history.all().count(), 3)
+
+    def test_auto_cleanup(self):
+        p = Poll.objects.create(
+            question="Will this be deleted?", pub_date=datetime.now()
+        )
+        self.assertEqual(Poll.history.all().count(), 1)
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 2)
+        p.question = "Maybe this one won't...?"
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 3)
+        out = StringIO()
+        management.call_command(
+            self.command_name, auto=True, stdout=out, stderr=StringIO()
+        )
+        self.assertEqual(
+            out.getvalue(),
+            "Removed 1 historical records for "
+            "<class 'simple_history.tests.models.Poll'>\n",
+        )
+        self.assertEqual(Poll.history.all().count(), 2)
+
+    def test_auto_cleanup_verbose(self):
+        p = Poll.objects.create(
+            question="Will this be deleted?", pub_date=datetime.now()
+        )
+        self.assertEqual(Poll.history.all().count(), 1)
+        p.save()
+        p.question = "Maybe this one won't...?"
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 3)
+        out = StringIO()
+        management.call_command(
+            self.command_name,
+            "tests.poll",
+            auto=True,
+            verbosity=2,
+            stdout=out,
+            stderr=StringIO(),
+        )
+        self.assertEqual(
+            out.getvalue(),
+            "<class 'simple_history.tests.models.Poll'> has 3 historical entries\n"
+            "Removed 1 historical records for "
+            "<class 'simple_history.tests.models.Poll'>\n",
+        )
+        self.assertEqual(Poll.history.all().count(), 2)
+
+    def test_auto_cleanup_dated(self):
+        the_time_is_now = datetime.now()
+        p = Poll.objects.create(
+            question="Will this be deleted?", pub_date=the_time_is_now
+        )
+        self.assertEqual(Poll.history.all().count(), 1)
+        p.save()
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 3)
+        p.question = "Or this one...?"
+        p.save()
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 5)
+
+        for h in Poll.history.all()[2:]:
+            h.history_date -= timedelta(hours=1)
+            h.save()
+
+        management.call_command(
+            self.command_name,
+            auto=True,
+            minutes=50,
+            stdout=StringIO(),
+            stderr=StringIO(),
+        )
+        self.assertEqual(Poll.history.all().count(), 4)
+
+    def test_auto_cleanup_dated_extra_one(self):
+        the_time_is_now = datetime.now()
+        p = Poll.objects.create(
+            question="Will this be deleted?", pub_date=the_time_is_now
+        )
+        self.assertEqual(Poll.history.all().count(), 1)
+        p.save()
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 3)
+        p.question = "Or this one...?"
+        p.save()
+        p.save()
+        p.save()
+        p.save()
+        self.assertEqual(Poll.history.all().count(), 7)
+
+        for h in Poll.history.all()[2:]:
+            h.history_date -= timedelta(hours=1)
+            h.save()
+
+        management.call_command(
+            self.command_name,
+            auto=True,
+            minutes=50,
+            stdout=StringIO(),
+            stderr=StringIO(),
+        )
+        # even though only the last 2 entries match the date range
+        # the "extra_one" (the record before the oldest match)
+        # is identical to the oldest match, so oldest match is deleted
+        self.assertEqual(Poll.history.all().count(), 5)