Skip to content

Commit 223a7bb

Browse files
fopinaRoss Mechanic
authored andcommitted
Duplicate history cleanup (#483)
* added duplicate history cleanup command * tests for clean_duplicate_history command * make format * coverage increase * review suggestions implemented * review suggestions implemented * review suggestions implemented * codeclimate * merged common code
1 parent cbb0ab4 commit 223a7bb

File tree

6 files changed

+327
-13
lines changed

6 files changed

+327
-13
lines changed

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Authors
2828
- David Grochowski (`ThePumpingLemma <https://github.com/ThePumpingLemma>`_)
2929
- David Hite
3030
- Eduardo Cuducos
31+
- Filipe Pina (@fopina)
3132
- Florian Eßer
3233
- Frank Sachsenheim
3334
- George Vilches

CHANGES.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Changes
1212
- Add german translations (gh-484)
1313
- Add `extra_context` parameter to history_form_view (gh-467)
1414
- Fixed bug that prevented `next_record` and `prev_record` to work with custom manager names (gh-501)
15+
- Added management command `clean_duplicate_history` to remove duplicate history entries (gh-483)
1516

1617
2.5.1 (2018-10-19)
1718
------------------

docs/usage.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,3 +328,25 @@ And to revert to that ``HistoricalPoll`` instance, we can do:
328328
This will change the ``poll`` instance to have the data from the
329329
``HistoricalPoll`` object and it will create a new row in the
330330
``HistoricalPoll`` table indicating that a new change has been made.
331+
332+
Duplicate history entries
333+
~~~~~~~~~~~~~~~~~~~~~~~~~
334+
335+
For performance reasons, ``django-simple-history`` always creates an ``HistoricalRecord``
336+
when ``Model.save()`` is called regardless of data having actually changed.
337+
If you find yourself with a lot of history duplicates you can schedule the
338+
``clean_duplicate_history`` command
339+
340+
.. code-block:: bash
341+
342+
$ python manage.py clean_duplicate_history --auto
343+
344+
You can use ``--auto`` to clean up duplicates for every model
345+
with ``HistoricalRecords`` or enumerate specific models as args.
346+
There is also ``-m/--minutes`` to specify how many minutes to go
347+
back in history while searching (default checks whole history),
348+
so you can schedule, for instance, an hourly cronjob such as
349+
350+
.. code-block:: bash
351+
352+
$ python manage.py clean_duplicate_history -m 60 --auto
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from django.utils import timezone
2+
from django.db import transaction
3+
4+
from . import populate_history
5+
from ... import models, utils
6+
from ...exceptions import NotHistoricalModelError
7+
8+
9+
class Command(populate_history.Command):
10+
args = "<app.model app.model ...>"
11+
help = (
12+
"Scans HistoricalRecords for identical sequencial entries "
13+
"(duplicates) in a model and deletes them."
14+
)
15+
16+
DONE_CLEANING_FOR_MODEL = "Removed {count} historical records for {model}\n"
17+
18+
def add_arguments(self, parser):
19+
parser.add_argument("models", nargs="*", type=str)
20+
parser.add_argument(
21+
"--auto",
22+
action="store_true",
23+
dest="auto",
24+
default=False,
25+
help="Automatically search for models with the HistoricalRecords field "
26+
"type",
27+
)
28+
parser.add_argument(
29+
"-d", "--dry", action="store_true", help="Dry (test) run only, no changes"
30+
)
31+
parser.add_argument(
32+
"-m", "--minutes", type=int, help="Only search the last MINUTES of history"
33+
)
34+
35+
def handle(self, *args, **options):
36+
self.verbosity = options["verbosity"]
37+
38+
to_process = set()
39+
model_strings = options.get("models", []) or args
40+
41+
if model_strings:
42+
for model_pair in self._handle_model_list(*model_strings):
43+
to_process.add(model_pair)
44+
45+
elif options["auto"]:
46+
to_process = self._auto_models()
47+
48+
else:
49+
self.log(self.COMMAND_HINT)
50+
51+
self._process(to_process, date_back=options["minutes"], dry_run=options["dry"])
52+
53+
def _process(self, to_process, date_back=None, dry_run=True):
54+
if date_back:
55+
stop_date = timezone.now() - timezone.timedelta(minutes=date_back)
56+
else:
57+
stop_date = None
58+
59+
for model, history_model in to_process:
60+
m_qs = history_model.objects
61+
if stop_date:
62+
m_qs = m_qs.filter(history_date__gte=stop_date)
63+
found = m_qs.count()
64+
self.log("{0} has {1} historical entries".format(model, found), 2)
65+
if not found:
66+
continue
67+
68+
# it would be great if we could just iterate over the instances that
69+
# have changes (in the given period) but
70+
# `m_qs.values(model._meta.pk.name).distinct()`
71+
# is actually slower than looping all and filtering in the code...
72+
for o in model.objects.all():
73+
self._process_instance(o, model, stop_date=stop_date, dry_run=dry_run)
74+
75+
def _process_instance(self, instance, model, stop_date=None, dry_run=True):
76+
entries_deleted = 0
77+
o_qs = instance.history.all()
78+
if stop_date:
79+
# to compare last history match
80+
extra_one = o_qs.filter(history_date__lte=stop_date).first()
81+
o_qs = o_qs.filter(history_date__gte=stop_date)
82+
else:
83+
extra_one = None
84+
with transaction.atomic():
85+
# ordering is ('-history_date', '-history_id') so this is ok
86+
f1 = o_qs.first()
87+
if not f1:
88+
return
89+
90+
for f2 in o_qs[1:]:
91+
entries_deleted += self._check_and_delete(f1, f2, dry_run)
92+
f1 = f2
93+
if extra_one:
94+
entries_deleted += self._check_and_delete(f1, extra_one, dry_run)
95+
96+
self.log(
97+
self.DONE_CLEANING_FOR_MODEL.format(model=model, count=entries_deleted)
98+
)
99+
100+
def log(self, message, verbosity_level=1):
101+
if self.verbosity >= verbosity_level:
102+
self.stdout.write(message)
103+
104+
def _check_and_delete(self, entry1, entry2, dry_run=True):
105+
delta = entry1.diff_against(entry2)
106+
if not delta.changed_fields:
107+
if not dry_run:
108+
entry1.delete()
109+
return 1
110+
return 0

simple_history/management/commands/populate_history.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,22 +55,27 @@ def handle(self, *args, **options):
5555
to_process.add(model_pair)
5656

5757
elif options["auto"]:
58-
for model in models.registered_models.values():
59-
try: # avoid issues with mutli-table inheritance
60-
history_model = utils.get_history_model_for_model(model)
61-
except NotHistoricalModelError:
62-
continue
63-
to_process.add((model, history_model))
64-
if not to_process:
65-
if self.verbosity >= 1:
66-
self.stdout.write(self.NO_REGISTERED_MODELS)
58+
to_process = self._auto_models()
6759

6860
else:
6961
if self.verbosity >= 1:
7062
self.stdout.write(self.COMMAND_HINT)
7163

7264
self._process(to_process, batch_size=options["batchsize"])
7365

66+
def _auto_models(self):
67+
to_process = set()
68+
for model in models.registered_models.values():
69+
try: # avoid issues with multi-table inheritance
70+
history_model = utils.get_history_model_for_model(model)
71+
except NotHistoricalModelError:
72+
continue
73+
to_process.add((model, history_model))
74+
if not to_process:
75+
if self.verbosity >= 1:
76+
self.stdout.write(self.NO_REGISTERED_MODELS)
77+
return to_process
78+
7479
def _handle_model_list(self, *args):
7580
failing = False
7681
for natural_key in args:

simple_history/tests/tests/test_commands.py

Lines changed: 179 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from contextlib import contextmanager
2-
from datetime import datetime
2+
from datetime import datetime, timedelta
33

44
from django.core import management
55
from django.test import TestCase
66
from six.moves import cStringIO as StringIO
77

88
from simple_history import models as sh_models
9-
from simple_history.management.commands import populate_history
10-
from ..models import Book, Poll, PollWithExcludeFields, Restaurant
9+
from simple_history.management.commands import populate_history, clean_duplicate_history
10+
from ..models import Book, Poll, PollWithExcludeFields, Restaurant, Place
1111

1212

1313
@contextmanager
@@ -118,7 +118,7 @@ def test_existing_objects(self):
118118

119119
def test_no_historical(self):
120120
out = StringIO()
121-
with replace_registry():
121+
with replace_registry({"test_place": Place}):
122122
management.call_command(self.command_name, auto=True, stdout=out)
123123
self.assertIn(populate_history.Command.NO_REGISTERED_MODELS, out.getvalue())
124124

@@ -180,3 +180,178 @@ def test_excluded_fields(self):
180180
)
181181
initial_history_record = PollWithExcludeFields.history.all()[0]
182182
self.assertEqual(initial_history_record.question, poll.question)
183+
184+
185+
class TestCleanDuplicateHistory(TestCase):
186+
command_name = "clean_duplicate_history"
187+
command_error = (management.CommandError, SystemExit)
188+
189+
def test_no_args(self):
190+
out = StringIO()
191+
management.call_command(self.command_name, stdout=out, stderr=StringIO())
192+
self.assertIn(clean_duplicate_history.Command.COMMAND_HINT, out.getvalue())
193+
194+
def test_bad_args(self):
195+
test_data = (
196+
(clean_duplicate_history.Command.MODEL_NOT_HISTORICAL, ("tests.place",)),
197+
(clean_duplicate_history.Command.MODEL_NOT_FOUND, ("invalid.model",)),
198+
(clean_duplicate_history.Command.MODEL_NOT_FOUND, ("bad_key",)),
199+
)
200+
for msg, args in test_data:
201+
out = StringIO()
202+
self.assertRaises(
203+
self.command_error,
204+
management.call_command,
205+
self.command_name,
206+
*args,
207+
stdout=StringIO(),
208+
stderr=out
209+
)
210+
self.assertIn(msg, out.getvalue())
211+
212+
def test_no_historical(self):
213+
out = StringIO()
214+
with replace_registry({"test_place": Place}):
215+
management.call_command(self.command_name, auto=True, stdout=out)
216+
self.assertIn(
217+
clean_duplicate_history.Command.NO_REGISTERED_MODELS, out.getvalue()
218+
)
219+
220+
def test_auto_dry_run(self):
221+
p = Poll.objects.create(
222+
question="Will this be deleted?", pub_date=datetime.now()
223+
)
224+
p.save()
225+
226+
# not related to dry_run test, just for increasing coverage :)
227+
# create instance with single-entry history older than "minutes"
228+
# so it is skipped
229+
p = Poll.objects.create(
230+
question="Will this be deleted?", pub_date=datetime.now()
231+
)
232+
h = p.history.first()
233+
h.history_date -= timedelta(hours=1)
234+
h.save()
235+
236+
self.assertEqual(Poll.history.all().count(), 3)
237+
out = StringIO()
238+
management.call_command(
239+
self.command_name,
240+
auto=True,
241+
minutes=50,
242+
dry=True,
243+
stdout=out,
244+
stderr=StringIO(),
245+
)
246+
self.assertEqual(
247+
out.getvalue(),
248+
"Removed 1 historical records for "
249+
"<class 'simple_history.tests.models.Poll'>\n",
250+
)
251+
self.assertEqual(Poll.history.all().count(), 3)
252+
253+
def test_auto_cleanup(self):
254+
p = Poll.objects.create(
255+
question="Will this be deleted?", pub_date=datetime.now()
256+
)
257+
self.assertEqual(Poll.history.all().count(), 1)
258+
p.save()
259+
self.assertEqual(Poll.history.all().count(), 2)
260+
p.question = "Maybe this one won't...?"
261+
p.save()
262+
self.assertEqual(Poll.history.all().count(), 3)
263+
out = StringIO()
264+
management.call_command(
265+
self.command_name, auto=True, stdout=out, stderr=StringIO()
266+
)
267+
self.assertEqual(
268+
out.getvalue(),
269+
"Removed 1 historical records for "
270+
"<class 'simple_history.tests.models.Poll'>\n",
271+
)
272+
self.assertEqual(Poll.history.all().count(), 2)
273+
274+
def test_auto_cleanup_verbose(self):
275+
p = Poll.objects.create(
276+
question="Will this be deleted?", pub_date=datetime.now()
277+
)
278+
self.assertEqual(Poll.history.all().count(), 1)
279+
p.save()
280+
p.question = "Maybe this one won't...?"
281+
p.save()
282+
self.assertEqual(Poll.history.all().count(), 3)
283+
out = StringIO()
284+
management.call_command(
285+
self.command_name,
286+
"tests.poll",
287+
auto=True,
288+
verbosity=2,
289+
stdout=out,
290+
stderr=StringIO(),
291+
)
292+
self.assertEqual(
293+
out.getvalue(),
294+
"<class 'simple_history.tests.models.Poll'> has 3 historical entries\n"
295+
"Removed 1 historical records for "
296+
"<class 'simple_history.tests.models.Poll'>\n",
297+
)
298+
self.assertEqual(Poll.history.all().count(), 2)
299+
300+
def test_auto_cleanup_dated(self):
301+
the_time_is_now = datetime.now()
302+
p = Poll.objects.create(
303+
question="Will this be deleted?", pub_date=the_time_is_now
304+
)
305+
self.assertEqual(Poll.history.all().count(), 1)
306+
p.save()
307+
p.save()
308+
self.assertEqual(Poll.history.all().count(), 3)
309+
p.question = "Or this one...?"
310+
p.save()
311+
p.save()
312+
self.assertEqual(Poll.history.all().count(), 5)
313+
314+
for h in Poll.history.all()[2:]:
315+
h.history_date -= timedelta(hours=1)
316+
h.save()
317+
318+
management.call_command(
319+
self.command_name,
320+
auto=True,
321+
minutes=50,
322+
stdout=StringIO(),
323+
stderr=StringIO(),
324+
)
325+
self.assertEqual(Poll.history.all().count(), 4)
326+
327+
def test_auto_cleanup_dated_extra_one(self):
328+
the_time_is_now = datetime.now()
329+
p = Poll.objects.create(
330+
question="Will this be deleted?", pub_date=the_time_is_now
331+
)
332+
self.assertEqual(Poll.history.all().count(), 1)
333+
p.save()
334+
p.save()
335+
self.assertEqual(Poll.history.all().count(), 3)
336+
p.question = "Or this one...?"
337+
p.save()
338+
p.save()
339+
p.save()
340+
p.save()
341+
self.assertEqual(Poll.history.all().count(), 7)
342+
343+
for h in Poll.history.all()[2:]:
344+
h.history_date -= timedelta(hours=1)
345+
h.save()
346+
347+
management.call_command(
348+
self.command_name,
349+
auto=True,
350+
minutes=50,
351+
stdout=StringIO(),
352+
stderr=StringIO(),
353+
)
354+
# even though only the last 2 entries match the date range
355+
# the "extra_one" (the record before the oldest match)
356+
# is identical to the oldest match, so oldest match is deleted
357+
self.assertEqual(Poll.history.all().count(), 5)

0 commit comments

Comments
 (0)