Skip to content

Commit 1a07924

Browse files
authored
Merge pull request #865 from aaxelb/fix/chunky-delete
[ENG-7922] chunky delete_pretrove_data (and --chunksize param)
2 parents 9bfea80 + 3f158c6 commit 1a07924

File tree

1 file changed

+46
-12
lines changed

1 file changed

+46
-12
lines changed

share/management/commands/delete_pretrove_data.py

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77

88
class Command(BaseShareCommand):
99
def add_arguments(self, parser):
10+
parser.add_argument('--chunksize', type=int, default=1024, help='number of RawData per DELETE')
1011
parser.add_argument('--really-really', action='store_true', help='skip final confirmation prompt before really deleting')
1112

12-
def handle(self, *args, really_really: bool, **kwargs):
13+
def handle(self, *args, chunksize: int, really_really: bool, **kwargs):
1314
# note: `share.transform` deleted; `transformer_key` always null for trove-ingested rdf
1415
_pretrove_configs = _db.SourceConfig.objects.filter(transformer_key__isnull=False)
1516
_pretrove_configs_with_rawdata = (
@@ -23,17 +24,50 @@ def handle(self, *args, really_really: bool, **kwargs):
2324
if not _pretrove_configs_with_rawdata.exists():
2425
self.stdout.write(self.style.SUCCESS(_('nothing to delete')))
2526
return
27+
_sourceconfig_ids_and_labels = list(
28+
_pretrove_configs_with_rawdata.values_list('id', 'label'),
29+
)
2630
self.stdout.write(self.style.WARNING(_('pre-trove source-configs with deletable rawdata:')))
27-
for _label in _pretrove_configs_with_rawdata.values_list('label', flat=True):
28-
self.stdout.write(f'\t{_label}')
31+
for __, _sourceconfig_label in _sourceconfig_ids_and_labels:
32+
self.stdout.write(f'\t{_sourceconfig_label}')
2933
if really_really or self.input_confirm(self.style.WARNING(_('really DELETE ALL raw metadata records belonging to these source-configs? (y/n)'))):
30-
self.stdout.write(_('deleting...'))
31-
_rawdata_to_delete = (
32-
_db.RawDatum.objects
33-
.filter(suid__source_config_id__in=_pretrove_configs)
34-
)
35-
_deleted_total, _deleted_counts = _rawdata_to_delete.delete()
36-
for _name, _count in _deleted_counts.items():
37-
self.stdout.write(self.style.SUCCESS(f'{_name}: deleted {_count}'))
34+
_total_deleted = 0
35+
for _sourceconfig_id, _sourceconfig_label in _sourceconfig_ids_and_labels:
36+
_total_deleted += self._do_delete_rawdata(_sourceconfig_id, _sourceconfig_label, chunksize)
37+
self.stdout.write(self.style.SUCCESS(_('deleted %(count)s items') % {'count': _total_deleted}))
3838
else:
39-
self.stdout.write(self.style.SUCCESS('deleted nothing'))
39+
self.stdout.write(self.style.SUCCESS(_('deleted nothing')))
40+
41+
def _do_delete_rawdata(self, sourceconfig_id, sourceconfig_label, chunksize) -> int:
42+
# note: `.delete()` cannot be called on sliced querysets, so chunking is more complicated
43+
# -- before deleting each chunk, query for its last pk to filter on as a sentinel value
44+
_prior_sentinel_pk = None
45+
_deleted_count = 0
46+
_rawdata_qs = (
47+
_db.RawDatum.objects
48+
.filter(suid__source_config_id=sourceconfig_id)
49+
.order_by('pk') # for consistent chunking
50+
)
51+
self.stdout.write(_('%(label)s: deleting all rawdata...') % {'label': sourceconfig_label})
52+
while True: # for each chunk:
53+
_pk_qs = _rawdata_qs.values_list('pk', flat=True)
54+
# get the last pk in the chunk
55+
_sentinel_pk = _pk_qs[chunksize - 1: chunksize].first() or _pk_qs.last()
56+
if _sentinel_pk is not None:
57+
if (_prior_sentinel_pk is not None) and (_sentinel_pk <= _prior_sentinel_pk):
58+
raise RuntimeError(f'sentinel pks not ascending?? got {_sentinel_pk} after {_prior_sentinel_pk}')
59+
_prior_sentinel_pk = _sentinel_pk
60+
_chunk_to_delete = _rawdata_qs.filter(pk__lte=_sentinel_pk)
61+
_chunk_deleted_count, _by_model = _chunk_to_delete.delete()
62+
if _by_model and set(_by_model.keys()) != {'share.RawDatum'}:
63+
raise RuntimeError(f'deleted models other than RawDatum?? {_by_model}')
64+
self.stdout.write(
65+
_('%(label)s: deleted %(count)s') % {'label': sourceconfig_label, 'count': _chunk_deleted_count},
66+
)
67+
_deleted_count += _chunk_deleted_count
68+
continue # next chunk
69+
# end
70+
self.stdout.write(self.style.SUCCESS(
71+
_('%(label)s: done; deleted %(count)s') % {'label': sourceconfig_label, 'count': _deleted_count},
72+
))
73+
return _deleted_count

0 commit comments

Comments
 (0)