Skip to content

Commit f5b4dbb

Browse files
committed
data migration for expiration_date
1 parent 242da16 commit f5b4dbb

File tree

8 files changed

+126
-45
lines changed

8 files changed

+126
-45
lines changed

project/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,8 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw):
446446
SITE_ID = 1
447447
PUBLIC_SENTRY_DSN = os.environ.get('PUBLIC_SENTRY_DSN')
448448

449+
DATA_MIGRATION_CHUNK_SIZE = int(os.environ.get('DATA_MIGRATION_CHUNK_SIZE', 666))
450+
449451
SHARE_WEB_URL = os.environ.get('SHARE_WEB_URL', 'http://localhost:8003').rstrip('/') + '/'
450452
SHARE_USER_AGENT = os.environ.get('SHARE_USER_AGENT', 'SHAREbot/{} (+{})'.format(VERSION, SHARE_WEB_URL))
451453
SHARE_ADMIN_USERNAME = os.environ.get('SHARE_ADMIN_USERNAME', 'admin')

share/migrations/0078_delete_rawdatum.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class Migration(migrations.Migration):
77

88
dependencies = [
99
('share', '0077_big_cleanup_2025'),
10-
('trove', '0008_no_raw_datum'),
10+
('trove', '0011_no_raw_datum'),
1111
]
1212

1313
operations = [

trove/migrations/0008_no_raw_datum.py

Lines changed: 0 additions & 42 deletions
This file was deleted.

trove/migrations/0009_resource_description_rename.py renamed to trove/migrations/0008_resource_description.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
class Migration(migrations.Migration):
66

77
dependencies = [
8-
('trove', '0008_no_raw_datum'),
8+
('trove', '0007_rawdata_fks_do_nothing'),
99
]
1010

1111
operations = [
@@ -41,4 +41,19 @@ class Migration(migrations.Migration):
4141
name='supplementary_suid',
4242
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_description_set', to='share.sourceuniqueidentifier'),
4343
),
44+
migrations.AddField(
45+
model_name='archivedresourcedescription',
46+
name='expiration_date',
47+
field=models.DateField(blank=True, help_text='An (optional) date when this description will no longer be valid.', null=True),
48+
),
49+
migrations.AddField(
50+
model_name='latestresourcedescription',
51+
name='expiration_date',
52+
field=models.DateField(blank=True, help_text='An (optional) date when this description will no longer be valid.', null=True),
53+
),
54+
migrations.AddField(
55+
model_name='supplementaryresourcedescription',
56+
name='expiration_date',
57+
field=models.DateField(blank=True, help_text='An (optional) date when this description will no longer be valid.', null=True),
58+
),
4459
]

trove/migrations/0010_description_expiration_index.py renamed to trove/migrations/0009_description_expiration_index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ class Migration(migrations.Migration):
66
atomic = False # allow adding indexes concurrently (without locking tables)
77

88
dependencies = [
9-
('trove', '0009_resource_description_rename'),
9+
('trove', '0008_resource_description'),
1010
]
1111

1212
operations = [
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import logging
2+
import time
3+
4+
from django.conf import settings
5+
from django.db import migrations
6+
from django.db.models import F
7+
8+
from trove.util.django import pk_chunked_queryset
9+
10+
_logger = logging.getLogger(__name__)
11+
12+
13+
def migrate_expiration_date(apps, schema_editor):
14+
SupplementaryResourceDescription = apps.get_model('trove', 'SupplementaryResourceDescription')
15+
# on share.osf.io, only supplements have expirations
16+
_supp_qs = SupplementaryResourceDescription.objects.filter(
17+
expiration_date__isnull=True,
18+
from_raw_datum__expiration_date__isnull=False,
19+
)
20+
_total_updated = 0
21+
_before_all = time.perf_counter()
22+
for _chunk_qs in pk_chunked_queryset(_supp_qs, settings.DATA_MIGRATION_CHUNK_SIZE):
23+
_before_chunk = time.perf_counter()
24+
_updated_count = _chunk_qs.update(
25+
expiration_date=F('from_raw_datum__expiration_date'),
26+
)
27+
_chunk_seconds = time.perf_counter() - _before_chunk
28+
_logger.info('migrated %s in %ss...', _updated_count, _chunk_seconds)
29+
_total_updated += _updated_count
30+
_total_seconds = time.perf_counter() - _before_all
31+
_logger.info('done! migrated %s in %ss', _total_updated, _total_seconds)
32+
33+
34+
class Migration(migrations.Migration):
35+
atomic = False # potentially lorg data migration; allow partial progress
36+
37+
dependencies = [
38+
('trove', '0009_description_expiration_index'),
39+
]
40+
41+
operations = [
42+
migrations.RunPython(
43+
code=migrate_expiration_date,
44+
reverse_code=migrations.RunPython.noop,
45+
),
46+
]
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from django.db import migrations
2+
3+
4+
class Migration(migrations.Migration):
5+
6+
dependencies = [
7+
('trove', '0010_data__copy_expiration_date'),
8+
]
9+
10+
operations = [
11+
migrations.RemoveConstraint(
12+
model_name='archivedresourcedescription',
13+
name='trove_archivedindexcardrdf_uniq_archived_version',
14+
),
15+
migrations.RemoveField(
16+
model_name='archivedresourcedescription',
17+
name='from_raw_datum',
18+
),
19+
migrations.RemoveField(
20+
model_name='latestresourcedescription',
21+
name='from_raw_datum',
22+
),
23+
migrations.RemoveField(
24+
model_name='supplementaryresourcedescription',
25+
name='from_raw_datum',
26+
),
27+
]

trove/util/django.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from __future__ import annotations
2+
from collections.abc import Iterator
3+
4+
from django.db.models.query import QuerySet
5+
6+
7+
__all__ = ('pk_chunked_queryset',)
8+
9+
10+
def pk_chunked_queryset(queryset, chunksize: int) -> Iterator[QuerySet]:
11+
'''pk_chunked_queryset: for chunked update/delete on django querysets
12+
13+
since `.update()` or `.delete()` cannot be used on sliced querysets,
14+
this orders the given queryset by pk and yields a filtered (not sliced)
15+
queryset for each chunk
16+
'''
17+
_ordered_qs = queryset.order_by('pk')
18+
_prior_end_pk = None
19+
while True: # for each chunk:
20+
_qs = (
21+
_ordered_qs
22+
if _prior_end_pk is None
23+
else _ordered_qs.filter(pk__gt=_prior_end_pk)
24+
)
25+
# query for the last pk in the chunk
26+
_pk_qs = _qs.values_list('pk', flat=True)
27+
_end_pk = _pk_qs[chunksize - 1: chunksize].first() or _pk_qs.last()
28+
if _end_pk is None:
29+
break # end
30+
if (_prior_end_pk is not None) and (_end_pk <= _prior_end_pk):
31+
raise RuntimeError(f'sentinel pks not ascending?? got {_end_pk} after {_prior_end_pk}')
32+
_prior_end_pk = _end_pk
33+
yield _qs.filter(pk__lte=_end_pk)

0 commit comments

Comments
 (0)