Skip to content

Commit 38aa53c

Browse files
fix(core): metadata v10 migration fixes (#3326)
1 parent 4a52069 commit 38aa53c

File tree

14 files changed

+173
-58
lines changed

14 files changed

+173
-58
lines changed

renku/command/checks/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# limitations under the License.
1818
"""Define repository checks for :program:`renku doctor`."""
1919

20-
from .activities import check_migrated_activity_ids
20+
from .activities import check_activity_dates, check_migrated_activity_ids
2121
from .datasets import (
2222
check_dataset_files_outside_datadir,
2323
check_dataset_old_metadata_location,
@@ -30,7 +30,7 @@
3030
from .project import check_project_id_group
3131
from .storage import check_lfs_info
3232
from .validate_shacl import check_datasets_structure, check_project_structure
33-
from .workflow import check_activity_catalog, check_modification_date
33+
from .workflow import check_activity_catalog, check_plan_modification_date
3434

3535
# Checks will be executed in the order as they are listed in __all__. They are mostly used in ``doctor`` command to
3636
# inspect broken things. The order of operations matters when fixing issues, so, don't sort this list.
@@ -48,5 +48,6 @@
4848
"check_missing_files",
4949
"check_project_id_group",
5050
"check_project_structure",
51-
"check_modification_date",
51+
"check_plan_modification_date",
52+
"check_activity_dates",
5253
)

renku/command/checks/activities.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,61 @@ def check_migrated_activity_ids(fix, activity_gateway: IActivityGateway, **_):
7171
)
7272

7373
return False, problems
74+
75+
76+
@inject.autoparams("activity_gateway")
77+
def check_activity_dates(fix, activity_gateway: IActivityGateway, **_):
78+
"""Check activities have correct start/end/delete dates.
79+
80+
Args:
81+
fix(bool): Whether to fix found issues.
82+
activity_gateway(IActivityGateway): Injected ActivityGateway.
83+
_: keyword arguments.
84+
85+
Returns:
86+
Tuple[bool, Optional[str]]: Tuple of whether there are activities with invalid dates a string of the problem.
87+
"""
88+
invalid_activities = []
89+
90+
for activity in activity_gateway.get_all_activities(include_deleted=True):
91+
plan = activity.association.plan
92+
if (
93+
activity.started_at_time < plan.date_created
94+
or activity.ended_at_time < activity.started_at_time
95+
or (activity.invalidated_at and activity.invalidated_at < activity.ended_at_time)
96+
):
97+
invalid_activities.append(activity)
98+
99+
if not invalid_activities:
100+
return True, None
101+
if not fix:
102+
ids = [a.id for a in invalid_activities]
103+
message = (
104+
WARNING
105+
+ "The following activity have incorrect start, end, or delete date (use 'renku doctor --fix' to fix them):"
106+
+ "\n\t"
107+
+ "\n\t".join(ids)
108+
)
109+
return False, message
110+
111+
fix_activity_dates(activities=invalid_activities)
112+
project_context.database.commit()
113+
communication.info("Activity dates were fixed")
114+
115+
return True, None
116+
117+
118+
def fix_activity_dates(activities):
119+
"""Fix activities' start/end/delete dates."""
120+
for activity in activities:
121+
plan = activity.association.plan
122+
activity.unfreeze()
123+
if activity.started_at_time < plan.date_created:
124+
activity.started_at_time = plan.date_created
125+
126+
if activity.ended_at_time < activity.started_at_time:
127+
activity.ended_at_time = activity.started_at_time
128+
129+
if activity.invalidated_at and activity.invalidated_at < activity.ended_at_time:
130+
activity.invalidated_at = activity.ended_at_time
131+
activity.freeze()

renku/command/checks/workflow.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818
"""Checks needed to determine integrity of workflows."""
19-
19+
from datetime import timedelta
2020
from typing import List, Optional, Tuple, cast
2121

2222
from renku.command.command_builder import inject
@@ -64,7 +64,7 @@ def check_activity_catalog(fix, force, **_) -> Tuple[bool, Optional[str]]:
6464

6565

6666
@inject.autoparams("plan_gateway")
67-
def check_modification_date(fix, plan_gateway: IPlanGateway, **_) -> Tuple[bool, Optional[str]]:
67+
def check_plan_modification_date(fix, plan_gateway: IPlanGateway, **_) -> Tuple[bool, Optional[str]]:
6868
"""Check if all plans have modification date set for them.
6969
7070
Args:
@@ -88,7 +88,7 @@ def check_modification_date(fix, plan_gateway: IPlanGateway, **_) -> Tuple[bool,
8888
ids = [plan.id for plan in to_be_processed]
8989
message = (
9090
WARNING
91-
+ "The following workflows have incorrect modification date (use 'renku doctor --fix' to fix them).:\n\t"
91+
+ "The following workflows have incorrect modification date (use 'renku doctor --fix' to fix them):\n\t"
9292
+ "\n\t".join(ids)
9393
)
9494
return False, message
@@ -124,4 +124,6 @@ def fix_plan_dates(plans: List[AbstractPlan], plan_gateway):
124124
plan.unfreeze()
125125
plan.date_modified = plan.date_created
126126
plan.date_created = creation_date
127+
if plan.date_removed and plan.date_removed < plan.date_created:
128+
plan.date_removed = plan.date_created + timedelta(seconds=1)
127129
plan.freeze()

renku/core/migration/m_0009__new_metadata_storage.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -655,19 +655,16 @@ def _process_datasets(
655655

656656
for dataset in datasets:
657657
dataset, tags = convert_dataset(dataset=dataset, revision=revision)
658-
if is_last_commit:
659-
datasets_provenance.update_during_migration(
660-
dataset,
661-
commit_sha=revision,
662-
date=date,
663-
tags=tags,
664-
replace=True,
665-
preserve_identifiers=preserve_identifiers,
666-
)
667-
else:
668-
datasets_provenance.update_during_migration(
669-
dataset, commit_sha=revision, date=date, tags=tags, preserve_identifiers=preserve_identifiers
670-
)
658+
659+
datasets_provenance.update_during_migration(
660+
dataset,
661+
commit_sha=revision,
662+
date=date,
663+
tags=tags,
664+
replace=True if is_last_commit else False,
665+
preserve_identifiers=preserve_identifiers,
666+
)
667+
671668
for dataset in deleted_datasets:
672669
dataset, _ = convert_dataset(dataset=dataset, revision=revision)
673670
datasets_provenance.update_during_migration(

renku/core/migration/m_0010__metadata_fixes.py

Lines changed: 74 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818
"""Various metadata migrations for v10."""
19+
1920
import io
2021
import json
2122
import os
@@ -26,6 +27,7 @@
2627

2728
import zstandard as zstd
2829

30+
from renku.command.checks.activities import fix_activity_dates
2931
from renku.command.checks.workflow import fix_plan_dates
3032
from renku.command.command_builder import inject
3133
from renku.core.interface.activity_gateway import IActivityGateway
@@ -53,9 +55,12 @@ def migrate(migration_context: MigrationContext):
5355
if MigrationType.WORKFLOWS in migration_context.options.type:
5456
migrate_activity_ids()
5557
fix_plan_times()
58+
fix_activity_times()
5659

5760
migrate_remote_entity_ids()
5861
fix_dataset_date_modified()
62+
fix_dataset_image_ids()
63+
fix_removed_plans()
5964

6065
# NOTE: Rebuild all workflow catalogs since ids and times have changed
6166
communication.echo("Rebuilding workflow metadata")
@@ -76,8 +81,8 @@ def migrate_old_metadata_namespaces():
7681
header = int.from_bytes(file.read(4), "little")
7782
file.seek(0)
7883
if header == zstd.MAGIC_NUMBER:
79-
with decompressor.stream_reader(file) as zfile:
80-
data = json.load(zfile)
84+
with decompressor.stream_reader(file) as compressed_file:
85+
data = json.load(compressed_file)
8186
compressed = True
8287
else:
8388
data = json.load(file)
@@ -99,7 +104,7 @@ def migrate_old_metadata_namespaces():
99104

100105

101106
def nested_update(data: Dict[str, Any], target_key: str, transforms: List[Tuple[str, str]]) -> None:
102-
"""Update a key's value based on tranformations (from, to) in a deeply nested dictionary."""
107+
"""Update a key's value based on transformations (from, to) in a deeply nested dictionary."""
103108
for k in list(data.keys()):
104109
value = data[k]
105110
if isinstance(value, str) and k == target_key:
@@ -232,19 +237,10 @@ def migrate_project_template_data(project_gateway: IProjectGateway):
232237
project_context.database.commit()
233238

234239

235-
@inject.autoparams("activity_gateway", "plan_gateway")
236-
def fix_plan_times(activity_gateway: IActivityGateway, plan_gateway: IPlanGateway):
240+
@inject.autoparams("plan_gateway")
241+
def fix_plan_times(plan_gateway: IPlanGateway):
237242
"""Add timezone to plan invalidations."""
238-
database = project_context.database
239-
240243
plans: List[AbstractPlan] = plan_gateway.get_all_plans()
241-
all_activities = activity_gateway.get_all_activities()
242-
activity_map: Dict[str, Activity] = {}
243-
244-
for activity in all_activities:
245-
plan_id = activity.association.plan.id
246-
if plan_id not in activity_map or activity.started_at_time < activity_map[plan_id].started_at_time:
247-
activity_map[plan_id] = activity
248244

249245
for plan in plans:
250246
plan.unfreeze()
@@ -255,24 +251,41 @@ def fix_plan_times(activity_gateway: IActivityGateway, plan_gateway: IPlanGatewa
255251
plan.date_removed = None
256252

257253
if plan.date_removed is not None:
258-
if plan.date_removed < plan.date_created:
259-
# NOTE: Fix invalidation times set before creation date on plans
260-
plan.date_removed = plan.date_created
261254
if plan.date_removed.tzinfo is None:
262255
# NOTE: There was a bug that caused date_removed to be set without timezone (as UTC time)
263256
# so we patch in the timezone here
264257
plan.date_removed = plan.date_removed.replace(microsecond=0).astimezone(timezone.utc)
265-
if plan.id in activity_map and plan.date_created > activity_map[plan.id].started_at_time:
266-
plan.date_created = activity_map[plan.id].started_at_time
258+
if plan.date_removed < plan.date_created:
259+
# NOTE: Fix invalidation times set before creation date on plans
260+
plan.date_removed = plan.date_created
267261
plan.freeze()
268262

269263
fix_plan_dates(plans=plans, plan_gateway=plan_gateway)
270-
database.commit()
264+
project_context.database.commit()
265+
266+
267+
@inject.autoparams("activity_gateway")
268+
def fix_activity_times(activity_gateway: IActivityGateway):
269+
"""Make sure activities have valid start/end/delete dates."""
270+
fix_activity_dates(activities=activity_gateway.get_all_activities(include_deleted=True))
271+
project_context.database.commit()
271272

272273

273274
@inject.autoparams("dataset_gateway")
274275
def fix_dataset_date_modified(dataset_gateway: IDatasetGateway):
275276
"""Change date_created and date_modified to have correct semantics."""
277+
278+
def fix_creation_date(dataset):
279+
"""Check creation date to make sure that it's after project's creation date."""
280+
if dataset.date_created and dataset.date_created < project_context.project.date_created:
281+
try:
282+
dataset.date_created = min([f.date_added for f in dataset.files])
283+
except (ValueError, TypeError):
284+
dataset.date_created = project_context.project.date_created
285+
else:
286+
if dataset.date_created < project_context.project.date_created:
287+
dataset.date_created = project_context.project.date_created
288+
276289
tails = dataset_gateway.get_provenance_tails()
277290

278291
for dataset_tail in tails:
@@ -281,6 +294,7 @@ def fix_dataset_date_modified(dataset_gateway: IDatasetGateway):
281294
previous_modification_date = local_now()
282295

283296
while dataset.derived_from is not None:
297+
fix_creation_date(dataset)
284298
modification_date = dataset.date_removed or dataset.date_created
285299

286300
if modification_date is not None:
@@ -294,8 +308,9 @@ def fix_dataset_date_modified(dataset_gateway: IDatasetGateway):
294308
found_datasets.append(dataset)
295309
dataset = dataset_gateway.get_by_id(dataset.derived_from.value)
296310

311+
fix_creation_date(dataset)
297312
# NOTE: first dataset in chain
298-
modification_date = dataset.date_created or dataset.date_published
313+
modification_date = dataset.date_published or dataset.date_created
299314
if modification_date is not None:
300315
dataset.unfreeze()
301316
dataset.date_modified = modification_date
@@ -308,3 +323,41 @@ def fix_dataset_date_modified(dataset_gateway: IDatasetGateway):
308323
child.freeze()
309324

310325
project_context.database.commit()
326+
327+
328+
@inject.autoparams("dataset_gateway")
329+
def fix_dataset_image_ids(dataset_gateway: IDatasetGateway):
330+
"""Remove dashes from dataset image IDs."""
331+
for dataset in dataset_gateway.get_provenance_tails():
332+
while True:
333+
if dataset.images:
334+
for image in dataset.images:
335+
image.id = image.id.replace("-", "")
336+
337+
dataset._p_changed = True
338+
339+
if not dataset.derived_from:
340+
break
341+
342+
dataset = dataset_gateway.get_by_id(dataset.derived_from.value)
343+
344+
project_context.database.commit()
345+
346+
347+
@inject.autoparams("plan_gateway")
348+
def fix_removed_plans(plan_gateway: IPlanGateway):
349+
"""Create a derivative if a removed plan doesn't have one."""
350+
plans: List[AbstractPlan] = plan_gateway.get_all_plans()
351+
352+
for plan in plans:
353+
if plan.date_removed and plan.derived_from is None:
354+
derived_plan = plan.derive()
355+
derived_plan.date_modified = plan.date_modified
356+
derived_plan.delete(when=plan.date_removed)
357+
plan_gateway.add(derived_plan)
358+
359+
plan.unfreeze()
360+
plan.date_removed = None
361+
plan.freeze()
362+
363+
project_context.database.commit()

renku/core/util/datetime8601.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def _set_to_local_timezone(value):
7575
return value.replace(tzinfo=local_tz)
7676

7777

78-
def local_now() -> datetime:
78+
def local_now(remove_microseconds: bool = True) -> datetime:
7979
"""Return current datetime in local timezone."""
80-
return datetime.now(timezone.utc).replace(microsecond=0).astimezone()
80+
now = datetime.now(timezone.utc).astimezone()
81+
return now.replace(microsecond=0) if remove_microseconds else now

renku/core/util/git.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ def get_entity_from_revision(
379379
Entity: The Entity for the given path and revision.
380380
381381
"""
382-
from renku.domain_model.entity import Collection, Entity
382+
from renku.domain_model.entity import NON_EXISTING_ENTITY_CHECKSUM, Collection, Entity
383383

384384
def get_directory_members(absolute_path: Path) -> List[Entity]:
385385
"""Return first-level files/directories in a directory."""
@@ -410,7 +410,9 @@ def get_directory_members(absolute_path: Path) -> List[Entity]:
410410
checksum = repository.get_object_hash(revision=revision, path=path)
411411
# NOTE: If object was not found at a revision it's either removed or exists in a different revision; keep the
412412
# entity and use revision as checksum
413-
checksum = checksum or revision or "HEAD"
413+
if isinstance(revision, str) and revision == "HEAD":
414+
revision = repository.head.commit.hexsha
415+
checksum = checksum or revision or NON_EXISTING_ENTITY_CHECKSUM
414416
id = Entity.generate_id(checksum=checksum, path=path)
415417

416418
absolute_path = repository.path / path

renku/data/shacl_shape.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1635,7 +1635,6 @@
16351635
"datatype": {
16361636
"@id": "xsd:string"
16371637
},
1638-
"minCount": 1,
16391638
"maxCount": 1
16401639
},
16411640
{

renku/domain_model/workflow/plan.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,14 @@ def is_derivation(self) -> bool:
153153
"""Return if an ``AbstractPlan`` has correct derived_from."""
154154
raise NotImplementedError()
155155

156-
def delete(self):
156+
def delete(self, when: datetime = local_now()):
157157
"""Mark a plan as deleted.
158158
159159
NOTE: Don't call this function for deleting plans since it doesn't delete the whole plan derivatives chain. Use
160160
renku.core.workflow.plan::remove_plan instead.
161161
"""
162162
self.unfreeze()
163-
self.date_removed = local_now()
163+
self.date_removed = when
164164
self.freeze()
165165

166166

@@ -323,6 +323,7 @@ def derive(self, creator: Optional[Person] = None) -> "Plan":
323323
derived.keywords = copy.deepcopy(self.keywords)
324324
derived.outputs = self.outputs.copy()
325325
derived.success_codes = self.success_codes.copy()
326+
derived.creators = self.creators.copy()
326327
derived.assign_new_id()
327328

328329
if creator and hasattr(creator, "email") and not any(c for c in self.creators if c.email == creator.email):

0 commit comments

Comments
 (0)