m3dev · kitagry · Apr 29, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 5, 2025
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import copy
+import json
 import re
 from logging import getLogger
 from typing import Any, Union
@@ -21,7 +22,7 @@ class GCSObjectMetadataClient:
 
     @staticmethod
     def _is_log_related_path(path: str) -> bool:
-        return re.match(r'^log/(processing_time/|task_info/|task_log/|module_versions/|random_seed/|task_params/).+', path) is not None
+        return re.match(r'^gs://.+?/log/(processing_time/|task_info/|task_log/|module_versions/|random_seed/|task_params/).+', path) is not None
 
     # This is the copied method of luigi.gcs._path_to_bucket_and_key(path).
     @staticmethod
@@ -32,7 +33,12 @@ def _path_to_bucket_and_key(path: str) -> tuple[str, str]:
         return netloc, path_without_initial_slash
 
     @staticmethod
-    def add_task_state_labels(path: str, task_params: dict[str, str] | None = None, custom_labels: dict[str, Any] | None = None) -> None:
+    def add_task_state_labels(
+        path: str,
+        task_params: dict[str, str] | None = None,
+        custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
+    ) -> None:
         if GCSObjectMetadataClient._is_log_related_path(path):
             return
         # In gokart/object_storage.get_time_stamp, could find same call.
@@ -42,20 +48,18 @@ def add_task_state_labels(path: str, task_params: dict[str, str] | None = None,
         if _response is None:
             logger.error(f'failed to get object from GCS bucket {bucket} and object {obj}.')
             return
-
         response: dict[str, Any] = dict(_response)
         original_metadata: dict[Any, Any] = {}
         if 'metadata' in response.keys():
             _metadata = response.get('metadata')
             if _metadata is not None:
                 original_metadata = dict(_metadata)
-
         patched_metadata = GCSObjectMetadataClient._get_patched_obj_metadata(
             copy.deepcopy(original_metadata),
             task_params,
             custom_labels,
+            required_task_outputs if required_task_outputs else None,
-            required_task_outputs if required_task_outputs else None,
+            required_task_outputs,
-            required_task_outputs if required_task_outputs else None,
+            required_task_outputs,
         )
-
         if original_metadata != patched_metadata:
             # If we use update api, existing object metadata are removed, so should use patch api.
             # See the official document descriptions.
@@ -71,7 +75,6 @@ def add_task_state_labels(path: str, task_params: dict[str, str] | None = None,
                 )
                 .execute()
             )
-
             if update_response is None:
                 logger.error(f'failed to patch object {obj} in bucket {bucket} and object {obj}.')
 
@@ -84,13 +87,13 @@ def _get_patched_obj_metadata(
         metadata: Any,
         task_params: dict[str, str] | None = None,
         custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
     ) -> Union[dict, Any]:
         # If metadata from response when getting bucket and object information is not dictionary,
         # something wrong might be happened, so return original metadata, no patched.
         if not isinstance(metadata, dict):
             logger.warning(f'metadata is not a dict: {metadata}, something wrong was happened when getting response when get bucket and object information.')
             return metadata
-
         if not task_params and not custom_labels:
             return metadata
         # Maximum size of metadata for each object is 8 KiB.
@@ -101,23 +104,28 @@ def _get_patched_obj_metadata(
         # However, users who utilize custom_labels are no longer expected to search using the labels generated from task parameters.
         # Instead, users are expected to search using the labels they provided.
         # Therefore, in the event of a key conflict, the value registered by the user-provided labels will take precedence.
-        _merged_labels = GCSObjectMetadataClient._merge_custom_labels_and_task_params_labels(normalized_task_params_labels, normalized_custom_labels)
+        normalized_labels = (
-        normalized_labels = (
+        normalized_labels = [normalized_custom_labels, normalized_task_params_labels]
+        if not required_task_outputs
+            normalized_labels.append({'__required_task_outputs': json.dumps(GCSObjectMetadataClient._get_serialized_string(required_task_outputs))})
-        normalized_labels = (
+        normalized_labels = [normalized_custom_labels, normalized_task_params_labels]
+        if not required_task_outputs
+            normalized_labels.append({'__required_task_outputs': json.dumps(GCSObjectMetadataClient._get_serialized_string(required_task_outputs))})
+            [normalized_custom_labels, normalized_task_params_labels]
+            if not required_task_outputs
+            else [normalized_custom_labels, normalized_custom_labels, {'required_task_outputs': json.dumps(required_task_outputs)}]
+        )
+        _merged_labels = GCSObjectMetadataClient._merge_custom_labels_and_task_params_labels(normalized_labels)
         return GCSObjectMetadataClient._adjust_gcs_metadata_limit_size(dict(metadata) | _merged_labels)
 
     @staticmethod
     def _merge_custom_labels_and_task_params_labels(
-        normalized_task_params: dict[str, str],
-        normalized_custom_labels: dict[str, Any],
+        normalized_labels_list: list[dict[str, Any]],
     ) -> dict[str, str]:
-        merged_labels = copy.deepcopy(normalized_custom_labels)
-        for label_name, label_value in normalized_task_params.items():
-            if len(label_value) == 0:
-                logger.warning(f'value of label_name={label_name} is empty. So skip to add as a metadata.')
-                continue
-            if label_name in merged_labels.keys():
-                logger.warning(f'label_name={label_name} is already seen. So skip to add as a metadata.')
-                continue
-            merged_labels[label_name] = label_value
+        merged_labels: dict[str, str] = {}
+        for normalized_label in normalized_labels_list[:]:
-        for normalized_label in normalized_labels_list[:]:
+        for normalized_label in normalized_labels_list:
-        for normalized_label in normalized_labels_list[:]:
+        for normalized_label in normalized_labels_list:
+            for label_name, label_value in normalized_label.items():
+                if len(label_value) == 0:
+                    logger.warning(f'value of label_name={label_name} is empty. So skip to add as a metadata.')
+                    continue
+                if label_name in merged_labels.keys():
+                    logger.warning(f'label_name={label_name} is already seen. So skip to add as a metadata.')
+                    continue
+                merged_labels[label_name] = label_value
         return merged_labels
 
     # Google Cloud Storage(GCS) has a limitation of metadata size, 8 KiB.
@@ -132,10 +140,8 @@ def _get_label_size(label_name: str, label_value: str) -> int:
             8 * 1024,
             sum(_get_label_size(label_name, label_value) for label_name, label_value in labels.items()),
         )
-
         if current_total_metadata_size <= max_gcs_metadata_size:
             return labels
-
         for label_name, label_value in reversed(labels.items()):
             size = _get_label_size(label_name, label_value)
             del labels[label_name]

@@ -26,7 +26,13 @@ def _get_task_lock_params(self) -> TaskLockParams:
     def _load(self) -> Any:
         return _repository.get_value(self._data_key)
 
-    def _dump(self, obj: Any, task_params: dict[str, str] | None = None, custom_labels: dict[str, Any] | None = None) -> None:
+    def _dump(
+        self,
+        obj: Any,
+        task_params: dict[str, str] | None = None,
+        custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
+    ) -> None:
         return _repository.set_value(self._data_key, obj)
 
     def _remove(self) -> None:

@@ -30,13 +30,23 @@ def exists(self) -> bool:
     def load(self) -> Any:
         return wrap_load_with_lock(func=self._load, task_lock_params=self._get_task_lock_params())()
 
-    def dump(self, obj, lock_at_dump: bool = True, task_params: dict[str, str] | None = None, custom_labels: dict[str, Any] | None = None) -> None:
+    def dump(
+        self,
+        obj,
+        lock_at_dump: bool = True,
+        task_params: dict[str, str] | None = None,
+        custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
+    ) -> None:
         if lock_at_dump:
             wrap_dump_with_lock(func=self._dump, task_lock_params=self._get_task_lock_params(), exist_check=self.exists)(
-                obj=obj, task_params=task_params, custom_labels=custom_labels
+                obj=obj,
+                task_params=task_params,
+                custom_labels=custom_labels,
+                required_task_outputs=required_task_outputs,
             )
         else:
-            self._dump(obj=obj, task_params=task_params, custom_labels=custom_labels)
+            self._dump(obj=obj, task_params=task_params, custom_labels=custom_labels, required_task_outputs=required_task_outputs)
 
     def remove(self) -> None:
         if self.exists():
@@ -61,7 +71,13 @@ def _load(self) -> Any:
         pass
 
     @abstractmethod
-    def _dump(self, obj, task_params: dict[str, str] | None = None, custom_labels: dict[str, Any] | None = None) -> None:
+    def _dump(
+        self,
+        obj,
+        task_params: Optional[dict[str, str]] = None,
+        custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
+    ) -> None:
         pass
 
     @abstractmethod
@@ -98,11 +114,19 @@ def _load(self) -> Any:
         with self._target.open('r') as f:
             return self._processor.load(f)
 
-    def _dump(self, obj, task_params: dict[str, str] | None = None, custom_labels: dict[str, Any] | None = None) -> None:
+    def _dump(
+        self,
+        obj,
+        task_params: dict[str, str] | None = None,
+        custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
+    ) -> None:
         with self._target.open('w') as f:
             self._processor.dump(obj, f)
         if self.path().startswith('gs://'):
-            GCSObjectMetadataClient.add_task_state_labels(path=self.path(), task_params=task_params, custom_labels=custom_labels)
+            GCSObjectMetadataClient.add_task_state_labels(
+                path=self.path(), task_params=task_params, custom_labels=custom_labels, required_task_outputs=required_task_outputs
+            )
 
     def _remove(self) -> None:
         self._target.remove()
@@ -142,10 +166,18 @@ def _load(self) -> Any:
         self._remove_temporary_directory()
         return model
 
-    def _dump(self, obj, task_params: dict[str, str] | None = None, custom_labels: dict[str, Any] | None = None) -> None:
+    def _dump(
+        self,
+        obj,
+        task_params: dict[str, str] | None = None,
+        custom_labels: dict[str, Any] | None = None,
+        required_task_outputs: dict[str, str] | None = None,
+    ) -> None:
         self._make_temporary_directory()
         self._save_function(obj, self._model_path())
-        make_target(self._load_function_path()).dump(self._load_function, task_params=task_params)
+        make_target(self._load_function_path()).dump(
+            self._load_function, task_params=task_params, custom_labels=custom_labels, required_task_outputs=required_task_outputs
+        )
         self._zip_client.make_archive()
         self._remove_temporary_directory()
 

@@ -7,10 +7,13 @@
 import random
 import sys
 import types
+from dataclasses import dataclass
 from importlib import import_module
 from logging import getLogger
 from typing import Any, Callable, Dict, Generator, Generic, Iterable, List, Optional, Set, TypeVar, Union, overload
 
+from gokart.utils import map_flattenable_items
+
 if sys.version_info < (3, 13):
     from typing_extensions import deprecated
 else:
@@ -362,11 +365,26 @@ def dump(self, obj: Any, target: Union[None, str, TargetOnKart] = None, custom_l
         if self.fail_on_empty_dump and isinstance(obj, pd.DataFrame):
             assert not obj.empty
 
+        @dataclass
+        class _RequiredTaskOutput:
+            task_name: str
+            output_path: str
+
+        _required_task_outputs = flatten(
+            map_flattenable_items(
+                lambda task: map_flattenable_items(
+                    lambda output: _RequiredTaskOutput(task_name=task.get_task_family(), output_path=output.path()), task.output()
+                ),
+                self.requires(),
+            )
+        )
+        required_task_outputs = {r.task_name: r.output_path for r in _required_task_outputs}
         self._get_output_target(target).dump(
             obj,
             lock_at_dump=self._lock_at_dump,
             task_params=super().to_str_params(only_significant=True, only_public=True),
             custom_labels=custom_labels,
+            required_task_outputs=required_task_outputs,
         )
 
     @staticmethod

@@ -3,7 +3,7 @@
 import os
 import sys
 from io import BytesIO
-from typing import Any, Iterable, Protocol, TypeVar, Union
+from typing import Any, Callable, Iterable, Protocol, TypeVar, Union
 
 import dill
 import luigi
@@ -71,6 +71,21 @@ def flatten(targets: FlattenableItems[T]) -> list[T]:
     return flat
 
 
+K = TypeVar('K')
+
+
+def map_flattenable_items(func: Callable[[T], K], items: FlattenableItems[T]) -> FlattenableItems[K]:
+    if isinstance(items, dict):
+        return {k: map_flattenable_items(func, v) for k, v in items.items()}
+    if isinstance(items, tuple):
+        return tuple(map_flattenable_items(func, i) for i in items)
+    if isinstance(items, str):
+        return func(items)  # type: ignore
+    if isinstance(items, Iterable):
+        return [map_flattenable_items(func, i) for i in items]
-        return [map_flattenable_items(func, i) for i in items]
+        return map(lambda item: map_flattenable_items(func, i), items)
-        return [map_flattenable_items(func, i) for i in items]
+        return map(lambda item: map_flattenable_items(func, i), items)
+    return func(items)
+
+
 def load_dill_with_pandas_backward_compatibility(file: Union[FileLike, BytesIO]) -> Any:
     """Load binary dumped by dill with pandas backward compatibility.
     pd.read_pickle can load binary dumped in backward pandas version, and also any objects dumped by pickle.

@@ -120,7 +120,10 @@ def test_mock_target_on_kart(self, mock_get_output_target):
 
         task = _DummyTaskOnKart()
         task.dump({'key': 'value'}, mock_target)
-        mock_target.dump.assert_called_once_with({'key': 'value'}, lock_at_dump=task._lock_at_dump, task_params={}, custom_labels=None)
+
+        mock_target.dump.assert_called_once_with(
+            {'key': 'value'}, lock_at_dump=task._lock_at_dump, task_params={}, custom_labels=None, required_task_outputs={}
+        )
 
 
 if __name__ == '__main__':

@@ -1,6 +1,6 @@
 import unittest
 
-from gokart.utils import flatten
+from gokart.utils import flatten, map_flattenable_items
 
 
 class TestFlatten(unittest.TestCase):
@@ -18,3 +18,19 @@ def test_flatten_int(self):
 
     def test_flatten_none(self):
         self.assertEqual(flatten(None), [])
+
+
+class TestMapFlatten(unittest.TestCase):
+    def test_map_flattenable_items(self):
+        self.assertEqual(map_flattenable_items(lambda x: str(x), {'a': 1, 'b': 2}), {'a': '1', 'b': '2'})
+        self.assertEqual(
+            map_flattenable_items(lambda x: str(x), (1, 2, 3, (4, 5, (6, 7, {'a': (8, 9, 0)})))),
+            ('1', '2', '3', ('4', '5', ('6', '7', {'a': ('8', '9', '0')}))),
+        )
+        self.assertEqual(
+            map_flattenable_items(
+                lambda x: str(x),
+                {'a': [1, 2, 3, '4'], 'b': {'c': True, 'd': {'e': 5}}},
+            ),
+            {'a': ['1', '2', '3', '4'], 'b': {'c': 'True', 'd': {'e': '5'}}},
+        )