fix context resolution for mapped tasks

lucafaggianelli · lucafaggianelli · commit 30787f78db8c · 2025-11-03T17:30:43.000+01:00
diff --git a/frontend/package.json b/frontend/package.json
@@ -1,6 +1,6 @@
 {
   "name": "plombery",
-  "version": "0.5.2-beta2",
+  "version": "0.6.0-beta1",
   "description": "",
   "license": "MIT",
   "author": {
diff --git a/src/plombery/_version.py b/src/plombery/_version.py
@@ -1,3 +1,3 @@
 # Keep it aligned with version in package.json
 
-__version__ = "0.5.2-beta2"
+__version__ = "0.6.0-beta1"
diff --git a/src/plombery/database/repository.py b/src/plombery/database/repository.py
@@ -187,8 +187,17 @@ def create_task_run_output(
 ) -> models.TaskRunOutput:
     """Creates a new TaskRunOutput record and returns the instance."""
     with SessionLocal() as session:
+        data = (
+            task_output.data.__dict__
+            if hasattr(task_output.data, "__dict__")
+            else task_output.data
+        )
+
         db_output = models.TaskRunOutput(
-            **task_output.model_dump(), size=len(task_output.data)
+            mimetype=task_output.mimetype,
+            encoding=task_output.encoding,
+            data=data,
+            size=0,
         )
         session.add(db_output)
         session.flush()
diff --git a/src/plombery/orchestrator/context.py b/src/plombery/orchestrator/context.py
@@ -2,6 +2,8 @@
 from plombery.database.repository import get_task_run_output_by_id
 from plombery.database.models import TaskRun
 from plombery.logger import get_logger
+from plombery.pipeline.context import task_context
+from plombery.pipeline.tasks import MappingMode
 
 
 class Context:
@@ -13,13 +15,25 @@ def __init__(self, _task_run: TaskRun, upstream_task_runs: dict[str, TaskRun]):
         self._task_run = _task_run
         self._upstream_task_runs = upstream_task_runs
         self.logger = get_logger()
+        self.task = task_context.get()
 
     def get_output_data(self, task_id: str) -> Optional[Any]:
         """
         Imperatively retrieves the full TaskRunOutput data (XCom) for a specified
         upstream task, performing a database lookup only upon call.
         """
-        target_task_run = self._upstream_task_runs.get(task_id)
+        if (
+            self._task_run.map_index is not None
+            # if it's a Chained Fan Out the upstream is returning a
+            # primitive value and not an array, si we retrieve the value of
+            # the corresponding task + map_index
+            and self.task.mapping_mode == MappingMode.CHAINED_FAN_OUT
+        ):
+            task_full_id = f"{task_id}.{self._task_run.map_index}"
+        else:
+            task_full_id = task_id
+
+        target_task_run = self._upstream_task_runs.get(task_full_id)
 
         if not target_task_run or not target_task_run.task_output_id:
             # Task ID not found in upstream dependencies
@@ -29,7 +43,12 @@ def get_output_data(self, task_id: str) -> Optional[Any]:
         output_record = get_task_run_output_by_id(target_task_run.task_output_id)
 
         if output_record:
-            if self._task_run.map_index is not None:
+            if (
+                self._task_run.map_index is not None
+                # If it's a Fan Out task then the upstream is return a list and we need
+                # to get the item at the specific index
+                and self.task.mapping_mode == MappingMode.FAN_OUT
+            ):
                 return output_record.data[self._task_run.map_index]
 
             # Return the data stored in the 'data' JSON column
diff --git a/src/plombery/orchestrator/executor.py b/src/plombery/orchestrator/executor.py
@@ -1,5 +1,6 @@
 import asyncio
 from dataclasses import dataclass, field
+from types import MappingProxyType
 from typing import Any, Callable, Dict, Optional
 import inspect
 
@@ -131,7 +132,16 @@ async def execute_task_instance(
 
     # Prepare arguments using the TaskRun's context/inputs determined by the Orchestrator
     # The Orchestrator should have resolved all upstream tasks' data into task_run.context
-    pipeline_params = task_run.context.get("params", None) if task_run.context else None
+    if task_run.context:
+        dict_params = task_run.context.get("params", None)
+
+        if pipeline.params:
+            pipeline_params = pipeline.params.model_validate(dict_params)
+        else:
+            # TODO: This should raise at least a warning
+            pipeline_params = dict_params
+    else:
+        pipeline_params = None
 
     task_start_time = utcnow()
     task_run_status = PipelineRunStatus.FAILED  # Assume failure until success
@@ -228,6 +238,7 @@ async def run(
 
 @dataclass
 class TaskFunctionSignature:
+    func_params: MappingProxyType[str, inspect.Parameter]
     has_params_arg: bool = False
     context_arg: Optional[str] = None
     input_arg_names: list[str] = field(default_factory=list)
@@ -245,9 +256,9 @@ def check_task_signature(func: Callable) -> TaskFunctionSignature:
     Where the params argument is the Pipeline input params.
     """
 
-    result = TaskFunctionSignature()
+    result = TaskFunctionSignature(inspect.signature(func).parameters)
 
-    for name, parameter in inspect.signature(func).parameters.items():
+    for name, parameter in result.func_params.items():
         # Check for special arguments
         if name == "params":
             result.has_params_arg = True
@@ -294,11 +305,18 @@ async def _execute_task(
 
     # Load the TaskRuns for all upstream dependencies
     upstream_runs_metadata = get_task_runs_for_pipeline_run(
-        task_run.pipeline_run_id, task.upstream_task_ids
+        task_run.pipeline_run_id, task_ids=task.upstream_task_ids
     )
 
     # Build the map of task_id -> TaskRun model instance
-    metadata_map = {run.task_id: run for run in upstream_runs_metadata}
+    metadata_map = {
+        (
+            f"{run.task_id}.{run.map_index}"
+            if run.map_index is not None
+            else run.task_id
+        ): run
+        for run in upstream_runs_metadata
+    }
     runtime_context = Context(task_run, metadata_map)
 
     # Iterate over arguments required by the function signature
@@ -307,6 +325,13 @@ async def _execute_task(
         # - If mapped, resolves to single item if arg_name == map_upstream_id.
         # - Otherwise, resolves to the full output of the upstream task named arg_name.
         input_data = runtime_context.get_output_data(task_id=arg_name)
+
+        arg_annotation = result.func_params[arg_name].annotation
+
+        # If the argument is a Pydantic Model, we parse it
+        if issubclass(arg_annotation, BaseModel):
+            input_data = arg_annotation.model_validate(input_data)
+
         kwargs[arg_name] = input_data
 
     if pipeline_params and result.has_params_arg:

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "plombery",`
`3`		`- "version": "0.5.2-beta2",`
	`3`	`+ "version": "0.6.0-beta1",`
`4`	`4`	`"description": "",`
`5`	`5`	`"license": "MIT",`
`6`	`6`	`"author": {`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Keep it aligned with version in package.json`
`2`	`2`
`3`		`-__version__ = "0.5.2-beta2"`
	`3`	`+__version__ = "0.6.0-beta1"`