Update: randomization of ESA scope exchange

heptaflar · heptaflar · commit 50905d7a5d7b · 2025-04-28T16:51:59.000+02:00
diff --git a/README.md b/README.md
@@ -21,7 +21,9 @@ processor. The processor is invoked from the command line. Typing
 will print a detailed usage message to the screen
 
     usage: kaleidoscope [-h] [--chunk-size-lat CHUNK_SIZE_LAT]
-                        [--chunk-size-lon CHUNK_SIZE_LON] [--selector SELECTOR]
+                        [--chunk-size-lon CHUNK_SIZE_LON]
+                        [--product-type {esa-cci-oc,esa-scope-exchange,ghrsst,glorys}]
+                        [--selector SELECTOR]
                         [--engine-reader {h5netcdf,netcdf4,zarr}]
                         [--engine-writer {h5netcdf,netcdf4,zarr}]
                         [--log-level {debug,info,warning,error,off}]
@@ -51,7 +53,10 @@ will print a detailed usage message to the screen
                             value of `-1` refers to full longitudinal chunk size
                             and a value of `0` refers to the chunk size used in
                             the source file. (default: None)
-      --selector SELECTOR   the Monte Carlo stream selector. (default: None)
+      --product-type {esa-cci-oc,esa-scope-exchange,ghrsst,glorys}
+                            the product type. (default: None)
+      --selector SELECTOR   the Monte Carlo stream selector. An integral number
+                            whichmust not be negative. (default: None)
       --engine-reader {h5netcdf,netcdf4,zarr}
                             specify the engine used to read the source product
                             file. (default: None)
@@ -76,9 +81,8 @@ will print a detailed usage message to the screen
       --tmpdir TMPDIR       specify the path to the temporary directory.
                             (default: None)
       -v, --version         show program's version number and exit
-    
-    Copyright (c) Brockmann Consult GmbH, 2025. License: MIT
 
+Copyright (c) Brockmann Consult GmbH, 2025. License: MIT
 ### Normal operations
 
 TBD.
diff --git a/kaleidoscope/algorithms/randomize.py b/kaleidoscope/algorithms/randomize.py
@@ -106,16 +106,18 @@ def dropped_axes(self) -> list[int]:
     def randomize(
         self,
         *data: np.ndarray,
-        coverage_factor: Any = 1.0,
+        coverage: Any = 1.0,
         relative: bool = False,
+        clip: tuple[Any, Any] | None = None,
         **kwargs,
     ) -> np.ndarray:
         """
         Randomizes data.
 
         :param data: The data.
-        :param coverage_factor: The uncertainty coverage factor.
+        :param coverage: The uncertainty coverage factor.
         :param relative: Uncertainty is given in relative terms.
+        :param clip: Where to clip measurement errors.
         :return: The measurement values randomized.
         """
         seed = _block_seed(kwargs["block_id"], self._root_seed)
@@ -126,11 +128,10 @@ def randomize(
             if len(data) == 2
             else np.sqrt(np.square(data[1]) - np.square(data[2]))
         )
-        if coverage_factor != 1.0:
-            u = u / coverage_factor
+        if coverage != 1.0:
+            u = u / coverage
         if relative:
             u = u * x
-
         match self._dist:
             case "normal":
                 y = _normal(seed, x, u)
@@ -140,6 +141,8 @@ def randomize(
                 y = _chlorophyll(seed, x, u)
             case _:
                 y = x
+        if clip is not None:
+            y = np.clip(y, a_min=clip[0], a_max=clip[1])
         return y
 
     compute_block = randomize
diff --git a/kaleidoscope/operators/randomizeop.py b/kaleidoscope/operators/randomizeop.py
@@ -64,6 +64,7 @@ def run(self, source: Dataset) -> Dataset:  # noqa: D102
         for v, x in target.data_vars.items():
             if v not in config:
                 continue
+            get_logger().info(f"starting graph for variable: {v}")
             attrs: dict[str:Any] = config[v]
             f = Randomize(
                 x.dtype,
@@ -87,8 +88,9 @@ def run(self, source: Dataset) -> Dataset:  # noqa: D102
                     data=f.apply_to(
                         x.data,
                         u.data,
-                        coverage_factor=attrs.get("coverage_factor", 1.0),
+                        coverage=attrs.get("coverage", 1.0),
                         relative=attrs.get("relative", False),
+                        clip=attrs.get("clip", None),
                     ),
                     coords=x.coords,
                     dims=x.dims,
@@ -99,37 +101,51 @@ def run(self, source: Dataset) -> Dataset:  # noqa: D102
                 b = target[attrs["bias"]]
                 r = target[attrs["rmsd"]]
                 target[v] = DataArray(
-                    data=f.apply_to(x.data, r.data, b.data),
+                    data=f.apply_to(
+                        x.data,
+                        r.data,
+                        b.data,
+                        clip=attrs.get("clip", [None, None]),
+                    ),
                     coords=x.coords,
                     dims=x.dims,
                     name=x.name,
                     attrs=x.attrs,
                 )
             if get_logger().is_enabled(Logging.DEBUG):
                 get_logger().debug(
-                    f"source[{v}] min:  {source[v].min().values.item() :.6f}"
+                    f"source[{v}] min:  "
+                    f"{source[v].quantile(0.0001).values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"target[{v}] min:  {target[v].min().values.item() :.6f}"
+                    f"target[{v}] min:  "
+                    f"{target[v].quantile(0.0001).values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"source[{v}] max:  {source[v].max().values.item() :.6f}"
+                    f"source[{v}] max:  "
+                    f"{source[v].quantile(0.9999).values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"target[{v}] max:  {target[v].max().values.item() :.6f}"
+                    f"target[{v}] max:  "
+                    f"{target[v].quantile(0.9999).values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"source[{v}] mean: {source[v].mean().values.item() :.6f}"
+                    f"source[{v}] mean: "
+                    f"{source[v].mean().values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"target[{v}] mean: {target[v].mean().values.item() :.6f}"
+                    f"target[{v}] mean: "
+                    f"{target[v].mean().values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"source[{v}] std:  {source[v].std().values.item() :.6f}"
+                    f"source[{v}] std:  "
+                    f"{source[v].std().values.item() :.6f}"
                 )
                 get_logger().debug(
-                    f"target[{v}] std:  {target[v].std().values.item() :.6f}"
+                    f"target[{v}] std:  "
+                    f"{target[v].std().values.item() :.6f}"
                 )
+            get_logger().info(f"finished graph for variable: {v}")
         return target
 
     @property
@@ -168,7 +184,7 @@ def config(self) -> dict[str : dict[str:Any]]:
                 "Rrs_665": {
                     "bias": "Rrs_665_bias",
                     "rmsd": "Rrs_665_rmsd",
-                    "distribution": "lognormal",
+                    "distribution": "normal",
                 },
                 "adg_412": {
                     "bias": "adg_412_bias",
@@ -245,35 +261,39 @@ def config(self) -> dict[str : dict[str:Any]]:
                 "fco2": {
                     "uncertainty": "fco2_tot_unc",
                     # the uncertainty interval coverage factor
-                    "coverage_factor": 2.0,
+                    "coverage": 2.0,
                     "distribution": "lognormal",
                 },
                 "flux": {
                     "uncertainty": "flux_unc",
                     # uncertainty is stated in relative terms
                     "relative": True,
-                    "coverage_factor": 2.0,
+                    "coverage": 2.0,
                     "distribution": "normal",
                 },
                 "ta": {
                     "uncertainty": "ta_tot_unc",
-                    "coverage_factor": 2.0,
+                    "coverage": 2.0,
                     "distribution": "lognormal",
+                    # clip to interval
+                    "clip": (None, 3400.0),
                 },
                 "dic": {
                     "uncertainty": "dic_tot_unc",
-                    "coverage_factor": 2.0,
+                    "coverage": 2.0,
                     "distribution": "lognormal",
+                    "clip": (None, 3200.0),
                 },
-                "ph": {
-                    "uncertainty": "ph_tot_unc",
-                    "coverage_factor": 2.0,
-                    "distribution": "lognormal",
+                "pH": {
+                    "uncertainty": "pH_tot_unc",
+                    "coverage": 2.0,
+                    "distribution": "normal",
                 },
                 "saturation_aragonite": {
                     "uncertainty": "saturation_aragonite_tot_unc",
-                    "coverage_factor": 2.0,
+                    "coverage": 2.0,
                     "distribution": "lognormal",
+                    "clip": (None, 6.0),
                 },
             },
             "ghrsst": {