Deprecating loss type mapping (#120)

akshay18iitg · web-flow · commit c72756aaeee2 · 2026-02-13T14:38:09.000-08:00
diff --git a/src/opentau/configs/default.py b/src/opentau/configs/default.py
@@ -31,7 +31,6 @@
 from opentau import (
     policies,  # noqa: F401
 )
-from opentau.datasets.standard_data_format_mapping import DATA_FEATURES_NAME_MAPPING, LOSS_TYPE_MAPPING
 from opentau.datasets.transforms import ImageTransformsConfig
 from opentau.datasets.video_utils import get_safe_default_codec
 
@@ -70,14 +69,11 @@ class DatasetConfig:
         stats: Dictionary of statistics for normalization, keyed by feature name.
             Each value is a dictionary with 'mean' and 'std' arrays. Defaults to None.
         data_features_name_mapping: Optional mapping from dataset feature names to
-            standard feature names. Must be provided together with `loss_type_mapping`.
-            Defaults to None.
-        loss_type_mapping: Optional loss type mapping for the dataset. Must be
-            provided together with `data_features_name_mapping`. Defaults to None.
+            standard feature names. Defaults to None.
 
     Raises:
         ValueError: If both or neither of `repo_id` and `grounding` are set, or
-            if only one of `data_features_name_mapping` and `loss_type_mapping`
+            if `data_features_name_mapping` is provided.
             is provided.
     """
 
@@ -94,7 +90,6 @@ class DatasetConfig:
 
     # optional standard data format mapping for the dataset if mapping is not already in standard_data_format_mapping.py
     data_features_name_mapping: dict[str, str] | None = None
-    loss_type_mapping: str | None = None
 
     # Ratio of the dataset to be used for validation. Please specify a value.
     # If `val_freq` is set to 0, a validation dataset will not be created and this value will be ignored.
@@ -106,16 +101,7 @@ def __post_init__(self):
         if (self.repo_id is None) == (self.grounding is None):
             raise ValueError("Exactly one of `repo_id` or `grounding` for Dataset config should be set.")
 
-        # data_features_name_mapping and loss_type_mapping have to be provided together
-        if (self.data_features_name_mapping is None) != (self.loss_type_mapping is None):
-            raise ValueError(
-                "`data_features_name_mapping` and `loss_type_mapping` have to be provided together."
-            )
-
-        # add data_features_name_mapping and loss_type_mapping to standard_data_format_mapping.py if they are provided
-        if self.data_features_name_mapping is not None and self.loss_type_mapping is not None:
-            DATA_FEATURES_NAME_MAPPING[self.repo_id] = self.data_features_name_mapping
-            LOSS_TYPE_MAPPING[self.repo_id] = self.loss_type_mapping
+        # data_features_name_mapping have to be provided if it is not already in standard_data_format_mapping.py
 
 
 @dataclass
diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py
@@ -108,7 +108,7 @@
 from opentau.constants import HF_OPENTAU_HOME
 from opentau.datasets.compute_stats import aggregate_stats, compute_episode_stats
 from opentau.datasets.image_writer import AsyncImageWriter, write_image
-from opentau.datasets.standard_data_format_mapping import DATA_FEATURES_NAME_MAPPING, LOSS_TYPE_MAPPING
+from opentau.datasets.standard_data_format_mapping import DATA_FEATURES_NAME_MAPPING
 from opentau.datasets.utils import (
     DEFAULT_FEATURES,
     DEFAULT_IMAGE_PATH,
@@ -727,9 +727,6 @@ def _to_standard_data_format(self, item: dict) -> dict:
         standard_item["img_is_pad"] = torch.tensor(img_is_pad, dtype=torch.bool)
         standard_item["action_is_pad"] = item[name_map["actions"] + "_is_pad"]
 
-        # add loss type
-        standard_item["loss_type"] = LOSS_TYPE_MAPPING[self._get_feature_mapping_key()]
-
         # cast all tensors in standard_item to bfloat16
         for key, value in standard_item.items():
             if isinstance(value, torch.Tensor) and value.dtype.is_floating_point:
diff --git a/src/opentau/datasets/standard_data_format_mapping.py b/src/opentau/datasets/standard_data_format_mapping.py
@@ -47,23 +47,12 @@
         - "prompt": Task descriptions or prompts
         - "response": Expected responses or labels
 
-    LOSS_TYPE_MAPPING
-        Dictionary mapping dataset repository IDs to loss type strings. Valid
-        values are:
-
-        - "MSE": Mean Squared Error (typically for continuous robotic actions)
-        - "CE": Cross Entropy (typically for discrete classification tasks
-          like VQA)
-
 Example:
     Access feature name mapping for a dataset:
         >>> mapping = DATA_FEATURES_NAME_MAPPING["lerobot/aloha_mobile_cabinet"]
         >>> mapping["camera0"]  # Returns "observation.images.cam_right_wrist"
         >>> mapping["actions"]  # Returns "action"
 
-    Access loss type for a dataset:
-        >>> loss_type = LOSS_TYPE_MAPPING["lerobot/aloha_mobile_cabinet"]
-        >>> loss_type  # Returns "MSE"
 """
 
 DATA_FEATURES_NAME_MAPPING = {
@@ -247,32 +236,3 @@
         "response": "response",
     },
 }
-
-"""
-Use "MSE" for mean squared error and "CE" for cross entropy.
-Usually robotic data with actions will have an MSE loss while
-VQA tasks will have a CE loss.
-"""
-LOSS_TYPE_MAPPING = {
-    "ML-GOD/mt-button-press": "MSE",
-    "ML-GOD/libero_spatial_no_noops_1.0.0_lerobot": "MSE",
-    "ML-GOD/libero": "MSE",
-    "physical-intelligence/libero": "MSE",
-    "danaaubakirova/koch_test": "MSE",
-    "lerobot/droid_100": "MSE",
-    "lerobot/aloha_mobile_cabinet": "MSE",
-    "autox/agibot-sample": "MSE",
-    "bi-so100-block-manipulation": "MSE",
-    "cube-on-cylinder": "MSE",
-    "cylinder-on-cube": "MSE",
-    "l-shape-on-cross-shape": "MSE",
-    "lerobot/svla_so101_pickplace": "MSE",
-    "lerobot/svla_so100_pickplace": "MSE",
-    "lerobot/svla_so100_stacking": "MSE",
-    "pixmo": "CE",
-    "dummy": "CE",
-    "vsr": "CE",
-    "clevr": "CE",
-    "cocoqa": "CE",
-    "lerobot_dummy": "MSE",
-}
diff --git a/tests/configs/test_default.py b/tests/configs/test_default.py
@@ -15,7 +15,7 @@
 import pytest
 
 from opentau.configs.default import DatasetConfig, DatasetMixtureConfig
-from opentau.datasets.standard_data_format_mapping import DATA_FEATURES_NAME_MAPPING, LOSS_TYPE_MAPPING
+from opentau.datasets.standard_data_format_mapping import DATA_FEATURES_NAME_MAPPING
 
 
 @pytest.mark.parametrize(
@@ -94,121 +94,9 @@ def setup_method(self):
         """Set up test fixtures before each test method."""
         # Store original state of global mappings
         self.original_data_mapping = DATA_FEATURES_NAME_MAPPING.copy()
-        self.original_loss_mapping = LOSS_TYPE_MAPPING.copy()
 
     def teardown_method(self):
         """Clean up after each test method."""
         # Restore original state of global mappings
         DATA_FEATURES_NAME_MAPPING.clear()
         DATA_FEATURES_NAME_MAPPING.update(self.original_data_mapping)
-        LOSS_TYPE_MAPPING.clear()
-        LOSS_TYPE_MAPPING.update(self.original_loss_mapping)
-
-    @pytest.mark.parametrize(
-        "data_mapping, loss_mapping, should_raise",
-        [
-            (None, None, False),  # Both None - valid
-            ({"camera0": "image"}, "MSE", False),  # Both provided - valid
-            (None, "MSE", True),  # Only loss_mapping provided - invalid
-            ({"camera0": "image"}, None, True),  # Only data_mapping provided - invalid
-        ],
-    )
-    def test_data_mapping_validation(self, data_mapping, loss_mapping, should_raise):
-        """Test that data_features_name_mapping and loss_type_mapping must be provided together."""
-        if should_raise:
-            with pytest.raises(
-                ValueError,
-                match="`data_features_name_mapping` and `loss_type_mapping` have to be provided together.",
-            ):
-                DatasetConfig(
-                    repo_id="test_repo",
-                    data_features_name_mapping=data_mapping,
-                    loss_type_mapping=loss_mapping,
-                )
-        else:
-            # Should not raise an error
-            DatasetConfig(
-                repo_id="test_repo", data_features_name_mapping=data_mapping, loss_type_mapping=loss_mapping
-            )
-
-    def test_mapping_addition_to_global_dicts(self):
-        """Test that mappings are added to global dictionaries when both are provided."""
-        test_repo_id = "test_custom_repo"
-        test_data_mapping = {"camera0": "observation.image", "state": "observation.state"}
-        test_loss_mapping = "MSE"
-
-        # Ensure the repo_id is not already in the mappings
-        assert test_repo_id not in DATA_FEATURES_NAME_MAPPING
-        assert test_repo_id not in LOSS_TYPE_MAPPING
-
-        # Create DatasetConfig with both mappings
-        config = DatasetConfig(  # noqa: F841
-            repo_id=test_repo_id,
-            data_features_name_mapping=test_data_mapping,
-            loss_type_mapping=test_loss_mapping,
-        )
-
-        # Check that mappings were added to global dictionaries
-        assert test_repo_id in DATA_FEATURES_NAME_MAPPING
-        assert test_repo_id in LOSS_TYPE_MAPPING
-        assert DATA_FEATURES_NAME_MAPPING[test_repo_id] == test_data_mapping
-        assert LOSS_TYPE_MAPPING[test_repo_id] == test_loss_mapping
-
-    def test_mapping_not_added_when_both_none(self):
-        """Test that mappings are not added to global dictionaries when both are None."""
-        test_repo_id = "test_none_repo"
-
-        # Ensure the repo_id is not already in the mappings
-        assert test_repo_id not in DATA_FEATURES_NAME_MAPPING
-        assert test_repo_id not in LOSS_TYPE_MAPPING
-
-        # Create DatasetConfig with both mappings as None
-        config = DatasetConfig(repo_id=test_repo_id, data_features_name_mapping=None, loss_type_mapping=None)  # noqa: F841
-
-        # Check that mappings were not added to global dictionaries
-        assert test_repo_id not in DATA_FEATURES_NAME_MAPPING
-        assert test_repo_id not in LOSS_TYPE_MAPPING
-
-    def test_mapping_overwrites_existing(self):
-        """Test that providing mappings overwrites existing entries for the same repo_id."""
-        test_repo_id = "test_overwrite_repo"
-        original_data_mapping = {"old": "mapping"}
-        original_loss_mapping = "CE"
-        new_data_mapping = {"camera0": "observation.image", "state": "observation.state"}
-        new_loss_mapping = "MSE"
-
-        # Add original mappings
-        DATA_FEATURES_NAME_MAPPING[test_repo_id] = original_data_mapping
-        LOSS_TYPE_MAPPING[test_repo_id] = original_loss_mapping
-
-        # Create DatasetConfig with new mappings
-        config = DatasetConfig(  # noqa: F841
-            repo_id=test_repo_id,
-            data_features_name_mapping=new_data_mapping,
-            loss_type_mapping=new_loss_mapping,
-        )
-
-        # Check that mappings were overwritten
-        assert DATA_FEATURES_NAME_MAPPING[test_repo_id] == new_data_mapping
-        assert LOSS_TYPE_MAPPING[test_repo_id] == new_loss_mapping
-        assert DATA_FEATURES_NAME_MAPPING[test_repo_id] != original_data_mapping
-        assert LOSS_TYPE_MAPPING[test_repo_id] != original_loss_mapping
-
-    def test_empty_mappings(self):
-        """Test behavior with empty mappings."""
-        test_repo_id = "test_empty_repo"
-        empty_data_mapping = {}
-        test_loss_mapping = "MSE"
-
-        # Create DatasetConfig with empty data mapping
-        config = DatasetConfig(  # noqa: F841
-            repo_id=test_repo_id,
-            data_features_name_mapping=empty_data_mapping,
-            loss_type_mapping=test_loss_mapping,
-        )
-
-        # Check that empty mapping was added
-        assert test_repo_id in DATA_FEATURES_NAME_MAPPING
-        assert test_repo_id in LOSS_TYPE_MAPPING
-        assert DATA_FEATURES_NAME_MAPPING[test_repo_id] == empty_data_mapping
-        assert LOSS_TYPE_MAPPING[test_repo_id] == test_loss_mapping
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -310,7 +310,6 @@ def check_standard_data_format(item, delta_timestamps_params, dataset, train_pip
         ("actions", (train_pipeline_config.action_chunk, train_pipeline_config.max_action_dim)),
         ("prompt", None),
         ("response", None),
-        ("loss_type", None),
         ("img_is_pad", (train_pipeline_config.num_cams,)),
         ("action_is_pad", (train_pipeline_config.action_chunk,)),
     ]
@@ -329,7 +328,7 @@ def check_standard_data_format(item, delta_timestamps_params, dataset, train_pip
             assert item[key].shape == shape, f"{key}"
         elif key == "state" or key == "actions":
             assert item[key].shape == shape, f"{key}"
-        elif key == "prompt" or key == "response" or key == "loss_type":
+        elif key == "prompt" or key == "response":
             assert type(item[key]) is str, f"{key}"
         elif key == "img_is_pad" or key == "action_is_pad":
             assert item[key].shape == shape, f"{key}"