feat(export): Export XMLID in hybrid read mode

bosd · bosd · commit e94cc817775e · 2025-09-24T21:27:00.000+02:00
This commit enhances the `odoo-data-flow` tool to allow exporting the external XML ID of the main record while using the tool's fast `read` mode.

When a user requests both `.id` and `id` in the fields list, and the export is running in `read` mode, the tool will now perform a secondary enrichment step. After fetching the main data with numeric IDs, it makes another call to Odoo to retrieve the XML IDs (`module.name`) for those numeric IDs.

This is implemented using a Polars left join to merge the XML IDs back into the main DataFrame. The `id` column is then populated with the XML ID, and per the user's final clarification, the `.id` column containing the numeric database ID is preserved in the final output. This allows users to have access to both the database ID and the external ID for transformation steps.

A comprehensive unit test has been added to verify this new functionality, including the case where a record may not have an XML ID.
diff --git a/noxfile.py b/noxfile.py
@@ -122,14 +122,15 @@ def precommit(session: nox.Session) -> None:
     session.run(
         "uv",
         "sync",
+        "--python",
+        session.python,
         "--group",
         "dev",
         "--group",
         "lint",
-        external=True,
     )
     session.install("pydoclint")
-    session.run("pre-commit", *args, external=True)
+    session.run("pre-commit", *args)
     if args and args[0] == "install":
         activate_virtualenv_in_precommit_hooks(session)
 
diff --git a/src/odoo_data_flow/export_threaded.py b/src/odoo_data_flow/export_threaded.py
@@ -408,6 +408,60 @@ def _clean_and_transform_batch(
     return casted_df.select(list(polars_schema.keys()))
 
 
+def _enrich_main_df_with_xml_ids(
+    df: pl.DataFrame, connection: Any, model_name: str
+) -> pl.DataFrame:
+    """Enriches a DataFrame with XML IDs for the main records.
+
+    This function takes a DataFrame containing a '.id' column with numeric
+    database IDs, fetches their corresponding external XML IDs from Odoo,
+    and uses them to populate the 'id' column, preserving the '.id' column.
+
+    Args:
+        df: The Polars DataFrame to enrich. Must contain an '.id' column.
+        connection: The active Odoo connection object.
+        model_name: The name of the Odoo model being exported.
+
+    Returns:
+        The enriched DataFrame with the 'id' column populated with XML IDs
+        and the '.id' column preserved.
+    """
+    if ".id" not in df.columns:
+        log.warning("'.id' column not found, cannot perform main XML ID enrichment.")
+        return df
+
+    db_ids = df.get_column(".id").unique().drop_nulls().to_list()
+    if not db_ids:
+        log.debug("No database IDs found to enrich; ensuring 'id' is empty.")
+        # Overwrite 'id' with nulls, keep '.id'
+        return df.with_columns(pl.lit(None, dtype=pl.String).alias("id"))
+
+    log.info(f"Fetching XML IDs for {len(db_ids)} main records...")
+    ir_model_data = connection.get_model("ir.model.data")
+    xml_id_data = ir_model_data.search_read(
+        [("model", "=", model_name), ("res_id", "in", db_ids)],
+        ["res_id", "module", "name"],
+        context={"active_test": False},
+    )
+
+    if not xml_id_data:
+        log.warning(f"No XML IDs found for the exported {model_name} records.")
+        return df.with_columns(pl.lit(None, dtype=pl.String).alias("id"))
+
+    df_xml_ids = (
+        pl.from_dicts(xml_id_data)
+        .with_columns(
+            pl.format("{}.{}", pl.col("module"), pl.col("name")).alias("xml_id")
+        )
+        .select(pl.col("res_id").cast(pl.Int64), "xml_id")
+        .unique(subset=["res_id"], keep="first")
+    )
+
+    # Join to get the xml_id, overwrite 'id', and drop temporary columns.
+    df_enriched = df.join(df_xml_ids, left_on=".id", right_on="res_id", how="left")
+    return df_enriched.with_columns(pl.col("xml_id").alias("id")).drop("xml_id")
+
+
 def _process_export_batches(  # noqa: C901
     rpc_thread: "RPCThreadExport",
     total_ids: int,
@@ -419,6 +473,7 @@ def _process_export_batches(  # noqa: C901
     session_dir: Optional[Path],
     is_resuming: bool,
     encoding: str,
+    enrich_main_xml_id: bool = False,
 ) -> Optional[pl.DataFrame]:
     """Processes exported batches.
 
@@ -474,6 +529,11 @@ def _process_export_batches(  # noqa: C901
                         df, field_types, polars_schema
                     )
 
+                    if enrich_main_xml_id:
+                        final_batch_df = _enrich_main_df_with_xml_ids(
+                            final_batch_df, rpc_thread.connection, model_name
+                        )
+
                     if output and streaming:
                         if not header_written:
                             if is_resuming:
@@ -521,6 +581,11 @@ def _process_export_batches(  # noqa: C901
         return None
     if not all_cleaned_dfs:
         log.warning("No data was returned from the export.")
+        # Adjust schema for empty DataFrame if enrichment was active
+        if enrich_main_xml_id:
+            # The .id column is correctly typed as Int64. The id column, which
+            # would also be Int64, needs its type changed to String for the header.
+            polars_schema["id"] = pl.String()
         empty_df = pl.DataFrame(schema=polars_schema)
         if output:
             if is_resuming:
@@ -557,6 +622,7 @@ def _determine_export_strategy(
     Optional[dict[str, dict[str, Any]]],
     bool,
     bool,
+    bool,
 ]:
     """Perform pre-flight checks and determine the best export strategy."""
     preliminary_read_mode = technical_names or any(
@@ -567,7 +633,7 @@ def _determine_export_strategy(
     )
 
     if not model_obj or not fields_info:
-        return None, None, None, False, False
+        return None, None, None, False, False, False
 
     has_read_specifiers = any(f.endswith("/.id") or f == ".id" for f in header)
     has_xml_id_specifiers = any(f.endswith("/id") for f in header)
@@ -586,7 +652,7 @@ def _determine_export_strategy(
             f"(e.g., {invalid_fields}) is not supported in hybrid mode. "
             "Only 'field/id' is allowed for enrichment."
         )
-        return None, None, None, False, False
+        return None, None, None, False, False, False
 
     technical_types = {"selection", "binary"}
     has_technical_fields = any(
@@ -597,7 +663,15 @@ def _determine_export_strategy(
         technical_names or has_read_specifiers or is_hybrid or has_technical_fields
     )
 
-    if is_hybrid:
+    # --- New logic for main record XML ID enrichment ---
+    enrich_main_xml_id = ".id" in header and "id" in header and force_read_method
+
+    if enrich_main_xml_id:
+        log.info(
+            "Main record XML ID enrichment activated. "
+            "'.id' will be used to fetch and populate 'id'."
+        )
+    elif is_hybrid:
         log.info("Hybrid export mode activated. Using 'read' with XML ID enrichment.")
     elif has_technical_fields:
         log.info("Read method auto-enabled for 'selection' or 'binary' fields.")
@@ -613,9 +687,16 @@ def _determine_export_strategy(
                 f"Mixing export-style specifiers {invalid_fields} "
                 f"is not supported in pure 'read' mode."
             )
-            return None, None, None, False, False
-
-    return connection, model_obj, fields_info, force_read_method, is_hybrid
+            return None, None, None, False, False, False
+
+    return (
+        connection,
+        model_obj,
+        fields_info,
+        force_read_method,
+        is_hybrid,
+        enrich_main_xml_id,
+    )
 
 
 def _resume_existing_session(
@@ -692,9 +773,14 @@ def export_data(
     if not session_dir:
         return False, session_id, 0, None
 
-    connection, model_obj, fields_info, force_read_method, is_hybrid = (
-        _determine_export_strategy(config, model, header, technical_names)
-    )
+    (
+        connection,
+        model_obj,
+        fields_info,
+        force_read_method,
+        is_hybrid,
+        enrich_main_xml_id,
+    ) = _determine_export_strategy(config, model, header, technical_names)
     if not connection or not model_obj or not fields_info:
         return False, session_id, 0, None
 
@@ -747,6 +833,7 @@ def export_data(
         session_dir=session_dir,
         is_resuming=is_resuming,
         encoding=encoding,
+        enrich_main_xml_id=enrich_main_xml_id,
     )
 
     # --- Finalization and Cleanup ---
diff --git a/tests/test_export_threaded.py b/tests/test_export_threaded.py
@@ -548,6 +548,7 @@ def test_export_data_streaming_no_output(
             {"name": {"type": "char"}},
             False,
             False,
+            False,
         )
 
         success, _, _, result_df = export_data(
@@ -736,50 +737,6 @@ def test_export_hybrid_mode_success(self, mock_conf_lib: MagicMock) -> None:
         )
         assert_frame_equal(result_df, expected_df)
 
-    def test_export_id_and_dot_id_in_read_mode(self, mock_conf_lib: MagicMock) -> None:
-        """Test the read mode.
-
-        Tests that in read() mode, both 'id' and '.id' correctly resolve
-        to the integer database ID.
-        """
-        # --- Arrange ---
-        header = [".id", "id", "name"]
-        mock_model = mock_conf_lib.return_value.get_model.return_value
-        mock_model.search.return_value = [101, 102]
-        mock_model.read.return_value = [
-            {"id": 101, "name": "Record 101"},
-            {"id": 102, "name": "Record 102"},
-        ]
-        mock_model.fields_get.return_value = {
-            "id": {"type": "integer"},
-            "name": {"type": "char"},
-        }
-
-        # --- Act ---
-        _, _, _, result_df = export_data(
-            config="dummy.conf",
-            model="res.partner",
-            domain=[],
-            header=header,
-            output=None,
-            technical_names=True,
-        )
-
-        # --- Assert ---
-        assert result_df is not None
-
-        # *** FIX ***: Use the 'schema' argument to define dtypes on creation.
-        expected_df = pl.DataFrame(
-            {
-                ".id": [101, 102],
-                "id": [101, 102],
-                "name": ["Record 101", "Record 102"],
-            },
-            schema={".id": pl.Int64, "id": pl.Int64, "name": pl.String},
-        )
-
-        assert_frame_equal(result_df, expected_df)
-
     def test_export_id_in_export_data_mode(self, mock_conf_lib: MagicMock) -> None:
         """Test export id in export data.
 
@@ -983,3 +940,69 @@ def test_process_export_batches_handles_inconsistent_schemas(
         )
         final_df = final_df.sort("id")
         assert_frame_equal(final_df, expected_df)
+
+    def test_export_main_record_xml_id_enrichment(
+        self, mock_conf_lib: MagicMock
+    ) -> None:
+        """Test main record xml id.
+
+        Tests that when '.id' and 'id' are requested, the 'id' column is
+        enriched with the main record's XML ID.
+        """
+        # --- Arrange ---
+        header = [".id", "id", "name"]
+        mock_model = mock_conf_lib.return_value.get_model.return_value
+        mock_model.search.return_value = [1, 2]
+
+        # 1. Mock the primary read() call which returns numeric IDs
+        mock_model.read.return_value = [
+            {"id": 1, "name": "Partner A"},
+            {"id": 2, "name": "Partner B"},
+        ]
+
+        # 2. Mock the metadata call
+        mock_model.fields_get.return_value = {
+            "id": {"type": "integer"},
+            ".id": {"type": "integer"},
+            "name": {"type": "char"},
+        }
+
+        # 3. Mock the secondary XML ID lookup on 'ir.model.data'
+        # Note: Partner B (id=2) does not have an XML ID to test the null case.
+        mock_ir_model_data = MagicMock()
+        mock_ir_model_data.search_read.return_value = [
+            {"res_id": 1, "module": "base", "name": "partner_a_xmlid"}
+        ]
+
+        # Make get_model return the main model first, then the ir.model.data mock
+        # This needs to be reset for each test that uses it this way.
+        mock_conf_lib.return_value.get_model.side_effect = [
+            mock_model,
+            mock_ir_model_data,
+        ]
+
+        # --- Act ---
+        success, _, _, result_df = export_data(
+            config="dummy.conf",
+            model="res.partner",
+            domain=[],
+            header=header,
+            output=None,
+        )
+
+        # --- Assert ---
+        assert success is True
+        assert result_df is not None
+
+        # The '.id' column should be preserved, and 'id' should be the XML ID
+        expected_df = pl.DataFrame(
+            {
+                ".id": [1, 2],
+                "id": ["base.partner_a_xmlid", None],
+                "name": ["Partner A", "Partner B"],
+            },
+            schema={".id": pl.Int64, "id": pl.String, "name": pl.String},
+        )
+
+        # Sort by name to ensure consistent order for comparison
+        assert_frame_equal(result_df.sort("name"), expected_df.sort("name"))