implement correct function cache for distributedcontractor

refraction-ray · refraction-ray · commit 3be3f33804f7 · 2025-07-24T15:19:48.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,19 @@
 # Change Log
 
 ## Unreleased
+
+### Added
+
 - Add `Lattice` module (`tensorcircuit.templates.lattice`) for creating and manipulating various lattice geometries, including `SquareLattice`, `HoneycombLattice`, and `CustomizeLattice`.
 
+- Add `DistributedContractor` in experimental module with new examples for fast implementation of distribution circuit simulation on jax backend.
+
+- Add `circuit.amplitude_before()` method to return the corresponding tensornetwork nodes.
+
+### Fixed
+
+- Fix the nodes order in contraction by giving each node a global `_stable_id_`.
+
 ## v1.2.1
 
 ### Fixed
diff --git a/tensorcircuit/experimental.py b/tensorcircuit/experimental.py
@@ -690,6 +690,14 @@ def __init__(
 
         self._params_template = params
         self._backend = "jax"
+        self._compiled_v_fns: Dict[
+            Tuple[Callable[[Tensor], Tensor], str],
+            Callable[[Any, Tensor, Tensor], Tensor],
+        ] = {}
+        self._compiled_vg_fns: Dict[
+            Tuple[Callable[[Tensor], Tensor], str],
+            Callable[[Any, Tensor, Tensor], Tensor],
+        ] = {}
 
         logger.info("Running cotengra pathfinder... (This may take a while)")
         nodes = self.nodes_fn(self._params_template)
@@ -844,20 +852,29 @@ def compute_and_add() -> Tensor:
 
         return device_sum_fn
 
-    # --- Public API ---
-    def value_and_grad(
+    def _get_or_compile_fn(
         self,
-        params: Tensor,
-        aggregate: bool = True,
-        op: Optional[Callable[[Tensor], Tensor]] = None,
-        output_dtype: Optional[str] = None,
-    ) -> Tuple[Tensor, Tensor]:
-        if self._compiled_vg_fn is None:
-            device_sum_fn = self._get_device_sum_vg_fn(op=op, output_dtype=output_dtype)
-            # `tree` is arg 0, `params` is arg 1, `indices` is arg 2
-            # `tree` is static and broadcast to all devices
-            self._compiled_vg_fn = jaxlib.pmap(
-                device_sum_fn,
+        cache: Dict[
+            Tuple[Callable[[Tensor], Tensor], str],
+            Callable[[Any, Tensor, Tensor], Tensor],
+        ],
+        fn_getter: Callable[..., Any],
+        op: Optional[Callable[[Tensor], Tensor]],
+        output_dtype: Optional[str],
+    ) -> Callable[[Any, Tensor, Tensor], Tensor]:
+        """
+        Gets a compiled pmap-ed function from cache or compiles and caches it.
+
+        The cache key is a tuple of (op, output_dtype). Caution on lambda function!
+
+        Returns:
+            The compiled, pmap-ed JAX function.
+        """
+        cache_key = (op, output_dtype)
+        if cache_key not in cache:
+            device_fn = fn_getter(op=op, output_dtype=output_dtype)
+            compiled_fn = jaxlib.pmap(
+                device_fn,
                 in_axes=(
                     None,
                     None,
@@ -866,10 +883,39 @@ def value_and_grad(
                 static_broadcasted_argnums=(0,),  # arg 0 (tree) is a static argument
                 devices=self.devices,
             )
-        # Pass `self.tree` as the first argument
-        device_values, device_grads = self._compiled_vg_fn(  # type: ignore
+            cache[cache_key] = compiled_fn  # type: ignore
+        return cache[cache_key]  # type: ignore
+
+    def value_and_grad(
+        self,
+        params: Tensor,
+        aggregate: bool = True,
+        op: Optional[Callable[[Tensor], Tensor]] = None,
+        output_dtype: Optional[str] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Calculates the value and gradient, compiling the pmap function if needed for the first call.
+
+        :param params: Parameters for the `nodes_fn` input
+        :type params: Tensor
+        :param aggregate: Whether to aggregate (sum) the results across devices, defaults to True
+        :type aggregate: bool, optional
+        :param op: Optional post-processing function for the output, defaults to None (corresponding to `backend.real`)
+        :type op: Optional[Callable[[Tensor], Tensor]], optional
+        :param output_dtype: dtype str for the output of `nodes_fn`, defaults to None (corresponding to `rdtypestr`)
+        :type output_dtype: Optional[str], optional
+        """
+        compiled_vg_fn = self._get_or_compile_fn(
+            cache=self._compiled_vg_fns,
+            fn_getter=self._get_device_sum_vg_fn,
+            op=op,
+            output_dtype=output_dtype,
+        )
+
+        device_values, device_grads = compiled_vg_fn(
             self.tree, params, self.batched_slice_indices
         )
+
         if aggregate:
             total_value = backend.sum(device_values)
             total_grad = jaxlib.tree_util.tree_map(
@@ -885,17 +931,27 @@ def value(
         op: Optional[Callable[[Tensor], Tensor]] = None,
         output_dtype: Optional[str] = None,
     ) -> Tensor:
-        if self._compiled_v_fn is None:
-            device_sum_fn = self._get_device_sum_v_fn(op=op, output_dtype=output_dtype)
-            self._compiled_v_fn = jaxlib.pmap(
-                device_sum_fn,
-                in_axes=(None, None, 0),
-                static_broadcasted_argnums=(0,),
-                devices=self.devices,
-            )
-        device_values = self._compiled_v_fn(  # type: ignore
-            self.tree, params, self.batched_slice_indices
+        """
+        Calculates the value, compiling the pmap function for the first call.
+
+        :param params: Parameters for the `nodes_fn` input
+        :type params: Tensor
+        :param aggregate: Whether to aggregate (sum) the results across devices, defaults to True
+        :type aggregate: bool, optional
+        :param op: Optional post-processing function for the output, defaults to None (corresponding to identity)
+        :type op: Optional[Callable[[Tensor], Tensor]], optional
+        :param output_dtype: dtype str for the output of `nodes_fn`, defaults to None (corresponding to `dtypestr`)
+        :type output_dtype: Optional[str], optional
+        """
+        compiled_v_fn = self._get_or_compile_fn(
+            cache=self._compiled_v_fns,
+            fn_getter=self._get_device_sum_v_fn,
+            op=op,
+            output_dtype=output_dtype,
         )
+
+        device_values = compiled_v_fn(self.tree, params, self.batched_slice_indices)
+
         if aggregate:
             return backend.sum(device_values)
         return device_values
diff --git a/tests/test_stabilizer.py b/tests/test_stabilizer.py
@@ -184,7 +184,7 @@ def test_depolarize():
         c.depolarizing(0, 1, p=0.2)
         c.h(0)
         r.append(c.expectation_ps(z=[0]))
-    assert 5 < np.sum(r) < 38
+    assert 4 < np.sum(r) < 39
 
 
 def test_tableau_inputs():