postprocessing for dc and amplitude example

refraction-ray · refraction-ray · commit 89ee53e1c9d3 · 2025-07-24T13:55:14.000+08:00
diff --git a/examples/distributed_interface_amplitude.py b/examples/distributed_interface_amplitude.py
@@ -0,0 +1,108 @@
+"""
+amplitude constraction on multiple GPU cards with neat interface `DistributedContractor`
+"""
+
+import os
+
+NUM_DEVICES = 4
+os.environ["XLA_FLAGS"] = f"--xla_force_host_platform_device_count={NUM_DEVICES}"
+
+import time
+import jax
+from jax import numpy as jnp
+import tensorcircuit as tc
+from tensorcircuit.experimental import DistributedContractor
+
+K = tc.set_backend("jax")
+tc.set_dtype("complex64")
+
+
+N_QUBITS = 16
+DEPTH = 8
+
+
+def circuit_ansatz(n, d, params):
+    c = tc.Circuit(n)
+    c.h(range(n))
+    for i in range(d):
+        for j in range(0, n - 1):
+            c.rzz(j, j + 1, theta=params[j, i, 0])
+        for j in range(n):
+            c.rx(j, theta=params[j, i, 1])
+        for j in range(n):
+            c.ry(j, theta=params[j, i, 2])
+    return c
+
+
+def get_nodes_fn(n, d):
+    def nodes_fn(params):
+        psi = circuit_ansatz(n, d, params["circuit"])
+        return psi.amplitude_before(params["amplitude"])
+
+    return nodes_fn
+
+
+def get_binary_representation(i: int, N: int) -> jax.Array:
+    """
+    Generates the binary representation of an integer as a JAX array.
+    """
+    # Create an array of shift amounts, from N-1 down to 0
+    # For N=8, this is [7, 6, 5, 4, 3, 2, 1, 0]
+    shifts = jnp.arange(N - 1, -1, -1)
+    # Right-shift the integer 'i' by each amount in 'shifts'.
+    # This effectively isolates each bit at the rightmost position.
+    # For i=5 (..0101) and shifts=[..., 3, 2, 1, 0]
+    # shifted_i will be [..0, ..0, ..1, ..10, ..101] -> [0, 0, 1, 2, 5]
+    shifted_i = i >> shifts
+    # Use a bitwise AND with 1 to extract just the last bit from each shifted value.
+    # [0&1, 0&1, 1&1, 2&1, 5&1] -> [0, 0, 1, 0, 1]
+    # We explicitly cast to int32 as requested.
+    bits = (shifted_i & 1).astype(jnp.int32)
+    return bits
+
+
+if __name__ == "__main__":
+    print(f"JAX is using {jax.local_device_count()} devices.")
+
+    nodes_fn = get_nodes_fn(N_QUBITS, DEPTH)
+
+    @K.jit
+    def baseline(params):
+        psi = circuit_ansatz(N_QUBITS, DEPTH, params["circuit"])
+        return psi.amplitude(params["amplitude"])
+
+    key = jax.random.PRNGKey(42)
+    params_circuit = (
+        jax.random.normal(key, shape=[N_QUBITS, DEPTH, 3], dtype=tc.rdtypestr) * 0.1
+    )
+    params = {
+        "circuit": params_circuit,
+        "amplitude": get_binary_representation(0, N_QUBITS),
+    }
+    DC = DistributedContractor(
+        nodes_fn=nodes_fn,
+        params=params,
+        cotengra_options={
+            "slicing_reconf_opts": {"target_size": 2**16},
+            "max_repeats": 64,
+            "progbar": True,
+            "minimize": "write",
+            "parallel": 4,
+        },
+    )
+
+    n_steps = 100
+
+    print("\nStarting amplitude loop...")
+    for i in range(n_steps):
+        bs_vector = get_binary_representation(i, N_QUBITS)
+        t0 = time.time()
+        params = {"circuit": params_circuit, "amplitude": bs_vector}
+        amp = DC.value(params)
+        t1 = time.time()
+        print(
+            f"Bitstring: {K.numpy(bs_vector).tolist()} | "
+            f"amp: {amp:.8f} | "
+            f"baseline_amp: {baseline(params):.8f} | "
+            f"Time: {t1 - t0:.4f} s"
+        )
diff --git a/examples/distributed_interface_vqe.py b/examples/distributed_interface_vqe.py
@@ -112,5 +112,7 @@ def opt_update(params, opt_state, grads):
         print(f"Step {i+1:03d} | " f"Loss: {loss:.8f} | " f"Time: {t1 - t0:.4f} s")
 
     print("\nOptimization finished.")
-    final_energy = DC.value(params)
+    final_energy = DC.value(
+        params, op=lambda x: K.real(K.sum(x)), output_dtype=tc.rdtypestr
+    )
     print(f"Final energy: {final_energy:.8f}")
diff --git a/tensorcircuit/basecircuit.py b/tensorcircuit/basecircuit.py
@@ -441,27 +441,17 @@ def measure_jit(
 
     measure = measure_jit
 
-    def amplitude(self, l: Union[str, Tensor]) -> Tensor:
+    def amplitude_before(self, l: Union[str, Tensor]) -> List[Gate]:
         r"""
-        Returns the amplitude of the circuit given the bitstring l.
+        Returns the tensornetwor nodes for the amplitude of the circuit given the bitstring l.
         For state simulator, it computes :math:`\langle l\vert \psi\rangle`,
         for density matrix simulator, it computes :math:`Tr(\rho \vert l\rangle \langle 1\vert)`
         Note how these two are different up to a square operation.
 
-        :Example:
-
-        >>> c = tc.Circuit(2)
-        >>> c.X(0)
-        >>> c.amplitude("10")
-        array(1.+0.j, dtype=complex64)
-        >>> c.CNOT(0, 1)
-        >>> c.amplitude("11")
-        array(1.+0.j, dtype=complex64)
-
         :param l: The bitstring of 0 and 1s.
         :type l: Union[str, Tensor]
-        :return: The amplitude of the circuit.
-        :rtype: tn.Node.tensor
+        :return: The tensornetwork nodes for the amplitude of the circuit.
+        :rtype: List[Gate]
         """
         no, d_edges = self._copy()
         ms = []
@@ -502,6 +492,32 @@ def amplitude(self, l: Union[str, Tensor]) -> Tensor:
         no.extend(ms)
         if self.is_dm:
             no.extend(msconj)
+        return no
+
+    def amplitude(self, l: Union[str, Tensor]) -> Tensor:
+        r"""
+        Returns the amplitude of the circuit given the bitstring l.
+        For state simulator, it computes :math:`\langle l\vert \psi\rangle`,
+        for density matrix simulator, it computes :math:`Tr(\rho \vert l\rangle \langle 1\vert)`
+        Note how these two are different up to a square operation.
+
+        :Example:
+
+        >>> c = tc.Circuit(2)
+        >>> c.X(0)
+        >>> c.amplitude("10")
+        array(1.+0.j, dtype=complex64)
+        >>> c.CNOT(0, 1)
+        >>> c.amplitude("11")
+        array(1.+0.j, dtype=complex64)
+
+        :param l: The bitstring of 0 and 1s.
+        :type l: Union[str, Tensor]
+        :return: The amplitude of the circuit.
+        :rtype: tn.Node.tensor
+        """
+        no = self.amplitude_before(l)
+
         return contractor(no).tensor
 
     def probability(self) -> Tensor:
diff --git a/tensorcircuit/experimental.py b/tensorcircuit/experimental.py
@@ -737,7 +737,12 @@ def __init__(
 
         logger.info("Initialization complete.")
 
-    def _get_single_slice_contraction_fn(self) -> Callable[[Any, Tensor, int], Tensor]:
+    def _get_single_slice_contraction_fn(
+        self, op: Optional[Callable[[Tensor], Tensor]] = None
+    ) -> Callable[[Any, Tensor, int], Tensor]:
+        if op is None:
+            op = backend.sum
+
         def single_slice_contraction(
             tree: ctg.ContractionTree, params: Tensor, slice_idx: int
         ) -> Tensor:
@@ -746,16 +751,25 @@ def single_slice_contraction(
             input_arrays = [node.tensor for node in standardized_nodes]
             sliced_arrays = tree.slice_arrays(input_arrays, slice_idx)
             result = tree.contract_core(sliced_arrays, backend=self._backend)
-            return backend.sum(backend.real(result))
+            return op(result)
 
         return single_slice_contraction
 
     def _get_device_sum_vg_fn(
         self,
+        op: Optional[Callable[[Tensor], Tensor]] = None,
+        output_dtype: Optional[str] = None,
     ) -> Callable[[Any, Tensor, Tensor], Tuple[Tensor, Tensor]]:
-        base_fn = self._get_single_slice_contraction_fn()
+        post_processing = lambda x: backend.real(backend.sum(x))
+        if op is None:
+            op = post_processing
+        base_fn = self._get_single_slice_contraction_fn(op=op)
+        # to ensure the output is real so that can be differentiated
         single_slice_vg_fn = jaxlib.value_and_grad(base_fn, argnums=1)
 
+        if output_dtype is None:
+            output_dtype = rdtypestr
+
         def device_sum_fn(
             tree: ctg.ContractionTree, params: Tensor, slice_indices_for_device: Tensor
         ) -> Tuple[Tensor, Tensor]:
@@ -785,7 +799,7 @@ def do_nothing() -> Tuple[Tensor, Tensor]:
                 )
 
             initial_carry = (
-                backend.cast(backend.convert_to_tensor(0.0), dtype=rdtypestr),
+                backend.cast(backend.convert_to_tensor(0.0), dtype=output_dtype),
                 jaxlib.tree_util.tree_map(lambda x: jaxlib.numpy.zeros_like(x), params),
             )
             (final_value, final_grads), _ = jaxlib.lax.scan(
@@ -795,21 +809,14 @@ def do_nothing() -> Tuple[Tensor, Tensor]:
 
         return device_sum_fn
 
-    def _compile_value_and_grad(self) -> None:
-        if self._compiled_vg_fn is not None:
-            return
-        device_sum_fn = self._get_device_sum_vg_fn()
-        # `tree` is arg 0, `params` is arg 1, `indices` is arg 2
-        # `tree` is static and broadcast to all devices
-        self._compiled_vg_fn = jaxlib.pmap(
-            device_sum_fn,
-            in_axes=(None, None, 0),  # tree: broadcast, params: broadcast, indices: map
-            static_broadcasted_argnums=(0,),  # arg 0 (tree) is a static argument
-            devices=self.devices,
-        )
-
-    def _get_device_sum_v_fn(self) -> Callable[[Any, Tensor, Tensor], Tensor]:
-        base_fn = self._get_single_slice_contraction_fn()
+    def _get_device_sum_v_fn(
+        self,
+        op: Optional[Callable[[Tensor], Tensor]] = None,
+        output_dtype: Optional[str] = None,
+    ) -> Callable[[Any, Tensor, Tensor], Tensor]:
+        base_fn = self._get_single_slice_contraction_fn(op=op)
+        if output_dtype is None:
+            output_dtype = dtypestr
 
         def device_sum_fn(
             tree: ctg.ContractionTree, params: Tensor, slice_indices_for_device: Tensor
@@ -828,7 +835,7 @@ def compute_and_add() -> Tensor:
                 )
 
             initial_carry = backend.cast(
-                backend.convert_to_tensor(0.0), dtype=rdtypestr
+                backend.convert_to_tensor(0.0), dtype=output_dtype
             )
             final_value, _ = jaxlib.lax.scan(
                 scan_body, initial_carry, slice_indices_for_device
@@ -837,22 +844,28 @@ def compute_and_add() -> Tensor:
 
         return device_sum_fn
 
-    def _compile_value(self) -> None:
-        if self._compiled_v_fn is not None:
-            return
-        device_sum_fn = self._get_device_sum_v_fn()
-        self._compiled_v_fn = jaxlib.pmap(
-            device_sum_fn,
-            in_axes=(None, None, 0),
-            static_broadcasted_argnums=(0,),
-            devices=self.devices,
-        )
-
     # --- Public API ---
     def value_and_grad(
-        self, params: Tensor, aggregate: bool = True
+        self,
+        params: Tensor,
+        aggregate: bool = True,
+        op: Optional[Callable[[Tensor], Tensor]] = None,
+        output_dtype: Optional[str] = None,
     ) -> Tuple[Tensor, Tensor]:
-        self._compile_value_and_grad()
+        if self._compiled_vg_fn is None:
+            device_sum_fn = self._get_device_sum_vg_fn(op=op, output_dtype=output_dtype)
+            # `tree` is arg 0, `params` is arg 1, `indices` is arg 2
+            # `tree` is static and broadcast to all devices
+            self._compiled_vg_fn = jaxlib.pmap(
+                device_sum_fn,
+                in_axes=(
+                    None,
+                    None,
+                    0,
+                ),  # tree: broadcast, params: broadcast, indices: map
+                static_broadcasted_argnums=(0,),  # arg 0 (tree) is a static argument
+                devices=self.devices,
+            )
         # Pass `self.tree` as the first argument
         device_values, device_grads = self._compiled_vg_fn(  # type: ignore
             self.tree, params, self.batched_slice_indices
@@ -865,15 +878,36 @@ def value_and_grad(
             return total_value, total_grad
         return device_values, device_grads
 
-    def value(self, params: Tensor, aggregate: bool = True) -> Tensor:
-        self._compile_value()
+    def value(
+        self,
+        params: Tensor,
+        aggregate: bool = True,
+        op: Optional[Callable[[Tensor], Tensor]] = None,
+        output_dtype: Optional[str] = None,
+    ) -> Tensor:
+        if self._compiled_v_fn is None:
+            device_sum_fn = self._get_device_sum_v_fn(op=op, output_dtype=output_dtype)
+            self._compiled_v_fn = jaxlib.pmap(
+                device_sum_fn,
+                in_axes=(None, None, 0),
+                static_broadcasted_argnums=(0,),
+                devices=self.devices,
+            )
         device_values = self._compiled_v_fn(  # type: ignore
             self.tree, params, self.batched_slice_indices
         )
         if aggregate:
             return backend.sum(device_values)
         return device_values
 
-    def grad(self, params: Tensor, aggregate: bool = True) -> Tensor:
-        _, grad = self.value_and_grad(params, aggregate=aggregate)
+    def grad(
+        self,
+        params: Tensor,
+        aggregate: bool = True,
+        op: Optional[Callable[[Tensor], Tensor]] = None,
+        output_dtype: Optional[str] = None,
+    ) -> Tensor:
+        _, grad = self.value_and_grad(
+            params, aggregate=aggregate, op=op, output_dtype=output_dtype
+        )
         return grad
diff --git a/tests/test_stabilizer.py b/tests/test_stabilizer.py
@@ -178,13 +178,13 @@ def test_circuit_inputs():
 
 def test_depolarize():
     r = []
-    for _ in range(20):
+    for _ in range(40):
         c = tc.StabilizerCircuit(2)
         c.h(0)
         c.depolarizing(0, 1, p=0.2)
         c.h(0)
         r.append(c.expectation_ps(z=[0]))
-    assert 4 < np.sum(r) < 20
+    assert 5 < np.sum(r) < 38
 
 
 def test_tableau_inputs():