multi host support

refraction-ray · refraction-ray · commit 4a53d86603d3 · 2025-11-07T15:04:40.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@
 
 - Add `su4` as a generic parameterized two-qubit gates.
 
+- Add multi controller jax support for distrubuted contraction.
+
 ### Fixed
 
 - Fix the breaking logic change in jax from dlpack API, dlcapsule -> tensor.
diff --git a/examples/multihost_vqe.py b/examples/multihost_vqe.py
@@ -0,0 +1,174 @@
+import os
+import time
+import argparse
+import logging
+
+import jax
+import jax.distributed
+import numpy as np
+import optax
+import tensornetwork as tn
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+
+import tensorcircuit as tc
+from tensorcircuit.experimental import DistributedContractor, broadcast_py_object
+
+
+# --- Static Configuration ---
+NUM_DEVICES_TOTAL = 4
+os.environ["XLA_FLAGS"] = f"--xla_force_host_platform_device_count={NUM_DEVICES_TOTAL}"
+# delete the above fake lines when using GPU devices
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+K = tc.set_backend("jax")
+tc.set_dtype("complex64")
+
+N_QUBITS = 10
+DEPTH = 4
+
+
+def circuit_ansatz(n, d, params):
+    c = tc.Circuit(n)
+    c.h(range(n))
+    for i in range(d):
+        for j in range(0, n - 1):
+            c.rzz(j, j + 1, theta=params[j, i, 0])
+        for j in range(n):
+            c.rx(j, theta=params[j, i, 1])
+        for j in range(n):
+            c.ry(j, theta=params[j, i, 2])
+    return c
+
+
+def get_tfi_mpo(n):
+    Jx = np.ones(n - 1)
+    Bz = -1.0 * np.ones(n)
+    tn_mpo = tn.matrixproductstates.mpo.FiniteTFI(Jx, Bz, dtype=np.complex64)
+    return tc.quantum.tn2qop(tn_mpo)
+
+
+def get_nodes_fn(n, d, mpo):
+    def nodes_fn(params):
+        psi = circuit_ansatz(n, d, params).get_quvector()
+        expression = psi.adjoint() @ mpo @ psi
+        return expression.nodes
+
+    return nodes_fn
+
+
+def run_vqe_main(coordinator_address: str, num_processes: int, process_id: int):
+    """
+    Main logic run by ALL processes.
+    """
+    jax.distributed.initialize(
+        coordinator_address=coordinator_address,
+        num_processes=num_processes,
+        process_id=process_id,
+    )
+    print(
+        f"[Process {process_id}] Initialized. jax.process_index() reports: {jax.process_index()}"
+    )
+
+    global_mesh = Mesh(jax.devices(), axis_names=("devices",))
+    if jax.process_index() == 0:
+        print(f"--- Global mesh created with devices: {global_mesh.devices}")
+
+    tfi_mpo = get_tfi_mpo(N_QUBITS)
+    nodes_fn = get_nodes_fn(N_QUBITS, DEPTH, tfi_mpo)
+    params_shape = [N_QUBITS, DEPTH, 3]
+
+    # --- KEY CHANGE: Create params on host 0 and broadcast to all others ---
+    params_cpu = None
+    if jax.process_index() == 0:
+        key = jax.random.PRNGKey(42)
+        params_cpu = (
+            jax.random.normal(key, shape=params_shape, dtype=tc.rdtypestr) * 0.1
+        )
+
+    # Broadcast the CPU array. Now all processes have a concrete `params_cpu`.
+    # This is CRITICAL to prevent the NoneType error upon contractor initialization.
+    params_cpu = broadcast_py_object(params_cpu)
+
+    # Now that all processes have `params_cpu`, we can initialize the contractor safely.
+    # The contractor will use this concrete array to run its (now internal)
+    # "find path on 0 and broadcast" logic.
+    DC = DistributedContractor(
+        nodes_fn=nodes_fn,
+        params=params_cpu,
+        mesh=global_mesh,
+        cotengra_options={
+            "slicing_reconf_opts": {"target_size": 2**8},
+            "max_repeats": 16,
+            "progbar": True,
+            "minimize": "write",
+            "parallel": 4,
+        },
+    )
+
+    # Shard the parameters onto devices for the actual GPU/TPU computation.
+    params_sharding = NamedSharding(global_mesh, P(*([None] * len(params_shape))))
+    params = jax.device_put(params_cpu, params_sharding)
+
+    # Initialize the optimizer and its state.
+    optimizer = optax.adam(2e-2)
+    opt_state = optimizer.init(params)  # Can init directly with sharded params
+
+    @jax.jit
+    def opt_update(params, opt_state, grads):
+        updates, new_opt_state = optimizer.update(grads, opt_state, params)
+        new_params = optax.apply_updates(params, updates)
+        return new_params, new_opt_state
+
+    # Run the optimization loop.
+    n_steps = 100
+    if jax.process_index() == 0:
+        print("\nStarting VQE optimization loop...")
+
+    for i in range(n_steps):
+        t0 = time.time()
+        loss, grads = DC.value_and_grad(params)
+        params, opt_state = opt_update(params, opt_state, grads)
+        t1 = time.time()
+
+        if jax.process_index() == 0:
+            print(f"Step {i+1:03d} | " f"Loss: {loss:.8f} | " f"Time: {t1 - t0:.4f} s")
+
+    jax.distributed.shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="JAX Multi-Host VQE Simulation")
+    parser.add_argument(
+        "--process_id",
+        type=int,
+        required=True,
+        help="Rank of the current process (e.g., 0, 1).",
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=2, help="Total number of processes."
+    )
+    parser.add_argument(
+        "--coordinator_address",
+        type=str,
+        default="127.0.0.1:8888",
+        help="IP address and port of the coordinator (process 0).",
+    )
+    args = parser.parse_args()
+
+    print(f"--- Starting Process {args.process_id}/{args.num_processes} ---")
+
+    if args.process_id == 0:
+        print(
+            "\n>>> This is the coordinator process. Waiting for other processes to connect."
+        )
+
+    run_vqe_main(
+        coordinator_address=args.coordinator_address,
+        num_processes=args.num_processes,
+        process_id=args.process_id,
+    )
+
+# 5090: CUDA_VISIBLE_DEVICES=0,1  NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 python multihost_vqe.py  --process_id=1
+# CUDA_VISIBLE_DEVICES=2,3  NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 python multihost_vqe.py  --process_id=1
+# H200: no need to disable P2P and SHM due to well configured nvlink
diff --git a/tensorcircuit/experimental.py b/tensorcircuit/experimental.py