unitaryfoundation · vprusso · Oct 12, 2025 · Oct 17, 2025 · Oct 17, 2025 · Nov 3, 2025
diff --git a/docs/source/cli_workflows.rst b/docs/source/cli_workflows.rst
@@ -171,7 +171,9 @@ Example completed result:
 .. code-block:: text
 
    INFO - Polling job...
-   BSEQResult(largest_connected_size=100, fraction_connected=0.7874)
+   BSEQResult(largest_connected_size=BenchmarkScore(value=100.0, uncertainty=3.5))
+
+The ``largest_connected_size`` metric now carries a ``BenchmarkScore`` so the CLI reports values and Monte Carlo uncertainties side by side.
 
 If the job is still queued, the CLI reports the current queue position and asks you to try again later.
 

diff --git a/metriq_gym/benchmarks/bseq.py b/metriq_gym/benchmarks/bseq.py
@@ -1,42 +1,56 @@
-""" "Bell state effective qubits" BSEQ benchmark for the Metriq Gym
-(credit to Paul Nation for the original code for IBM devices).
-
-This benchmark evaluates a quantum device's ability to produce entangled states and measure correlations that violate
-the CHSH inequality. The violation of this inequality indicates successful entanglement between qubits.
+"""BSEQ benchmark (Bell state effective qubits) for the Metriq Gym.
+
+The benchmark evaluates a device's ability to produce entangled pairs that violate
+the CHSH inequality.  For every qubit pair we estimate the CHSH mean and uncertainty;
+edges whose mean exceeds the violation threshold are treated as successfully entangled.
+The reported ``largest_connected_size`` metric equals the size of the largest connected
+component built from those successful edges, while the uncertainty is obtained via a
+bootstrap: we resample every edge from its mean/std Gaussian, rebuild the active graph,
+and compute the largest connected component.  The standard deviation across those
+Monte Carlo draws becomes the error bar returned in the ``BenchmarkScore``.
+
+(Credit to Paul Nation for the original IBM-oriented implementation.)
 """
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
+
 import networkx as nx
-import rustworkx as rx
 import numpy as np
+import rustworkx as rx
 
 from qiskit import QuantumCircuit
 from qiskit.result import marginal_counts, sampled_expectation_value
 
-from pydantic import Field
 from metriq_gym.benchmarks.benchmark import (
     Benchmark,
     BenchmarkData,
     BenchmarkResult,
+    BenchmarkScore,
     MetricDirection,
 )
-from metriq_gym.helpers.task_helpers import flatten_counts
+from pydantic import Field
 from metriq_gym.helpers.graph_helpers import (
     GraphColoring,
     device_graph_coloring,
     largest_connected_size,
 )
+from metriq_gym.helpers.statistics import bootstrap_largest_component_stddev
+from metriq_gym.helpers.task_helpers import flatten_counts
 from metriq_gym.qplatform.device import connectivity_graph
 
 if TYPE_CHECKING:
     from qbraid import GateModelResultData, QuantumDevice, QuantumJob
     from qbraid.runtime.result_data import MeasCount
 
 
+CHSH_THRESHOLD = 2.0
+
+
 class BSEQResult(BenchmarkResult):
-    largest_connected_size: int
-    fraction_connected: float = Field(..., json_schema_extra={"direction": MetricDirection.HIGHER})
+    largest_connected_size: BenchmarkScore = Field(
+        ..., json_schema_extra={"direction": MetricDirection.HIGHER}
+    )
 
 
 @dataclass
@@ -107,45 +121,57 @@ def generate_chsh_circuit_sets(coloring: GraphColoring) -> list[QuantumCircuit]:
     return exp_sets
 
 
-def chsh_subgraph(coloring: GraphColoring, counts: list["MeasCount"]) -> rx.PyGraph:
+def chsh_subgraph(
+    coloring: GraphColoring, counts: list["MeasCount"]
+) -> tuple[rx.PyGraph, dict[tuple[int, int], tuple[float, float]]]:
     """Constructs a subgraph of qubit pairs that violate the CHSH inequality.
 
     Args:
-        job_data: The benchmark metadata including topology and coloring.
-        result_data: The result data containing measurement counts.
+        coloring: The benchmark graph-coloring metadata.
+        counts: Flattened measurement counts returned from the device.
 
     Returns:
-        The graph of edges that violated the CHSH inequality.
+        A tuple with the subgraph of successful edges and per-edge (mean, std) CHSH scores.
     """
     # A subgraph is constructed containing only the edges (qubit pairs) that successfully violate the CHSH inequality.
     # The size of the largest connected component in this subgraph provides a measure of the device's performance.
     good_edges = []
+    edge_stats: dict[tuple[int, int], tuple[float, float]] = {}
     for color_idx in range(coloring.num_colors):
         num_meas_pairs = len(
             {key for key, val in coloring.edge_color_map.items() if val == color_idx}
         )
         exp_vals: np.ndarray = np.zeros(num_meas_pairs, dtype=float)
+        variances: np.ndarray = np.zeros(num_meas_pairs, dtype=float)
 
         for idx in range(4):
             for pair in range(num_meas_pairs):
                 sub_counts = marginal_counts(counts[color_idx * 4 + idx], [2 * pair, 2 * pair + 1])
                 exp_val = sampled_expectation_value(sub_counts, "ZZ")
                 exp_vals[pair] += exp_val if idx != 2 else -exp_val
+                shots = sum(sub_counts.values())
+                if shots > 0:
+                    var_term = max((1 - exp_val**2) / shots, 0.0)
+                    variances[pair] += var_term
+                else:
+                    variances[pair] = float("nan")
 
         for idx, edge_idx in enumerate(
             key for key, val in coloring.edge_color_map.items() if val == color_idx
         ):
             edge = (coloring.edge_index_map[edge_idx][0], coloring.edge_index_map[edge_idx][1])
             # The benchmark checks whether the CHSH inequality is violated (i.e., the sum of correlations exceeds 2,
             # indicating entanglement).
-            if exp_vals[idx] > 2:
+            std = float(np.sqrt(variances[idx])) if not np.isnan(variances[idx]) else float("nan")
-            std = float(np.sqrt(variances[idx])) if not np.isnan(variances[idx]) else float("nan")
+            std = float(np.sqrt(max(variances[idx], 0.0))) if not np.isnan(variances[idx]) else float("nan")
-            std = float(np.sqrt(variances[idx])) if not np.isnan(variances[idx]) else float("nan")
+            std = float(np.sqrt(max(variances[idx], 0.0))) if not np.isnan(variances[idx]) else float("nan")
+            edge_stats[edge] = (float(exp_vals[idx]), std)
+            if exp_vals[idx] > CHSH_THRESHOLD:
                 good_edges.append(edge)
 
     good_graph = rx.PyGraph(multigraph=False)
     good_graph.add_nodes_from(list(range(coloring.num_nodes)))
     for edge in good_edges:
         good_graph.add_edge(*edge, 1)
-    return good_graph
+    return good_graph, edge_stats
 
 
 class BSEQ(Benchmark):
@@ -193,9 +219,13 @@ def poll_handler(
 
         if isinstance(job_data.coloring, dict):
             job_data.coloring = GraphColoring.from_dict(job_data.coloring)
-        good_graph = chsh_subgraph(job_data.coloring, flatten_counts(result_data))
+        good_graph, edge_stats = chsh_subgraph(job_data.coloring, flatten_counts(result_data))
         lcs = largest_connected_size(good_graph)
+        lcs_std = bootstrap_largest_component_stddev(
+            edge_stats,
+            job_data.coloring.num_nodes,
+            threshold=CHSH_THRESHOLD,
+        )
         return BSEQResult(
-            largest_connected_size=lcs,
-            fraction_connected=lcs / job_data.coloring.num_nodes,
+            largest_connected_size=BenchmarkScore(value=float(lcs), uncertainty=lcs_std),
         )
diff --git a/metriq_gym/helpers/statistics.py b/metriq_gym/helpers/statistics.py
@@ -7,6 +7,8 @@
 from math import sqrt
 from typing import Mapping
 
+import numpy as np
+
 
 def effective_shot_count(shots: int, count_results: Mapping[str, int]) -> int:
     total_measurements = sum(count_results.values())
@@ -31,3 +33,88 @@ def binary_expectation_stddev(
     expectation = binary_expectation_value(shots, count_results, outcome=outcome)
     variance = expectation * (1 - expectation) / effective_shots
     return float(sqrt(max(variance, 0.0)))
+
+
+def _largest_component_from_edges(edges: list[tuple[int, int]], num_nodes: int) -> int:
+    """Return the size of the largest connected component for an edge list."""
+    if num_nodes <= 0:
+        return 0
+
+    parent = list(range(num_nodes))
+    sizes = [1] * num_nodes
+
+    def find(node: int) -> int:
+        while parent[node] != node:
+            parent[node] = parent[parent[node]]
+            node = parent[node]
+        return node
+
+    def union(u: int, v: int) -> None:
+        root_u = find(u)
+        root_v = find(v)
+        if root_u == root_v:
+            return
+        if sizes[root_u] < sizes[root_v]:
+            root_u, root_v = root_v, root_u
+        parent[root_v] = root_u
+        sizes[root_u] += sizes[root_v]
+
+    for u, v in edges:
+        union(u, v)
+
+    largest = 1
+    for idx in range(num_nodes):
+        root = find(idx)
+        if sizes[root] > largest:
+            largest = sizes[root]
+    return largest
-    largest = 1
-    for idx in range(num_nodes):
-        root = find(idx)
-        if sizes[root] > largest:
-            largest = sizes[root]
-    return largest
+    # The size of the largest component is the maximum in sizes
+    return max(sizes) if num_nodes > 0 else 0
-    largest = 1
-    for idx in range(num_nodes):
-        root = find(idx)
-        if sizes[root] > largest:
-            largest = sizes[root]
-    return largest
+    # The size of the largest component is the maximum in sizes
+    return max(sizes) if num_nodes > 0 else 0
+
+
+def bootstrap_largest_component_stddev(
+    edge_stats: Mapping[tuple[int, int], tuple[float, float]],
+    num_nodes: int,
+    *,
+    threshold: float,
+    num_samples: int = 512,
+    rng: np.random.Generator | None = None,
+) -> float:
+    """Estimate the uncertainty of the largest connected component via Monte Carlo.
+
+    Each edge ``(u, v)`` is modelled as a Gaussian random variable with mean/std drawn
+    from ``edge_stats[(u, v)]``.  During every Monte Carlo draw we sample a synthetic
+    value for every edge, mark the edge as ``active`` whenever that draw exceeds the
+    violation ``threshold``, and compute the size of the largest connected component
+    that can be assembled from the active edges.  The reported uncertainty is the
+    sample standard deviation of those largest-component sizes after ``num_samples``
+    repetitions.  When all draws produce the same component size the function returns
+    zero, indicating that the metric is insensitive to the provided per-edge noise.
+
+    Args:
+        edge_stats: Mapping from edge to a tuple of (mean, stddev) metric values (e.g., CHSH scores).
+        num_nodes: Total number of nodes in the graph.
+        threshold: Threshold that determines whether an edge is considered active.
+        num_samples: Number of Monte Carlo samples to draw.
+        rng: Optional numpy random generator for deterministic sampling.
+
+    Returns:
+        The sample standard deviation of the largest connected component.
+    """
+    if num_nodes <= 0 or not edge_stats or num_samples <= 1:
+        return 0.0
+
+    rng = rng or np.random.default_rng()
+    edges = list(edge_stats.items())
+    samples = np.empty(num_samples, dtype=float)
+
+    for sample_idx in range(num_samples):
+        active_edges: list[tuple[int, int]] = []
+        for (u, v), (mean, std) in edges:
+            sigma = 0.0 if std is None or np.isnan(std) else float(std)
+            value = float(mean) if sigma == 0.0 else float(rng.normal(mean, sigma))
+            if value > threshold:
+                active_edges.append((u, v))
+        samples[sample_idx] = _largest_component_from_edges(active_edges, num_nodes)
+
+    if np.allclose(samples, samples[0]):
+        return 0.0
+    return float(samples.std(ddof=1))
diff --git a/tests/unit/benchmarks/test_bseq.py b/tests/unit/benchmarks/test_bseq.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+from metriq_gym.benchmarks.bseq import BSEQResult, CHSH_THRESHOLD
+from metriq_gym.benchmarks.benchmark import BenchmarkScore
+from metriq_gym.helpers.statistics import bootstrap_largest_component_stddev
+
+
+def test_bootstrap_stddev_returns_zero_without_edges():
+    assert bootstrap_largest_component_stddev({}, num_nodes=4, threshold=CHSH_THRESHOLD) == 0.0
+
+
+def test_bootstrap_stddev_zero_variance_is_deterministic():
+    edge_stats = {(0, 1): (CHSH_THRESHOLD + 0.5, 0.0)}
+    rng = np.random.default_rng(0)
+    assert (
+        bootstrap_largest_component_stddev(
+            edge_stats, num_nodes=3, threshold=CHSH_THRESHOLD, rng=rng
+        )
+        == 0.0
+    )
+
+
+def test_bootstrap_stddev_reflects_sampling_noise():
+    edge_stats = {(0, 1): (CHSH_THRESHOLD + 0.1, 0.3)}
+    rng = np.random.default_rng(1234)
+    std = bootstrap_largest_component_stddev(
+        edge_stats,
+        num_nodes=3,
+        threshold=CHSH_THRESHOLD,
+        rng=rng,
+        num_samples=256,
+    )
+    assert 0.0 < std < 3.0
+
+
+def test_bseq_result_exposes_benchmark_score():
+    result = BSEQResult(largest_connected_size=BenchmarkScore(value=8.0, uncertainty=1.5))
+    assert result.values == pytest.approx({"largest_connected_size": 8.0})
+    assert result.uncertainties == pytest.approx({"largest_connected_size": 1.5})