Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/source/cli_workflows.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ Example completed result:
.. code-block:: text

INFO - Polling job...
BSEQResult(largest_connected_size=100, fraction_connected=0.7874)
BSEQResult(largest_connected_size=BenchmarkScore(value=100.0, uncertainty=3.5))

The ``largest_connected_size`` metric now carries a ``BenchmarkScore`` so the CLI reports values and Monte Carlo uncertainties side by side.

If the job is still queued, the CLI reports the current queue position and asks you to try again later.

Expand Down
68 changes: 49 additions & 19 deletions metriq_gym/benchmarks/bseq.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,56 @@
""" "Bell state effective qubits" BSEQ benchmark for the Metriq Gym
(credit to Paul Nation for the original code for IBM devices).

This benchmark evaluates a quantum device's ability to produce entangled states and measure correlations that violate
the CHSH inequality. The violation of this inequality indicates successful entanglement between qubits.
"""BSEQ benchmark (Bell state effective qubits) for the Metriq Gym.

The benchmark evaluates a device's ability to produce entangled pairs that violate
the CHSH inequality. For every qubit pair we estimate the CHSH mean and uncertainty;
edges whose mean exceeds the violation threshold are treated as successfully entangled.
The reported ``largest_connected_size`` metric equals the size of the largest connected
component built from those successful edges, while the uncertainty is obtained via a
bootstrap: we resample every edge from its mean/std Gaussian, rebuild the active graph,
and compute the largest connected component. The standard deviation across those
Monte Carlo draws becomes the error bar returned in the ``BenchmarkScore``.

(Credit to Paul Nation for the original IBM-oriented implementation.)
"""

from dataclasses import dataclass
from typing import TYPE_CHECKING

import networkx as nx
import rustworkx as rx
import numpy as np
import rustworkx as rx

from qiskit import QuantumCircuit
from qiskit.result import marginal_counts, sampled_expectation_value

from pydantic import Field
from metriq_gym.benchmarks.benchmark import (
Benchmark,
BenchmarkData,
BenchmarkResult,
BenchmarkScore,
MetricDirection,
)
from metriq_gym.helpers.task_helpers import flatten_counts
from pydantic import Field
from metriq_gym.helpers.graph_helpers import (
GraphColoring,
device_graph_coloring,
largest_connected_size,
)
from metriq_gym.helpers.statistics import bootstrap_largest_component_stddev
from metriq_gym.helpers.task_helpers import flatten_counts
from metriq_gym.qplatform.device import connectivity_graph

if TYPE_CHECKING:
from qbraid import GateModelResultData, QuantumDevice, QuantumJob
from qbraid.runtime.result_data import MeasCount


CHSH_THRESHOLD = 2.0


class BSEQResult(BenchmarkResult):
largest_connected_size: int
fraction_connected: float = Field(..., json_schema_extra={"direction": MetricDirection.HIGHER})
largest_connected_size: BenchmarkScore = Field(
..., json_schema_extra={"direction": MetricDirection.HIGHER}
)


@dataclass
Expand Down Expand Up @@ -107,45 +121,57 @@ def generate_chsh_circuit_sets(coloring: GraphColoring) -> list[QuantumCircuit]:
return exp_sets


def chsh_subgraph(coloring: GraphColoring, counts: list["MeasCount"]) -> rx.PyGraph:
def chsh_subgraph(
coloring: GraphColoring, counts: list["MeasCount"]
) -> tuple[rx.PyGraph, dict[tuple[int, int], tuple[float, float]]]:
"""Constructs a subgraph of qubit pairs that violate the CHSH inequality.

Args:
job_data: The benchmark metadata including topology and coloring.
result_data: The result data containing measurement counts.
coloring: The benchmark graph-coloring metadata.
counts: Flattened measurement counts returned from the device.

Returns:
The graph of edges that violated the CHSH inequality.
A tuple with the subgraph of successful edges and per-edge (mean, std) CHSH scores.
"""
# A subgraph is constructed containing only the edges (qubit pairs) that successfully violate the CHSH inequality.
# The size of the largest connected component in this subgraph provides a measure of the device's performance.
good_edges = []
edge_stats: dict[tuple[int, int], tuple[float, float]] = {}
for color_idx in range(coloring.num_colors):
num_meas_pairs = len(
{key for key, val in coloring.edge_color_map.items() if val == color_idx}
)
exp_vals: np.ndarray = np.zeros(num_meas_pairs, dtype=float)
variances: np.ndarray = np.zeros(num_meas_pairs, dtype=float)

for idx in range(4):
for pair in range(num_meas_pairs):
sub_counts = marginal_counts(counts[color_idx * 4 + idx], [2 * pair, 2 * pair + 1])
exp_val = sampled_expectation_value(sub_counts, "ZZ")
exp_vals[pair] += exp_val if idx != 2 else -exp_val
shots = sum(sub_counts.values())
if shots > 0:
var_term = max((1 - exp_val**2) / shots, 0.0)
variances[pair] += var_term
else:
variances[pair] = float("nan")

for idx, edge_idx in enumerate(
key for key, val in coloring.edge_color_map.items() if val == color_idx
):
edge = (coloring.edge_index_map[edge_idx][0], coloring.edge_index_map[edge_idx][1])
# The benchmark checks whether the CHSH inequality is violated (i.e., the sum of correlations exceeds 2,
# indicating entanglement).
if exp_vals[idx] > 2:
std = float(np.sqrt(variances[idx])) if not np.isnan(variances[idx]) else float("nan")
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variance-to-stddev conversion should validate that variances[idx] >= 0 before taking the square root to prevent potential issues with floating-point precision producing negative values due to the accumulation in line 144. Consider using max(variances[idx], 0.0) before np.sqrt.

Suggested change
std = float(np.sqrt(variances[idx])) if not np.isnan(variances[idx]) else float("nan")
std = float(np.sqrt(max(variances[idx], 0.0))) if not np.isnan(variances[idx]) else float("nan")

Copilot uses AI. Check for mistakes.
edge_stats[edge] = (float(exp_vals[idx]), std)
if exp_vals[idx] > CHSH_THRESHOLD:
good_edges.append(edge)

good_graph = rx.PyGraph(multigraph=False)
good_graph.add_nodes_from(list(range(coloring.num_nodes)))
for edge in good_edges:
good_graph.add_edge(*edge, 1)
return good_graph
return good_graph, edge_stats


class BSEQ(Benchmark):
Expand Down Expand Up @@ -193,9 +219,13 @@ def poll_handler(

if isinstance(job_data.coloring, dict):
job_data.coloring = GraphColoring.from_dict(job_data.coloring)
good_graph = chsh_subgraph(job_data.coloring, flatten_counts(result_data))
good_graph, edge_stats = chsh_subgraph(job_data.coloring, flatten_counts(result_data))
lcs = largest_connected_size(good_graph)
lcs_std = bootstrap_largest_component_stddev(
edge_stats,
job_data.coloring.num_nodes,
threshold=CHSH_THRESHOLD,
)
return BSEQResult(
largest_connected_size=lcs,
fraction_connected=lcs / job_data.coloring.num_nodes,
largest_connected_size=BenchmarkScore(value=float(lcs), uncertainty=lcs_std),
)
87 changes: 87 additions & 0 deletions metriq_gym/helpers/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from math import sqrt
from typing import Mapping

import numpy as np


def effective_shot_count(shots: int, count_results: Mapping[str, int]) -> int:
total_measurements = sum(count_results.values())
Expand All @@ -31,3 +33,88 @@ def binary_expectation_stddev(
expectation = binary_expectation_value(shots, count_results, outcome=outcome)
variance = expectation * (1 - expectation) / effective_shots
return float(sqrt(max(variance, 0.0)))


def _largest_component_from_edges(edges: list[tuple[int, int]], num_nodes: int) -> int:
"""Return the size of the largest connected component for an edge list."""
if num_nodes <= 0:
return 0

parent = list(range(num_nodes))
sizes = [1] * num_nodes

def find(node: int) -> int:
while parent[node] != node:
parent[node] = parent[parent[node]]
node = parent[node]
return node

def union(u: int, v: int) -> None:
root_u = find(u)
root_v = find(v)
if root_u == root_v:
return
if sizes[root_u] < sizes[root_v]:
root_u, root_v = root_v, root_u
parent[root_v] = root_u
sizes[root_u] += sizes[root_v]

for u, v in edges:
union(u, v)

largest = 1
for idx in range(num_nodes):
root = find(idx)
if sizes[root] > largest:
largest = sizes[root]
return largest
Comment on lines +65 to +70
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initialization to 1 is incorrect when there are no edges. For a graph with isolated nodes, the largest component size should be 1, but if num_nodes is 0 or edges is empty with num_nodes > 0, this will return 1 instead of the correct value. When num_nodes <= 0, the function already returns 0 at line 40-41. However, when num_nodes > 0 but edges is empty, the largest component should still be 1 (single isolated node), so the current behavior is actually correct for non-empty graphs. The issue is semantic: initialize to 0 and use max(sizes) instead of tracking during iteration to make the logic clearer.

Suggested change
largest = 1
for idx in range(num_nodes):
root = find(idx)
if sizes[root] > largest:
largest = sizes[root]
return largest
# The size of the largest component is the maximum in sizes
return max(sizes) if num_nodes > 0 else 0

Copilot uses AI. Check for mistakes.


def bootstrap_largest_component_stddev(
edge_stats: Mapping[tuple[int, int], tuple[float, float]],
num_nodes: int,
*,
threshold: float,
num_samples: int = 512,
rng: np.random.Generator | None = None,
) -> float:
"""Estimate the uncertainty of the largest connected component via Monte Carlo.

Each edge ``(u, v)`` is modelled as a Gaussian random variable with mean/std drawn
from ``edge_stats[(u, v)]``. During every Monte Carlo draw we sample a synthetic
value for every edge, mark the edge as ``active`` whenever that draw exceeds the
violation ``threshold``, and compute the size of the largest connected component
that can be assembled from the active edges. The reported uncertainty is the
sample standard deviation of those largest-component sizes after ``num_samples``
repetitions. When all draws produce the same component size the function returns
zero, indicating that the metric is insensitive to the provided per-edge noise.

Args:
edge_stats: Mapping from edge to a tuple of (mean, stddev) metric values (e.g., CHSH scores).
num_nodes: Total number of nodes in the graph.
threshold: Threshold that determines whether an edge is considered active.
num_samples: Number of Monte Carlo samples to draw.
rng: Optional numpy random generator for deterministic sampling.

Returns:
The sample standard deviation of the largest connected component.
"""
if num_nodes <= 0 or not edge_stats or num_samples <= 1:
return 0.0

rng = rng or np.random.default_rng()
edges = list(edge_stats.items())
samples = np.empty(num_samples, dtype=float)

for sample_idx in range(num_samples):
active_edges: list[tuple[int, int]] = []
for (u, v), (mean, std) in edges:
sigma = 0.0 if std is None or np.isnan(std) else float(std)
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The check std is None is redundant since the type annotation indicates edge_stats values are tuple[float, float], meaning std cannot be None. If NaN values are expected to represent missing standard deviations, document this in the function docstring or consider using Optional[float] in the edge_stats type annotation to make the contract explicit.

Copilot uses AI. Check for mistakes.
value = float(mean) if sigma == 0.0 else float(rng.normal(mean, sigma))
if value > threshold:
active_edges.append((u, v))
samples[sample_idx] = _largest_component_from_edges(active_edges, num_nodes)

if np.allclose(samples, samples[0]):
return 0.0
return float(samples.std(ddof=1))
40 changes: 40 additions & 0 deletions tests/unit/benchmarks/test_bseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import pytest

from metriq_gym.benchmarks.bseq import BSEQResult, CHSH_THRESHOLD
from metriq_gym.benchmarks.benchmark import BenchmarkScore
from metriq_gym.helpers.statistics import bootstrap_largest_component_stddev


def test_bootstrap_stddev_returns_zero_without_edges():
assert bootstrap_largest_component_stddev({}, num_nodes=4, threshold=CHSH_THRESHOLD) == 0.0


def test_bootstrap_stddev_zero_variance_is_deterministic():
edge_stats = {(0, 1): (CHSH_THRESHOLD + 0.5, 0.0)}
rng = np.random.default_rng(0)
assert (
bootstrap_largest_component_stddev(
edge_stats, num_nodes=3, threshold=CHSH_THRESHOLD, rng=rng
)
== 0.0
)


def test_bootstrap_stddev_reflects_sampling_noise():
edge_stats = {(0, 1): (CHSH_THRESHOLD + 0.1, 0.3)}
rng = np.random.default_rng(1234)
std = bootstrap_largest_component_stddev(
edge_stats,
num_nodes=3,
threshold=CHSH_THRESHOLD,
rng=rng,
num_samples=256,
)
assert 0.0 < std < 3.0


def test_bseq_result_exposes_benchmark_score():
result = BSEQResult(largest_connected_size=BenchmarkScore(value=8.0, uncertainty=1.5))
assert result.values == pytest.approx({"largest_connected_size": 8.0})
assert result.uncertainties == pytest.approx({"largest_connected_size": 1.5})