Sparse OpenCL Gray code search

WrathfulSpatula · WrathfulSpatula · commit b40457e3677b · 2025-11-14T17:47:26.000-05:00
diff --git a/pyqrackising/kernels.cl b/pyqrackising/kernels.cl
@@ -627,10 +627,8 @@ real1 cut_worker_sparse_segmented(
 
     for (int u = 0; u < n; ++u) {
         const bool u_bit = get_bit(theta, u);
-        const uint row_start = G_rows[u];
         const uint row_end = G_rows[u + 1];
-
-        for (uint col = row_start; col < row_end; ++col) {
+        for (uint col = G_rows[u]; col < row_end; ++col) {
             const int v = G_cols[col];
             const real1 val = get_G_m(G_data, col, segment_size);
             const bool v_bit = get_bit(theta, v);
@@ -707,10 +705,8 @@ real1 single_bit_flip_worker_sparse_segmented(
         if (u == k) {
             u_bit = !u_bit;
         }
-        const uint row_start = G_rows[u];
         const uint row_end = G_rows[u + 1];
-
-        for (uint col = row_start; col < row_end; ++col) {
+        for (uint col = G_rows[u]; col < row_end; ++col) {
             const int v = G_cols[col];
             const real1 val = get_G_m(G_data, col, segment_size);
             bool v_bit = get_const_bit(theta, v);
@@ -783,10 +779,8 @@ real1 double_bit_flip_worker_sparse_segmented(
         if ((u == k) || (u == l)) {
             u_bit = !u_bit;
         }
-        const uint row_start = G_rows[u];
         const uint row_end = G_rows[u + 1];
-
-        for (uint col = row_start; col < row_end; ++col) {
+        for (uint col = G_rows[u]; col < row_end; ++col) {
             const int v = G_cols[col];
             const real1 val = get_G_m(G_data, col, segment_size);
             bool v_bit = get_const_bit(theta, v);
@@ -945,11 +939,11 @@ __kernel void gray(
             real1 energy = ZERO_R1;
             for (uint u = 0; u < n; u++) {
                 const size_t u_offset = u * n;
-                int bit_u = get_local_bit(theta_local, u);
+                int u_bit = get_local_bit(theta_local, u);
                 for (uint v = u + 1; v < n; v++) {
-                    const int bit_v = get_local_bit(theta_local, v);
+                    const int v_bit = get_local_bit(theta_local, v);
                     const real1 val = G_m[u_offset + v];
-                    if (bit_u != bit_v) {
+                    if (u_bit != v_bit) {
                         energy += val;
                     } else if (is_spin_glass) {
                         energy -= val;
@@ -1016,11 +1010,154 @@ __kernel void gray_segmented(
             real1 energy = ZERO_R1;
             for (uint u = 0; u < n; u++) {
                 const size_t u_offset = u * n;
-                int bit_u = get_local_bit(theta_local, u);
+                int u_bit = get_local_bit(theta_local, u);
                 for (uint v = u + 1; v < n; v++) {
-                    const int bit_v = get_local_bit(theta_local, v);
+                    const int v_bit = get_local_bit(theta_local, v);
                     const real1 val = get_G_m(G_m, u_offset + v, segment_size);
-                    if (bit_u != bit_v) {
+                    if (u_bit != v_bit) {
+                        energy += val;
+                    } else if (is_spin_glass) {
+                        energy -= val;
+                    }
+                }
+            }
+
+            if (energy > best_energy) {
+                best_energy = energy;
+                best_i = i;
+            } else {
+                theta_local[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
+            }
+        }
+    }
+
+    i = get_global_id(0);
+    const size_t offset = i * blocks;
+    for (int b = 0; b < blocks; ++b) {
+        theta_out[offset + b] = theta_local[b];
+    }
+    energy_out[i] = best_energy;
+}
+
+__kernel void gray_sparse(
+    __global const real1* G_data,
+    __global const uint* G_rows,
+    __global const uint* G_cols,
+    __constant ulong* theta,
+    __constant int* args,
+    __global ulong* theta_out,
+    __global real1* energy_out
+) {
+    const int n = args[0];
+    const bool is_spin_glass = args[1];
+    const int gray_iterations = args[2];
+    const int blocks = (n + 63) / 64;
+    const int last_block = blocks - 1;
+
+    int i = get_global_id(0);
+    const int max_i = get_global_size(0);
+
+    ulong theta_local[2048];
+    for (int b = 0; b < blocks; ++b) {
+        theta_local[b] = theta[b];
+    }
+
+    // Initialize different seed per thread
+    const int seed = i ^ (i >> 1);
+    for (int b = 0; b < 64; ++b) {
+        theta_local[last_block] ^= (seed >> (63U - b)) << b;
+    }
+
+    real1 best_energy = -INFINITY;
+    int best_i = i;
+    int best_block = 0U;
+    for (; i < gray_iterations; i += max_i) {
+        for (int block = 0; block < blocks; ++block) {
+            const size_t flip_bit = gray_code_next(theta_local, i, block << 6U);
+            real1 energy = ZERO_R1;
+            for (uint u = 0; u < n; u++) {
+                int u_bit = get_local_bit(theta_local, u);
+                const size_t mCol = G_rows[u + 1];
+                for (int col = G_rows[u]; col < mCol; ++col) {
+                    const int v = G_cols[col];
+                    const real1 val = G_data[col];
+                    bool v_bit = get_local_bit(theta_local, v);
+                    if (u_bit != v_bit) {
+                        energy += val;
+                    } else if (is_spin_glass) {
+                        energy -= val;
+                    }
+                }
+            }
+
+            if (energy > best_energy) {
+                best_energy = energy;
+                best_i = i;
+            } else {
+                theta_local[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
+            }
+        }
+    }
+
+    i = get_global_id(0);
+    const size_t offset = i * blocks;
+    for (int b = 0; b < blocks; ++b) {
+        theta_out[offset + b] = theta_local[b];
+    }
+    energy_out[i] = best_energy;
+}
+
+__kernel void gray_sparse_segmented(
+    __global const real1* G_data0,
+    __global const real1* G_data1,
+    __global const real1* G_data2,
+    __global const real1* G_data3,
+    __global const uint* G_rows,
+    __global const uint* G_cols,
+    __constant ulong* theta,
+    __constant int* args,
+    __global ulong* theta_out,
+    __global real1* energy_out
+) {
+    __global const real1* G_data[4] = { G_data0, G_data1, G_data2, G_data3 };
+
+    const int n = args[0];
+    const bool is_spin_glass = args[1];
+    const int gray_iterations = args[2];
+    const int segment_size = args[3];
+    const int blocks = (n + 63) / 64;
+    const int last_block = blocks - 1;
+
+    int i = get_global_id(0);
+    const int max_i = get_global_size(0);
+
+    ulong theta_local[2048];
+    for (int b = 0; b < blocks; ++b) {
+        theta_local[b] = theta[b];
+    }
+
+    // Initialize different seed per thread
+    const int seed = i ^ (i >> 1);
+    for (int b = 0; b < 64; ++b) {
+        theta_local[last_block] ^= (seed >> (63U - b)) << b;
+    }
+
+    real1 best_energy = -INFINITY;
+    int best_i = i;
+    int best_block = 0U;
+    for (; i < gray_iterations; i += max_i) {
+        for (int block = 0; block < blocks; ++block) {
+            const size_t flip_bit = gray_code_next(theta_local, i, block << 6U);
+            real1 energy = ZERO_R1;
+            for (uint u = 0; u < n; u++) {
+                const size_t u_offset = u * n;
+                int u_bit = get_local_bit(theta_local, u);
+                const uint row_end = G_rows[u + 1];
+                for (uint col = G_rows[u]; col < row_end; ++col) {
+                    const int v = G_cols[col];
+                    const real1 val = get_G_m(G_data, col, segment_size);
+                    const bool v_bit = get_local_bit(theta_local, v);
+                    if (u_bit != v_bit) {
                         energy += val;
                     } else if (is_spin_glass) {
                         energy -= val;
diff --git a/pyqrackising/maxcut_tfim_util.py b/pyqrackising/maxcut_tfim_util.py
@@ -7,7 +7,7 @@
 
 
 class OpenCLContext:
-    def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s, t):
+    def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s, t, u, v):
         self.GRAY_NODE_LIMIT = 131072
         self.MAX_GPU_PROC_ELEM = a
         self.IS_OPENCL_AVAILABLE = b
@@ -31,6 +31,8 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
         self.double_bit_flips_sparse_segmented_kernel = w
         self.gray_kernel = s
         self.gray_segmented_kernel = t
+        self.gray_sparse_kernel = u
+        self.gray_sparse_segmented_kernel = v
 
 IS_OPENCL_AVAILABLE = True
 ctx = None
@@ -54,6 +56,8 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
 double_bit_flips_sparse_segmented_kernel = None
 gray_kernel = None
 gray_segmented_kernel = None
+gray_sparse_kernel = None
+gray_sparse_segmented_kernel = None
 
 dtype_bits = int(os.getenv('PYQRACKISING_FPPOW', '5'))
 kernel_src = ''
@@ -116,6 +120,8 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
     double_bit_flips_sparse_segmented_kernel = program.double_bit_flips_sparse_segmented
     gray_kernel = program.gray
     gray_segmented_kernel = program.gray_segmented
+    gray_sparse_kernel = program.gray_sparse
+    gray_sparse_segmented_kernel = program.gray_sparse_segmented
 
     work_group_size = calculate_cut_kernel.get_work_group_info(
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
@@ -127,7 +133,7 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
     IS_OPENCL_AVAILABLE = False
     print("PyOpenCL not installed. (If you have any OpenCL accelerator devices with available ICDs, you might want to optionally install pyopencl.)")
 
-opencl_context = OpenCLContext(compute_units, IS_OPENCL_AVAILABLE, work_group_size, dtype, epsilon, max_alloc, ctx, queue, calculate_cut_kernel, calculate_cut_sparse_kernel, calculate_cut_segmented_kernel, calculate_cut_sparse_segmented_kernel, single_bit_flips_kernel, single_bit_flips_sparse_kernel, single_bit_flips_segmented_kernel, single_bit_flips_sparse_segmented_kernel, double_bit_flips_kernel, double_bit_flips_sparse_kernel, double_bit_flips_segmented_kernel, double_bit_flips_sparse_segmented_kernel, gray_kernel, gray_segmented_kernel)
+opencl_context = OpenCLContext(compute_units, IS_OPENCL_AVAILABLE, work_group_size, dtype, epsilon, max_alloc, ctx, queue, calculate_cut_kernel, calculate_cut_sparse_kernel, calculate_cut_segmented_kernel, calculate_cut_sparse_segmented_kernel, single_bit_flips_kernel, single_bit_flips_sparse_kernel, single_bit_flips_segmented_kernel, single_bit_flips_sparse_segmented_kernel, double_bit_flips_kernel, double_bit_flips_sparse_kernel, double_bit_flips_segmented_kernel, double_bit_flips_sparse_segmented_kernel, gray_kernel, gray_segmented_kernel, gray_sparse_kernel, gray_sparse_segmented_kernel)
 heuristic_threshold = 24
 heuristic_threshold_sparse = 23
 
diff --git a/pyqrackising/spin_glass_solver_sparse.py b/pyqrackising/spin_glass_solver_sparse.py
@@ -1,5 +1,5 @@
 from .maxcut_tfim_sparse import maxcut_tfim_sparse
-from .maxcut_tfim_util import compute_cut_sparse, compute_energy_sparse, get_cut, gray_code_next, gray_mutation, heuristic_threshold_sparse, int_to_bitstring, make_G_m_csr_buf, make_best_theta_buf, opencl_context, setup_opencl, to_scipy_sparse_upper_triangular
+from .maxcut_tfim_util import compute_cut_sparse, compute_energy_sparse, get_cut, gray_code_next, gray_mutation, heuristic_threshold_sparse, int_to_bitstring, make_G_m_csr_buf, make_best_theta_buf, make_best_theta_buf_64, opencl_context, setup_opencl, to_scipy_sparse_upper_triangular
 import networkx as nx
 import numpy as np
 from numba import njit, prange
@@ -15,6 +15,7 @@
 
 dtype = opencl_context.dtype
 wgs = opencl_context.work_group_size
+gnl = opencl_context.GRAY_NODE_LIMIT
 
 
 @njit(parallel=True)
@@ -259,6 +260,68 @@ def run_bit_flips_opencl(is_double, n, kernel, best_energy, theta, theta_buf, G_
     return energy, theta
 
 
+def run_gray_search_opencl(n, kernel, best_energy, theta, theta_buf, G_data_buf, G_rows_buf, G_cols_buf, is_segmented, local_size, global_size, args_buf, max_energy_host, max_theta_host, max_energy_buf, max_theta_buf):
+    queue = opencl_context.queue
+
+    # Set kernel args
+    if is_segmented:
+        kernel.set_args(
+            G_data_buf[0],
+            G_data_buf[1],
+            G_data_buf[2],
+            G_data_buf[3],
+            G_rows_buf,
+            G_cols_buf,
+            theta_buf,
+            args_buf,
+            max_theta_buf,
+            max_energy_buf
+        )
+    else:
+        kernel.set_args(
+            G_data_buf,
+            G_rows_buf,
+            G_cols_buf,
+            theta_buf,
+            args_buf,
+            max_theta_buf,
+            max_energy_buf
+        )
+
+    cl.enqueue_nd_range_kernel(queue, kernel, (global_size,), (local_size,))
+
+    # Read results
+    cl.enqueue_copy(queue, max_energy_host, max_energy_buf)
+    queue.finish()
+
+    # Queue read for results we might not need
+    cl.enqueue_copy(queue, max_theta_host, max_theta_buf)
+
+    # Find global minimum
+    best_x = np.argmax(max_energy_host)
+    energy = max_energy_host[best_x]
+
+    if energy <= best_energy:
+        # No improvement: we can exit early
+        return best_energy, theta
+
+    # We need the best index
+    queue.finish()
+
+    blocks = (n + 63) // 64
+    offset = best_x * blocks
+    for b in range(blocks):
+        s = max_theta_host[best_x]
+        b_offset = b << 6
+        for i in range(64):
+            j = b_offset + i
+            if j >= n:
+                break
+            theta[j] = (s >> i) & 1
+
+    return energy, theta
+
+
 def spin_glass_solver_sparse(
     G,
     quality=None,
@@ -351,6 +414,11 @@ def spin_glass_solver_sparse(
             single_bit_flips_kernel = opencl_context.single_bit_flips_sparse_kernel
             double_bit_flips_kernel = opencl_context.double_bit_flips_sparse_kernel
 
+        if n_qubits <= gnl:
+            gray_work_group_size = opencl_context.MAX_GPU_PROC_ELEM
+            gray_args = setup_opencl(1, gray_work_group_size, np.array([n_qubits, is_spin_glass, (gray_iterations + gray_work_group_size - 1) // gray_work_group_size, segment_size]), True)
+            gray_kernel = opencl_context.gray_sparse_segmented_kernel if is_segmented else opencl_context.gray_sparse_kernel
+
     thread_count = os.cpu_count() ** 2
     improved = True
     while improved:
@@ -380,21 +448,25 @@ def spin_glass_solver_sparse(
             improved = True
             continue
 
-        # Gray code with default O(n^3)
-        iterators, energies = pick_gray_seeds(best_theta, thread_count, gray_seed_multiple, G_m.data, G_m.indptr, G_m.indices, n_qubits, is_spin_glass)
-        energy, state = energies[0], iterators[0]
-        if energy > max_energy:
-            max_energy = energy
-            best_theta = state
-            improved = True
-            continue
-
-        energy, state = run_gray_optimization(best_theta, iterators, energies, gray_iterations, thread_count, is_spin_glass, G_m.data, G_m.indptr, G_m.indices)
-        if energy > max_energy:
-            max_energy = energy
-            best_theta = state
-            improved = True
-            continue
+        if is_opencl and (n_qubits <= gnl):
+            theta_buf_64 = make_best_theta_buf_64(best_theta)
+            energy, state = run_gray_search_opencl(n_qubits, gray_kernel, max_energy, best_theta, theta_buf_64, G_data_buf, G_rows_buf, G_cols_buf, is_segmented, *gray_args)
+        else:
+            # Gray code with default O(n^3)
+            iterators, energies = pick_gray_seeds(best_theta, thread_count, gray_seed_multiple, G_m.data, G_m.indptr, G_m.indices, n_qubits, is_spin_glass)
+            energy, state = energies[0], iterators[0]
+            if energy > max_energy:
+                max_energy = energy
+                best_theta = state
+                improved = True
+                continue
+
+            energy, state = run_gray_optimization(best_theta, iterators, energies, gray_iterations, thread_count, is_spin_glass, G_m.data, G_m.indptr, G_m.indices)
+            if energy > max_energy:
+                max_energy = energy
+                best_theta = state
+                improved = True
+                continue
 
         # Post-reheat phase
         reheat_theta = state