Skip to content

Commit b40457e

Browse files
Sparse OpenCL Gray code search
1 parent 46a205c commit b40457e

File tree

3 files changed

+248
-33
lines changed

3 files changed

+248
-33
lines changed

pyqrackising/kernels.cl

Lines changed: 152 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -627,10 +627,8 @@ real1 cut_worker_sparse_segmented(
627627

628628
for (int u = 0; u < n; ++u) {
629629
const bool u_bit = get_bit(theta, u);
630-
const uint row_start = G_rows[u];
631630
const uint row_end = G_rows[u + 1];
632-
633-
for (uint col = row_start; col < row_end; ++col) {
631+
for (uint col = G_rows[u]; col < row_end; ++col) {
634632
const int v = G_cols[col];
635633
const real1 val = get_G_m(G_data, col, segment_size);
636634
const bool v_bit = get_bit(theta, v);
@@ -707,10 +705,8 @@ real1 single_bit_flip_worker_sparse_segmented(
707705
if (u == k) {
708706
u_bit = !u_bit;
709707
}
710-
const uint row_start = G_rows[u];
711708
const uint row_end = G_rows[u + 1];
712-
713-
for (uint col = row_start; col < row_end; ++col) {
709+
for (uint col = G_rows[u]; col < row_end; ++col) {
714710
const int v = G_cols[col];
715711
const real1 val = get_G_m(G_data, col, segment_size);
716712
bool v_bit = get_const_bit(theta, v);
@@ -783,10 +779,8 @@ real1 double_bit_flip_worker_sparse_segmented(
783779
if ((u == k) || (u == l)) {
784780
u_bit = !u_bit;
785781
}
786-
const uint row_start = G_rows[u];
787782
const uint row_end = G_rows[u + 1];
788-
789-
for (uint col = row_start; col < row_end; ++col) {
783+
for (uint col = G_rows[u]; col < row_end; ++col) {
790784
const int v = G_cols[col];
791785
const real1 val = get_G_m(G_data, col, segment_size);
792786
bool v_bit = get_const_bit(theta, v);
@@ -945,11 +939,11 @@ __kernel void gray(
945939
real1 energy = ZERO_R1;
946940
for (uint u = 0; u < n; u++) {
947941
const size_t u_offset = u * n;
948-
int bit_u = get_local_bit(theta_local, u);
942+
int u_bit = get_local_bit(theta_local, u);
949943
for (uint v = u + 1; v < n; v++) {
950-
const int bit_v = get_local_bit(theta_local, v);
944+
const int v_bit = get_local_bit(theta_local, v);
951945
const real1 val = G_m[u_offset + v];
952-
if (bit_u != bit_v) {
946+
if (u_bit != v_bit) {
953947
energy += val;
954948
} else if (is_spin_glass) {
955949
energy -= val;
@@ -1016,11 +1010,154 @@ __kernel void gray_segmented(
10161010
real1 energy = ZERO_R1;
10171011
for (uint u = 0; u < n; u++) {
10181012
const size_t u_offset = u * n;
1019-
int bit_u = get_local_bit(theta_local, u);
1013+
int u_bit = get_local_bit(theta_local, u);
10201014
for (uint v = u + 1; v < n; v++) {
1021-
const int bit_v = get_local_bit(theta_local, v);
1015+
const int v_bit = get_local_bit(theta_local, v);
10221016
const real1 val = get_G_m(G_m, u_offset + v, segment_size);
1023-
if (bit_u != bit_v) {
1017+
if (u_bit != v_bit) {
1018+
energy += val;
1019+
} else if (is_spin_glass) {
1020+
energy -= val;
1021+
}
1022+
}
1023+
}
1024+
1025+
if (energy > best_energy) {
1026+
best_energy = energy;
1027+
best_i = i;
1028+
} else {
1029+
theta_local[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
1030+
}
1031+
}
1032+
}
1033+
1034+
i = get_global_id(0);
1035+
const size_t offset = i * blocks;
1036+
for (int b = 0; b < blocks; ++b) {
1037+
theta_out[offset + b] = theta_local[b];
1038+
}
1039+
energy_out[i] = best_energy;
1040+
}
1041+
1042+
__kernel void gray_sparse(
1043+
__global const real1* G_data,
1044+
__global const uint* G_rows,
1045+
__global const uint* G_cols,
1046+
__constant ulong* theta,
1047+
__constant int* args,
1048+
__global ulong* theta_out,
1049+
__global real1* energy_out
1050+
) {
1051+
const int n = args[0];
1052+
const bool is_spin_glass = args[1];
1053+
const int gray_iterations = args[2];
1054+
const int blocks = (n + 63) / 64;
1055+
const int last_block = blocks - 1;
1056+
1057+
int i = get_global_id(0);
1058+
const int max_i = get_global_size(0);
1059+
1060+
ulong theta_local[2048];
1061+
for (int b = 0; b < blocks; ++b) {
1062+
theta_local[b] = theta[b];
1063+
}
1064+
1065+
// Initialize different seed per thread
1066+
const int seed = i ^ (i >> 1);
1067+
for (int b = 0; b < 64; ++b) {
1068+
theta_local[last_block] ^= (seed >> (63U - b)) << b;
1069+
}
1070+
1071+
real1 best_energy = -INFINITY;
1072+
int best_i = i;
1073+
int best_block = 0U;
1074+
for (; i < gray_iterations; i += max_i) {
1075+
for (int block = 0; block < blocks; ++block) {
1076+
const size_t flip_bit = gray_code_next(theta_local, i, block << 6U);
1077+
real1 energy = ZERO_R1;
1078+
for (uint u = 0; u < n; u++) {
1079+
int u_bit = get_local_bit(theta_local, u);
1080+
const size_t mCol = G_rows[u + 1];
1081+
for (int col = G_rows[u]; col < mCol; ++col) {
1082+
const int v = G_cols[col];
1083+
const real1 val = G_data[col];
1084+
bool v_bit = get_local_bit(theta_local, v);
1085+
if (u_bit != v_bit) {
1086+
energy += val;
1087+
} else if (is_spin_glass) {
1088+
energy -= val;
1089+
}
1090+
}
1091+
}
1092+
1093+
if (energy > best_energy) {
1094+
best_energy = energy;
1095+
best_i = i;
1096+
} else {
1097+
theta_local[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
1098+
}
1099+
}
1100+
}
1101+
1102+
i = get_global_id(0);
1103+
const size_t offset = i * blocks;
1104+
for (int b = 0; b < blocks; ++b) {
1105+
theta_out[offset + b] = theta_local[b];
1106+
}
1107+
energy_out[i] = best_energy;
1108+
}
1109+
1110+
__kernel void gray_sparse_segmented(
1111+
__global const real1* G_data0,
1112+
__global const real1* G_data1,
1113+
__global const real1* G_data2,
1114+
__global const real1* G_data3,
1115+
__global const uint* G_rows,
1116+
__global const uint* G_cols,
1117+
__constant ulong* theta,
1118+
__constant int* args,
1119+
__global ulong* theta_out,
1120+
__global real1* energy_out
1121+
) {
1122+
__global const real1* G_data[4] = { G_data0, G_data1, G_data2, G_data3 };
1123+
1124+
const int n = args[0];
1125+
const bool is_spin_glass = args[1];
1126+
const int gray_iterations = args[2];
1127+
const int segment_size = args[3];
1128+
const int blocks = (n + 63) / 64;
1129+
const int last_block = blocks - 1;
1130+
1131+
int i = get_global_id(0);
1132+
const int max_i = get_global_size(0);
1133+
1134+
ulong theta_local[2048];
1135+
for (int b = 0; b < blocks; ++b) {
1136+
theta_local[b] = theta[b];
1137+
}
1138+
1139+
// Initialize different seed per thread
1140+
const int seed = i ^ (i >> 1);
1141+
for (int b = 0; b < 64; ++b) {
1142+
theta_local[last_block] ^= (seed >> (63U - b)) << b;
1143+
}
1144+
1145+
real1 best_energy = -INFINITY;
1146+
int best_i = i;
1147+
int best_block = 0U;
1148+
for (; i < gray_iterations; i += max_i) {
1149+
for (int block = 0; block < blocks; ++block) {
1150+
const size_t flip_bit = gray_code_next(theta_local, i, block << 6U);
1151+
real1 energy = ZERO_R1;
1152+
for (uint u = 0; u < n; u++) {
1153+
const size_t u_offset = u * n;
1154+
int u_bit = get_local_bit(theta_local, u);
1155+
const uint row_end = G_rows[u + 1];
1156+
for (uint col = G_rows[u]; col < row_end; ++col) {
1157+
const int v = G_cols[col];
1158+
const real1 val = get_G_m(G_data, col, segment_size);
1159+
const bool v_bit = get_local_bit(theta_local, v);
1160+
if (u_bit != v_bit) {
10241161
energy += val;
10251162
} else if (is_spin_glass) {
10261163
energy -= val;

pyqrackising/maxcut_tfim_util.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
class OpenCLContext:
10-
def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s, t):
10+
def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s, t, u, v):
1111
self.GRAY_NODE_LIMIT = 131072
1212
self.MAX_GPU_PROC_ELEM = a
1313
self.IS_OPENCL_AVAILABLE = b
@@ -31,6 +31,8 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
3131
self.double_bit_flips_sparse_segmented_kernel = w
3232
self.gray_kernel = s
3333
self.gray_segmented_kernel = t
34+
self.gray_sparse_kernel = u
35+
self.gray_sparse_segmented_kernel = v
3436

3537
IS_OPENCL_AVAILABLE = True
3638
ctx = None
@@ -54,6 +56,8 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
5456
double_bit_flips_sparse_segmented_kernel = None
5557
gray_kernel = None
5658
gray_segmented_kernel = None
59+
gray_sparse_kernel = None
60+
gray_sparse_segmented_kernel = None
5761

5862
dtype_bits = int(os.getenv('PYQRACKISING_FPPOW', '5'))
5963
kernel_src = ''
@@ -116,6 +120,8 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
116120
double_bit_flips_sparse_segmented_kernel = program.double_bit_flips_sparse_segmented
117121
gray_kernel = program.gray
118122
gray_segmented_kernel = program.gray_segmented
123+
gray_sparse_kernel = program.gray_sparse
124+
gray_sparse_segmented_kernel = program.gray_sparse_segmented
119125

120126
work_group_size = calculate_cut_kernel.get_work_group_info(
121127
cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
@@ -127,7 +133,7 @@ def __init__(self, a, b, g, d, e, f, c, q, i, j, k, l, m, n, o, p, x, y, z, w, s
127133
IS_OPENCL_AVAILABLE = False
128134
print("PyOpenCL not installed. (If you have any OpenCL accelerator devices with available ICDs, you might want to optionally install pyopencl.)")
129135

130-
opencl_context = OpenCLContext(compute_units, IS_OPENCL_AVAILABLE, work_group_size, dtype, epsilon, max_alloc, ctx, queue, calculate_cut_kernel, calculate_cut_sparse_kernel, calculate_cut_segmented_kernel, calculate_cut_sparse_segmented_kernel, single_bit_flips_kernel, single_bit_flips_sparse_kernel, single_bit_flips_segmented_kernel, single_bit_flips_sparse_segmented_kernel, double_bit_flips_kernel, double_bit_flips_sparse_kernel, double_bit_flips_segmented_kernel, double_bit_flips_sparse_segmented_kernel, gray_kernel, gray_segmented_kernel)
136+
opencl_context = OpenCLContext(compute_units, IS_OPENCL_AVAILABLE, work_group_size, dtype, epsilon, max_alloc, ctx, queue, calculate_cut_kernel, calculate_cut_sparse_kernel, calculate_cut_segmented_kernel, calculate_cut_sparse_segmented_kernel, single_bit_flips_kernel, single_bit_flips_sparse_kernel, single_bit_flips_segmented_kernel, single_bit_flips_sparse_segmented_kernel, double_bit_flips_kernel, double_bit_flips_sparse_kernel, double_bit_flips_segmented_kernel, double_bit_flips_sparse_segmented_kernel, gray_kernel, gray_segmented_kernel, gray_sparse_kernel, gray_sparse_segmented_kernel)
131137
heuristic_threshold = 24
132138
heuristic_threshold_sparse = 23
133139

pyqrackising/spin_glass_solver_sparse.py

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .maxcut_tfim_sparse import maxcut_tfim_sparse
2-
from .maxcut_tfim_util import compute_cut_sparse, compute_energy_sparse, get_cut, gray_code_next, gray_mutation, heuristic_threshold_sparse, int_to_bitstring, make_G_m_csr_buf, make_best_theta_buf, opencl_context, setup_opencl, to_scipy_sparse_upper_triangular
2+
from .maxcut_tfim_util import compute_cut_sparse, compute_energy_sparse, get_cut, gray_code_next, gray_mutation, heuristic_threshold_sparse, int_to_bitstring, make_G_m_csr_buf, make_best_theta_buf, make_best_theta_buf_64, opencl_context, setup_opencl, to_scipy_sparse_upper_triangular
33
import networkx as nx
44
import numpy as np
55
from numba import njit, prange
@@ -15,6 +15,7 @@
1515

1616
dtype = opencl_context.dtype
1717
wgs = opencl_context.work_group_size
18+
gnl = opencl_context.GRAY_NODE_LIMIT
1819

1920

2021
@njit(parallel=True)
@@ -259,6 +260,68 @@ def run_bit_flips_opencl(is_double, n, kernel, best_energy, theta, theta_buf, G_
259260
return energy, theta
260261

261262

263+
def run_gray_search_opencl(n, kernel, best_energy, theta, theta_buf, G_data_buf, G_rows_buf, G_cols_buf, is_segmented, local_size, global_size, args_buf, max_energy_host, max_theta_host, max_energy_buf, max_theta_buf):
264+
queue = opencl_context.queue
265+
266+
# Set kernel args
267+
if is_segmented:
268+
kernel.set_args(
269+
G_data_buf[0],
270+
G_data_buf[1],
271+
G_data_buf[2],
272+
G_data_buf[3],
273+
G_rows_buf,
274+
G_cols_buf,
275+
theta_buf,
276+
args_buf,
277+
max_theta_buf,
278+
max_energy_buf
279+
)
280+
else:
281+
kernel.set_args(
282+
G_data_buf,
283+
G_rows_buf,
284+
G_cols_buf,
285+
theta_buf,
286+
args_buf,
287+
max_theta_buf,
288+
max_energy_buf
289+
)
290+
291+
cl.enqueue_nd_range_kernel(queue, kernel, (global_size,), (local_size,))
292+
293+
# Read results
294+
cl.enqueue_copy(queue, max_energy_host, max_energy_buf)
295+
queue.finish()
296+
297+
# Queue read for results we might not need
298+
cl.enqueue_copy(queue, max_theta_host, max_theta_buf)
299+
300+
# Find global minimum
301+
best_x = np.argmax(max_energy_host)
302+
energy = max_energy_host[best_x]
303+
304+
if energy <= best_energy:
305+
# No improvement: we can exit early
306+
return best_energy, theta
307+
308+
# We need the best index
309+
queue.finish()
310+
311+
blocks = (n + 63) // 64
312+
offset = best_x * blocks
313+
for b in range(blocks):
314+
s = max_theta_host[best_x]
315+
b_offset = b << 6
316+
for i in range(64):
317+
j = b_offset + i
318+
if j >= n:
319+
break
320+
theta[j] = (s >> i) & 1
321+
322+
return energy, theta
323+
324+
262325
def spin_glass_solver_sparse(
263326
G,
264327
quality=None,
@@ -351,6 +414,11 @@ def spin_glass_solver_sparse(
351414
single_bit_flips_kernel = opencl_context.single_bit_flips_sparse_kernel
352415
double_bit_flips_kernel = opencl_context.double_bit_flips_sparse_kernel
353416

417+
if n_qubits <= gnl:
418+
gray_work_group_size = opencl_context.MAX_GPU_PROC_ELEM
419+
gray_args = setup_opencl(1, gray_work_group_size, np.array([n_qubits, is_spin_glass, (gray_iterations + gray_work_group_size - 1) // gray_work_group_size, segment_size]), True)
420+
gray_kernel = opencl_context.gray_sparse_segmented_kernel if is_segmented else opencl_context.gray_sparse_kernel
421+
354422
thread_count = os.cpu_count() ** 2
355423
improved = True
356424
while improved:
@@ -380,21 +448,25 @@ def spin_glass_solver_sparse(
380448
improved = True
381449
continue
382450

383-
# Gray code with default O(n^3)
384-
iterators, energies = pick_gray_seeds(best_theta, thread_count, gray_seed_multiple, G_m.data, G_m.indptr, G_m.indices, n_qubits, is_spin_glass)
385-
energy, state = energies[0], iterators[0]
386-
if energy > max_energy:
387-
max_energy = energy
388-
best_theta = state
389-
improved = True
390-
continue
391-
392-
energy, state = run_gray_optimization(best_theta, iterators, energies, gray_iterations, thread_count, is_spin_glass, G_m.data, G_m.indptr, G_m.indices)
393-
if energy > max_energy:
394-
max_energy = energy
395-
best_theta = state
396-
improved = True
397-
continue
451+
if is_opencl and (n_qubits <= gnl):
452+
theta_buf_64 = make_best_theta_buf_64(best_theta)
453+
energy, state = run_gray_search_opencl(n_qubits, gray_kernel, max_energy, best_theta, theta_buf_64, G_data_buf, G_rows_buf, G_cols_buf, is_segmented, *gray_args)
454+
else:
455+
# Gray code with default O(n^3)
456+
iterators, energies = pick_gray_seeds(best_theta, thread_count, gray_seed_multiple, G_m.data, G_m.indptr, G_m.indices, n_qubits, is_spin_glass)
457+
energy, state = energies[0], iterators[0]
458+
if energy > max_energy:
459+
max_energy = energy
460+
best_theta = state
461+
improved = True
462+
continue
463+
464+
energy, state = run_gray_optimization(best_theta, iterators, energies, gray_iterations, thread_count, is_spin_glass, G_m.data, G_m.indptr, G_m.indices)
465+
if energy > max_energy:
466+
max_energy = energy
467+
best_theta = state
468+
improved = True
469+
continue
398470

399471
# Post-reheat phase
400472
reheat_theta = state

0 commit comments

Comments
 (0)