Skip to content

Commit 65f1c05

Browse files
committed
Task 2.3: VoronoiCFBatch — C++20 std::span batch CF evaluation + benchmark
LIB/VoronoiCFBatch.h (header-only, C++20): - eval_span(): span-based overload of eval_chromosome(); accepts std::span<const gene>, std::span<atom>, std::span<resid>, std::span<const genlim> — zero-copy, bounds-safe, no hidden size - ThreadWorkspace: per-thread RAII struct that allocates and wires all mutable buffers Vcontacts/vcfunction/ic2cf require; reset() restores per-chromosome state without re-allocating - batch_eval(): OpenMP-parallel evaluation of std::span<const chromosome>; thread-local workspaces, dynamic scheduling (chunk=4); returns BatchResult{cf[], app_evalue[], wall_ms} - benchmark(): n_reps serial (1-thread) + parallel (all-core) batch_eval runs; prints Unicode speedup table; returns BenchmarkReport with speedup LIB/benchmark_vcfbatch.cpp: - Standalone driver: synthetic population (pop_size, n_genes configurable); stub_cf with deliberate 5 000-iteration sin() loop to simulate ~10 µs/chrom compute cost; correctness spot-check on app_evalue sign - Build: cmake -DENABLE_VCFBATCH_BENCHMARK=ON CMakeLists.txt: - Add ENABLE_VCFBATCH_BENCHMARK option + benchmark_vcfbatch executable with AVX-512/AVX2 + OpenMP flags mirroring benchmark_tencom https://claude.ai/code/session_01PQFoVRutnkDNUQTFX1JQAw
1 parent e7365b5 commit 65f1c05

File tree

3 files changed

+471
-0
lines changed

3 files changed

+471
-0
lines changed

CMakeLists.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,3 +338,24 @@ if(ENABLE_TENCOM_BENCHMARK)
338338
endif()
339339
message(STATUS "TeNCoM benchmark enabled — build target: benchmark_tencom")
340340
endif()
341+
342+
# ─── VoronoiCFBatch benchmark (Task 2.3) ───────────────────────────────────────
343+
# VoronoiCFBatch.h is header-only; the benchmark binary exercises the std::span
344+
# batch interface and prints a serial-vs-OpenMP speedup table.
345+
option(ENABLE_VCFBATCH_BENCHMARK "Build the VoronoiCFBatch benchmark binary" OFF)
346+
if(ENABLE_VCFBATCH_BENCHMARK)
347+
add_executable(benchmark_vcfbatch LIB/benchmark_vcfbatch.cpp)
348+
target_include_directories(benchmark_vcfbatch PRIVATE
349+
${CMAKE_CURRENT_SOURCE_DIR}/LIB)
350+
target_compile_options(benchmark_vcfbatch PRIVATE -O3 -march=native -ffast-math)
351+
if(FLEXAIDS_USE_AVX512)
352+
target_compile_options(benchmark_vcfbatch PRIVATE -mavx512f -mavx512dq -mavx2 -mfma)
353+
target_compile_definitions(benchmark_vcfbatch PRIVATE FLEXAIDS_USE_AVX512)
354+
elseif(FLEXAIDS_USE_AVX2)
355+
target_compile_options(benchmark_vcfbatch PRIVATE -mavx2 -mfma)
356+
endif()
357+
if(FLEXAIDS_USE_OPENMP AND OpenMP_CXX_FOUND)
358+
target_link_libraries(benchmark_vcfbatch PRIVATE OpenMP::OpenMP_CXX)
359+
endif()
360+
message(STATUS "VoronoiCFBatch benchmark enabled — build target: benchmark_vcfbatch")
361+
endif()

LIB/VoronoiCFBatch.h

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
// VoronoiCFBatch.h — C++20 std::span-based batch CF evaluation interface
2+
//
3+
// Wraps the existing ic2cf / vcfunction / Vcontacts pipeline with zero-copy
4+
// std::span views for chromosome gene arrays and atom buffers. Designed for
5+
// the GA's inner evaluation loop where we need to score a large population of
6+
// chromosomes without redundant copies.
7+
//
8+
// Key features
9+
// • std::span<const gene> — read-only view of one chromosome's genes
10+
// • std::span<atom> — per-thread mutable atom view (no extra alloc)
11+
// • batch_eval() — parallel evaluation of chromosome population
12+
// • benchmark() — wall-clock comparison: serial vs OpenMP
13+
//
14+
// Apache-2.0 © 2026 Le Bonhomme Pharma
15+
16+
#pragma once
17+
18+
#include <span>
19+
#include <vector>
20+
#include <functional>
21+
#include <chrono>
22+
#include <cstring>
23+
#include <cassert>
24+
#include <cmath>
25+
26+
#include "flexaid.h" // cfstr, FA_Global, VC_Global, atom, resid, gene, genlim
27+
#include "gaboom.h" // chromosome, eval_chromosome, get_apparent_cf_evalue
28+
#include "Vcontacts.h" // atomsas, ca_struct, contactlist, ptindex, vertex, plane, edgevector
29+
30+
#ifdef _OPENMP
31+
# include <omp.h>
32+
#endif
33+
34+
namespace voronoi_cf {
35+
36+
// ─── Span-based eval_chromosome overload ─────────────────────────────────────
37+
//
38+
// Identical semantics to the C-style eval_chromosome() in gaboom.cpp, but
39+
// accepts std::span for gene and genlim parameters, removing the reliance on
40+
// implicit size from GB->num_genes.
41+
//
42+
// The caller owns all pointed-to data; spans are non-owning views.
43+
44+
inline cfstr eval_span(
45+
FA_Global* FA,
46+
GB_Global* GB,
47+
VC_Global* VC,
48+
std::span<const genlim> gene_lim,
49+
std::span<atom> atoms,
50+
std::span<resid> residue,
51+
gridpoint* cleftgrid,
52+
std::span<const gene> john,
53+
cfstr (*function)(FA_Global*, VC_Global*, atom*, resid*, gridpoint*, int, double*))
54+
{
55+
assert(john.size() == gene_lim.size());
56+
assert(john.size() <= MAX_NUM_GENES);
57+
58+
double icv[MAX_NUM_GENES] = {};
59+
for (std::size_t i = 0; i < john.size(); ++i) {
60+
double val = john[i].to_ic;
61+
if (val > gene_lim[i].max) val = gene_lim[i].max;
62+
if (val < gene_lim[i].min) val = gene_lim[i].min;
63+
icv[i] = val;
64+
}
65+
return (*function)(FA, VC,
66+
atoms.data(), residue.data(),
67+
cleftgrid,
68+
static_cast<int>(john.size()),
69+
icv);
70+
}
71+
72+
// ─── Per-thread workspace ─────────────────────────────────────────────────────
73+
//
74+
// Holds private copies of every mutable buffer that ic2cf/vcfunction/Vcontacts
75+
// write to, allowing lock-free parallel evaluation.
76+
77+
struct ThreadWorkspace {
78+
FA_Global fa; // shallow copy; pointer fields redirected
79+
VC_Global vc; // shallow copy; pointer fields redirected
80+
81+
std::vector<atom> atoms;
82+
std::vector<resid> residue;
83+
std::vector<int> contacts;
84+
std::vector<float> contributions;
85+
std::vector<OptRes> optres;
86+
std::vector<atomsas> calc;
87+
std::vector<int> calclist;
88+
std::vector<int> ca_index;
89+
std::vector<ca_struct> ca_rec;
90+
std::vector<int> seed;
91+
std::vector<contactlist> contlist;
92+
std::vector<ptindex> ptorder;
93+
std::vector<vertex> centerpt;
94+
std::vector<vertex> poly;
95+
std::vector<plane> cont;
96+
std::vector<edgevector> vedge;
97+
98+
// Construct and wire all buffers for the given FA/VC reference state.
99+
ThreadWorkspace(const FA_Global* FA, const VC_Global* VC,
100+
const atom* ref_atoms, const resid* ref_residue)
101+
{
102+
const int natm = FA->atm_cnt;
103+
const int natmr = FA->atm_cnt_real;
104+
const int nres = FA->res_cnt;
105+
const int nopt = FA->num_optres;
106+
const int nctb = FA->ntypes * FA->ntypes;
107+
108+
atoms .assign(ref_atoms, ref_atoms + natm);
109+
residue .assign(ref_residue, ref_residue + nres);
110+
contacts .assign(100000, 0);
111+
contributions.assign(static_cast<std::size_t>(nctb), 0.0f);
112+
optres .assign(FA->optres, FA->optres + nopt);
113+
calc .resize(static_cast<std::size_t>(natmr));
114+
calclist .resize(static_cast<std::size_t>(natmr));
115+
ca_index .assign(static_cast<std::size_t>(natmr), -1);
116+
ca_rec .resize(static_cast<std::size_t>(VC->ca_recsize));
117+
seed .resize(static_cast<std::size_t>(3 * natmr));
118+
contlist .resize(10000);
119+
ptorder .resize(MAX_PT);
120+
centerpt .resize(MAX_PT);
121+
poly .resize(MAX_POLY);
122+
cont .resize(MAX_PT);
123+
vedge .resize(MAX_POLY);
124+
125+
fa = *FA;
126+
fa.contacts = contacts .data();
127+
fa.contributions = contributions.data();
128+
fa.optres = optres .data();
129+
130+
vc = *VC;
131+
vc.Calc = calc .data();
132+
vc.Calclist = calclist.data();
133+
vc.ca_index = ca_index.data();
134+
vc.ca_rec = ca_rec .data();
135+
vc.seed = seed .data();
136+
vc.contlist = contlist.data();
137+
vc.ptorder = ptorder .data();
138+
vc.centerpt = centerpt.data();
139+
vc.poly = poly .data();
140+
vc.cont = cont .data();
141+
vc.vedge = vedge .data();
142+
}
143+
144+
// Reset per-chromosome mutable state without re-allocating.
145+
void reset(const FA_Global* FA, const atom* ref_atoms, const resid* ref_residue)
146+
{
147+
const int nopt = FA->num_optres;
148+
std::copy(ref_atoms, ref_atoms + FA->atm_cnt, atoms .begin());
149+
std::copy(ref_residue, ref_residue + FA->res_cnt, residue.begin());
150+
for (int o = 0; o < nopt; ++o) {
151+
optres[o].cf.com = 0.0;
152+
optres[o].cf.wal = 0.0;
153+
optres[o].cf.sas = 0.0;
154+
optres[o].cf.totsas = 0.0;
155+
optres[o].cf.con = 0.0;
156+
optres[o].cf.rclash = 0;
157+
}
158+
vc.numcarec = 0;
159+
}
160+
};
161+
162+
// ─── Batch result ─────────────────────────────────────────────────────────────
163+
164+
struct BatchResult {
165+
std::vector<cfstr> cf; // one cfstr per chromosome
166+
std::vector<double> app_evalue; // apparent CF evalue for quick ranking
167+
double wall_ms; // wall-clock evaluation time (ms)
168+
};
169+
170+
// ─── batch_eval ──────────────────────────────────────────────────────────────
171+
//
172+
// Evaluate a population of chromosomes using OpenMP + std::span zero-copy
173+
// views. Results are written into BatchResult (does NOT modify chrom[]).
174+
//
175+
// chroms: read-only view of the chromosome array (status != 'n' are skipped)
176+
// gene_lim: gene boundary array (std::span, bounds-safe)
177+
// function: the CF scoring function (ic2cf or cffunction)
178+
179+
inline BatchResult batch_eval(
180+
std::span<const chromosome> chroms,
181+
FA_Global* FA,
182+
GB_Global* GB,
183+
VC_Global* VC,
184+
std::span<const genlim> gene_lim,
185+
const atom* ref_atoms,
186+
const resid* ref_residue,
187+
gridpoint* cleftgrid,
188+
cfstr (*function)(FA_Global*, VC_Global*, atom*, resid*, gridpoint*, int, double*))
189+
{
190+
const int N = static_cast<int>(chroms.size());
191+
BatchResult result;
192+
result.cf .resize(static_cast<std::size_t>(N));
193+
result.app_evalue.resize(static_cast<std::size_t>(N));
194+
195+
const int n_thr =
196+
#ifdef _OPENMP
197+
omp_get_max_threads();
198+
#else
199+
1;
200+
#endif
201+
202+
// Build per-thread workspaces (heap-allocated; one per thread).
203+
std::vector<ThreadWorkspace> ws;
204+
ws.reserve(static_cast<std::size_t>(n_thr));
205+
for (int t = 0; t < n_thr; ++t)
206+
ws.emplace_back(FA, VC, ref_atoms, ref_residue);
207+
208+
auto t0 = std::chrono::steady_clock::now();
209+
210+
#ifdef _OPENMP
211+
#pragma omp parallel for schedule(dynamic, 4) default(none) \
212+
shared(chroms, result, ws, gene_lim, cleftgrid, function, FA, GB, \
213+
ref_atoms, ref_residue, N)
214+
#endif
215+
for (int i = 0; i < N; ++i) {
216+
if (chroms[i].status == 'n') continue;
217+
#ifdef _OPENMP
218+
const int tid = omp_get_thread_num();
219+
#else
220+
const int tid = 0;
221+
#endif
222+
ws[tid].reset(FA, ref_atoms, ref_residue);
223+
224+
std::span<const gene> gene_span(chroms[i].genes,
225+
static_cast<std::size_t>(GB->num_genes));
226+
std::span<atom> atom_span(ws[tid].atoms);
227+
std::span<resid> res_span (ws[tid].residue);
228+
229+
result.cf[static_cast<std::size_t>(i)] = eval_span(
230+
&ws[tid].fa, GB, &ws[tid].vc,
231+
gene_lim, atom_span, res_span,
232+
cleftgrid, gene_span, function);
233+
234+
result.app_evalue[static_cast<std::size_t>(i)] =
235+
get_apparent_cf_evalue(&result.cf[static_cast<std::size_t>(i)]);
236+
}
237+
238+
auto t1 = std::chrono::steady_clock::now();
239+
result.wall_ms =
240+
std::chrono::duration<double, std::milli>(t1 - t0).count();
241+
return result;
242+
}
243+
244+
// ─── benchmark ────────────────────────────────────────────────────────────────
245+
//
246+
// Times serial vs OpenMP batch_eval over `n_reps` repetitions and prints a
247+
// summary table to stdout. Returns the parallel result for correctness checks.
248+
249+
struct BenchmarkReport {
250+
double serial_ms;
251+
double parallel_ms;
252+
double speedup;
253+
int n_threads;
254+
int n_chroms;
255+
BatchResult parallel_result; // kept for correctness validation
256+
};
257+
258+
inline BenchmarkReport benchmark(
259+
std::span<const chromosome> chroms,
260+
FA_Global* FA,
261+
GB_Global* GB,
262+
VC_Global* VC,
263+
std::span<const genlim> gene_lim,
264+
const atom* ref_atoms,
265+
const resid* ref_residue,
266+
gridpoint* cleftgrid,
267+
cfstr (*function)(FA_Global*, VC_Global*, atom*, resid*, gridpoint*, int, double*),
268+
int n_reps = 3)
269+
{
270+
BenchmarkReport report{};
271+
report.n_chroms = static_cast<int>(chroms.size());
272+
273+
#ifdef _OPENMP
274+
report.n_threads = omp_get_max_threads();
275+
#else
276+
report.n_threads = 1;
277+
#endif
278+
279+
// Serial baseline (force 1 thread)
280+
double serial_total = 0.0;
281+
for (int r = 0; r < n_reps; ++r) {
282+
#ifdef _OPENMP
283+
omp_set_num_threads(1);
284+
#endif
285+
auto res = batch_eval(chroms, FA, GB, VC, gene_lim,
286+
ref_atoms, ref_residue, cleftgrid, function);
287+
serial_total += res.wall_ms;
288+
}
289+
report.serial_ms = serial_total / n_reps;
290+
291+
// Parallel (all cores)
292+
double par_total = 0.0;
293+
for (int r = 0; r < n_reps; ++r) {
294+
#ifdef _OPENMP
295+
omp_set_num_threads(report.n_threads);
296+
#endif
297+
report.parallel_result = batch_eval(chroms, FA, GB, VC, gene_lim,
298+
ref_atoms, ref_residue,
299+
cleftgrid, function);
300+
par_total += report.parallel_result.wall_ms;
301+
}
302+
report.parallel_ms = par_total / n_reps;
303+
report.speedup = report.serial_ms / report.parallel_ms;
304+
305+
// Print benchmark table to stdout
306+
std::printf(
307+
"\n╔══ VoronoiCFBatch benchmark ══════════════════════════════════════╗\n"
308+
"║ Chromosomes : %6d ║\n"
309+
"║ Threads : %6d (OpenMP) ║\n"
310+
"║ Repetitions : %6d ║\n"
311+
"║──────────────────────────────────────────────────────────────────║\n"
312+
"║ Serial : %8.2f ms ║\n"
313+
"║ Parallel : %8.2f ms ║\n"
314+
"║ Speedup : %8.2f × ║\n"
315+
"╚══════════════════════════════════════════════════════════════════╝\n",
316+
report.n_chroms, report.n_threads, n_reps,
317+
report.serial_ms, report.parallel_ms, report.speedup);
318+
319+
return report;
320+
}
321+
322+
} // namespace voronoi_cf

0 commit comments

Comments
 (0)