diff --git a/baseHistogram.mjs b/baseHistogram.mjs new file mode 100644 index 0000000..b95fc21 --- /dev/null +++ b/baseHistogram.mjs @@ -0,0 +1,428 @@ +import { range, arrayProd, datatypeToTypedArray, datatypeToBytes } from "./util.mjs"; +import { + BasePrimitive, + Kernel, + AllocateBuffer, +} from "./primitive.mjs"; +import { BaseTestSuite } from "./testsuite.mjs"; +import { BinOpAddU32, BinOpAddF32 } from "./binop.mjs"; +import { arithmeticBinCPU, lookupBinCPU, buildHistogramStrategy } from "./histogramStrategies.mjs"; +import { generateHistogramInitKernel, generateHistogramSweepKernel } from "./histogramKernels.mjs"; + + +class BaseHistogram extends BasePrimitive { + + constructor(args) { + super(args); + + // Required parameters + for (const required of ["datatype"]) { + if (!this[required]) { + throw new Error(`${this.constructor.name}: ${required} is required`); + } + } + if (args.bins) { + if (args.bins.type === 'even') { + this.binType = 'even'; + this.numBins = args.bins.numBins; + if (!this.numBins) { + throw new Error(`${this.constructor.name}: bins must have 'numBins'`); + } + this.minValue = args.bins.min ?? 0.0; + this.maxValue = args.bins.max ?? 1.0; + } else if (args.bins.type === 'custom') { + this.binType = 'custom'; + this.binEdges = args.bins.bin_edges; + + this.numBins = this.binEdges.length - 1; + this.minValue = this.binEdges[0]; + this.maxValue = this.binEdges[this.binEdges.length - 1]; + } else { + throw new Error(`${this.constructor.name}: unknown bins type ${args.bins.type}`); + } + + } else { + if (!args.numBins) { + throw new Error(`${this.constructor.name}: numBins is required`); + } + this.binType = 'even'; + this.numBins = args.numBins; + this.minValue = args.minValue ?? 0.0; + this.maxValue = args.maxValue ?? 1.0; + } + + + // Optional: binop parameter (defaults to BinOpAddU32 for counting)--can be sum, min, max(for further extensions) + this.binop = args.binop ?? BinOpAddU32; + + if (!this.binop) { + throw new Error(`${this.constructor.name}: binop is required`); + } + + this.knownBuffers = [ + "inputBuffer", + "outputBuffer", + "uniforms" + ]; + + for (const knownBuffer of this.knownBuffers) { + if (knownBuffer in args) { + this.registerBuffer({ + label: knownBuffer, + buffer: args[knownBuffer], + device: this.device, + }); + delete this[knownBuffer]; + } + } + + this.outputDatatype = this.binop.datatype; + + } + //if bandwidth is low check this + get bytesTransferred() { + // Histogram memory traffic: + // 1. Init kernel: Write entire output buffer (numBins * 4 bytes) + // 2. Sweep kernel: + // - Read entire input buffer (inputLength * datatype_size) + // - Atomic RMW on output buffer: Each input does read+write (2 * 4 bytes * inputLength) + // Conservative estimate: read input once + write output once + const inputBytes = this.getBuffer("inputBuffer").size; + const outputBytes = this.getBuffer("outputBuffer").size; + + // For histogram: input read once + output init + output atomic writes + return inputBytes + (2 * outputBytes); // Read input, init output, update output + } + + finalizeRuntimeParameters() { + const inputBuffer = this.getBuffer("inputBuffer"); + const inputSize = inputBuffer.size / datatypeToBytes(this.datatype); + + //rebuild config to ensure correct min/max values- buildStrategy.mjs + + this.config = buildHistogramStrategy({ + bins: this.binType === 'custom' + ? { type: 'custom', bin_edges: this.binEdges, min: this.minValue, max: this.maxValue } + : { type: 'even', numBins: this.numBins, min: this.minValue, max: this.maxValue }, + datatype: this.datatype, + inputSize: inputSize, + binOp: this.binop + }); + this.config.initKernel = generateHistogramInitKernel(this.config.numBins); + this.config.sweepKernel = generateHistogramSweepKernel(this.config, this.datatype); + } + + + + compute() { + this.finalizeRuntimeParameters(); + const operations = []; + + operations.push( + new AllocateBuffer({ + label: "outputBuffer", + size: this.config.buffers.output.size, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC, + }) + ); + + + // Unified uniform buffer for both init and sweep kernels + operations.push( + new AllocateBuffer({ + label: "uniforms", + size: this.config.buffers.uniforms.size, + usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, + populateWith: new Uint32Array(this.config.buffers.uniforms.data) + }) + ); + + // Bin edges buffer (if custom bins) + if (this.config.buffers.bin_edges) { + operations.push( + new AllocateBuffer({ + label: "bin_edges", + size: this.config.buffers.bin_edges.size, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST, + populateWith: this.config.buffers.bin_edges.data + }) + ); + } + + operations.push( + new Kernel({ + kernel: () => this.config.initKernel, + entryPoint: "histogramInitKernel", + bufferTypes: [["storage", "uniform"]], + bindings: [["outputBuffer", "uniforms"]], + label: "histogram init", + getDispatchGeometry: () => this.config.dispatchDimensions + }) + + ); + + // Sweep kernel + operations.push( + new Kernel({ + kernel: () => this.config.sweepKernel, + entryPoint: "histogramKernel", + bufferTypes: this.config.buffers.bin_edges + ? [["read-only-storage", "storage", "uniform", "read-only-storage"]] + : [["read-only-storage", "storage", "uniform"]], + bindings: this.config.buffers.bin_edges + ? [["inputBuffer", "outputBuffer", "uniforms", "bin_edges"]] + : [["inputBuffer", "outputBuffer", "uniforms"]], + label: "histogram sweep", + getDispatchGeometry: () => this.config.dispatchDimensions + }) + ); + + return operations; + } + + validate = (args = {}) => { + const memsrc = args.inputBuffer ?? this.getBuffer("inputBuffer")?.cpuBuffer; + const memdest = args.outputBuffer ?? this.getBuffer("outputBuffer")?.cpuBuffer; + + if (!memsrc || !memdest) { + return ""; + } + + const OutputArrayType = { + 'u32': Uint32Array, + 'i32': Int32Array, + 'f32': Float32Array + }[this.outputDatatype] ?? Uint32Array; + + let referenceOutput = new OutputArrayType(this.numBins); + + // Initialize based on binop + for (let bin = 0; bin < this.numBins; bin++) { + referenceOutput[bin] = this.binop.identity ?? 0; + } + + // Process each sample + for (let i = 0; i < memsrc.length; i++) { + const value = memsrc[i]; + let binIndex = -1; + //binary serach for custom bins + if (this.binType === 'custom') { + binIndex = lookupBinCPU(value, this.binEdges); + } else { + // Arithmetic binning + binIndex = arithmeticBinCPU(value, this.minValue, this.maxValue, this.numBins); + } + + // Ignore out-of-bounds inputs + if (binIndex >= 0 && binIndex < this.numBins) { + // For histogram: increment count by 1 (not by the value!) + referenceOutput[binIndex] = this.binop.op( + referenceOutput[binIndex], + 1 + ); + } + } + + // Compare results + let returnString = ""; + let allowedErrors = 5; + + for (let bin = 0; bin < this.numBins; bin++) { + if (allowedErrors == 0) break; + if (referenceOutput[bin] != memdest[bin]) { + returnString += `\nBin ${bin}: expected ${referenceOutput[bin]}, got ${memdest[bin]}.`; + allowedErrors--; + } + } + + return returnString; + }; + + + getBinMetadata() { + if (this.binType === 'even') { + const binWidth = (this.maxValue - this.minValue) / this.numBins; + return Array.from({ length: this.numBins }, (_, i) => ({ + index: i, + range: [ + this.minValue + i * binWidth, + this.minValue + (i + 1) * binWidth + ], + label: `[${(this.minValue + i * binWidth).toFixed(3)}, ${(this.minValue + (i + 1) * binWidth).toFixed(3)})` + })); + } else { + // Custom bins + return Array.from({ length: this.numBins }, (_, i) => ({ + index: i, + range: [this.binEdges[i], this.binEdges[i + 1]], + label: `[${this.binEdges[i]}, ${this.binEdges[i + 1]})` + })); + } + } + + + getHistogramResult(customData = null) { + const counts = customData || this.getBuffer("outputBuffer").cpuBuffer; + + if (!counts) { + console.warn('getHistogramResult: No histogram data available'); + return []; + } + + const metadata = this.getBinMetadata(); + + return metadata.map((bin, i) => ({ + bin: i, + range: bin.range, + label: bin.label, + count: counts[i] + })); + } + /** + */ + getBinIndex(value) { + if (this.binType === 'even') { + // Arithmetic binning (same as GPU kernel) + const scale = this.numBins / (this.maxValue - this.minValue); + const normalized = (value - this.minValue) * scale; + const bin = Math.floor(normalized); + + // Clamp to valid range + if (bin < 0 || bin >= this.numBins) { + return -1; + } + return bin; + } else { + // Custom bins - binary search (same as GPU kernel) + if (value < this.binEdges[0] || value >= this.binEdges[this.binEdges.length - 1]) { + return -1; // Out of bounds + } + + // Binary search to find bin + let left = 0; + let right = this.numBins; + + while (left < right) { + const mid = Math.floor((left + right) / 2); + if (value < this.binEdges[mid]) { + right = mid; + } else if (value >= this.binEdges[mid + 1]) { + left = mid + 1; + } else { + return mid; + } + } + + return -1; // Shouldn't reach here + } + } + + /**/ + getBinIndices(values) { + const indices = new Uint32Array(values.length); + + for (let i = 0; i < values.length; i++) { + const binIndex = this.getBinIndex(values[i]); + indices[i] = binIndex === -1 ? 0xFFFFFFFF : binIndex; // Use max uint32 for out-of-bounds + } + + return indices; + } +} + + +// Test Suite Configuration + +export const histogramBandwidthPlot = { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" }, + fx: { field: "timing" }, + stroke: { field: "inputLength" }, + test_br: "gpuinfo.description", + caption: "Lines are input length", +}; + +export const histogramTimePlot = { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "cpuTotalTimeMS", label: "Time (ms)" }, + fx: { field: "timing" }, + stroke: { field: "inputLength" }, + test_br: "gpuinfo.description", + caption: "Lines are input length", +}; + +export const histogramBinTypePlot = { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" }, + fx: { field: "timing" }, + stroke: { field: "binType" }, + test_br: "gpuinfo.description", + caption: "Lines are bin type (even/custom)", +}; + +export const HistogramEvenBinsBWPlot = { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" }, + stroke: { field: "timing" }, + test_br: "gpuinfo.description", + caption: "CPU timing (performance.now), GPU timing (timestamps)", +}; + +export const HistogramCustomBinsBWPlot = { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" }, + stroke: { field: "timing" }, + test_br: "gpuinfo.description", + caption: "CPU timing (performance.now), GPU timing (timestamps)", +}; + +const HistogramParams = { + inputLength: range(8, 27).map((i) => 2 ** i), + numBins: [64], +}; + +// eslint-disable-next-line no-unused-vars +const HistogramParamsSingleton = { + inputLength: [2 ** 27], + numBins: [64], +}; + +export const EvenBinsHistogramTestSuite = new BaseTestSuite({ + category: "histogram", + testSuite: "even bins histogram", + trials: 20, + params: HistogramParams, + uniqueRuns: ["inputLength", "numBins"], + primitive: BaseHistogram, + primitiveArgs: { + datatype: "f32", + binop: BinOpAddU32, + bins: { type: 'even', numBins: 64, min: 0.0, max: 100.0 }, + gputimestamps: true, + }, + plots: [ + HistogramEvenBinsBWPlot, + ], +}); + +export const CustomBinsHistogramTestSuite = new BaseTestSuite({ + category: "histogram", + testSuite: "custom bins histogram", + trials: 10, + params: HistogramParams, + uniqueRuns: ["inputLength", "numBins"], + primitive: BaseHistogram, + primitiveArgs: { + datatype: "f32", + binop: BinOpAddU32, + bins: { + type: 'custom', + bin_edges: [0.0, 10.0, 25.0, 50.0, 75.0, 90.0, 100.0] + }, + gputimestamps: true, + }, + plots: [ + HistogramCustomBinsBWPlot, + ], +}); + diff --git a/benchmarking.mjs b/benchmarking.mjs index 62770df..9e16c57 100644 --- a/benchmarking.mjs +++ b/benchmarking.mjs @@ -20,21 +20,12 @@ if (typeof process !== "undefined" && process.release.name === "node") { Plot = await import( "https://cdn.jsdelivr.net/npm/@observablehq/plot@0.6/+esm" ); - /* begin https://github.com/sharonchoong/svg-exportJS */ - /* svg-exportJS prerequisite: canvg */ - await import("https://cdnjs.cloudflare.com/ajax/libs/canvg/3.0.9/umd.js"); - /* svg-exportJS plugin */ - await import("https://sharonchoong.github.io/svg-exportJS/svg-export.min.js"); - /* end https://github.com/sharonchoong/svg-exportJS */ const urlParams = new URL(window.location.href).searchParams; - saveJSON = urlParams.get("saveJSON"); // string or undefined + saveJSON = urlParams.get("saveJSON"); if (saveJSON === "false") { saveJSON = false; } - saveSVG = urlParams.get("saveSVG"); // string or undefined - if (saveSVG === "false") { - saveSVG = false; - } + saveSVG = false; if (Window.crossOriginIsolated) { console.info("IS cross-origin isolated"); } else { @@ -45,7 +36,7 @@ if (typeof process !== "undefined" && process.release.name === "node") { // tests // import { NoAtomicPKReduceTestSuite } from "./reduce.mjs"; // import { HierarchicalScanTestSuite } from "./scan.mjs"; -import { +/*import { // DLDFScanTestSuite, // DLDFReduceTestSuite, DLDFScanAccuracyRegressionSuite, @@ -64,7 +55,8 @@ import { SortOneSweepFunctionalRegressionSuite, SortOneSweep64v32Suite, SortOneSweep64v321MNoPlotSuite, -} from "./onesweep.mjs"; +} from "./onesweep.mjs";*/ +import { EvenBinsHistogramTestSuite, CustomBinsHistogramTestSuite } from "./baseHistogram.mjs" import { BasePrimitive } from "./primitive.mjs"; async function main(navigator) { @@ -135,7 +127,7 @@ async function main(navigator) { //); // let testSuites = [DLDFScanMiniSuite]; // let testSuites = [DLDFScanAccuracyRegressionSuite]; - let testSuites = [DLDFPerfSuite]; + let testSuites = [EvenBinsHistogramTestSuite, CustomBinsHistogramTestSuite]; // let testSuites = [DLDFDottedCachePerfTestSuite]; // let testSuites = [DLDFDottedCachePerf2TestSuite]; // let testSuites = [DLDFSingletonWithTimingSuite]; @@ -240,7 +232,7 @@ async function main(navigator) { device, datatype: testSuite.category === "subgroups" && - testSuite.testSuite === "subgroupBallot" + testSuite.testSuite === "subgroupBallot" ? "vec4u" : primitive.datatype, length: @@ -409,10 +401,8 @@ async function main(navigator) { } if (validations.done > 0) { console.info( - `${validations.done} validation${ - validations.done === 1 ? "" : "s" - } complete${validations?.tested ? ` (${validations.tested})` : ""}, ${ - validations.errors + `${validations.done} validation${validations.done === 1 ? "" : "s" + } complete${validations?.tested ? ` (${validations.tested})` : ""}, ${validations.errors } error${validations.errors === 1 ? "" : "s"}.` ); } @@ -424,9 +414,9 @@ async function main(navigator) { /* default: if filter not specified, only take expts from the last test we ran */ let filteredExpts = expts.filter( plot.filter ?? - ((row) => - row.testSuite === lastTestSeen.testSuite && - row.category === lastTestSeen.category) + ((row) => + row.testSuite === lastTestSeen.testSuite && + row.category === lastTestSeen.category) ); const mark = plot.mark ?? "lineY"; console.info( @@ -500,12 +490,6 @@ async function main(navigator) { const div = document.querySelector("#plot"); div.append(plotted); if (saveSVG) { - // eslint-disable-next-line no-undef - svgExport.downloadSvg( - div.lastChild, - `${testSuite.testsuite}-${testSuite.category}`, // chart title: file name of exported image - {} - ); } div.append(document.createElement("hr")); } @@ -579,6 +563,7 @@ function processAndRecordResults( bandwidth: result.bandwidthCPU, inputItemsPerSecondE9: result.inputItemsPerSecondE9CPU, }); + console.log(`[${testSuite.category}] GPU BW: ${result.bandwidthGPU.toFixed(2)} GB/s | CPU BW: ${result.bandwidthCPU.toFixed(2)} GB/s | Input: ${result.inputBytes} bytes`); } export { main }; diff --git a/histogram.mjs b/histogram.mjs new file mode 100644 index 0000000..2109554 --- /dev/null +++ b/histogram.mjs @@ -0,0 +1,7 @@ +import { BaseHistogram } from "./baseHistogram.mjs"; +export class Histogram extends BaseHistogram { + constructor(args) { + super(args); + } +} + diff --git a/histogramKernels.mjs b/histogramKernels.mjs new file mode 100644 index 0000000..da8ab91 --- /dev/null +++ b/histogramKernels.mjs @@ -0,0 +1,134 @@ +/* */ + + +export function generateHistogramInitKernel(numBins) { + return ` +@group(0) @binding(0) var output: array>; // ← Add @ symbol! + +struct Uniforms { + inputSize: u32, + numBins: u32, + minValue: f32, + binScale: f32, +} + +@group(0) @binding(1) var uniforms: Uniforms; + +@compute @workgroup_size(256) +fn histogramInitKernel(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + + if (idx < uniforms.numBins) { + atomicStore(&output[idx], 0u); + } +} +`; +} + +/** + * Generate WGSL for histogram sweep kernel (KERNEL 2) + * Purpose: Compute histogram using 3-phase: + * Phase 1: Initialize workgroup-private histogram (shared memory) + * Phase 2: Process input with grid-stride loop + * Phase 3: Merge workgroup results to global histogram + + */ +export function generateHistogramSweepKernel(config, datatype) { + const { strategyType, numBins, useSharedMemory, workgroupSize } = config; + const hasBinEdges = strategyType === 'lookup'; + + // Get binning function from strategy(It's a function that generates the binning logic code where it tells the GPU HOW to decide which bin a value belongs to.) + const binningFunctionWGSL = config.generateBinningWGSL(); + + const sharedMemoryDecl = useSharedMemory ? ` +var privateHistogram: array, ${numBins}>; +` : ''; + + const bindings = ` +@group(0) @binding(0) var input: array<${datatype}>; +@group(0) @binding(1) var output: array>; +@group(0) @binding(2) var uniforms: Uniforms; +${hasBinEdges ? '@group(0) @binding(3) var bin_edges: array;' : ''} +`; + + + const uniformsStruct = ` +struct Uniforms { + inputSize: u32, + numBins: u32, + minValue: f32, + binScale: f32, +} +`; + + const mainKernel = ` +@compute @workgroup_size(${workgroupSize}) +fn histogramKernel( + @builtin(global_invocation_id) gid: vec3, + @builtin(local_invocation_id) lid: vec3, + @builtin(workgroup_id) wgid: vec3, + @builtin(num_workgroups) numWorkgroups: vec3 +) { + let localIdx = lid.x; + let globalIdx = gid.x; + + ${useSharedMemory ? ` + // Each thread initializes multiple bins using stride loop + var b = localIdx; + while (b < uniforms.numBins) { + atomicStore(&privateHistogram[b], 0u); + b += ${workgroupSize}u; + } + workgroupBarrier(); + ` : ''} + + // PHASE 2: Process samples (grid-stride loop) + var idx = globalIdx; + let stride = numWorkgroups.x * ${workgroupSize}u; + + while (idx < uniforms.inputSize) { + let sample = input[idx]; + + ${strategyType === 'lookup' + ? 'let bin = bin_values(sample, &bin_edges);' + : 'let bin = bin_value(sample);' + } + + + if (bin != 0xFFFFFFFFu && bin < uniforms.numBins) { + ${useSharedMemory + ? 'atomicAdd(&privateHistogram[bin], 1u);' + : 'atomicAdd(&output[bin], 1u);'} + } + + idx += stride; + } + + // + // Each thread merges multiple bins using stride loop. + ${useSharedMemory ? ` + workgroupBarrier(); + + var bin = localIdx; + while (bin < uniforms.numBins) { + let count = atomicLoad(&privateHistogram[bin]); + if (count > 0u) { + atomicAdd(&output[bin], count); + } + bin += ${workgroupSize}u; + } + ` : ''} +} +`; + + const generatedCode = ` +${bindings} +${uniformsStruct} +${sharedMemoryDecl} +${binningFunctionWGSL} +${mainKernel} +`; + + + return generatedCode; +} diff --git a/histogramStrategies.mjs b/histogramStrategies.mjs new file mode 100644 index 0000000..14c6938 --- /dev/null +++ b/histogramStrategies.mjs @@ -0,0 +1,358 @@ +/*Histogram Strategy Builder + +/*Maximum bins that fit in workgroup shared memory */ +export const MAX_SHARED_MEMORY_BINS = 256; + +/*Default workgroup size*/ +export const DEFAULT_WORKGROUP_SIZE = 256; + +/*Values processed per thread (coalesced memory access)*/ +export const VAL_PER_THREAD = 1; + +/*Target GPU occupancy multiplier */ +const OCCUPANCY_MULTIPLIER = 4; // GPU will handle grid-stride efficiently + +/*Maximum workgroups - WebGPU limit per dimension */ +const MAX_WORKGROUPS = 65535; + + +/*Datatype helper class*/ + +function createUnifiedUniformsBuffer(inputSize, numBins, minValue, binScale) { + const data = new ArrayBuffer(16); + const view = new DataView(data); + view.setUint32(0, inputSize, true); + view.setUint32(4, numBins, true); + view.setFloat32(8, minValue, true); + view.setFloat32(12, binScale, true); + return data; +} + + + +export function arithmeticBinCPU(inputValue, min, max, numBins) { + // Accept all values, clamp to range (matches GPU behavior) + const scale = numBins / (max - min); + const normalized = (inputValue - min) * scale; + let bin = Math.floor(normalized); + + // Clamp for out-of-bounds and floating-point edge cases + bin = Math.max(0, Math.min(bin, numBins - 1)); + + return bin; +} + +export function lookupBinCPU(inputValue, bin_edges) { + if (inputValue < bin_edges[0] || inputValue >= bin_edges[bin_edges.length - 1]) { + return -1; + } + + + // Binary search for upper bound + let left = 0; + let right = bin_edges.length - 1; + + + while (left < right - 1) { + const mid = Math.floor((left + right) / 2); + if (inputValue < bin_edges[mid]) { + right = mid; + } else { + left = mid; + } + } + + // Verify inputValue is in valid range [left, right) + if (inputValue >= bin_edges[left] && inputValue < bin_edges[right]) { + return left; + } + + return -1; // Out of bounds +} + + +// WGSL generation + + +/* Generate WGSL code for arithmetic binning (even bins)*/ + +function generateArithmeticWGSL(min, max, numBins, datatype) { + return ` +fn bin_value(value: ${datatype}) -> u32 { + let min_value: f32 = ${min}; + let bin_scale: f32 = ${numBins} / (${max} - ${min}); + + let normalized: f32 = (f32(value) - min_value) * bin_scale; + var bin: i32 = i32(floor(normalized)); + + bin = clamp(bin, 0, i32(uniforms.numBins) - 1); + + return u32(bin); +} +`; +} +/*fn bin_value(value: ${datatype}) -> u32 { + // Read min and scale from uniforms (allows dynamic configuration) + let min_value: f32 = uniforms.minValue; + let bin_scale: f32 = uniforms.binScale; + + // Convert value to normalized bin index + let normalized: f32 = (f32(value) - min_value) * bin_scale; + var bin: i32 = i32(floor(normalized)); + + // Clamp to valid range [0, numBins) + bin = clamp(bin, 0, i32(uniforms.numBins) - 1); + + return u32(bin); +} + +/* Generate WGSL code for lookup binning (binary search)*/ + +function generateLookupWGSL(bin_edges, datatype) { + return ` +fn bin_values(value: ${datatype}, bin_edges: ptr>) -> u32 { + let num_edges: u32 = ${bin_edges.length}u; + let value_f32: f32 = f32(value); + if(value_f32=bin_edges[num_edges-1u]){ + return 0xFFFFFFFFu; + } + + var left:u32=0u; + var right:u32=num_edges-1u; + while(left=bin_edges[left] && value_f32= bins.max) { + throw new Error( + `HistogramStrategy: bins.min (${bins.min}) must be < bins.max (${bins.max})` + ); + } + + // Check for arithmetic overflow in scale computation(not very small ranges or not very large ranges to avoid infinities) + const range = bins.max - bins.min; + const scale = bins.numBins / range; + + if (!Number.isFinite(scale)) { + throw new Error( + `HistogramStrategy: Bin scale computation overflow. ` + + `Range [${bins.min}, ${bins.max}] with ${bins.numBins} bins is too large.` + ); + } + + // Warn if too many bins-const (MAX_SHARED_MEMORY_BINS = 256); + if (bins.numBins > MAX_SHARED_MEMORY_BINS) { + console.warn( + `HistogramStrategy: ${bins.numBins} bins exceeds shared memory limit (${MAX_SHARED_MEMORY_BINS}). ` + + `Will use slower global atomics. Consider reducing bin count.` + ); + } +} + +/*Validate custom bins configuration*/ + +function validateCustomBins(bins) { + + if (!Array.isArray(bins.bin_edges)) { + throw new Error('HistogramStrategy: bins.bin_edges must be an array'); + } + + if (bins.bin_edges.length < 2) { + throw new Error( + `HistogramStrategy: bins.bin_edges must have at least 2 boundaries, got ${bins.bin_edges.length}` + ); + } + for (let i = 1; i < bins.bin_edges.length; i++) { + if (bins.bin_edges[i] <= bins.bin_edges[i - 1]) { + throw new Error(`HistogramStrategy: bin_edges must be monotonically increasing (found ${bins.bin_edges[i - 1]} >= ${bins.bin_edges[i]} at index ${i})`); + } + } + + + // Check all bin_edges are finite + for (let i = 0; i < bins.bin_edges.length; i++) { + if (!Number.isFinite(bins.bin_edges[i])) { + throw new Error( + `HistogramStrategy: bins.bin_edges[${i}] = ${bins.bin_edges[i]} is not finite` + ); + } + } + + // Check strictly ascending order + for (let i = 1; i < bins.bin_edges.length; i++) { + if (bins.bin_edges[i] <= bins.bin_edges[i - 1]) { + throw new Error( + `HistogramStrategy: bins.bin_edges must be strictly ascending. ` + + `Found bin_edges[${i - 1}] = ${bins.bin_edges[i - 1]} >= bin_edges[${i}] = ${bins.bin_edges[i]}` + ); + } + } + + // Warn if many bin_edges (slow binary search) + if (bins.bin_edges.length > 1000) { + console.warn( + `HistogramStrategy: ${bins.bin_edges.length} bin edges may cause slow binary search. ` + + `Consider reducing number of bins.` + ); + } +} + +/*Calculate workgroup dimensions */ + +function calculateDimensions(inputSize, workgroupSize = DEFAULT_WORKGROUP_SIZE) { + + //workgroupSize=256 + const valuesPerWorkgroup = workgroupSize * VAL_PER_THREAD; + + // Calculate tiles needed to cover all input values + const tilesNeeded = Math.ceil(inputSize / valuesPerWorkgroup); + // Let small inputs use fewer workgroups naturally + let workgroupCount = Math.min(tilesNeeded, MAX_WORKGROUPS); + + // Estimate occupancy + const estimatedSMCount = 100; + const targetOccupancy = estimatedSMCount * OCCUPANCY_MULTIPLIER; + const estimatedOccupancy = Math.min(workgroupCount / targetOccupancy, 1.0); + + return { + workgroupSize, + workgroupCount, + valuesPerThread: VAL_PER_THREAD, + valuesPerWorkgroup, + dispatchDimensions: [workgroupCount, 1, 1], + estimatedOccupancy + }; +} + + + + +/*Build histogram execution configuration*/ +function planBuffers(numBins, inputSize, bins) { + const buffers = { + output: { + size: numBins * 4, // u32 = 4 bytes per bin + usage: 'STORAGE | COPY_SRC' + } + }; + + if (bins.type === 'custom') { + const edgesArray = new Float32Array(bins.bin_edges); + buffers.bin_edges = { + size: edgesArray.byteLength, + usage: 'STORAGE | COPY_DST', + data: edgesArray + }; + } + + return buffers; +} + +export function buildHistogramStrategy(userConfig) { + const { bins, datatype, inputSize } = userConfig; + + let strategyType; + let numBins; + let binningFunction; + let generateWGSL; + + if (bins.type === 'even') { + validateEvenBins(bins); + strategyType = 'arithmetic'; + numBins = bins.numBins; + binningFunction = (value) => arithmeticBinCPU(value, bins.min, bins.max, numBins); + generateWGSL = () => generateArithmeticWGSL(bins.min, bins.max, numBins, datatype); + + } else if (bins.type === 'custom') { + validateCustomBins(bins); + strategyType = 'lookup'; + numBins = bins.bin_edges.length - 1; + binningFunction = (value) => lookupBinCPU(value, bins.bin_edges); + generateWGSL = () => generateLookupWGSL(bins.bin_edges, datatype); + + } else { + throw new Error( + `HistogramStrategy: Unknown bins.type '${bins.type}'. Expected 'even' or 'custom'.` + ); + } + + const useSharedMemory = (numBins <= MAX_SHARED_MEMORY_BINS); + const dimensions = calculateDimensions(inputSize); + + // Plan buffers + const buffers = planBuffers(numBins, inputSize, bins); + + // Create unified uniform buffer + if (bins.type === 'even') { + const binScale = numBins / (bins.max - bins.min); + buffers.uniforms = { + size: 16, + usage: 'UNIFORM | COPY_DST', + data: createUnifiedUniformsBuffer(inputSize, numBins, bins.min, binScale) + }; + } else { + buffers.uniforms = { + size: 16, + usage: 'UNIFORM | COPY_DST', + data: createUnifiedUniformsBuffer(inputSize, numBins, 0.0, 0.0) + }; + } + + return { + strategyType, + numBins, + useSharedMemory, + binningFunction, + generateBinningWGSL: generateWGSL, + ...dimensions, + buffers, + metadata: { + strategyType, + numBins, + useSharedMemory, + workgroupSize: dimensions.workgroupSize, + workgroupCount: dimensions.workgroupCount, + estimatedOccupancy: dimensions.estimatedOccupancy, + } + }; +} + + + +