Skip to content

Commit 543f3b2

Browse files
committed
Add instructions on how to use the cuda version to the staging folder
1 parent a8d63ff commit 543f3b2

File tree

6 files changed

+1795
-0
lines changed

6 files changed

+1795
-0
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import jcuda.*;
2+
import jcuda.driver.*;
3+
import jcuda.nvrtc.*;
4+
import jcuda.runtime.JCuda;
5+
6+
import java.io.IOException;
7+
import java.nio.charset.StandardCharsets;
8+
import java.nio.file.Files;
9+
import java.nio.file.Paths;
10+
11+
import static jcuda.driver.JCudaDriver.cuCtxCreate;
12+
13+
public class PhiloxJNvrtcExample {
14+
15+
public static void main(String[] args) {
16+
// Enable exceptions and omit error checks
17+
JCuda.setExceptionsEnabled(true);
18+
JCudaDriver.setExceptionsEnabled(true);
19+
JNvrtc.setExceptionsEnabled(true);
20+
21+
String ptx = "";
22+
try {
23+
ptx = new String(Files.readAllBytes(Paths.get("philox_kernel.ptx")));
24+
} catch (IOException e) {
25+
System.out.println(e.getMessage());
26+
}
27+
28+
// Print the PTX for debugging
29+
//System.out.println("Generated PTX:");
30+
// System.out.println(ptx);
31+
32+
// Initialize the driver API and create a context
33+
JCudaDriver.cuInit(0);
34+
CUdevice device = new CUdevice();
35+
JCudaDriver.cuDeviceGet(device, 0);
36+
CUcontext context = new CUcontext();
37+
cuCtxCreate(context, 0, device);
38+
39+
CUmodule module = new CUmodule();
40+
JCudaDriver.cuModuleLoadData(module, ptx);
41+
42+
// Get a function pointer to the kernel
43+
CUfunction function = new CUfunction();
44+
JCudaDriver.cuModuleGetFunction(function, module, "philox_4_64");
45+
46+
// Prepare data
47+
int n = 1000; // Number of random numbers to generate
48+
long[] hostOut = new long[n];
49+
CUdeviceptr deviceOut = new CUdeviceptr();
50+
JCudaDriver.cuMemAlloc(deviceOut, n * Sizeof.LONG);
51+
52+
// Direkte Werte für seed und startingCounter
53+
long seed = 0L; // Fester Seed-Wert
54+
long startingCounter = 0L; // Startwert für Counter
55+
56+
Pointer kernelParameters = Pointer.to(
57+
Pointer.to(deviceOut), // ulong* output
58+
Pointer.to(new long[]{seed}), // uint64_t seed
59+
Pointer.to(new long[]{startingCounter}), // uint64_t startingCounter
60+
Pointer.to(new long[]{n}) // size_t numElements
61+
);
62+
63+
// Launch the kernel
64+
int blockSizeX = 128;
65+
int gridSizeX = (int) Math.ceil((double)n / blockSizeX);
66+
JCudaDriver.cuLaunchKernel(
67+
function,
68+
gridSizeX, 1, 1, // Grid dimension
69+
blockSizeX, 1, 1, // Block dimension
70+
0, null, // Shared memory size and stream
71+
kernelParameters, null // Kernel- und extra parameters
72+
);
73+
JCudaDriver.cuCtxSynchronize();
74+
75+
// Copy result back
76+
JCudaDriver.cuMemcpyDtoH(Pointer.to(hostOut), deviceOut, n * Sizeof.LONG);
77+
78+
// Print results
79+
System.out.println("Generated random numbers with seed=" +
80+
String.format("0x%016X", seed) +
81+
" and startingCounter=" + startingCounter);
82+
for (int i = 0; i < Math.min(10, n); i++) {
83+
System.out.printf("hostOut[%d] = 0x%016X\n", i, hostOut[i]);
84+
}
85+
86+
// Cleanup
87+
JCudaDriver.cuMemFree(deviceOut);
88+
JCudaDriver.cuCtxDestroy(context);
89+
}
90+
}
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
import jcuda.*;
2+
import jcuda.driver.*;
3+
4+
import java.io.BufferedReader;
5+
import java.io.File;
6+
import java.io.FileWriter;
7+
import java.io.InputStreamReader;
8+
import java.util.ArrayList;
9+
import java.util.List;
10+
import java.util.Random;
11+
12+
import static java.nio.file.Files.readAllBytes;
13+
import static jcuda.driver.JCudaDriver.*;
14+
15+
public class PhiloxRuntimeCompilationExample implements AutoCloseable {
16+
private static String philox4x64KernelSource = "#include <cuda_runtime.h>\n" +
17+
"#include <Random123/philox.h>\n" +
18+
"extern \"C\" __global__ void philox_4_64(ulong* output, uint64_t startingCounter, uint64_t seed, size_t numElements) {\n"
19+
+
20+
" uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n" +
21+
" if (idx * 4 < numElements) {\n" +
22+
" r123::Philox4x64 rng;\n" +
23+
" r123::Philox4x64::ctr_type ctr = {{startingCounter + idx, 0, 0, 0}};\n" +
24+
" r123::Philox4x64::key_type key = {{seed}};\n" +
25+
" r123::Philox4x64::ctr_type result = rng(ctr, key);\n" +
26+
" for (int i = 0; i < 4; ++i) {\n" +
27+
" size_t outputIdx = idx * 4 + i;\n" +
28+
" if (outputIdx < numElements) {\n" +
29+
" output[outputIdx] = result[i];\n" +
30+
" }\n" +
31+
" }\n" +
32+
" }\n" +
33+
"}\n";
34+
35+
private final CUcontext context;
36+
private final CUmodule module;
37+
private final CUfunction function;
38+
private final int blockSize;
39+
40+
public PhiloxRuntimeCompilationExample() {
41+
JCudaDriver.setExceptionsEnabled(true);
42+
// Initialize CUDA
43+
cuInit(0);
44+
CUdevice device = new CUdevice();
45+
cuDeviceGet(device, 0);
46+
context = new CUcontext();
47+
int result = cuCtxCreate(context, 0, device);
48+
if (result != CUresult.CUDA_SUCCESS) {
49+
throw new RuntimeException(
50+
"Kontext-Erstellung fehlgeschlagen: " + result + ", " + CUresult.stringFor(result));
51+
}
52+
53+
// Compile to PTX
54+
String ptx = compileToTPX(philox4x64KernelSource);
55+
56+
// Load the PTX
57+
module = new CUmodule();
58+
cuModuleLoadData(module, ptx);
59+
function = new CUfunction();
60+
cuModuleGetFunction(function, module, "philox_4_64");
61+
62+
// Set block size based on device capabilities
63+
blockSize = 64; // Can be adjusted based on device properties
64+
}
65+
66+
private String compileToTPX(String source) {
67+
try {
68+
// Temporäre Dateien erstellen
69+
File sourceFile = File.createTempFile("philox_kernel", ".cu");
70+
File outputFile = File.createTempFile("philox_kernel", ".ptx");
71+
72+
// CUDA-Quellcode in temporäre Datei schreiben
73+
try (FileWriter writer = new FileWriter(sourceFile)) {
74+
writer.write(philox4x64KernelSource);
75+
}
76+
77+
// nvcc Kommando zusammenbauen
78+
List<String> command = new ArrayList<>();
79+
command.add("/usr/local/cuda/bin/nvcc");
80+
command.add("-ccbin");
81+
command.add("gcc-8");
82+
command.add("--ptx"); // PTX-Output generieren
83+
command.add("-o");
84+
command.add(outputFile.getAbsolutePath());
85+
command.add("-I");
86+
command.add("./lib/random123/include");
87+
command.add(sourceFile.getAbsolutePath());
88+
89+
// Prozess erstellen und ausführen
90+
ProcessBuilder pb = new ProcessBuilder(command);
91+
pb.redirectErrorStream(true);
92+
Process process = pb.start();
93+
94+
// Output des Kompilers lesen
95+
try (BufferedReader reader = new BufferedReader(
96+
new InputStreamReader(process.getInputStream()))) {
97+
String line;
98+
StringBuilder output = new StringBuilder();
99+
while ((line = reader.readLine()) != null) {
100+
output.append(line).append("\n");
101+
}
102+
System.out.println("Compiler Output: " + output.toString());
103+
}
104+
105+
// Auf Prozessende warten
106+
int exitCode = process.waitFor();
107+
if (exitCode != 0) {
108+
throw new RuntimeException("nvcc Kompilierung fehlgeschlagen mit Exit-Code: " + exitCode);
109+
}
110+
111+
// PTX-Datei einlesen
112+
String ptxCode = new String(readAllBytes(outputFile.toPath()));
113+
114+
// Aufräumen
115+
sourceFile.delete();
116+
outputFile.delete();
117+
118+
return ptxCode;
119+
120+
} catch (Exception e) {
121+
throw new RuntimeException("Fehler bei der CUDA-Kompilierung: " + e.getMessage(), e);
122+
}
123+
}
124+
125+
/**
126+
* Generates random numbers using the Philox4x64 algorithm
127+
*
128+
* @param startingCounter Initial counter value
129+
* @param seed Random seed
130+
* @param numElements Number of random numbers to generate
131+
* @return Array of random numbers
132+
*/
133+
public CUdeviceptr Philox4x64(long startingCounter, long seed, int numElements) {
134+
// Allocate host memory for results
135+
// long[] hostOutput = new long[numElements];
136+
137+
// Allocate device memory
138+
CUdeviceptr deviceOutput = new CUdeviceptr();
139+
cuMemAlloc(deviceOutput, (long) numElements * Sizeof.LONG);
140+
141+
try {
142+
// Set up kernel parameters mit Debugging
143+
System.out.printf("numElements: %d, seed: %d, startingCounter: %d%n",
144+
numElements, seed, startingCounter);
145+
146+
Pointer kernelParams = Pointer.to(
147+
Pointer.to(deviceOutput),
148+
Pointer.to(new long[] { startingCounter }),
149+
Pointer.to(new long[] { seed }),
150+
Pointer.to(new long[] { numElements }));
151+
152+
// Calculate grid size
153+
int gridSize = (numElements + (blockSize * 4) - 1) / (blockSize * 4);
154+
155+
// Launch kernel mit Fehlerprüfung
156+
int kernelResult = cuLaunchKernel(function,
157+
gridSize, 1, 1, // Grid dimension
158+
blockSize, 1, 1, // Block dimension
159+
0, null, // Shared memory size and stream
160+
kernelParams, null // Kernel parameters and extra parameters
161+
);
162+
if (kernelResult != CUresult.CUDA_SUCCESS) {
163+
throw new RuntimeException(
164+
"Kernel-Launch fehlgeschlagen: " + kernelResult + ", " + CUresult.stringFor(kernelResult));
165+
}
166+
167+
// Copy results back to host
168+
// cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, (long) numElements *
169+
// Sizeof.LONG);
170+
} finally {
171+
// Free device memory
172+
// cuMemFree(deviceOutput);
173+
}
174+
175+
// return hostOutput;
176+
return deviceOutput;
177+
}
178+
179+
/**
180+
* Cleans up CUDA resources
181+
*/
182+
public void close() {
183+
cuModuleUnload(module);
184+
cuCtxDestroy(context);
185+
}
186+
187+
// Example usage
188+
public static void main(String[] args) {
189+
try (PhiloxRuntimeCompilationExample generator = new PhiloxRuntimeCompilationExample()) {
190+
// Generate 1 million random numbers
191+
int numElements = 1_000_000;
192+
long seed = 0L;
193+
long startingCounter = 0L;
194+
195+
CUdeviceptr randomNumbers = generator.Philox4x64(startingCounter, seed, numElements);
196+
197+
long[] elements = new long[10];
198+
cuMemcpyDtoH(Pointer.to(elements), randomNumbers, 10L * Sizeof.LONG);
199+
cuMemFree(randomNumbers);
200+
201+
// Print first few numbers
202+
System.out.println("First 10 random numbers:");
203+
for (int i = 0; i < 10; i++) {
204+
System.out.printf("%d: %x%n", i, elements[i]);
205+
}
206+
207+
int size = 10_000_000;
208+
long start = System.currentTimeMillis();
209+
CUdeviceptr ptr = generator.Philox4x64(0L, 0L, size);
210+
long end = System.currentTimeMillis();
211+
System.out.println("philox4x64 speed test: " + (end - start) * 1000 + " microseconds");
212+
cuMemFree(ptr);
213+
Random r = new Random();
214+
long javaStart = System.currentTimeMillis();
215+
for (int i = 0; i < size; i++) {
216+
r.nextLong();
217+
}
218+
long javaEnd = System.currentTimeMillis();
219+
System.out.println("java speed test: " + (javaEnd - javaStart) * 1000 + " microseconds");
220+
System.out.println("philox4x64 is " + (double) (javaEnd - javaStart) / (double) (end - start)
221+
+ " times faster than java");
222+
223+
}
224+
}
225+
}

0 commit comments

Comments
 (0)