Use CLPtr instead

blegat · blegat · commit 3cd5966104cb · 2025-03-21T15:26:24.000+01:00
diff --git a/examples/hands_on_opencl/ex08/matmul.jl b/examples/hands_on_opencl/ex08/matmul.jl
@@ -151,6 +151,8 @@ else
 
 @info("=== OpenCL, matrix mult, C row, priv A, B, cols loc, order $Ndim ====")
 
+ORDER = 2
+
 for i in 1:COUNT
     fill!(h_C, 0.0)
     localmem = cl.LocalMem(Float32, Pdim)
diff --git a/examples/hands_on_opencl/ex09/pi_ocl.cl b/examples/hands_on_opencl/ex09/pi_ocl.cl
@@ -35,6 +35,8 @@ __kernel void pi(
 
    for(i= istart; i<iend; i++){
        x = (i+0.5f)*step_size;
+       // arctan(x)' = 1 / (1 + x^2)
+       // pi/4 = arctan(1)
        accum += 4.0f/(1.0f+x*x);
    }
 
diff --git a/examples/hands_on_opencl/ex09/pi_ocl.jl b/examples/hands_on_opencl/ex09/pi_ocl.jl
@@ -18,15 +18,15 @@ src_dir = dirname(Base.source_path())
 
 #
 # Some constant values
-const INSTEPS = 512*512*512
-const ITERS = 262144
+INSTEPS = 512*512*512
+ITERS = 262144
 
 # Set some default values:
 # Default number of steps (updated later to device prefereable)
-const in_nsteps = INSTEPS
+in_nsteps = INSTEPS
 
 # Default number of iterations
-const niters = ITERS
+niters = ITERS
 
 kernelsource = read(joinpath(src_dir, "pi_ocl.cl"), String)
 program = cl.Program(source=kernelsource) |> cl.build!
diff --git a/examples/hands_on_opencl/exA/pi_vocl.jl b/examples/hands_on_opencl/exA/pi_vocl.jl
@@ -27,6 +27,7 @@ if length(ARGS) < 1
     exit(1)
 end
 vector_size = parse(Int, ARGS[1])
+vector_size = 8
 
 if vector_size == 1
         ITERS = 262144
@@ -68,13 +69,11 @@ end
 nwork_groups = in_nsteps ÷ (work_group_size * niters)
 
 # get the max work group size for the kernel on our device
-if vector_size == 1
-    max_size = cl.work_group_info(pi_kernel, cl.device()).size
-elseif vector_size == 4
-    max_size = cl.work_group_info(pi_kernel, cl.device()).size
-elseif vector_size == 8
-    max_size = cl.work_group_info(pi_kernel, cl.device()).size
-end
+max_size = cl.work_group_info(pi_kernel, cl.device()).size
+cl.work_group_info(pi_kernel, cl.device()).prefered_size_multiple
+cl.work_group_info(pi_kernel, cl.device()).private_mem_size
+cl.work_group_info(pi_kernel, cl.device()).local_mem_size
+cl.work_group_info(pi_kernel, cl.device()).compile_size
 
 if max_size > work_group_size
     work_group_size = max_size
diff --git a/examples/notebooks/Transpose.ipynb b/examples/notebooks/Transpose.ipynb
diff --git a/examples/performance.jl b/examples/performance.jl
@@ -106,6 +106,6 @@ end
 # N_WORKERS has to be less than or equal to the device's max work group size
 # ex. N_WORKERS = 1 is non parallel execution on the gpu
 
-const N_DATA_PTS = Int(2^23) # ~8 million
-const N_WORKERS  = Int(2^7)
+const N_DATA_PTS = Int(2^2) # ~8 million
+const N_WORKERS  = Int(2^2)
 cl_performance(N_DATA_PTS, N_WORKERS)
diff --git a/test/Project.toml b/test/Project.toml
@@ -3,6 +3,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@ __kernel void pi(`
`35`	`35`
`36`	`36`	`for(i= istart; i<iend; i++){`
`37`	`37`	`x = (i+0.5f)*step_size;`
	`38`	`+ // arctan(x)' = 1 / (1 + x^2)`
	`39`	`+ // pi/4 = arctan(1)`
`38`	`40`	`accum += 4.0f/(1.0f+x*x);`
`39`	`41`	`}`
`40`	`42`