From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Mon, 2 Nov 2015 21:49:00 -0500 Subject: [PATCH 1/4] remove extraneous +4 --- HW3/P3/tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Wed, 4 Nov 2015 21:46:08 -0500 Subject: [PATCH 2/4] typo in typecast --- HW3/P3/sum.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..ee914740 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Thu, 5 Nov 2015 09:43:23 -0500 Subject: [PATCH 3/4] unused module, set include path --- HW3/P4/median_filter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) From 4ea109e69186b3d1c0dfa4a95826b0bfd582423a Mon Sep 17 00:00:00 2001 From: Crystal Lim Date: Sat, 21 Nov 2015 07:18:51 -0500 Subject: [PATCH 4/4] first commit --- HW3/P3/P3.txt | 31 ++++++++++++++++++++ HW3/P3/sum.cl | 55 ++++++++++++++++++++++++++++------ HW3/P4/median_filter.cl | 65 +++++++++++++++++++++++++++++++++++++++++ HW3/P5/P5.txt | 46 +++++++++++++++++++++++++++++ HW3/P5/label_regions.cl | 49 +++++++++++++++++++++++++++---- 5 files changed, 231 insertions(+), 15 deletions(-) create mode 100644 HW3/P3/P3.txt create mode 100644 HW3/P5/P5.txt diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..383e5fbd --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,31 @@ +RESULTS + +fastest: +coalesced reads, workgroups: 8, num_workers: 4, 0.076327157 seconds + + + +Note: +getting segmentation fault, so i am probably not checking well for +threads going out of bounds, though i do check that things are < N +my results are from printing and looking at times manually. it +segfaults in the middle of workgroups:128 + +Results for test.py + +The platforms detected are: +--------------------------- +AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8) +The devices detected on platform AMD Accelerated Parallel Processing are: +--------------------------- +Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz [Type: CPU ] +Maximum clock Frequency: 2394 MHz +Maximum allocable memory size: 1073 MB +Maximum work group size 1024 +Maximum work item dimensions 3 +Maximum work item size [1024, 1024, 1024] +--------------------------- +This context is associated with 1 devices +The queue is using the device: Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz +The device memory bandwidth is 1.46333327639 GB/s +The host-device bandwidth is 5.07974888261 GB/s diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..827ace6b 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -8,8 +8,12 @@ __kernel void sum_coalesced(__global float* x, // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + int i = get_global_id(0); + int k = get_global_size(0); + + for (int j = 0; (i + j*k) < N; j++) { // YOUR CODE HERE + sum = sum + x[i + j*k]; // YOUR CODE HERE } fast[local_id] = sum; @@ -24,8 +28,22 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + int ls = get_local_size(0); + int offset = 0; + int limit = 0; + + // calculate log_2(local_size) + // = to number of shifts + while (ls > 1) { + ls = ls >> 1; + limit = limit + 1; + } + + for (int j=1; j < limit; j++) { // YOUR CODE HERE + offset = (get_local_size(0) >> j); + if (i+offset < N) + fast[i] = fast[i] + fast[i + offset]; // YOUR CODE HERE } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -40,16 +58,21 @@ __kernel void sum_blocked(__global float* x, size_t local_id = get_local_id(0); int k = ceil((float)N / get_global_size(0)); + int i = get_global_id(0); + int val = 0; // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 // thread with global_id 2 should add 2k..3k-1 // ... // with k = ceil(N / get_global_size()). - // + // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + + for (val = k*i; val <= k*(i+1)-1; val++) { // YOUR CODE HERE + if (val < N) + sum = sum + x[val]; // YOUR CODE HERE } fast[local_id] = sum; @@ -64,8 +87,22 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + int ls = get_local_size(0); + int offset = 0; + int limit = 0; + + // calculate log_2(local_size) + // = to number of shifts + while (ls > 1) { + ls = ls >> 1; + limit = limit + 1; + } + + for (int j=1; j < limit; j++) { // YOUR CODE HERE + offset = (get_local_size(0) >> j); + if (i + offset < N) + fast[i] = fast[i] + fast[i + offset]; // YOUR CODE HERE } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..b8332f66 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -22,12 +22,77 @@ median_3x3(__global __read_only float *in_values, // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + int row; + + // Here is the actual loading of the buffer + // with 1D indexing + if (idx_1D < buf_w) + for (row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = \ + FETCH(in_values, w, h, + buf_corner_x + idx_1D, + buf_corner_y + row); + } + + barrier(CLK_LOCAL_MEM_FENCE); // Compute 3x3 median for each pixel in core (non-halo) pixels // // We've given you median9.h, and included it above, so you can // use the median9() function. + // median9(...) gives the median value of a 3x3 space + // advice from halo_load.cl: Should only use buffer, buf_x, buf_y. + + //if ((y > 0 && y < h-1 ) && (x > 0 && x < w-1)) // stay in bounds + if ((x < w) && (y < h)) + out_values[y * w + x] = \ + meadian9(buffer[(buf_y * buf_w + buf_x) -buf_x-1], + buffer[(buf_y * buf_w + buf_x) -buf_x], + buffer[(buf_y * buf_w + buf_x) -buf_x+1], + buffer[(buf_y * buf_w + buf_x) -1], + buffer[(buf_y * buf_w + buf_x)], + buffer[(buf_y * buf_w + buf_x) +1], + buffer[(buf_y * buf_w + buf_x) +buf_x-1], + buffer[(buf_y * buf_w + buf_x) +buf_x], + buffer[(buf_y * buf_w + buf_x) +buf_x+1]); + + // take care of corners + /* + if (x==0 && y==0){ + output[0] = buffer[(buf_y * buf_w + buf_x)]; + } + else if (x==0){ + } + else if (x==w-1){ + } + else if (y==0){ + } + else if (y==h-1){ + } + */ + // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..12c4c702 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,46 @@ +Iteration counts and average kernel times after each change for Parts 1-4, +an explanation for Part 4 as to why a single thread is a good (or bad) choice for this +operation, and the explanation of Part 5. + +Was getting errors on part 1, but continued on anyways + +Some of Error Output: +"/tmp/OCL8gbLMr.cl", line 31: error: a parameter cannot be allocated in a + named address space + propagate_labels(__global __read_write int *labels, + ^ + +"/tmp/OCL8gbLMr.cl", line 31: error: expected a ")" + propagate_labels(__global __read_write int *labels, + ^ + +"/tmp/OCL8gbLMr.cl", line 36: warning: parsing restarts here after previous + syntax error + const int halo) + ^ + +"/tmp/OCL8gbLMr.cl", line 50: error: identifier "halo" is undefined + const int buf_corner_x = x - lx - halo; + ^ + +"/tmp/OCL8gbLMr.cl", line 65: error: identifier "buf_w" is undefined + if (idx_1D < buf_w) { + ^ + +"/tmp/OCL8gbLMr.cl", line 66: error: identifier "buf_h" is undefined + for (int row = 0; row < buf_h; row++) { + ^ + + + +Part 4 Explanation +It seems valuable to use a single thread for a work group as it has its +own memory and won't have to wastefully repeat calculations over same +indices, which is what may be happening in part 2. + + +Part 5 Explanation +If we used min instead of atomic_min our answer might not be correct. +Atomic_min serializes so it will get the true minimum, while min in +parallel will cause values to be overwritten. However, this serialization +makes atomic_min slower. diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..4ad4b040 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -56,15 +56,15 @@ propagate_labels(__global __read_write int *labels, // 1D index of thread within our work-group const int idx_1D = ly * get_local_size(0) + lx; - + int old_label; // Will store the output value int new_label; - - // Load the relevant labels to a local buffer with a halo + + // Load the relevant labels to a local buffer with a halo if (idx_1D < buf_w) { for (int row = 0; row < buf_h; row++) { - buffer[row * buf_w + idx_1D] = + buffer[row * buf_w + idx_1D] = get_clamped_value(labels, w, h, buf_corner_x + idx_1D, buf_corner_y + row); @@ -80,20 +80,57 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + + // part 2 + // perform this operation by replacing each value in + // buffer (at index offset) with label[buffer[offset]] + buffer[buf_y * buf_w + buf_x] = label[buffer[buf_y * buf_w + buf_x]]; + + + // need to add barrier before min neighbors part 1 calculation + barrier(CLK LOCAL MEM FENCE); + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. + + // get min of current pixel and its 4 neighbors new_label = old_label; + // check upper neighbor + if (buffer[buf_y * buf_w + buf_x - buf_w] < new_label){ + new_label = buffer[buf_y * buf_w + buf_x - buf_w]; + } + // check lower neighbor + else if (buffer[buf_y * buf_w + buf_x + buf_w] < new_label){ + new_label = buffer[buf_y * buf_w + buf_x + buf_w]; + } + // check right neighbor + else if (buffer[buf_y * buf_w + buf_x + 1] < new_label){ + new_label = buffer[buf_y * buf_w + buf_x + 1]; + } + // check left neighbor + else if (buffer[buf_y * buf_w + buf_x - 1] < new_label){ + new_label = buffer[buf_y * buf_w + buf_x - 1]; + } + + + if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. *(changed_flag) += 1; - labels[y * w + x] = new_label; + + //part 3 calls for not directly assigning + //labels[y * w + x] = new_label; + + // use atomic_min to make sure pixel's value in labels + // never increases + labels[y * w + x] = atomic_min(labels,new_label) + } } }