diff --git a/HW3/P2/mandelbrot2.cl b/HW3/P2/mandelbrot2.cl new file mode 100644 index 00000000..ea0911ef --- /dev/null +++ b/HW3/P2/mandelbrot2.cl @@ -0,0 +1,35 @@ +__kernel void +mandelbrot(__global __read_only float *coords_real, + __global __read_only float *coords_imag, + __global __write_only int *out_counts, + int w, int h, int max_iter) +{ + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + float c_real, c_imag; + float z_real, z_imag; + float z_temp; + int iter=0; + + if ((x < w) && (y < h)) { + // YOUR CODE HERE + z_real = 0; + z_imag = 0; + c_real = coords_real[x+y*w]; + c_imag = coords_imag[x+y*w]; + while(iter 4){ + break; + } + else{ + z_temp = z_real; + z_real = z_real*z_real - z_imag*z_imag + c_real; + z_imag = 2*z_imag*z_temp + c_imag; + iter = iter + 1; + } + out_counts[x+y*w] = iter; + } + } +} \ No newline at end of file diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..096ea09c --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,29 @@ +PROBLEM 3 + +The best configuration and time for my hardware is: +configuration ('coalesced', 512, 64): 0.0030328 seconds + +This is my computer information: + +--------------------------- +Apple Apple version: OpenCL 1.2 (Feb 27 2015 01:29:10) +The devices detected on platform Apple are: +--------------------------- +Intel(R) Core(TM) i7-3540M CPU @ 3.00GHz [Type: CPU ] +Maximum clock Frequency: 3000 MHz +Maximum allocable memory size: 2147 MB +Maximum work group size 1024 +Maximum work item dimensions 3 +Maximum work item size [1024, 1, 1] +--------------------------- +HD Graphics 4000 [Type: GPU ] +Maximum clock Frequency: 1300 MHz +Maximum allocable memory size: 268 MB +Maximum work group size 512 +Maximum work item dimensions 3 +Maximum work item size [512, 512, 512] +--------------------------- +This context is associated with 2 devices +The queue is using the device: HD Graphics 4000 +The device memory bandwidth is 11.0345233454 GB/s +The host-device bandwidth is 5.02189546422 GB/s diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..942043a8 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -8,8 +8,11 @@ __kernel void sum_coalesced(__global float* x, // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + int k = get_global_size(0); + int i = get_global_id(0); + for(int s = i; s 0; s >>= 1) { + if(local_id< s) { + fast[local_id] += fast[local_id+s]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,7 +47,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +57,13 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + int i = get_global_id(0); + + for (int s=k*i;s 0; s >>= 1) { + if(local_id< s) { + fast[local_id] += fast[local_id+s]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..51f4b360 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,34 +1,74 @@ #include "median9.h" // 3x3 median filter -__kernel void -median_3x3(__global __read_only float *in_values, - __global __write_only float *out_values, - __local float *buffer, - int w, int h, - int buf_w, int buf_h, - const int halo) -{ - // Note: It may be easier for you to implement median filtering - // without using the local buffer, first, then adjust your code to - // use such a buffer after you have that working. - - - // Load into buffer (with 1-pixel halo). - // - // It may be helpful to consult HW3 Problem 5, and - // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl - // - // Note that globally out-of-bounds pixels should be replaced - // with the nearest valid pixel's value. - - - // Compute 3x3 median for each pixel in core (non-halo) pixels - // - // We've given you median9.h, and included it above, so you can - // use the median9() function. - - - // Each thread in the valid region (x < w, y < h) should write - // back its 3x3 neighborhood median. -} +__kernel void median_3x3(__global __read_only float *in_values, + __global __write_only float *out_values, + __local float *buffer, + int w, int h, + int buf_w, int buf_h, + const int halo){ + + + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + // We define the buffer indices and check their bounds + if ((y < h) && (x < w)) { + if (idx_1D < buf_w) { + for (int row = 0; row < buf_h; row++) { + + int new_x = buf_corner_x + idx_1D; + int new_y = buf_corner_y + row; + + if (new_x < 0){ + new_x = 0; + } + else if (new_x >= w){ + new_x = w-1; + } + + if (new_y < 0){ + new_y = 0; + } + else if (new_y >= h){ + new_y = h-1; + } + + buffer[row * buf_w + idx_1D] = in_values[new_y * w + new_x]; + + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if ((y < h) && (x < w)) { + float s0 = buffer[buf_w *(buf_y - 1) + (buf_x - 1)]; + float s1 = buffer[buf_w *(buf_y - 1) + (buf_x)]; + float s2 = buffer[buf_w *(buf_y - 1) + (buf_x + 1)]; + float s3 = buffer[buf_w *(buf_y) + (buf_x - 1)]; + float s4 = buffer[buf_w *(buf_y) + (buf_x)]; + float s5 = buffer[buf_w *(buf_y) + (buf_x + 1)]; + float s6 = buffer[buf_w *(buf_y + 1) + (buf_x - 1)]; + float s7 = buffer[buf_w *(buf_y + 1) + (buf_x)]; + float s8 = buffer[buf_w *(buf_y + 1) + (buf_x + 1)]; + out_values[y * w + x] = median9(s0, s1, s2, s3, s4, s5, s6, s7, s8); + } +} \ No newline at end of file diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..aa50915a --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,51 @@ +Christian Junge helped me with this problem. + +###Part 1 + +- Maze 1: +Finished after 875 iterations, 574.0908 ms total, 0.656103771429 ms per iteration +Found 2 regions + +- Maze 2: +Finished after 507 iterations, 345.4796 ms total, 0.681419329389 ms per iteration +Found 35 regions + +###Part 2 + +- Maze 1: +Finished after 529 iterations, 337.87184 ms total, 0.638699130435 ms per iteration +Found 2 regions + +- Maze 2: +Finished after 272 iterations, 180.02288 ms total, 0.661848823529 ms per iteration +Found 35 regions + +###Part 3: + +- Maze 1: +Finished after 8 iterations, 6.87728 ms total, 0.85966 ms per iteration +Found 2 regions + +- Maze 2: +Finished after 8 iterations, 6.70104 ms total, 0.83763 ms per iteration +Found 35 regions + +###Part 4: + +- Maze 1: + +- Maze 2: + +###Part 5: + +If instead of using the atomic_min() operation, we used the min() function the final result would be still correct. +Nevertheless, in the case where two threads are trying to change the same "old_label" for different "new_label", the performance of the algorithm would be affected. Let's say create an example: +thread1: old_label1 = 15 new_label1 = 10 +thread2: old_label2 = 15 new_label2 = 8 +In the first step Thread1 is comparing old_label1 to new_label1, it calculates the minimum and chooses new_label1 (10). +Parallely, Thread2 is comparing old_label2 to new_label2, it calculates the minimum and chooses new_label2 (8). +For memory reasons, Thread2 is faster and swaps the labels before Thread1. Thread1 swaps next, for its already selected +value (10). +As we can see this is not the optimum and we would probably end up doing more iterations. Nevertheless, a value in labels +will never increase. + diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..5fce5ad7 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -80,20 +80,52 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + + + // Part 2 + if ((x < w) && (y < h) && old_label < w*h) { + buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int left; + int right; + int up; + int down; + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + + // We get the values for the 4 neighbors + left = buffer[buf_y * buf_w + buf_x-1]; + right = buffer[buf_y * buf_w + buf_x+1]; + up = buffer[(buf_y-1) * buf_w + buf_x]; + down = buffer[(buf_y+1) * buf_w + buf_x]; + + // If it's not a wall, we find the minimum value of its 4 neighboring pixels and itself + if (old_label < w*h) { + new_label = min(left, right); + new_label = min(new_label, up); + new_label = min(new_label, down); + new_label = min(new_label, old_label); + } + else { + new_label = old_label; + } if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. *(changed_flag) += 1; - labels[y * w + x] = new_label; + atomic_min(&labels[old_label], labels[new_label]); + labels[y * w + x] = labels[old_label]; + // labels[y * w + x] = new_label; + } } }