harvard-cs205 · isadoranun · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 21, 2015
diff --git a/HW3/P2/mandelbrot2.cl b/HW3/P2/mandelbrot2.cl
@@ -0,0 +1,35 @@
+__kernel void
+mandelbrot(__global __read_only float *coords_real,
+           __global __read_only float *coords_imag,
+           __global __write_only int *out_counts,
+           int w, int h, int max_iter)
+{
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    float c_real, c_imag;
+    float z_real, z_imag;
+    float z_temp;
+    int iter=0;
+
+    if ((x < w) && (y < h)) {
+        // YOUR CODE HERE
+        z_real = 0;
+        z_imag = 0;
+        c_real = coords_real[x+y*w];
+        c_imag = coords_imag[x+y*w];
+        while(iter<max_iter){
+            if (z_real*z_real + z_imag*z_imag > 4){
+                break;
+            }
+            else{
+                z_temp = z_real;
+                z_real = z_real*z_real - z_imag*z_imag + c_real;
+                z_imag = 2*z_imag*z_temp + c_imag;
+                iter = iter + 1;
+            }
+        out_counts[x+y*w] = iter;
+        }
+    }
+}
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,29 @@
+PROBLEM 3
+
+The best configuration and time for my hardware is:
+configuration ('coalesced', 512, 64): 0.0030328 seconds
+
+This is my computer information:
+
+---------------------------
+Apple Apple version: OpenCL 1.2 (Feb 27 2015 01:29:10)
+The devices detected on platform Apple are:
+---------------------------
+Intel(R) Core(TM) i7-3540M CPU @ 3.00GHz [Type: CPU ]
+Maximum clock Frequency: 3000 MHz
+Maximum allocable memory size: 2147 MB
+Maximum work group size 1024
+Maximum work item dimensions 3
+Maximum work item size [1024, 1, 1]
+---------------------------
+HD Graphics 4000 [Type: GPU ]
+Maximum clock Frequency: 1300 MHz
+Maximum allocable memory size: 268 MB
+Maximum work group size 512
+Maximum work item dimensions 3
+Maximum work item size [512, 512, 512]
+---------------------------
+This context is associated with  2 devices
+The queue is using the device: HD Graphics 4000
+The device memory bandwidth is 11.0345233454 GB/s
+The host-device bandwidth is 5.02189546422 GB/s
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -8,8 +8,11 @@ __kernel void sum_coalesced(__global float* x,
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+
+    int k = get_global_size(0);
+    int i = get_global_id(0);
+    for(int s = i; s<N; s+=k){
+        sum += x[s];
     }
 
     fast[local_id] = sum;
@@ -24,8 +27,14 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    size_t gs = get_local_size(0);
+
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id< s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +47,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +57,13 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    int i = get_global_id(0);
+
+    for (int s=k*i;s<k*(i+1);s++) { // YOUR CODE HERE
+        if(s<N) {
+            sum += x[s]; 
+        }
     }
 
     fast[local_id] = sum;
@@ -64,8 +78,13 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    size_t gs = get_local_size(0);
+
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id< s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,34 +1,74 @@
 #include "median9.h"
 
 // 3x3 median filter
-__kernel void
-median_3x3(__global __read_only float *in_values,
-           __global __write_only float *out_values,
-           __local float *buffer,
-           int w, int h,
-           int buf_w, int buf_h,
-           const int halo)
-{
-    // Note: It may be easier for you to implement median filtering
-    // without using the local buffer, first, then adjust your code to
-    // use such a buffer after you have that working.
-
-
-    // Load into buffer (with 1-pixel halo).
-    //
-    // It may be helpful to consult HW3 Problem 5, and
-    // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
-
-
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
-
-
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
-}
+__kernel void median_3x3(__global __read_only float *in_values,
+                         __global __write_only float *out_values,
+                         __local float *buffer,
+                         int w, int h,
+                         int buf_w, int buf_h,
+                         const int halo){
+
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // We define the buffer indices and check their bounds
+    if ((y < h) && (x < w)) {
+      if (idx_1D  < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+
+          int new_x = buf_corner_x + idx_1D;
+          int new_y = buf_corner_y + row;
+
+          if (new_x < 0){
+            new_x = 0;
+          }
+          else if (new_x >= w){
+            new_x = w-1;
+          }
+
+          if (new_y < 0){
+            new_y = 0;
+          }
+          else if (new_y >= h){
+            new_y = h-1;
+          }
+
+          buffer[row * buf_w + idx_1D] = in_values[new_y * w + new_x];
+
+        }
+      }    
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+     if ((y < h) && (x < w)) {
+        float s0 = buffer[buf_w *(buf_y - 1) + (buf_x - 1)];
+        float s1 = buffer[buf_w *(buf_y - 1) + (buf_x)];
+        float s2 = buffer[buf_w *(buf_y - 1) + (buf_x + 1)];
+        float s3 = buffer[buf_w *(buf_y) + (buf_x - 1)];
+        float s4 = buffer[buf_w *(buf_y) + (buf_x)];
+        float s5 = buffer[buf_w *(buf_y) + (buf_x + 1)];
+        float s6 = buffer[buf_w *(buf_y + 1) + (buf_x - 1)];
+        float s7 = buffer[buf_w *(buf_y + 1) + (buf_x)];
+        float s8 = buffer[buf_w *(buf_y + 1) + (buf_x + 1)];
+        out_values[y * w + x] = median9(s0, s1, s2, s3, s4, s5, s6, s7, s8);
+    }
+}
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,51 @@
+Christian Junge helped me with this problem.
+
+###Part 1
+
+- Maze 1:
+Finished after 875 iterations, 574.0908 ms total, 0.656103771429 ms per iteration
+Found 2 regions
+
+- Maze 2:
+Finished after 507 iterations, 345.4796 ms total, 0.681419329389 ms per iteration
+Found 35 regions
+
+###Part 2
+
+- Maze 1:
+Finished after 529 iterations, 337.87184 ms total, 0.638699130435 ms per iteration
+Found 2 regions
+
+- Maze 2:
+Finished after 272 iterations, 180.02288 ms total, 0.661848823529 ms per iteration
+Found 35 regions
+
+###Part 3:
+
+- Maze 1:
+Finished after 8 iterations, 6.87728 ms total, 0.85966 ms per iteration
+Found 2 regions
+
+- Maze 2:
+Finished after 8 iterations, 6.70104 ms total, 0.83763 ms per iteration
+Found 35 regions
+
+###Part 4:
+
+- Maze 1:
+
+- Maze 2:
+
+###Part 5:
+
+If instead of using the atomic_min() operation, we used the min() function the final result would be still correct. 
+Nevertheless, in the case where two threads are trying to change the same "old_label" for different "new_label", the performance of the algorithm would be affected. Let's say create an example: 
+thread1: old_label1 = 15    new_label1 = 10
+thread2: old_label2 = 15	new_label2 = 8
+In the first step Thread1 is comparing old_label1 to new_label1, it calculates the minimum and chooses new_label1 (10).
+Parallely, Thread2 is comparing old_label2 to new_label2, it calculates the minimum and chooses new_label2 (8).
+For memory reasons, Thread2 is faster and swaps the labels before Thread1. Thread1 swaps next, for its already selected 
+value (10). 
+As we can see this is not the optimum and we would probably end up doing more iterations. Nevertheless, a value in labels
+will never increase.
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -80,20 +80,52 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-
+
+
+    // Part 2
+    if ((x < w) && (y < h) && old_label < w*h) {
+        buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int left;
+    int right;
+    int up;
+    int down;
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+
+        // We get the values for the 4 neighbors
+        left = buffer[buf_y * buf_w + buf_x-1];
+        right = buffer[buf_y * buf_w + buf_x+1];
+        up = buffer[(buf_y-1) * buf_w + buf_x];
+        down = buffer[(buf_y+1) * buf_w + buf_x];
+
+        // If it's not a wall, we find the minimum value of its 4 neighboring pixels and itself 
+        if (old_label < w*h) {
+            new_label = min(left, right);
+            new_label = min(new_label, up);
+            new_label = min(new_label, down);
+            new_label = min(new_label, old_label);
+        }
+        else {
+            new_label = old_label;
+        }
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], labels[new_label]);
+            labels[y * w + x] = labels[old_label];
+            // labels[y * w + x] = new_label;
+
         }
     }
 }