From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Mon, 2 Nov 2015 21:49:00 -0500
Subject: [PATCH 1/4] remove extraneous +4

---
 HW3/P3/tune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Wed, 4 Nov 2015 21:46:08 -0500
Subject: [PATCH 2/4] typo in typecast

---
 HW3/P3/sum.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..ee914740 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1

From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@seas.harvard.edu>
Date: Thu, 5 Nov 2015 09:43:23 -0500
Subject: [PATCH 3/4] unused module, set include path

---
 HW3/P4/median_filter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

From 4ea109e69186b3d1c0dfa4a95826b0bfd582423a Mon Sep 17 00:00:00 2001
From: Crystal Lim <crystallim@g.harvard.edu>
Date: Sat, 21 Nov 2015 07:18:51 -0500
Subject: [PATCH 4/4] first commit

---
 HW3/P3/P3.txt           | 31 ++++++++++++++++++++
 HW3/P3/sum.cl           | 55 ++++++++++++++++++++++++++++------
 HW3/P4/median_filter.cl | 65 +++++++++++++++++++++++++++++++++++++++++
 HW3/P5/P5.txt           | 46 +++++++++++++++++++++++++++++
 HW3/P5/label_regions.cl | 49 +++++++++++++++++++++++++++----
 5 files changed, 231 insertions(+), 15 deletions(-)
 create mode 100644 HW3/P3/P3.txt
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..383e5fbd
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,31 @@
+RESULTS
+
+fastest:
+coalesced reads, workgroups: 8, num_workers: 4, 0.076327157 seconds
+
+
+
+Note:
+getting segmentation fault, so i am probably not checking well for
+threads going out of bounds, though i do check that things are < N
+my results are from printing and looking at times manually. it
+segfaults in the middle of workgroups:128
+
+Results for test.py
+
+The platforms detected are:
+---------------------------
+AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8)
+The devices detected on platform AMD Accelerated Parallel Processing are:
+---------------------------
+Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz [Type: CPU ]
+Maximum clock Frequency: 2394 MHz
+Maximum allocable memory size: 1073 MB
+Maximum work group size 1024
+Maximum work item dimensions 3
+Maximum work item size [1024, 1024, 1024]
+---------------------------
+This context is associated with  1 devices
+The queue is using the device: Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz
+The device memory bandwidth is 1.46333327639 GB/s
+The host-device bandwidth is 5.07974888261 GB/s
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..827ace6b 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -8,8 +8,12 @@ __kernel void sum_coalesced(__global float* x,
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+
+    int i = get_global_id(0);
+    int k = get_global_size(0);
+
+    for (int j = 0; (i + j*k) < N; j++) { // YOUR CODE HERE
+        sum = sum + x[i + j*k]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,22 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    int ls = get_local_size(0);
+    int offset = 0;
+    int limit = 0;
+
+    // calculate log_2(local_size)
+    // = to number of shifts
+    while (ls > 1) {
+        ls = ls >> 1;
+        limit = limit + 1;
+    }
+
+    for (int j=1; j < limit; j++) { // YOUR CODE HERE
+        offset = (get_local_size(0) >> j);
+        if (i+offset < N)
+            fast[i] = fast[i] + fast[i + offset]; // YOUR CODE HERE
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -40,16 +58,21 @@ __kernel void sum_blocked(__global float* x,
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
 
+    int i = get_global_id(0);
+    int val = 0;
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
     // thread with global_id 2 should add 2k..3k-1
     // ...
     //     with k = ceil(N / get_global_size()).
-    // 
+    //
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+
+    for (val = k*i; val <= k*(i+1)-1; val++) { // YOUR CODE HERE
+        if (val < N)
+            sum = sum + x[val]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +87,22 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    int ls = get_local_size(0);
+    int offset = 0;
+    int limit = 0;
+
+    // calculate log_2(local_size)
+    // = to number of shifts
+    while (ls > 1) {
+        ls = ls >> 1;
+        limit = limit + 1;
+    }
+
+    for (int j=1; j < limit; j++) { // YOUR CODE HERE
+        offset = (get_local_size(0) >> j);
+        if (i + offset < N)
+            fast[i] = fast[i] + fast[i + offset]; // YOUR CODE HERE
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..b8332f66 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -22,12 +22,77 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    // Here is the actual loading of the buffer
+    // with 1D indexing
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                FETCH(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
+    // median9(...) gives the median value of a 3x3 space
+    // advice from halo_load.cl: Should only use buffer, buf_x, buf_y.
+
+    //if ((y > 0 && y < h-1 ) && (x > 0 && x < w-1)) // stay in bounds
+    if ((x < w) && (y < h))
+        out_values[y * w + x] = \
+            meadian9(buffer[(buf_y * buf_w + buf_x) -buf_x-1],
+                     buffer[(buf_y * buf_w + buf_x) -buf_x],
+                     buffer[(buf_y * buf_w + buf_x) -buf_x+1],
+                     buffer[(buf_y * buf_w + buf_x) -1],
+                     buffer[(buf_y * buf_w + buf_x)],
+                     buffer[(buf_y * buf_w + buf_x) +1],
+                     buffer[(buf_y * buf_w + buf_x) +buf_x-1],
+                     buffer[(buf_y * buf_w + buf_x) +buf_x],
+                     buffer[(buf_y * buf_w + buf_x) +buf_x+1]);
+
+    // take care of corners
+    /*
+    if (x==0 && y==0){
+        output[0] = buffer[(buf_y * buf_w + buf_x)];
+    }
+    else if (x==0){
+    }
+    else if (x==w-1){
+    }
+    else if (y==0){
+    }
+    else if (y==h-1){
+    }
+    */
+
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..12c4c702
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,46 @@
+Iteration counts and average kernel times after each change for Parts 1-4,
+an explanation for Part 4 as to why a single thread is a good (or bad) choice for this
+operation, and the explanation of Part 5.
+
+Was getting errors on part 1, but continued on anyways
+
+Some of Error Output:
+"/tmp/OCL8gbLMr.cl", line 31: error: a parameter cannot be allocated in a
+          named address space
+  propagate_labels(__global __read_write int *labels,
+                   ^
+
+"/tmp/OCL8gbLMr.cl", line 31: error: expected a ")"
+  propagate_labels(__global __read_write int *labels,
+                                         ^
+
+"/tmp/OCL8gbLMr.cl", line 36: warning: parsing restarts here after previous
+          syntax error
+                   const int halo)
+                                 ^
+
+"/tmp/OCL8gbLMr.cl", line 50: error: identifier "halo" is undefined
+      const int buf_corner_x = x - lx - halo;
+                                        ^
+
+"/tmp/OCL8gbLMr.cl", line 65: error: identifier "buf_w" is undefined
+      if (idx_1D < buf_w) {
+                   ^
+
+"/tmp/OCL8gbLMr.cl", line 66: error: identifier "buf_h" is undefined
+          for (int row = 0; row < buf_h; row++) {
+                                  ^
+
+
+
+Part 4 Explanation
+It seems valuable to use a single thread for a work group as it has its
+own memory and won't have to wastefully repeat calculations over same
+indices, which is what may be happening in part 2.
+
+
+Part 5 Explanation
+If we used min instead of atomic_min our answer might not be correct.
+Atomic_min serializes so it will get the true minimum, while min in
+parallel will cause values to be overwritten.  However, this serialization
+makes atomic_min slower.
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..4ad4b040 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -56,15 +56,15 @@ propagate_labels(__global __read_write int *labels,
 
     // 1D index of thread within our work-group
     const int idx_1D = ly * get_local_size(0) + lx;
-    
+
     int old_label;
     // Will store the output value
     int new_label;
-    
-    // Load the relevant labels to a local buffer with a halo 
+
+    // Load the relevant labels to a local buffer with a halo
     if (idx_1D < buf_w) {
         for (int row = 0; row < buf_h; row++) {
-            buffer[row * buf_w + idx_1D] = 
+            buffer[row * buf_w + idx_1D] =
                 get_clamped_value(labels,
                                   w, h,
                                   buf_corner_x + idx_1D, buf_corner_y + row);
@@ -80,20 +80,57 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    
+
+    // part 2
+    // perform this operation by replacing each value in
+    // buffer (at index offset) with label[buffer[offset]]
+    buffer[buf_y * buf_w + buf_x] = label[buffer[buf_y * buf_w + buf_x]];
+
+
+    // need to add barrier before min neighbors part 1 calculation
+    barrier(CLK LOCAL MEM FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
+
+        // get min of current pixel and its 4 neighbors
         new_label = old_label;
 
+        // check upper neighbor
+        if (buffer[buf_y * buf_w + buf_x - buf_w] < new_label){
+            new_label = buffer[buf_y * buf_w + buf_x - buf_w];
+        }
+        // check lower neighbor
+        else if (buffer[buf_y * buf_w + buf_x + buf_w] < new_label){
+            new_label = buffer[buf_y * buf_w + buf_x + buf_w];
+        }
+        // check right neighbor
+        else if (buffer[buf_y * buf_w + buf_x + 1] < new_label){
+            new_label = buffer[buf_y * buf_w + buf_x + 1];
+        }
+        // check left neighbor
+        else if (buffer[buf_y * buf_w + buf_x - 1] < new_label){
+            new_label = buffer[buf_y * buf_w + buf_x - 1];
+        }
+
+
+
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+
+            //part 3 calls for not directly assigning
+            //labels[y * w + x] = new_label;
+
+            // use atomic_min to make sure pixel's value in labels
+            // never increases
+            labels[y * w + x] = atomic_min(labels,new_label)
+
         }
     }
 }