harvard-cs205 · jonahkall · Nov 18, 2015 · Nov 19, 2015 · Nov 19, 2015 · Nov 19, 2015
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -11,9 +11,30 @@ mandelbrot(__global __read_only float *coords_real,
     float c_real, c_imag;
     float z_real, z_imag;
     int iter;
+    float z_new_r = 0;
+    float z_new_i = 0;
+    z_real = 0;
+    z_imag = 0;
+
+    c_real = coords_real[w*y + x];
+    c_imag = coords_imag[w*y + x];
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
-    }
+        iter = 0;
+    	for (int i = 0; i < max_iter; ++i) {
+    		if (z_real*z_real + z_imag * z_imag > 4.0) {
+    			break;
+    		}
+    		iter++;
+    		// z = z * z + c
+    		z_new_r = z_real * z_real - z_imag * z_imag;
+    		z_new_i = 2.0 * z_real * z_imag;
+    		z_new_i += c_imag;
+    		z_new_r += c_real;
+    		z_real = z_new_r;
+    		z_imag = z_new_i;
+    	}
+    	out_counts[w * y + x] = iter;
+  	}
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,86 @@
+coalesced reads, workgroups: 8, num_workers: 4, 0.07258504 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.03673464 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.01913392 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.01039344 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.00695472 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.0043812 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.06001544 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.0310272 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.01570544 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.00764992 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00431152 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.00333848 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.03370544 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.01284968 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.00768184 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00646128 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00485712 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.0032364 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02550672 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.0136236 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00706304 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00393336 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00425824 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00466488 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.05210832 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.02586976 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.01191712 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00492728 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.0037256 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00343488 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.03066424 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01158904 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00556592 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00309936 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00240584 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00236136 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.02249872 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.01201176 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00612176 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00309512 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00238664 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00222632 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.08008432 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.05314272 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.04055664 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.01864704 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.0086916 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.01332264 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.04286488 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.02522048 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.02002256 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.00785048 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.01318408 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.03491952 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.02088688 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.01281304 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.00847784 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01432488 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.03823888 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.04483016 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.02112496 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01133616 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.008448 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.01471376 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.05186944 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.0606752 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02310928 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01269576 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.00979584 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.01802176 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.05757784 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.0513108 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.02101336 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.0134932 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.0085828 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.02292672 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.04824048 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.03361944 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.02185112 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01406848 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.01055336 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.02269744 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.03258384 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.02003504 seconds
+
+configuration ('coalesced', 512, 128): 0.00222632 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    uint k = get_global_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (int i = get_global_id(0); i < N; i+= k) {
+        sum += x[i];
     }
 
     fast[local_id] = sum;
@@ -24,10 +25,14 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // a reduction kernel is easy to implement here
+    uint gs = get_local_size(0);
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
 
@@ -48,8 +53,9 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    uint gid = get_global_id(0);
+    for (int i = k * gid; i < k * gid + k && i < N; ++i) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +70,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    uint gs = get_local_size(0);
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -12,7 +12,9 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
-
+    // global outer pixel position
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
     // Load into buffer (with 1-pixel halo).
     //
@@ -22,6 +24,36 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // Looked at P5 for this code
+    int xt, yt;
+    if (idx_1D < buf_w) {
+      for (int row = 0; row < buf_h; row++) {
+        xt = max(0, buf_corner_x + idx_1D);
+        xt = min(xt, w - 1);
+        yt = max(0, buf_corner_y + row);
+        yt = min(h - 1, yt);
+        buffer[row * buf_w + idx_1D] = in_values[w*yt + xt];
+      }
+    }
+
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
@@ -31,4 +63,20 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    if ((x < w) && (y < h)) {
+      out_values[y * w + x] = median9(buffer[(buf_y - 1) * (buf_w) + (buf_x - 1)],
+                                      buffer[(buf_y) * (buf_w) + (buf_x - 1)],
+                                      buffer[(buf_y + 1) * (buf_w) + (buf_x - 1)],
+                                      buffer[(buf_y - 1) * (buf_w) + (buf_x)],
+                                      buffer[(buf_y) * (buf_w) + (buf_x)],
+                                      buffer[(buf_y + 1) * (buf_w) + (buf_x)],
+                                      buffer[(buf_y - 1) * (buf_w) + (buf_x + 1)],
+                                      buffer[(buf_y) * (buf_w) + (buf_x + 1)],
+                                      buffer[(buf_y + 1) * (buf_w) + (buf_x + 1)]);
+    }
 }
+
+// (GETPIX(input_image, i-1, j-1), GETPIX(input_image, i-1, j), GETPIX(input_image, i-1, j+1),
+//                                                  GETPIX(input_image, i,   j-1), GETPIX(input_image, i,   j), GETPIX(input_image, i,   j+1),
+//                                                  GETPIX(input_image, i+1, j-1), GETPIX(input_image, i+1, j), GETPIX(input_image, i+1, j+1))
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,58 @@
+***Part 1***
+
+maze1:
+Finished after 875 iterations, 182.42808 ms total, 0.208489234286 ms per iteration
+Found 2 regions
+
+maze2:
+Finished after 507 iterations, 105.83096 ms total, 0.208739566075 ms per iteration
+Found 35 regions
+
+***Part 2***
+
+maze1:
+Finished after 529 iterations, 109.4868 ms total, 0.206969376181 ms per iteration
+Found 2 regions
+
+maze2:
+Finished after 273 iterations, 56.38032 ms total, 0.206521318681 ms per iteration
+Found 35 regions
+
+***Part 3***
+
+maze1:
+Finished after 10 iterations, 2.8588 ms total, 0.28588 ms per iteration
+Found 2 regions
+
+maze2:
+Finished after 9 iterations, 2.35744 ms total, 0.261937777778 ms per iteration
+Found 35 regions
+
+***Part 4***
+
+It seems that on my architecture using a single thread to perform this step is
+pretty slow.  This implies that we are compute bound, because speeding up
+the memory accesses was not helpful.  This makes sense because my GPU (an intel 4000) is
+pretty slow: on a significantly faster GPU, possibly with many more cores, this effect
+may reverse.  Fundamentally, the fact that using a single thread would allow
+more efficient memory accesses should be helpful on certain architectures. If there
+were more redundancy in labels, this would also make our optimization more worthwhile.
+
+maze1:
+Finished after 10 iterations, 5.46928 ms total, 0.546928 ms per iteration
+Found 2 regions
+
+maze2:
+Finished after 9 iterations, 5.10248 ms total, 0.566942222222 ms per iteration
+Found 35 regions
+
+***Part 5***
+
+If we instead used the non-atomic standard min function, we would run into the following
+problem.  If we are trying to replace a with b in one thread and a with c in another
+thread, and b < c, we could replace a with b and then replace a with c because a was
+read into memory by both threads and then swapped out of order by both.  Thus, labels
+could increase, but not between iterations (because my code never tries to swap something
+with something bigger than it).  This would
+not result in incorrect results, but it might take more iterations than it should.
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -78,20 +78,60 @@ propagate_labels(__global __read_write int *labels,
     // Fetch the value from the buffer the corresponds to
     // the pixel for this thread
     old_label = buffer[buf_y * buf_w + buf_x];
+    new_label = old_label;
+    // Part 2 Grandparents checking
+    // if (new_label != w * h) {
+    //     buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
+    // }
+    // barrier(CLK_LOCAL_MEM_FENCE);
+    // Part 4
+    // This if condition guarantees one thread per work group.
+    int cur_label;
+    if (lx == 0 && ly == 0) {
+        // Loop over the whole buffer and get grandparents
+        int last_fetched_ind = w*h;
+        int last_val = -1;
+        for (int i = halo; i < buf_h - halo; ++i) {
+            for (int j = halo; j < buf_w - halo; ++j) {
+                cur_label = buffer[buf_w * i + j];
+                if (cur_label == w * h)
+                    continue;
+                if (cur_label == last_fetched_ind) {
+                    buffer[buf_w * i + j] = last_val;
+                }
+                else {
+                    last_fetched_ind = cur_label;
+                    last_val = labels[cur_label];
+                    buffer[buf_w * i + j] = last_val;
+                }
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-    // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
 
+        // w*h is the max label, used for background pixels.
+        if (new_label != w * h) {
+            new_label = min(buffer[(buf_y + 1) * buf_w + buf_x], new_label);
+            new_label = min(buffer[(buf_y - 1) * buf_w + buf_x], new_label);
+            new_label = min(buffer[buf_y * buf_w + (buf_x - 1)], new_label);
+            new_label = min(buffer[buf_y * buf_w + (buf_x + 1)], new_label);
+        }
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+            // Read the 32-bit value (referred to as old) stored at location pointed
+            // by p. Compute min (old, val) and store result at location pointed by
+            // p. The function returns old.
+            // int atomic_min (volatile __global int *p, int val)
+            atomic_min(&labels[old_label], new_label);
+            //labels[old_label] = min(labels[old_label], new_label);
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
         }