harvard-cs205 · matheuscfernandes · Nov 15, 2015 · Nov 16, 2015 · Nov 16, 2015 · Nov 18, 2015
diff --git a/HW3/HW3.pdf b/HW3/HW3.pdf
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -10,10 +10,35 @@ mandelbrot(__global __read_only float *coords_real,
 
     float c_real, c_imag;
     float z_real, z_imag;
+    float newZr, newZi;
+
     int iter;
 
+    // if inside of the boundaries then do the computation
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+
+ 		// initializing the variables
+        z_real = 0;
+        z_imag = 0;
+        c_real = coords_real[(y * w) + x];
+        c_imag = coords_imag[(y * w) + x];
+
+        // generate the mandelbrot set
+        for (iter = 0; iter < max_iter; iter++) {
+
+            //computing the magnitude, if greater than 4, quit the computation
+            if (z_real*z_real + z_imag*z_imag > 4.0) {
+                break;
+            }
+
+            // get the new values for each array or real and imaginary separately
+            newZr = (z_real*z_real) - (z_imag*z_imag);
+            newZi = 2*(z_imag*z_real);
+            z_real = newZr + c_real;
+            z_imag = newZi + c_imag;
+
+        }
+        // Transfer iteration data to output
+        out_counts[(y * w) + x] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,25 @@
+The platforms detected are:
+---------------------------
+Apple Apple version: OpenCL 1.2 (Sep 21 2015 19:24:11)
+The devices detected on platform Apple are:
+---------------------------
+Intel(R) Core(TM) i7-5557U CPU @ 3.10GHz [Type: CPU ]
+Maximum clock Frequency: 3100 MHz
+Maximum allocable memory size: 4294 MB
+Maximum work group size 1024
+---------------------------
+Intel(R) Iris(TM) Graphics 6100 [Type: GPU ]
+Maximum clock Frequency: 1100 MHz
+Maximum allocable memory size: 402 MB
+Maximum work group size 256
+---------------------------
+This context is associated with  2 devices
+The queue is using the device: Intel(R) Iris(TM) Graphics 6100
+The device memory bandwidth is 10.5842506351 GB/s
+The host-device bandwidth is 8.8087100525 GB/s
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+Best configuration and time for hardware:
+configuration ('coalesced', 64, 64): 0.0030028 seconds
+
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t i = get_global_id(0);
+    int globalSize = get_global_size(0);
+    int localSize = get_local_size(0);
+    int ii;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (ii = 0; i + ii * globalSize < N; ii++) { 
+        sum += x[i+ii*globalSize];
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (ii = localSize/2; ii>0; ii >>=1) { 
+        if(local_id<ii){
+            fast[local_id] += fast[local_id+ii];
+        }
+    barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -39,6 +46,9 @@ __kernel void sum_blocked(__global float* x,
     float sum = 0;
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
+    int localSize = get_local_size(0);
+    int globalId = get_global_id(0);
+    int ii;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +58,11 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (ii = globalId*k; ii<(globalId+1)*k;ii++) { 
+        if(ii<N){
+            sum += x[ii];    
+        }
+
     }
 
     fast[local_id] = sum;
@@ -63,9 +76,13 @@ __kernel void sum_blocked(__global float* x,
     //
     // You can assume get_local_size(0) is a power of 2.
     //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-redIuce/
+    for (ii = localSize/2; ii>0 ; ii>>=1) { 
+        if(local_id<ii){
+            fast[local_id] += fast[local_id+ii];
+
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -7,8 +7,83 @@ median_3x3(__global __read_only float *in_values,
            __local float *buffer,
            int w, int h,
            int buf_w, int buf_h,
-           const int halo)
-{
+           const int halo){   
+
+    //Initial variable definitions for the problem
+    //this code is obtained from the load halo example done in class
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // OBTAIN BUFFER BY LOADING RELEVANT LABELS TO LOVAL BUFFER WITH HALO
+    if (idx_1D<buf_w){
+
+        int row,mXX,mYY,hNew,wNew,mat_index,buf_index;
+        // GO OVER EACH ROW AND GO DOWN IN EACH COLUMN TO OBTAIN THE VALUES OF TH
+        // BUFFER FOR EACH INDIVIDUAL NODE
+        for(row=0;row<buf_h;row++){
+            //GET THE XX AND YY, ROW AND COLUMN RESPECTIVE CORNERS WITH ADJUSTMENTS
+            mYY=buf_corner_y+row;
+            mXX=buf_corner_x+idx_1D;
+            //OBTAIN THE NEW INDECES FOR THE HEIGHT AND THE WIDTH
+            hNew=h-1;
+            wNew=w-1;
+
+            //GET THE INDECES FOR THE BUFFER THAN TO BE LOADED INTO THE BUFFER
+            mat_index=min(max(0, mYY), hNew) * w + min(max(0, mXX), wNew);
+            buf_index=row * buf_w + idx_1D;
+            //LOAD DESIRED VALUES FROM GLOBAL MEMORY INTO THE BUFFER FOR FURTHER COMPUTATION
+            buffer[buf_index] = in_values[mat_index];
+        }
+
+    }
+
+    // if (idx_1D < buf_w)
+    //     for (row = 0; row < buf_h; row++) {
+    //         buffer[row * buf_w + idx_1D] = 
+    //             get_clamped_value(labels,
+    //                               w, h,
+    //                               buf_corner_x + idx_1D, buf_corner_y + row);
+    //     }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Processing code here...
+    //
+    // Should only use buffer, buf_x, buf_y
+
+    // write output
+    // if ((y < img_h) && (x < img_w)) // stay in bounds
+    //     output[y * img_w + x] = \
+    //         buffer[buf_y * buf_w + buf_x];
+    //STATEMENT FOR SMOOTHENING THE PIRURE WITHIN THE SPACE THAT WE CARE ABOUT
+	if ((y < h) && (x < w)) {
+        //THIS WRITES THE VALUE OUT AS DESIRED AND DESCRIBED IN THE .H FILE
+	    out_values[y * w + x] = median9(buffer[(buf_y - 1) * buf_w + (buf_x - 1)], buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y - 1) * buf_w + (buf_x + 1)], 
+	                                    buffer[buf_y * buf_w       + (buf_x - 1)], buffer[buf_y * buf_w       + buf_x], buffer[buf_y * buf_w       + (buf_x + 1)], 
+	                                    buffer[(buf_y + 1) * buf_w + (buf_x - 1)], buffer[(buf_y + 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + (buf_x + 1)]);
+	}
+}
+
+
+
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
@@ -31,4 +106,4 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
-}
+
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -51,8 +51,7 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    curdir = os.path.dirname(os.path.realpath(__file__))
-    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
+    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,77 @@
+Problem 5 Write Up 
+By: Matheus C. Fernandes
+
+Code Outputs and Questions Explanations:
+
+Part 1: implement updates from neighbors
+=========================================
+Code Outputs:
+-----------------------------------------
+Maze 1:
+Finished after 915 iterations, 187.40696 ms total, 0.204816349727 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 532 iterations, 108.11128 ms total, 0.203216691729 ms per iteration
+Found 35 regions
+
+
+Part 2: fetch grandparents
+=========================================
+Code Outputs:
+-----------------------------------------
+Maze 1:
+Finished after 529 iterations, 103.94128 ms total, 0.196486351607 ms per iteration
+Found 2 regions
+
+
+Maze 2:
+Finished after 273 iterations, 54.20424 ms total, 0.19855032967 ms per iteration
+Found 35 regions
+
+
+Part 3: merge parent regions 
+=========================================
+Code Outputs:
+-----------------------------------------
+Maze 1: 
+Finished after 10 iterations, 2.94152 ms total, 0.294152 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 9 iterations, 2.61712 ms total, 0.290791111111 ms per iteration
+Found 35 regions
+
+
+Part 4: efficient grandparents
+=========================================
+Code Outputs:
+-----------------------------------------
+Maze 1:
+Finished after 10 iterations, 5.62464 ms total, 0.562464 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 9 iterations, 4.9628 ms total, 0.551422222222 ms per iteration
+Found 35 regions
+
+Explanation of questions:
+-----------------------------------------
+Questions Statement:
+Explain, in terms of compute vs. memory, why using a single thread to perform this step is or is not a reasonable choice. Note that there is some variation in GPUs, so you may want to discuss your empirical results as well as speculate under what conditions those results might be different.
+
+-----------------------------------------
+Answer:
+I found that using one thread to perform this step did not improve the overall performance of the computation. The results of my GPU showed that there is a significant slowdown on the performance. I believe that even though we decrease the number of accesses to global memory, serializing the part of the algorithm gives away a large benefit in the compute through parallelism. Thus, I don’t think the compute vs. memory access here is worth doing, as the balance is much skewed on the direction of the overhead. Perhaps it would be beneficial to serialize such a task if the memory read overhead cost is much larger than the compute or the compute for one of these iterations is much less expensive than the memory. Thus, for this particular problem, I do not think it is worth this balance as the compute portion is too expensive over the read.   
+
+
+Part 5: no atomic operations
+=========================================
+Explanation of questions:
+-----------------------------------------
+Questions Statment:
+Atomic operations are also inefficient, and virtually guarantee serialization of memory access. Explain what would happen if instead of using the atomic min() operation, one would use the min() function. Your explanation should consider whether the final result would still be correct, what might be the impact on the performance of the algorithm (time and iterations), could a value in labels ever increase, and could it increase between iterations?
+
+-----------------------------------------
+Answer:
+I believe that using the regular min() function would have a positive impact on the performance of the code because the atomic min() function serializes (by essentially locking the memory location) to make sure everything is loaded properly. This serialization process makes the computation slower. Thus, the computation with regular min() functions can greatly benefit through the ability of the parallelization although you may run a risk of getting incorrect answers. If we used the min() function over the atomic min() function, we would potentially introduce the possibility of updating old_able redundantly, while two functions could be updating the output function at the same time. This could introduce error into our calculations by overusing a particular variable (while not necessarily always happening, the error could be introduced). Though, I also believe it is possible that it could not introduce an error because whenever we check if something has changed, that part is independent of what happens in the writing of the minimum. Thus, I believe the overall result may be correct but the count and iterations might change slightly depending on the conditions set. In any case, if min() is used over the atomic min() then we could potentially see an increase in the value of labels[]. This can be done if one thread attempts to make a change over the other thread who is altering the same location in memory, and one threads writes on to the location, and the other thread does not see the new value and puts back the older value. This essentially means that the thread who picked the memory location in the first place will be completely ignored over the iteration. 
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -61,6 +61,9 @@ propagate_labels(__global __read_write int *labels,
     // Will store the output value
     int new_label;
 
+    int upNeighbor,rightNeighbor,downNeighbor,leftNeighbor,minNeighbors;
+
+    int gpartent, xx, yy;
     // Load the relevant labels to a local buffer with a halo 
     if (idx_1D < buf_w) {
         for (int row = 0; row < buf_h; row++) {
@@ -80,20 +83,62 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+    // This makes sure that we have only the first thread running 
+    if ((lx==0) && (ly==0)){
+        //OBTAIN RELATIVE GRANDPARENT
+        if(old_label<w*h){
+            gpartent = labels[old_label];
+        }
+        // ITERATE OVER THE ROWS--XX AND COLUMNS--YY
+        for(xx = halo;xx < buf_h - halo; xx++){
+            for (yy= halo; yy < buf_w-halo; yy++){
+                // STATEMENT DOES NOT ALLOW IF THE YOU ARE NOT ON THE WALL OF THE MAZE
+                if(buffer[(ly+xx)*buf_w+(lx+yy)]<w*h){
+                    // STATEMENT TO NOT ALLOW IF NODE ALREADY HAS A GRANDPARENT
+                    if (buffer[(ly+xx)*buf_w +(lx+yy)]!=gpartent){
+                        //GIVE BUFFER THE DESIRED GRANDPARENT 
+                        buffer[(ly+xx)*buf_w+(lx+yy)]=labels[buffer[(ly+xx)*buf_w+(lx+yy)]];
+                    }
+
+
+                }
+            }
+        }
+    }
 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         new_label = old_label;
 
+        if (old_label<w*h){
+
+            //OBTAIN THE BUFFERS FOR THE NEIGHTBORS IN EACH OF THE 4 SIDES
+            upNeighbor = buffer[(buf_y + 1) * buf_w + (buf_x)]; 
+            rightNeighbor = buffer[(buf_y) * buf_w + (buf_x + 1)];
+            downNeighbor = buffer[(buf_y - 1) * buf_w + (buf_x)];
+            leftNeighbor = buffer[(buf_y) * buf_w + (buf_x - 1)];
+            //OBTAIN THE LABEL OF THE NEIGHTBOUR WITH THE SMALLEST VALUE OF THE BUFFER
+            minNeighbors=min(old_label,(min(upNeighbor,min(rightNeighbor,min(downNeighbor,leftNeighbor)))));
+            //COMPARE THE MINIMUM OF THE NEIGHBORS TO THE PREVIOUS LABEL
+            new_label=min(minNeighbors,new_label);
+        }
+
         if (new_label != old_label) {
+            atomic_min(&labels[old_label],new_label);
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+
+            //UPDATING GLOBAL POSITION OF THE CHILD REGIONS 
+            //THIS ENABLES MERGE OF OLD AND NEW PARENTS WITH THEY 
+            //CHANGE THE LABELS FROM OLD TO NEW LABELS
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }