Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added HW3/HW3.pdf
Binary file not shown.
29 changes: 27 additions & 2 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,35 @@ mandelbrot(__global __read_only float *coords_real,

float c_real, c_imag;
float z_real, z_imag;
float newZr, newZi;

int iter;

// if inside of the boundaries then do the computation
if ((x < w) && (y < h)) {
// YOUR CODE HERE
;

// initializing the variables
z_real = 0;
z_imag = 0;
c_real = coords_real[(y * w) + x];
c_imag = coords_imag[(y * w) + x];

// generate the mandelbrot set
for (iter = 0; iter < max_iter; iter++) {

//computing the magnitude, if greater than 4, quit the computation
if (z_real*z_real + z_imag*z_imag > 4.0) {
break;
}

// get the new values for each array or real and imaginary separately
newZr = (z_real*z_real) - (z_imag*z_imag);
newZi = 2*(z_imag*z_real);
z_real = newZr + c_real;
z_imag = newZi + c_imag;

}
// Transfer iteration data to output
out_counts[(y * w) + x] = iter;
}
}
25 changes: 25 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
The platforms detected are:
---------------------------
Apple Apple version: OpenCL 1.2 (Sep 21 2015 19:24:11)
The devices detected on platform Apple are:
---------------------------
Intel(R) Core(TM) i7-5557U CPU @ 3.10GHz [Type: CPU ]
Maximum clock Frequency: 3100 MHz
Maximum allocable memory size: 4294 MB
Maximum work group size 1024
---------------------------
Intel(R) Iris(TM) Graphics 6100 [Type: GPU ]
Maximum clock Frequency: 1100 MHz
Maximum allocable memory size: 402 MB
Maximum work group size 256
---------------------------
This context is associated with 2 devices
The queue is using the device: Intel(R) Iris(TM) Graphics 6100
The device memory bandwidth is 10.5842506351 GB/s
The host-device bandwidth is 8.8087100525 GB/s

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Best configuration and time for hardware:
configuration ('coalesced', 64, 64): 0.0030028 seconds

35 changes: 26 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t i = get_global_id(0);
int globalSize = get_global_size(0);
int localSize = get_local_size(0);
int ii;

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (ii = 0; i + ii * globalSize < N; ii++) {
sum += x[i+ii*globalSize];
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (ii = localSize/2; ii>0; ii >>=1) {
if(local_id<ii){
fast[local_id] += fast[local_id+ii];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -39,6 +46,9 @@ __kernel void sum_blocked(__global float* x,
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil((float)N / get_global_size(0));
int localSize = get_local_size(0);
int globalId = get_global_id(0);
int ii;

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +58,11 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (ii = globalId*k; ii<(globalId+1)*k;ii++) {
if(ii<N){
sum += x[ii];
}

}

fast[local_id] = sum;
Expand All @@ -63,9 +76,13 @@ __kernel void sum_blocked(__global float* x,
//
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-redIuce/
for (ii = localSize/2; ii>0 ; ii>>=1) {
if(local_id<ii){
fast[local_id] += fast[local_id+ii];

}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
81 changes: 78 additions & 3 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,83 @@ median_3x3(__global __read_only float *in_values,
__local float *buffer,
int w, int h,
int buf_w, int buf_h,
const int halo)
{
const int halo){

//Initial variable definitions for the problem
//this code is obtained from the load halo example done in class

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// // coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

// OBTAIN BUFFER BY LOADING RELEVANT LABELS TO LOVAL BUFFER WITH HALO
if (idx_1D<buf_w){

int row,mXX,mYY,hNew,wNew,mat_index,buf_index;
// GO OVER EACH ROW AND GO DOWN IN EACH COLUMN TO OBTAIN THE VALUES OF TH
// BUFFER FOR EACH INDIVIDUAL NODE
for(row=0;row<buf_h;row++){
//GET THE XX AND YY, ROW AND COLUMN RESPECTIVE CORNERS WITH ADJUSTMENTS
mYY=buf_corner_y+row;
mXX=buf_corner_x+idx_1D;
//OBTAIN THE NEW INDECES FOR THE HEIGHT AND THE WIDTH
hNew=h-1;
wNew=w-1;

//GET THE INDECES FOR THE BUFFER THAN TO BE LOADED INTO THE BUFFER
mat_index=min(max(0, mYY), hNew) * w + min(max(0, mXX), wNew);
buf_index=row * buf_w + idx_1D;
//LOAD DESIRED VALUES FROM GLOBAL MEMORY INTO THE BUFFER FOR FURTHER COMPUTATION
buffer[buf_index] = in_values[mat_index];
}

}

// if (idx_1D < buf_w)
// for (row = 0; row < buf_h; row++) {
// buffer[row * buf_w + idx_1D] =
// get_clamped_value(labels,
// w, h,
// buf_corner_x + idx_1D, buf_corner_y + row);
// }

barrier(CLK_LOCAL_MEM_FENCE);

// Processing code here...
//
// Should only use buffer, buf_x, buf_y

// write output
// if ((y < img_h) && (x < img_w)) // stay in bounds
// output[y * img_w + x] = \
// buffer[buf_y * buf_w + buf_x];
//STATEMENT FOR SMOOTHENING THE PIRURE WITHIN THE SPACE THAT WE CARE ABOUT
if ((y < h) && (x < w)) {
//THIS WRITES THE VALUE OUT AS DESIRED AND DESCRIBED IN THE .H FILE
out_values[y * w + x] = median9(buffer[(buf_y - 1) * buf_w + (buf_x - 1)], buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y - 1) * buf_w + (buf_x + 1)],
buffer[buf_y * buf_w + (buf_x - 1)], buffer[buf_y * buf_w + buf_x], buffer[buf_y * buf_w + (buf_x + 1)],
buffer[(buf_y + 1) * buf_w + (buf_x - 1)], buffer[(buf_y + 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + (buf_x + 1)]);
}
}



// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.
Expand All @@ -31,4 +106,4 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
}

3 changes: 1 addition & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
program = cl.Program(context, open('median_filter.cl').read()).build(options='')

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
77 changes: 77 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
Problem 5 Write Up
By: Matheus C. Fernandes

Code Outputs and Questions Explanations:

Part 1: implement updates from neighbors
=========================================
Code Outputs:
-----------------------------------------
Maze 1:
Finished after 915 iterations, 187.40696 ms total, 0.204816349727 ms per iteration
Found 2 regions

Maze 2:
Finished after 532 iterations, 108.11128 ms total, 0.203216691729 ms per iteration
Found 35 regions


Part 2: fetch grandparents
=========================================
Code Outputs:
-----------------------------------------
Maze 1:
Finished after 529 iterations, 103.94128 ms total, 0.196486351607 ms per iteration
Found 2 regions


Maze 2:
Finished after 273 iterations, 54.20424 ms total, 0.19855032967 ms per iteration
Found 35 regions


Part 3: merge parent regions
=========================================
Code Outputs:
-----------------------------------------
Maze 1:
Finished after 10 iterations, 2.94152 ms total, 0.294152 ms per iteration
Found 2 regions

Maze 2:
Finished after 9 iterations, 2.61712 ms total, 0.290791111111 ms per iteration
Found 35 regions


Part 4: efficient grandparents
=========================================
Code Outputs:
-----------------------------------------
Maze 1:
Finished after 10 iterations, 5.62464 ms total, 0.562464 ms per iteration
Found 2 regions

Maze 2:
Finished after 9 iterations, 4.9628 ms total, 0.551422222222 ms per iteration
Found 35 regions

Explanation of questions:
-----------------------------------------
Questions Statement:
Explain, in terms of compute vs. memory, why using a single thread to perform this step is or is not a reasonable choice. Note that there is some variation in GPUs, so you may want to discuss your empirical results as well as speculate under what conditions those results might be different.

-----------------------------------------
Answer:
I found that using one thread to perform this step did not improve the overall performance of the computation. The results of my GPU showed that there is a significant slowdown on the performance. I believe that even though we decrease the number of accesses to global memory, serializing the part of the algorithm gives away a large benefit in the compute through parallelism. Thus, I don’t think the compute vs. memory access here is worth doing, as the balance is much skewed on the direction of the overhead. Perhaps it would be beneficial to serialize such a task if the memory read overhead cost is much larger than the compute or the compute for one of these iterations is much less expensive than the memory. Thus, for this particular problem, I do not think it is worth this balance as the compute portion is too expensive over the read.


Part 5: no atomic operations
=========================================
Explanation of questions:
-----------------------------------------
Questions Statment:
Atomic operations are also inefficient, and virtually guarantee serialization of memory access. Explain what would happen if instead of using the atomic min() operation, one would use the min() function. Your explanation should consider whether the final result would still be correct, what might be the impact on the performance of the algorithm (time and iterations), could a value in labels ever increase, and could it increase between iterations?

-----------------------------------------
Answer:
I believe that using the regular min() function would have a positive impact on the performance of the code because the atomic min() function serializes (by essentially locking the memory location) to make sure everything is loaded properly. This serialization process makes the computation slower. Thus, the computation with regular min() functions can greatly benefit through the ability of the parallelization although you may run a risk of getting incorrect answers. If we used the min() function over the atomic min() function, we would potentially introduce the possibility of updating old_able redundantly, while two functions could be updating the output function at the same time. This could introduce error into our calculations by overusing a particular variable (while not necessarily always happening, the error could be introduced). Though, I also believe it is possible that it could not introduce an error because whenever we check if something has changed, that part is independent of what happens in the writing of the minimum. Thus, I believe the overall result may be correct but the count and iterations might change slightly depending on the conditions set. In any case, if min() is used over the atomic min() then we could potentially see an increase in the value of labels[]. This can be done if one thread attempts to make a change over the other thread who is altering the same location in memory, and one threads writes on to the location, and the other thread does not see the new value and puts back the older value. This essentially means that the thread who picked the memory location in the first place will be completely ignored over the iteration.
47 changes: 46 additions & 1 deletion HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ propagate_labels(__global __read_write int *labels,
// Will store the output value
int new_label;

int upNeighbor,rightNeighbor,downNeighbor,leftNeighbor,minNeighbors;

int gpartent, xx, yy;
// Load the relevant labels to a local buffer with a halo
if (idx_1D < buf_w) {
for (int row = 0; row < buf_h; row++) {
Expand All @@ -80,20 +83,62 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
// This makes sure that we have only the first thread running
if ((lx==0) && (ly==0)){
//OBTAIN RELATIVE GRANDPARENT
if(old_label<w*h){
gpartent = labels[old_label];
}
// ITERATE OVER THE ROWS--XX AND COLUMNS--YY
for(xx = halo;xx < buf_h - halo; xx++){
for (yy= halo; yy < buf_w-halo; yy++){
// STATEMENT DOES NOT ALLOW IF THE YOU ARE NOT ON THE WALL OF THE MAZE
if(buffer[(ly+xx)*buf_w+(lx+yy)]<w*h){
// STATEMENT TO NOT ALLOW IF NODE ALREADY HAS A GRANDPARENT
if (buffer[(ly+xx)*buf_w +(lx+yy)]!=gpartent){
//GIVE BUFFER THE DESIRED GRANDPARENT
buffer[(ly+xx)*buf_w+(lx+yy)]=labels[buffer[(ly+xx)*buf_w+(lx+yy)]];
}


}
}
}
}

barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

if (old_label<w*h){

//OBTAIN THE BUFFERS FOR THE NEIGHTBORS IN EACH OF THE 4 SIDES
upNeighbor = buffer[(buf_y + 1) * buf_w + (buf_x)];
rightNeighbor = buffer[(buf_y) * buf_w + (buf_x + 1)];
downNeighbor = buffer[(buf_y - 1) * buf_w + (buf_x)];
leftNeighbor = buffer[(buf_y) * buf_w + (buf_x - 1)];
//OBTAIN THE LABEL OF THE NEIGHTBOUR WITH THE SMALLEST VALUE OF THE BUFFER
minNeighbors=min(old_label,(min(upNeighbor,min(rightNeighbor,min(downNeighbor,leftNeighbor)))));
//COMPARE THE MINIMUM OF THE NEIGHBORS TO THE PREVIOUS LABEL
new_label=min(minNeighbors,new_label);
}

if (new_label != old_label) {
atomic_min(&labels[old_label],new_label);
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.

//UPDATING GLOBAL POSITION OF THE CHILD REGIONS
//THIS ENABLES MERGE OF OLD AND NEW PARENTS WITH THEY
//CHANGE THE LABELS FROM OLD TO NEW LABELS
*(changed_flag) += 1;
labels[y * w + x] = new_label;
atomic_min(&labels[y * w + x], new_label);
}
}
}
Loading