Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,30 @@ mandelbrot(__global __read_only float *coords_real,
float c_real, c_imag;
float z_real, z_imag;
int iter;
float z_new_r = 0;
float z_new_i = 0;
z_real = 0;
z_imag = 0;

c_real = coords_real[w*y + x];
c_imag = coords_imag[w*y + x];

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
}
iter = 0;
for (int i = 0; i < max_iter; ++i) {
if (z_real*z_real + z_imag * z_imag > 4.0) {
break;
}
iter++;
// z = z * z + c
z_new_r = z_real * z_real - z_imag * z_imag;
z_new_i = 2.0 * z_real * z_imag;
z_new_i += c_imag;
z_new_r += c_real;
z_real = z_new_r;
z_imag = z_new_i;
}
out_counts[w * y + x] = iter;
}
}
86 changes: 86 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
coalesced reads, workgroups: 8, num_workers: 4, 0.07258504 seconds
coalesced reads, workgroups: 8, num_workers: 8, 0.03673464 seconds
coalesced reads, workgroups: 8, num_workers: 16, 0.01913392 seconds
coalesced reads, workgroups: 8, num_workers: 32, 0.01039344 seconds
coalesced reads, workgroups: 8, num_workers: 64, 0.00695472 seconds
coalesced reads, workgroups: 8, num_workers: 128, 0.0043812 seconds
coalesced reads, workgroups: 16, num_workers: 4, 0.06001544 seconds
coalesced reads, workgroups: 16, num_workers: 8, 0.0310272 seconds
coalesced reads, workgroups: 16, num_workers: 16, 0.01570544 seconds
coalesced reads, workgroups: 16, num_workers: 32, 0.00764992 seconds
coalesced reads, workgroups: 16, num_workers: 64, 0.00431152 seconds
coalesced reads, workgroups: 16, num_workers: 128, 0.00333848 seconds
coalesced reads, workgroups: 32, num_workers: 4, 0.03370544 seconds
coalesced reads, workgroups: 32, num_workers: 8, 0.01284968 seconds
coalesced reads, workgroups: 32, num_workers: 16, 0.00768184 seconds
coalesced reads, workgroups: 32, num_workers: 32, 0.00646128 seconds
coalesced reads, workgroups: 32, num_workers: 64, 0.00485712 seconds
coalesced reads, workgroups: 32, num_workers: 128, 0.0032364 seconds
coalesced reads, workgroups: 64, num_workers: 4, 0.02550672 seconds
coalesced reads, workgroups: 64, num_workers: 8, 0.0136236 seconds
coalesced reads, workgroups: 64, num_workers: 16, 0.00706304 seconds
coalesced reads, workgroups: 64, num_workers: 32, 0.00393336 seconds
coalesced reads, workgroups: 64, num_workers: 64, 0.00425824 seconds
coalesced reads, workgroups: 64, num_workers: 128, 0.00466488 seconds
coalesced reads, workgroups: 128, num_workers: 4, 0.05210832 seconds
coalesced reads, workgroups: 128, num_workers: 8, 0.02586976 seconds
coalesced reads, workgroups: 128, num_workers: 16, 0.01191712 seconds
coalesced reads, workgroups: 128, num_workers: 32, 0.00492728 seconds
coalesced reads, workgroups: 128, num_workers: 64, 0.0037256 seconds
coalesced reads, workgroups: 128, num_workers: 128, 0.00343488 seconds
coalesced reads, workgroups: 256, num_workers: 4, 0.03066424 seconds
coalesced reads, workgroups: 256, num_workers: 8, 0.01158904 seconds
coalesced reads, workgroups: 256, num_workers: 16, 0.00556592 seconds
coalesced reads, workgroups: 256, num_workers: 32, 0.00309936 seconds
coalesced reads, workgroups: 256, num_workers: 64, 0.00240584 seconds
coalesced reads, workgroups: 256, num_workers: 128, 0.00236136 seconds
coalesced reads, workgroups: 512, num_workers: 4, 0.02249872 seconds
coalesced reads, workgroups: 512, num_workers: 8, 0.01201176 seconds
coalesced reads, workgroups: 512, num_workers: 16, 0.00612176 seconds
coalesced reads, workgroups: 512, num_workers: 32, 0.00309512 seconds
coalesced reads, workgroups: 512, num_workers: 64, 0.00238664 seconds
coalesced reads, workgroups: 512, num_workers: 128, 0.00222632 seconds
blocked reads, workgroups: 8, num_workers: 4, 0.08008432 seconds
blocked reads, workgroups: 8, num_workers: 8, 0.05314272 seconds
blocked reads, workgroups: 8, num_workers: 16, 0.04055664 seconds
blocked reads, workgroups: 8, num_workers: 32, 0.01864704 seconds
blocked reads, workgroups: 8, num_workers: 64, 0.0086916 seconds
blocked reads, workgroups: 8, num_workers: 128, 0.01332264 seconds
blocked reads, workgroups: 16, num_workers: 4, 0.04286488 seconds
blocked reads, workgroups: 16, num_workers: 8, 0.02522048 seconds
blocked reads, workgroups: 16, num_workers: 16, 0.02002256 seconds
blocked reads, workgroups: 16, num_workers: 32, 0.00785048 seconds
blocked reads, workgroups: 16, num_workers: 64, 0.01318408 seconds
blocked reads, workgroups: 16, num_workers: 128, 0.03491952 seconds
blocked reads, workgroups: 32, num_workers: 4, 0.02088688 seconds
blocked reads, workgroups: 32, num_workers: 8, 0.01281304 seconds
blocked reads, workgroups: 32, num_workers: 16, 0.00847784 seconds
blocked reads, workgroups: 32, num_workers: 32, 0.01432488 seconds
blocked reads, workgroups: 32, num_workers: 64, 0.03823888 seconds
blocked reads, workgroups: 32, num_workers: 128, 0.04483016 seconds
blocked reads, workgroups: 64, num_workers: 4, 0.02112496 seconds
blocked reads, workgroups: 64, num_workers: 8, 0.01133616 seconds
blocked reads, workgroups: 64, num_workers: 16, 0.008448 seconds
blocked reads, workgroups: 64, num_workers: 32, 0.01471376 seconds
blocked reads, workgroups: 64, num_workers: 64, 0.05186944 seconds
blocked reads, workgroups: 64, num_workers: 128, 0.0606752 seconds
blocked reads, workgroups: 128, num_workers: 4, 0.02310928 seconds
blocked reads, workgroups: 128, num_workers: 8, 0.01269576 seconds
blocked reads, workgroups: 128, num_workers: 16, 0.00979584 seconds
blocked reads, workgroups: 128, num_workers: 32, 0.01802176 seconds
blocked reads, workgroups: 128, num_workers: 64, 0.05757784 seconds
blocked reads, workgroups: 128, num_workers: 128, 0.0513108 seconds
blocked reads, workgroups: 256, num_workers: 4, 0.02101336 seconds
blocked reads, workgroups: 256, num_workers: 8, 0.0134932 seconds
blocked reads, workgroups: 256, num_workers: 16, 0.0085828 seconds
blocked reads, workgroups: 256, num_workers: 32, 0.02292672 seconds
blocked reads, workgroups: 256, num_workers: 64, 0.04824048 seconds
blocked reads, workgroups: 256, num_workers: 128, 0.03361944 seconds
blocked reads, workgroups: 512, num_workers: 4, 0.02185112 seconds
blocked reads, workgroups: 512, num_workers: 8, 0.01406848 seconds
blocked reads, workgroups: 512, num_workers: 16, 0.01055336 seconds
blocked reads, workgroups: 512, num_workers: 32, 0.02269744 seconds
blocked reads, workgroups: 512, num_workers: 64, 0.03258384 seconds
blocked reads, workgroups: 512, num_workers: 128, 0.02003504 seconds

configuration ('coalesced', 512, 128): 0.00222632 seconds
28 changes: 19 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
uint k = get_global_size(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_global_id(0); i < N; i+= k) {
sum += x[i];
}

fast[local_id] = sum;
Expand All @@ -24,10 +25,14 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
// a reduction kernel is easy to implement here
uint gs = get_local_size(0);
for(uint s = gs/2; s > 0; s >>= 1) {
if(local_id < s) {
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}

Expand All @@ -48,8 +53,9 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
uint gid = get_global_id(0);
for (int i = k * gid; i < k * gid + k && i < N; ++i) { // YOUR CODE HERE
sum += x[i]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -64,8 +70,12 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
uint gs = get_local_size(0);
for(uint s = gs/2; s > 0; s >>= 1) {
if(local_id < s) {
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
50 changes: 49 additions & 1 deletion HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ median_3x3(__global __read_only float *in_values,
// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.

// global outer pixel position
const int x = get_global_id(0);
const int y = get_global_id(1);

// Load into buffer (with 1-pixel halo).
//
Expand All @@ -22,6 +24,36 @@ median_3x3(__global __read_only float *in_values,
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

// Looked at P5 for this code
int xt, yt;
if (idx_1D < buf_w) {
for (int row = 0; row < buf_h; row++) {
xt = max(0, buf_corner_x + idx_1D);
xt = min(xt, w - 1);
yt = max(0, buf_corner_y + row);
yt = min(h - 1, yt);
buffer[row * buf_w + idx_1D] = in_values[w*yt + xt];
}
}


barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
Expand All @@ -31,4 +63,20 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.

if ((x < w) && (y < h)) {
out_values[y * w + x] = median9(buffer[(buf_y - 1) * (buf_w) + (buf_x - 1)],
buffer[(buf_y) * (buf_w) + (buf_x - 1)],
buffer[(buf_y + 1) * (buf_w) + (buf_x - 1)],
buffer[(buf_y - 1) * (buf_w) + (buf_x)],
buffer[(buf_y) * (buf_w) + (buf_x)],
buffer[(buf_y + 1) * (buf_w) + (buf_x)],
buffer[(buf_y - 1) * (buf_w) + (buf_x + 1)],
buffer[(buf_y) * (buf_w) + (buf_x + 1)],
buffer[(buf_y + 1) * (buf_w) + (buf_x + 1)]);
}
}

// (GETPIX(input_image, i-1, j-1), GETPIX(input_image, i-1, j), GETPIX(input_image, i-1, j+1),
// GETPIX(input_image, i, j-1), GETPIX(input_image, i, j), GETPIX(input_image, i, j+1),
// GETPIX(input_image, i+1, j-1), GETPIX(input_image, i+1, j), GETPIX(input_image, i+1, j+1))
58 changes: 58 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
***Part 1***

maze1:
Finished after 875 iterations, 182.42808 ms total, 0.208489234286 ms per iteration
Found 2 regions

maze2:
Finished after 507 iterations, 105.83096 ms total, 0.208739566075 ms per iteration
Found 35 regions

***Part 2***

maze1:
Finished after 529 iterations, 109.4868 ms total, 0.206969376181 ms per iteration
Found 2 regions

maze2:
Finished after 273 iterations, 56.38032 ms total, 0.206521318681 ms per iteration
Found 35 regions

***Part 3***

maze1:
Finished after 10 iterations, 2.8588 ms total, 0.28588 ms per iteration
Found 2 regions

maze2:
Finished after 9 iterations, 2.35744 ms total, 0.261937777778 ms per iteration
Found 35 regions

***Part 4***

It seems that on my architecture using a single thread to perform this step is
pretty slow. This implies that we are compute bound, because speeding up
the memory accesses was not helpful. This makes sense because my GPU (an intel 4000) is
pretty slow: on a significantly faster GPU, possibly with many more cores, this effect
may reverse. Fundamentally, the fact that using a single thread would allow
more efficient memory accesses should be helpful on certain architectures. If there
were more redundancy in labels, this would also make our optimization more worthwhile.

maze1:
Finished after 10 iterations, 5.46928 ms total, 0.546928 ms per iteration
Found 2 regions

maze2:
Finished after 9 iterations, 5.10248 ms total, 0.566942222222 ms per iteration
Found 35 regions

***Part 5***

If we instead used the non-atomic standard min function, we would run into the following
problem. If we are trying to replace a with b in one thread and a with c in another
thread, and b < c, we could replace a with b and then replace a with c because a was
read into memory by both threads and then swapped out of order by both. Thus, labels
could increase, but not between iterations (because my code never tries to swap something
with something bigger than it). This would
not result in incorrect results, but it might take more iterations than it should.

46 changes: 43 additions & 3 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,60 @@ propagate_labels(__global __read_write int *labels,
// Fetch the value from the buffer the corresponds to
// the pixel for this thread
old_label = buffer[buf_y * buf_w + buf_x];
new_label = old_label;
// Part 2 Grandparents checking
// if (new_label != w * h) {
// buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
// }
// barrier(CLK_LOCAL_MEM_FENCE);
// Part 4
// This if condition guarantees one thread per work group.
int cur_label;
if (lx == 0 && ly == 0) {
// Loop over the whole buffer and get grandparents
int last_fetched_ind = w*h;
int last_val = -1;
for (int i = halo; i < buf_h - halo; ++i) {
for (int j = halo; j < buf_w - halo; ++j) {
cur_label = buffer[buf_w * i + j];
if (cur_label == w * h)
continue;
if (cur_label == last_fetched_ind) {
buffer[buf_w * i + j] = last_val;
}
else {
last_fetched_ind = cur_label;
last_val = labels[cur_label];
buffer[buf_w * i + j] = last_val;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

// w*h is the max label, used for background pixels.
if (new_label != w * h) {
new_label = min(buffer[(buf_y + 1) * buf_w + buf_x], new_label);
new_label = min(buffer[(buf_y - 1) * buf_w + buf_x], new_label);
new_label = min(buffer[buf_y * buf_w + (buf_x - 1)], new_label);
new_label = min(buffer[buf_y * buf_w + (buf_x + 1)], new_label);
}
if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
// Read the 32-bit value (referred to as old) stored at location pointed
// by p. Compute min (old, val) and store result at location pointed by
// p. The function returns old.
// int atomic_min (volatile __global int *p, int val)
atomic_min(&labels[old_label], new_label);
//labels[old_label] = min(labels[old_label], new_label);
*(changed_flag) += 1;
labels[y * w + x] = new_label;
}
Expand Down