Skip to content
Open

Hw3 #438

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
RESULTS

fastest:
coalesced reads, workgroups: 8, num_workers: 4, 0.076327157 seconds



Note:
getting segmentation fault, so i am probably not checking well for
threads going out of bounds, though i do check that things are < N
my results are from printing and looking at times manually. it
segfaults in the middle of workgroups:128

Results for test.py

The platforms detected are:
---------------------------
AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8)
The devices detected on platform AMD Accelerated Parallel Processing are:
---------------------------
Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz [Type: CPU ]
Maximum clock Frequency: 2394 MHz
Maximum allocable memory size: 1073 MB
Maximum work group size 1024
Maximum work item dimensions 3
Maximum work item size [1024, 1024, 1024]
---------------------------
This context is associated with 1 devices
The queue is using the device: Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz
The device memory bandwidth is 1.46333327639 GB/s
The host-device bandwidth is 5.07974888261 GB/s
57 changes: 47 additions & 10 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ __kernel void sum_coalesced(__global float* x,

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

int i = get_global_id(0);
int k = get_global_size(0);

for (int j = 0; (i + j*k) < N; j++) { // YOUR CODE HERE
sum = sum + x[i + j*k]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,22 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

int ls = get_local_size(0);
int offset = 0;
int limit = 0;

// calculate log_2(local_size)
// = to number of shifts
while (ls > 1) {
ls = ls >> 1;
limit = limit + 1;
}

for (int j=1; j < limit; j++) { // YOUR CODE HERE
offset = (get_local_size(0) >> j);
if (i+offset < N)
fast[i] = fast[i] + fast[i + offset]; // YOUR CODE HERE
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,18 +56,23 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
int k = ceil((float)N / get_global_size(0));

int i = get_global_id(0);
int val = 0;
// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
// thread with global_id 2 should add 2k..3k-1
// ...
// with k = ceil(N / get_global_size()).
//
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE


for (val = k*i; val <= k*(i+1)-1; val++) { // YOUR CODE HERE
if (val < N)
sum = sum + x[val]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -64,8 +87,22 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

int ls = get_local_size(0);
int offset = 0;
int limit = 0;

// calculate log_2(local_size)
// = to number of shifts
while (ls > 1) {
ls = ls >> 1;
limit = limit + 1;
}

for (int j=1; j < limit; j++) { // YOUR CODE HERE
offset = (get_local_size(0) >> j);
if (i + offset < N)
fast[i] = fast[i] + fast[i + offset]; // YOUR CODE HERE
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
65 changes: 65 additions & 0 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,77 @@ median_3x3(__global __read_only float *in_values,
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;


// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int row;

// Here is the actual loading of the buffer
// with 1D indexing
if (idx_1D < buf_w)
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
FETCH(in_values, w, h,
buf_corner_x + idx_1D,
buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.

// median9(...) gives the median value of a 3x3 space
// advice from halo_load.cl: Should only use buffer, buf_x, buf_y.

//if ((y > 0 && y < h-1 ) && (x > 0 && x < w-1)) // stay in bounds
if ((x < w) && (y < h))
out_values[y * w + x] = \
meadian9(buffer[(buf_y * buf_w + buf_x) -buf_x-1],
buffer[(buf_y * buf_w + buf_x) -buf_x],
buffer[(buf_y * buf_w + buf_x) -buf_x+1],
buffer[(buf_y * buf_w + buf_x) -1],
buffer[(buf_y * buf_w + buf_x)],
buffer[(buf_y * buf_w + buf_x) +1],
buffer[(buf_y * buf_w + buf_x) +buf_x-1],
buffer[(buf_y * buf_w + buf_x) +buf_x],
buffer[(buf_y * buf_w + buf_x) +buf_x+1]);

// take care of corners
/*
if (x==0 && y==0){
output[0] = buffer[(buf_y * buf_w + buf_x)];
}
else if (x==0){
}
else if (x==w-1){
}
else if (y==0){
}
else if (y==h-1){
}
*/


// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
Expand Down
5 changes: 3 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
46 changes: 46 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Iteration counts and average kernel times after each change for Parts 1-4,
an explanation for Part 4 as to why a single thread is a good (or bad) choice for this
operation, and the explanation of Part 5.

Was getting errors on part 1, but continued on anyways

Some of Error Output:
"/tmp/OCL8gbLMr.cl", line 31: error: a parameter cannot be allocated in a
named address space
propagate_labels(__global __read_write int *labels,
^

"/tmp/OCL8gbLMr.cl", line 31: error: expected a ")"
propagate_labels(__global __read_write int *labels,
^

"/tmp/OCL8gbLMr.cl", line 36: warning: parsing restarts here after previous
syntax error
const int halo)
^

"/tmp/OCL8gbLMr.cl", line 50: error: identifier "halo" is undefined
const int buf_corner_x = x - lx - halo;
^

"/tmp/OCL8gbLMr.cl", line 65: error: identifier "buf_w" is undefined
if (idx_1D < buf_w) {
^

"/tmp/OCL8gbLMr.cl", line 66: error: identifier "buf_h" is undefined
for (int row = 0; row < buf_h; row++) {
^



Part 4 Explanation
It seems valuable to use a single thread for a work group as it has its
own memory and won't have to wastefully repeat calculations over same
indices, which is what may be happening in part 2.


Part 5 Explanation
If we used min instead of atomic_min our answer might not be correct.
Atomic_min serializes so it will get the true minimum, while min in
parallel will cause values to be overwritten. However, this serialization
makes atomic_min slower.
49 changes: 43 additions & 6 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ propagate_labels(__global __read_write int *labels,

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int old_label;
// Will store the output value
int new_label;
// Load the relevant labels to a local buffer with a halo

// Load the relevant labels to a local buffer with a halo
if (idx_1D < buf_w) {
for (int row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] =
buffer[row * buf_w + idx_1D] =
get_clamped_value(labels,
w, h,
buf_corner_x + idx_1D, buf_corner_y + row);
Expand All @@ -80,20 +80,57 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)


// part 2
// perform this operation by replacing each value in
// buffer (at index offset) with label[buffer[offset]]
buffer[buf_y * buf_w + buf_x] = label[buffer[buf_y * buf_w + buf_x]];


// need to add barrier before min neighbors part 1 calculation
barrier(CLK LOCAL MEM FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.

// get min of current pixel and its 4 neighbors
new_label = old_label;

// check upper neighbor
if (buffer[buf_y * buf_w + buf_x - buf_w] < new_label){
new_label = buffer[buf_y * buf_w + buf_x - buf_w];
}
// check lower neighbor
else if (buffer[buf_y * buf_w + buf_x + buf_w] < new_label){
new_label = buffer[buf_y * buf_w + buf_x + buf_w];
}
// check right neighbor
else if (buffer[buf_y * buf_w + buf_x + 1] < new_label){
new_label = buffer[buf_y * buf_w + buf_x + 1];
}
// check left neighbor
else if (buffer[buf_y * buf_w + buf_x - 1] < new_label){
new_label = buffer[buf_y * buf_w + buf_x - 1];
}



if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;

//part 3 calls for not directly assigning
//labels[y * w + x] = new_label;

// use atomic_min to make sure pixel's value in labels
// never increases
labels[y * w + x] = atomic_min(labels,new_label)

}
}
}