Skip to content
Open

Hw3 #432

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions HW3/P2/mandelbrot2.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
__kernel void
mandelbrot(__global __read_only float *coords_real,
__global __read_only float *coords_imag,
__global __write_only int *out_counts,
int w, int h, int max_iter)
{
// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

float c_real, c_imag;
float z_real, z_imag;
float z_temp;
int iter=0;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
z_real = 0;
z_imag = 0;
c_real = coords_real[x+y*w];
c_imag = coords_imag[x+y*w];
while(iter<max_iter){
if (z_real*z_real + z_imag*z_imag > 4){
break;
}
else{
z_temp = z_real;
z_real = z_real*z_real - z_imag*z_imag + c_real;
z_imag = 2*z_imag*z_temp + c_imag;
iter = iter + 1;
}
out_counts[x+y*w] = iter;
}
}
}
29 changes: 29 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
PROBLEM 3

The best configuration and time for my hardware is:
configuration ('coalesced', 512, 64): 0.0030328 seconds

This is my computer information:

---------------------------
Apple Apple version: OpenCL 1.2 (Feb 27 2015 01:29:10)
The devices detected on platform Apple are:
---------------------------
Intel(R) Core(TM) i7-3540M CPU @ 3.00GHz [Type: CPU ]
Maximum clock Frequency: 3000 MHz
Maximum allocable memory size: 2147 MB
Maximum work group size 1024
Maximum work item dimensions 3
Maximum work item size [1024, 1, 1]
---------------------------
HD Graphics 4000 [Type: GPU ]
Maximum clock Frequency: 1300 MHz
Maximum allocable memory size: 268 MB
Maximum work group size 512
Maximum work item dimensions 3
Maximum work item size [512, 512, 512]
---------------------------
This context is associated with 2 devices
The queue is using the device: HD Graphics 4000
The device memory bandwidth is 11.0345233454 GB/s
The host-device bandwidth is 5.02189546422 GB/s
37 changes: 28 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ __kernel void sum_coalesced(__global float* x,

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

int k = get_global_size(0);
int i = get_global_id(0);
for(int s = i; s<N; s+=k){
sum += x[s];
}

fast[local_id] = sum;
Expand All @@ -24,8 +27,14 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

size_t gs = get_local_size(0);

for(uint s = gs/2; s > 0; s >>= 1) {
if(local_id< s) {
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +47,7 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
int k = ceil((float)N / get_global_size(0));

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +57,13 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

int i = get_global_id(0);

for (int s=k*i;s<k*(i+1);s++) { // YOUR CODE HERE
if(s<N) {
sum += x[s];
}
}

fast[local_id] = sum;
Expand All @@ -64,8 +78,13 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
size_t gs = get_local_size(0);

for(uint s = gs/2; s > 0; s >>= 1) {
if(local_id< s) {
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
102 changes: 71 additions & 31 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,34 +1,74 @@
#include "median9.h"

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
__global __write_only float *out_values,
__local float *buffer,
int w, int h,
int buf_w, int buf_h,
const int halo)
{
// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.


// Load into buffer (with 1-pixel halo).
//
// It may be helpful to consult HW3 Problem 5, and
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
//
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.


// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.


// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
}
__kernel void median_3x3(__global __read_only float *in_values,
__global __write_only float *out_values,
__local float *buffer,
int w, int h,
int buf_w, int buf_h,
const int halo){


// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

// We define the buffer indices and check their bounds
if ((y < h) && (x < w)) {
if (idx_1D < buf_w) {
for (int row = 0; row < buf_h; row++) {

int new_x = buf_corner_x + idx_1D;
int new_y = buf_corner_y + row;

if (new_x < 0){
new_x = 0;
}
else if (new_x >= w){
new_x = w-1;
}

if (new_y < 0){
new_y = 0;
}
else if (new_y >= h){
new_y = h-1;
}

buffer[row * buf_w + idx_1D] = in_values[new_y * w + new_x];

}
}
}

barrier(CLK_LOCAL_MEM_FENCE);

if ((y < h) && (x < w)) {
float s0 = buffer[buf_w *(buf_y - 1) + (buf_x - 1)];
float s1 = buffer[buf_w *(buf_y - 1) + (buf_x)];
float s2 = buffer[buf_w *(buf_y - 1) + (buf_x + 1)];
float s3 = buffer[buf_w *(buf_y) + (buf_x - 1)];
float s4 = buffer[buf_w *(buf_y) + (buf_x)];
float s5 = buffer[buf_w *(buf_y) + (buf_x + 1)];
float s6 = buffer[buf_w *(buf_y + 1) + (buf_x - 1)];
float s7 = buffer[buf_w *(buf_y + 1) + (buf_x)];
float s8 = buffer[buf_w *(buf_y + 1) + (buf_x + 1)];
out_values[y * w + x] = median9(s0, s1, s2, s3, s4, s5, s6, s7, s8);
}
}
5 changes: 3 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
51 changes: 51 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Christian Junge helped me with this problem.

###Part 1

- Maze 1:
Finished after 875 iterations, 574.0908 ms total, 0.656103771429 ms per iteration
Found 2 regions

- Maze 2:
Finished after 507 iterations, 345.4796 ms total, 0.681419329389 ms per iteration
Found 35 regions

###Part 2

- Maze 1:
Finished after 529 iterations, 337.87184 ms total, 0.638699130435 ms per iteration
Found 2 regions

- Maze 2:
Finished after 272 iterations, 180.02288 ms total, 0.661848823529 ms per iteration
Found 35 regions

###Part 3:

- Maze 1:
Finished after 8 iterations, 6.87728 ms total, 0.85966 ms per iteration
Found 2 regions

- Maze 2:
Finished after 8 iterations, 6.70104 ms total, 0.83763 ms per iteration
Found 35 regions

###Part 4:

- Maze 1:

- Maze 2:

###Part 5:

If instead of using the atomic_min() operation, we used the min() function the final result would be still correct.
Nevertheless, in the case where two threads are trying to change the same "old_label" for different "new_label", the performance of the algorithm would be affected. Let's say create an example:
thread1: old_label1 = 15 new_label1 = 10
thread2: old_label2 = 15 new_label2 = 8
In the first step Thread1 is comparing old_label1 to new_label1, it calculates the minimum and chooses new_label1 (10).
Parallely, Thread2 is comparing old_label2 to new_label2, it calculates the minimum and chooses new_label2 (8).
For memory reasons, Thread2 is faster and swaps the labels before Thread1. Thread1 swaps next, for its already selected
value (10).
As we can see this is not the optimum and we would probably end up doing more iterations. Nevertheless, a value in labels
will never increase.

38 changes: 35 additions & 3 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,52 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)



// Part 2
if ((x < w) && (y < h) && old_label < w*h) {
buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
}

barrier(CLK_LOCAL_MEM_FENCE);

int left;
int right;
int up;
int down;

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

// We get the values for the 4 neighbors
left = buffer[buf_y * buf_w + buf_x-1];
right = buffer[buf_y * buf_w + buf_x+1];
up = buffer[(buf_y-1) * buf_w + buf_x];
down = buffer[(buf_y+1) * buf_w + buf_x];

// If it's not a wall, we find the minimum value of its 4 neighboring pixels and itself
if (old_label < w*h) {
new_label = min(left, right);
new_label = min(new_label, up);
new_label = min(new_label, down);
new_label = min(new_label, old_label);
}
else {
new_label = old_label;
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;
atomic_min(&labels[old_label], labels[new_label]);
labels[y * w + x] = labels[old_label];
// labels[y * w + x] = new_label;

}
}
}