diff --git a/.git.bfg-report/2015-11-03/11-37-30/cache-stats.txt b/.git.bfg-report/2015-11-03/11-37-30/cache-stats.txt new file mode 100644 index 00000000..b4fb9dd7 --- /dev/null +++ b/.git.bfg-report/2015-11-03/11-37-30/cache-stats.txt @@ -0,0 +1,4 @@ +(apply,CacheStats{hitCount=112, missCount=70, loadSuccessCount=70, loadExceptionCount=0, totalLoadTime=185199690, evictionCount=0}) +(tree,CacheStats{hitCount=156, missCount=125, loadSuccessCount=120, loadExceptionCount=0, totalLoadTime=209773886, evictionCount=0}) +(commit,CacheStats{hitCount=36, missCount=36, loadSuccessCount=36, loadExceptionCount=0, totalLoadTime=120618566, evictionCount=0}) +(tag,CacheStats{hitCount=0, missCount=0, loadSuccessCount=0, loadExceptionCount=0, totalLoadTime=0, evictionCount=0}) \ No newline at end of file diff --git a/.git.bfg-report/2015-11-03/11-37-30/deleted-files.txt b/.git.bfg-report/2015-11-03/11-37-30/deleted-files.txt new file mode 100644 index 00000000..9cc53747 --- /dev/null +++ b/.git.bfg-report/2015-11-03/11-37-30/deleted-files.txt @@ -0,0 +1 @@ +2c8c249ccb705d86464062748879e243832d8748 176626884 image.npz \ No newline at end of file diff --git a/.git.bfg-report/2015-11-03/11-37-30/object-id-map.old-new.txt b/.git.bfg-report/2015-11-03/11-37-30/object-id-map.old-new.txt new file mode 100644 index 00000000..b5816079 --- /dev/null +++ b/.git.bfg-report/2015-11-03/11-37-30/object-id-map.old-new.txt @@ -0,0 +1,27 @@ +091da4a92561434ea3f83e168d774dbc823a71cd afdfad4da10a213564adc019c73daeb080c290a0 +0caec684df270136f8994b7a440b8887b021cdb1 0dd0f85e4a361c1f8fd3b7891e7f0ff00ad0dd12 +0df4ba3d1ed8a6d000c73824771848d548811852 d57f19df041e727a091b36f620749c4e90825222 +132c45fec95b1f06e4b1dacc4d038924df13e3fa 69cdd1004db36b05ba68e5c84e2b431398c9c5c4 +296463fa9ca352573ad27c9f0a62e655e2879c1c d73744ccd976030a9c5747f55e4dce4dc01df1e0 +2b503b3cc9a6af6b025068d5762da9f94c2bcc35 f4d29581c2968b5600a123c46cd4fc80d1455dd9 +2fefccc72fbdc602a98e6a8863a7cdf6fbddf486 cf408e405f649226421180edf98460af7b789476 +46fde603928535d91b6c20d7493143f182a9184f a67a3b65c2c256336da432066e3d545f7715915f +4bb1c42e82f65f2a7e11e9962aa440b593d21591 c9deb5a4f6ea76943f3402e792201c6c0eb1b35a +578aebef8837a52a9f3e5e2a1067e36853aa595f eb74eae1d92ca72455a4a7350800dbbcae27338d +5bfee183d6a3b393f000f3da7120f84b3e4d4453 dcd8b9481bea25b9fb0f1cf7688dc296bfd04459 +5c983813b314e1fac2acbcb9cd39a9970a1d9e4b 7b1e7fc39ea621bf61b13cdd66583e2f10bc4be0 +5d5e3d32301a8c0c9e27639a660b08ddf64a5789 99b4dc236c9d8af19db510a00a369ad6782abb99 +6d98fe5bb47c0eae6314f4f989dfeb75be9f0e66 3c948f6c78933d19cd8f057b5bb60cac4e23a19f +748b9880bc9e50c870c346f232e10516ce64ce2e 40bde769b599c15cce591f27adf38d1dba12901e +7e5250aa6b8adaaa9ce0b540b7078d786c7cefdd 8892eaa24bd6e1c45d71db9a2eb14ffba27534df +99f98ab588d3b23bac83f48f60f9409c9ca212fc 4db29f94190931a9a5ba5d33739a72dac8d831cf +9b33a4f616f08d2ac4287b276356359b14007995 606b3fd3348545e369b86adbd9174c49bd3d3e45 +9d4d1e802c5a532bf35d08b8da3453961b254839 f449d7d26ed71c06375cd37b101a74fb97ba3fbf +a24b78d0e8f0ab353fb87b587ec19a0b068b7747 8ea78ff263a3e5d33e63c5d844a76f4759f97987 +c6561bc15047eb9a747df5191dca6fd82d5ef1a5 0f838e1096934173d1d32381fefc268f7aecebb0 +cc57ed6753b50abcca1fc445d6463afba82e1649 eaeab959cb57844deade137f9a8af0b15c501e47 +dfe20862f1d2c717304f1d954e08f8c66bfd75a2 2b4100f071b42206517dec46c3164fd87119d357 +e7b96f1a622b61522e20ba3dedea59db270620a6 8100ff1fc52e7d3fed6a970266048e9453bf8e78 +eb2cc6604a44f567dc5e8f938113a4afe085aa48 a665be5a65b8d8ed1b957bc07f53603f7aeda456 +f25e969fe1ebbc63a2dca79e220b524cb570a0f0 3535c11c926356a8874f6446bbd72a718727439b +f75fc3591a08d64048d6c3d5d379a69cf45c69df 7cbba8ca1361e48e1502ee09bb1275c8c076c255 \ No newline at end of file diff --git a/.gitignore b/.gitignore index f3e1a9d9..e307b460 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *~ *.pyc *.pyxbldc +*.npz diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..c51773ad 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -10,10 +10,21 @@ mandelbrot(__global __read_only float *coords_real, float c_real, c_imag; float z_real, z_imag; + float temp; int iter; if ((x < w) && (y < h)) { - // YOUR CODE HERE - ; + z_real = 0.; + c_real = coords_real[y*w+x]; + z_imag = 0.; + c_imag = coords_imag[y*w+x]; + iter = 0; + while(((z_real*z_real+z_imag*z_imag) < 4) && (iter < max_iter)){ + temp = z_real*z_real -z_imag*z_imag+c_real; + z_imag = 2*z_real*z_imag+c_imag; + z_real = temp; + iter ++; + } + out_counts[y*w+x] = iter; } } diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..a7dcdfbc --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,91 @@ +Best configuration: +configuration ('coalesced', 128, 128): 0.0022836 seconds + +---------------------- +#0: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz on Apple +#1: Iris on Apple +coalesced reads, workgroups: 8, num_workers: 4, 0.13820736 seconds +coalesced reads, workgroups: 8, num_workers: 8, 0.07522864 seconds +coalesced reads, workgroups: 8, num_workers: 16, 0.03894968 seconds +coalesced reads, workgroups: 8, num_workers: 32, 0.01939744 seconds +coalesced reads, workgroups: 8, num_workers: 64, 0.01012384 seconds +coalesced reads, workgroups: 8, num_workers: 128, 0.00510888 seconds +coalesced reads, workgroups: 16, num_workers: 4, 0.07443104 seconds +coalesced reads, workgroups: 16, num_workers: 8, 0.0354476 seconds +coalesced reads, workgroups: 16, num_workers: 16, 0.01942088 seconds +coalesced reads, workgroups: 16, num_workers: 32, 0.0095256 seconds +coalesced reads, workgroups: 16, num_workers: 64, 0.00377312 seconds +coalesced reads, workgroups: 16, num_workers: 128, 0.0030688 seconds +coalesced reads, workgroups: 32, num_workers: 4, 0.03869232 seconds +coalesced reads, workgroups: 32, num_workers: 8, 0.01968616 seconds +coalesced reads, workgroups: 32, num_workers: 16, 0.00996952 seconds +coalesced reads, workgroups: 32, num_workers: 32, 0.0052508 seconds +coalesced reads, workgroups: 32, num_workers: 64, 0.002854 seconds +coalesced reads, workgroups: 32, num_workers: 128, 0.00294048 seconds +coalesced reads, workgroups: 64, num_workers: 4, 0.019068 seconds +coalesced reads, workgroups: 64, num_workers: 8, 0.0098072 seconds +coalesced reads, workgroups: 64, num_workers: 16, 0.00523128 seconds +coalesced reads, workgroups: 64, num_workers: 32, 0.00287544 seconds +coalesced reads, workgroups: 64, num_workers: 64, 0.00305128 seconds +coalesced reads, workgroups: 64, num_workers: 128, 0.002908 seconds +coalesced reads, workgroups: 128, num_workers: 4, 0.01902536 seconds +coalesced reads, workgroups: 128, num_workers: 8, 0.00996336 seconds +coalesced reads, workgroups: 128, num_workers: 16, 0.00519712 seconds +coalesced reads, workgroups: 128, num_workers: 32, 0.0028336 seconds +coalesced reads, workgroups: 128, num_workers: 64, 0.00265088 seconds +coalesced reads, workgroups: 128, num_workers: 128, 0.0022836 seconds +coalesced reads, workgroups: 256, num_workers: 4, 0.01938496 seconds +coalesced reads, workgroups: 256, num_workers: 8, 0.01029512 seconds +coalesced reads, workgroups: 256, num_workers: 16, 0.00518696 seconds +coalesced reads, workgroups: 256, num_workers: 32, 0.00288824 seconds +coalesced reads, workgroups: 256, num_workers: 64, 0.00251904 seconds +coalesced reads, workgroups: 256, num_workers: 128, 0.0024184 seconds +coalesced reads, workgroups: 512, num_workers: 4, 0.02045848 seconds +coalesced reads, workgroups: 512, num_workers: 8, 0.0104252 seconds +coalesced reads, workgroups: 512, num_workers: 16, 0.00544816 seconds +coalesced reads, workgroups: 512, num_workers: 32, 0.00294584 seconds +coalesced reads, workgroups: 512, num_workers: 64, 0.00273072 seconds +coalesced reads, workgroups: 512, num_workers: 128, 0.00257088 seconds +blocked reads, workgroups: 8, num_workers: 4, 0.21957688 seconds +blocked reads, workgroups: 8, num_workers: 8, 0.13403528 seconds +blocked reads, workgroups: 8, num_workers: 16, 0.08049544 seconds +blocked reads, workgroups: 8, num_workers: 32, 0.04249248 seconds +blocked reads, workgroups: 8, num_workers: 64, 0.01939344 seconds +blocked reads, workgroups: 8, num_workers: 128, 0.01121992 seconds +blocked reads, workgroups: 16, num_workers: 4, 0.12281136 seconds +blocked reads, workgroups: 16, num_workers: 8, 0.0711032 seconds +blocked reads, workgroups: 16, num_workers: 16, 0.040288 seconds +blocked reads, workgroups: 16, num_workers: 32, 0.01929832 seconds +blocked reads, workgroups: 16, num_workers: 64, 0.01115656 seconds +blocked reads, workgroups: 16, num_workers: 128, 0.02065632 seconds +blocked reads, workgroups: 32, num_workers: 4, 0.06618664 seconds +blocked reads, workgroups: 32, num_workers: 8, 0.03625648 seconds +blocked reads, workgroups: 32, num_workers: 16, 0.01941776 seconds +blocked reads, workgroups: 32, num_workers: 32, 0.0112864 seconds +blocked reads, workgroups: 32, num_workers: 64, 0.020172 seconds +blocked reads, workgroups: 32, num_workers: 128, 0.05790488 seconds +blocked reads, workgroups: 64, num_workers: 4, 0.03469184 seconds +blocked reads, workgroups: 64, num_workers: 8, 0.018126 seconds +blocked reads, workgroups: 64, num_workers: 16, 0.01120904 seconds +blocked reads, workgroups: 64, num_workers: 32, 0.02006592 seconds +blocked reads, workgroups: 64, num_workers: 64, 0.0561036 seconds +blocked reads, workgroups: 64, num_workers: 128, 0.06538024 seconds +blocked reads, workgroups: 128, num_workers: 4, 0.034544 seconds +blocked reads, workgroups: 128, num_workers: 8, 0.01909208 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.01207088 seconds +blocked reads, workgroups: 128, num_workers: 32, 0.02292568 seconds +blocked reads, workgroups: 128, num_workers: 64, 0.06212224 seconds +blocked reads, workgroups: 128, num_workers: 128, 0.04734296 seconds +blocked reads, workgroups: 256, num_workers: 4, 0.0348528 seconds +blocked reads, workgroups: 256, num_workers: 8, 0.01931432 seconds +blocked reads, workgroups: 256, num_workers: 16, 0.0112232 seconds +blocked reads, workgroups: 256, num_workers: 32, 0.02347664 seconds +blocked reads, workgroups: 256, num_workers: 64, 0.04581624 seconds +blocked reads, workgroups: 256, num_workers: 128, 0.03153152 seconds +blocked reads, workgroups: 512, num_workers: 4, 0.03445376 seconds +blocked reads, workgroups: 512, num_workers: 8, 0.01904256 seconds +blocked reads, workgroups: 512, num_workers: 16, 0.01207264 seconds +blocked reads, workgroups: 512, num_workers: 32, 0.02433416 seconds +blocked reads, workgroups: 512, num_workers: 64, 0.03053264 seconds +blocked reads, workgroups: 512, num_workers: 128, 0.02399624 seconds +configuration ('coalesced', 128, 128): 0.0022836 seconds \ No newline at end of file diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..f4a92ff1 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -8,8 +8,8 @@ __kernel void sum_coalesced(__global float* x, // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint i=get_global_id(0); i 0; s >>= 1) { + if(local_id < s) { + fast[local_id] += fast[local_id+s]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,7 +43,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +53,9 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint i=k*get_global_id(0);i 0; s >>= 1) { + if(local_id < s) { + fast[local_id] += fast[local_id+s]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..c4bfd3ac 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -31,4 +31,56 @@ median_3x3(__global __read_only float *in_values, // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + int row; + + if (idx_1D < buf_w) + for (row = 0; row < buf_h; row++) { + + // Handle boundary case, use the closest pixel + int this_x = buf_corner_x + idx_1D; + int this_y = buf_corner_y + row; + if(this_x>=w) this_x = w-1; + else if(this_x<0) this_x = 0; + if(this_y>=h) this_y = h-1; + else if(this_y<0) this_y = 0; + buffer[row * buf_w + idx_1D] = in_values[this_y * w + this_x]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // write output + if((y