From a31a24a2bd724cbfc66271a5d5ca4c9b1c54bd67 Mon Sep 17 00:00:00 2001
From: pjewell <pjewell@biociphers.org>
Date: Sun, 12 Sep 2021 23:22:04 -0400
Subject: [PATCH] -main submission for project 1

>w<
---
 README.md     |  92 +++++-
 src/kernel.cu | 760 ++++++++++++++++++++++++++++++++++++++++++++++----
 src/main.cpp  |   2 +-
 3 files changed, 785 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index d63a6a1..56585c3 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,91 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Paul (San) Jewell
+  * [LinkedIn](https://www.linkedin.com/in/paul-jewell-2aba7379), [work website](
+    https://www.biociphers.org/paul-jewell-lab-member), [personal website](https://gitlab.com/inklabapp), [twitter](https://twitter.com/inklabapp), etc.
+* Tested on: (TODO) Linux pop-os 5.11.0-7614-generic, i7-9750H CPU @ 2.60GHz 32GB, GeForce GTX 1650 Mobile / Max-Q 4GB 
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+- For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+| Number of Boids | ~FPS Brute Force | ~FPS Scatter Grid | ~FPS Coherent Grid |   |
+|-----------------|-------------|--------------|---------------|---|
+| 1000            | 116         | 22           |   --            |   |
+| 2000            | 59          | 9            |   --           |   |
+| 3000            | 33          | 287          | 329           |   |
+| 4000            | 22          | 268          | 294           |   |
+| 5000            | 14          | 146          | 176           |   |
+| 10000           | 3           | 88           | 114           |   |
+| 13000           | 1.6         | 61           | 89            |   |
+| 15000           | 1           | 55           | 75            |   |
+| 20000           | 0.7         | 41           | 58            |   |
+
+In all cases in general, increasing the number of boids results in worse performance. 
+This is simply because evaluating one frame of motion required more calculations 
+to process, and is expected. 
+
+There is an anomaly with the low boid counts with the second and third implementations,
+where an unsolved error with the thrust sort took place, but I would expect these 
+trends to continue otherwise. 
+
+![boidstats](images/boidstats.png)
+
+- For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+| Block Size | Scatter FPS (5000 boids) |   |
+|------------|--------------------------|---|
+| 1          | 65                       |   |
+| 2          | 85                       |   |
+| 4          | 103                      |   |
+| 8          | 122                      |   |
+| 16         | 138                      |   |
+| 32         | 164                      |   |
+| 64         | 164                      |   |
+| 128        | 161                      |   |
+| 256        | 155                      |   |
+| 512        | 153                      |   |
+| 1024       | < out of memory >          |   |
+
+
+![blockstats](images/blockstats.png)
+    
+It seems there is a certain ideal block size somewhere around 32 on my system, which 
+yields a very good speedup. After this increasing the size seems to have saturated
+the memory, and some thrashing may be occurring, though it's not terrible. I'm guessing 
+it's a similar concept to multithreading in general. There will always be a bottleneck
+in bandwidth somewhere. It seems 32 blocks is a good rule of thumb. I'm not sure
+how variable this will be between problems / systems. 
+
+- For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+From the earlier plot, we can see there are minor performance improvements using the uniform
+grid over the scattered one. This highlights the importance of trying to keep your memory 
+accesses (from GPU main memory) as close together as you are able, to maximize the 
+data read at a time in to the cache. (Because, this will be greater than just one byte at
+a time!)
+
+- Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+|                                      | FPS |   |
+|--------------------------------------|-----|---|
+| Baseline (Double width) (2744 cells) | 162 |   |
+| Single Width (17576 cells)           | 3   |   |
+| Quad Width (512 cells)               | 106 |   |
+| 8x (64 cells)                        | 40  |   |
+| ~~~                                  |     |   |
+| Baseline With max 8 neighbors        | 162 |   |
+| Max 26 neighbors                     | 116 |   |
+
+Changing cell width is a balancing exercise. The performance here is tied to the max 
+rule distances used, so your mileage may vary. The takeaway is that your widths should 
+generally be around double the rule distance as indicated, because otherwise you will be 
+checking distance to extra boids which have no chance of falling within the rule. This
+problem gets worse as you make the cells larger and larger. 
+
+Single width just doesn't make any sense because you will miss most boid interactions. 
+
+Checking all neighbor cells in all directions is a bit slower. This is because, while
+there may be additional distances to check for boids in those 'farther away' cells, 
+Based on out cell sizing, we won't ever pass the distance threshold for rules
+when the boids are in the far cells, so it always equated to wasted work.  
\ No newline at end of file
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..a227a52 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -6,6 +6,7 @@
 #include "utilityCore.hpp"
 #include "kernel.h"
 
+
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
@@ -41,12 +42,15 @@ void checkCUDAError(const char *msg, int line = -1) {
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
-#define rule1Distance 5.0f
+//#define rule1Distance 10.0f
+//#define rule2Distance 3.0f
+//#define rule3Distance 5.0f
+#define rule1Distance 8.0f
 #define rule2Distance 3.0f
 #define rule3Distance 5.0f
 
 #define rule1Scale 0.01f
-#define rule2Scale 0.1f
+#define rule2Scale 0.2f
 #define rule3Scale 0.1f
 
 #define maxSpeed 1.0f
@@ -86,6 +90,10 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3 *dev_pos2; // What part of dev_particleArrayIndices belongs
+glm::vec3 *dev_vel3;   // to this cell?
+glm::vec3 *dev_vel4;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -94,6 +102,7 @@ float gridCellWidth;
 float gridInverseCellWidth;
 glm::vec3 gridMinimum;
 
+
 /******************
 * initSimulation *
 ******************/
@@ -137,6 +146,7 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
 * Initialize memory, update some globals
 */
 void Boids::initSimulation(int N) {
+    numObjects = N;
   numObjects = N;
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
@@ -156,12 +166,18 @@ void Boids::initSimulation(int N) {
     dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
+
+
   // LOOK-2.1 computing grid params
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
   gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+
+
+  std::cout << "GRID CELL COUNT " << gridCellCount << '\n';
+
   gridInverseCellWidth = 1.0f / gridCellWidth;
   float halfGridWidth = gridCellWidth * halfSideCount;
   gridMinimum.x -= halfGridWidth;
@@ -169,6 +185,22 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+  cudaMalloc((void**)&dev_gridCellStartIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+	cudaMalloc((void**)&dev_vel3, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_vel3 failed!");
+	cudaMalloc((void**)&dev_vel4, N * sizeof(glm::vec3));
+		checkCUDAErrorWithLine("cudaMalloc dev_vel4 failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -229,11 +261,64 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
+
+
+
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+
+  glm::vec3 new_velocity = glm::vec3(0, 0, 0);
+
+  glm::vec3 center = glm::vec3(0, 0, 0);
+  glm::vec3 separate = glm::vec3(0, 0, 0);
+  glm::vec3 cohesion = glm::vec3(0, 0, 0);
+  int numNeighborsR1 = 0;
+  int numNeighborsR2 = 0;
+  int numNeighborsR3 = 0;
+
+  for (int i = 0; i < N; i++) {
+    if (i == iSelf) continue;
+
+    float distance = glm::distance(pos[i], pos[iSelf]);
+
+    if (distance < rule1Distance){
+      numNeighborsR1++;
+      center += pos[i];
+    }
+
+    if (distance < rule2Distance){
+      numNeighborsR2++;
+      separate -= (pos[i] - pos[iSelf]);
+    }
+
+    if (distance < rule3Distance){
+      numNeighborsR3++;
+      cohesion += vel[i];
+    }
+
+
+  }
+
+  if(numNeighborsR1 > 0){
+    center /= numNeighborsR1;
+    new_velocity += ((center - pos[iSelf]) * rule1Scale);
+
+  }
+
+  if(numNeighborsR2 > 0) {
+    new_velocity += (separate * rule2Scale);
+  }
+
+  if(numNeighborsR3 > 0){
+    //cohesion /= numNeighborsR3; // ?
+    //new_velocity += (cohesion - vel[iSelf]) * rule3Scale;
+    new_velocity += cohesion * rule3Scale;
+  }
+
+
+
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  return new_velocity;
 }
 
 /**
@@ -245,6 +330,19 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+      return;
+  }
+
+  glm::vec3 vel2_ = vel2[index] + computeVelocityChange(N, index, pos, vel1);
+  float speed = glm::length(vel2_);
+  if (speed > maxSpeed) {
+    vel2_ = vel2_ / speed * maxSpeed;
+  }
+
+  vel2[index] = vel2_;
+
 }
 
 /**
@@ -284,11 +382,49 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
 
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
+  glm::vec3 *pos, int *particleArrayIndices, int *particleGridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+    /*
+     * int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
+int *dev_particleGridIndices; // What grid cell is this particle in?
+// needed for use with thrust
+thrust::device_ptr<int> dev_thrust_particleArrayIndices;
+thrust::device_ptr<int> dev_thrust_particleGridIndices;
+
+int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
+int *dev_gridCellEndIndices;   // to this cell?
+     */
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  glm::ivec3 _3d_grid_idx = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+
+  int _1d_grid_idx = gridIndex3Dto1D(_3d_grid_idx.x, _3d_grid_idx.y, _3d_grid_idx.z, gridResolution);
+
+  particleGridIndices[index] = _1d_grid_idx;
+  particleArrayIndices[index] = index;
+
+
+}
+
+__global__ void kernComputeIndicesCoherent(int N, int gridResolution,
+  glm::vec3 gridMin, float inverseCellWidth,
+  glm::vec3 *pos, int *particleGridIndices) {
+
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  glm::ivec3 _3d_grid_idx = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+
+  int _1d_grid_idx = gridIndex3Dto1D(_3d_grid_idx.x, _3d_grid_idx.y, _3d_grid_idx.z, gridResolution);
+
+  particleGridIndices[index] = _1d_grid_idx;
+
+
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +442,21 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+
+  if(index == 0){
+    gridCellStartIndices[particleGridIndices[index]] = index;
+  }else if(index == (N-1)) {
+    gridCellEndIndices[particleGridIndices[index]] = index;
+  }else if(particleGridIndices[index] != particleGridIndices[index+1]){
+      gridCellEndIndices[particleGridIndices[index]] = index;
+      gridCellStartIndices[particleGridIndices[index+1]] = index+1;
+  }
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +473,242 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  // Compute a new velocity based on pos and vel1
+  // Clamp the speed
+  // Record the new velocity into vel2. Question: why NOT vel1?
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N ) {
+    return;
+  }
+
+  // get grid cell
+  glm::ivec3 _3d_grid_idx = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+
+
+  int iSelf = index; //particleArrayIndices[index];
+  glm::vec3 vel2_ = vel2[iSelf];
+
+
+  for(int aj_cell_i=7; aj_cell_i>=0; aj_cell_i--){
+	  glm::ivec3 _chk_cell = glm::ivec3(_3d_grid_idx.x, _3d_grid_idx.y, _3d_grid_idx.z);
+	  switch(aj_cell_i){
+		case 0:
+			break;
+		case 1:
+			_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+			break;
+		case 2:
+			_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+			break;
+		case 3:
+			_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+			_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+			break;
+		case 4:
+			_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+			break;
+		case 5:
+			_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+			_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+			break;
+		case 6:
+			_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+			_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+			break;
+		case 7:
+			_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+			_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+			_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+			break;
+
+	  }
+	  if(_chk_cell.x < 0 || _chk_cell.y < 0 || _chk_cell.z < 0 || _chk_cell.x >= gridResolution || _chk_cell.y >= gridResolution || _chk_cell.z >= gridResolution ){
+		continue;
+	  }
+
+	  int _1d_grid_idx = gridIndex3Dto1D(_chk_cell.x, _chk_cell.y, _chk_cell.z, gridResolution);
+
+
+	  int particleIdxStart = gridCellStartIndices[_1d_grid_idx];
+	  if(particleIdxStart == -1){
+		  continue;
+	  }
+	  int particleIdxEnd = gridCellEndIndices[_1d_grid_idx];
+
+
+
+		glm::vec3 center = glm::vec3(0, 0, 0);
+		glm::vec3 separate = glm::vec3(0, 0, 0);
+		glm::vec3 cohesion = glm::vec3(0, 0, 0);
+		int numNeighborsR1 = 0;
+		int numNeighborsR2 = 0;
+		int numNeighborsR3 = 0;
+
+
+		for(int _i=particleIdxStart; _i<particleIdxEnd; _i++){
+
+		  int i = particleArrayIndices[_i];
+		  if (i == iSelf) continue;
+
+		  float distance = glm::distance(pos[i], pos[iSelf]);
+
+		  if (distance < rule1Distance){
+			numNeighborsR1++;
+			center += pos[i];
+		  }
+
+		  if (distance < rule2Distance){
+			numNeighborsR2++;
+			separate -= (pos[i] - pos[iSelf]);
+		  }
+
+		  if (distance < rule3Distance){
+			numNeighborsR3++;
+			cohesion += vel1[i];
+		  }
+
+
+		}
+
+		if(numNeighborsR1 > 0){
+		  center /= numNeighborsR1;
+		  vel2_ += ((center - pos[iSelf]) * rule1Scale);
+
+		}
+
+		if(numNeighborsR2 > 0) {
+		  vel2_ += (separate * rule2Scale);
+		}
+
+		if(numNeighborsR3 > 0){
+		  //cohesion /= numNeighborsR3; // ?
+		  //new_velocity += (cohesion - vel[iSelf]) * rule3Scale;
+		  vel2_ += cohesion * rule3Scale;
+		}
+  }
+
+
+
+
+  //glm::vec3 vel2_ = vel2[index] + computeVelocityChange(0, N, index, pos, vel1);
+  float speed = glm::length(vel2_);
+  if (speed > maxSpeed) {
+    vel2_ = vel2_ / speed * maxSpeed;
+  }
+
+  vel2[iSelf] = vel2_;
+
+}
+
+__global__ void kernUpdateVelNeighborSearchScatteredAllNeighbors(
+  int N, int gridResolution, glm::vec3 gridMin,
+  float inverseCellWidth, float cellWidth,
+  int *gridCellStartIndices, int *gridCellEndIndices,
+  int *particleArrayIndices,
+  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  // used as part of performance analysis
+
+
+
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N ) {
+    return;
+  }
+
+  // get grid cell
+  glm::ivec3 _3d_grid_idx = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+
+
+  int iSelf = index; //particleArrayIndices[index];
+  glm::vec3 vel2_ = vel2[iSelf];
+
+  for(int dz=-1; dz<=1; dz++){
+  	for(int dy=-1; dy<=1; dy++){
+  		for(int dx=-1; dx<=1; dx++){
+  			glm::ivec3 _chk_cell = glm::ivec3(_3d_grid_idx.x, _3d_grid_idx.y, _3d_grid_idx.z);
+  			_chk_cell += glm::ivec3(dx, dy, dz);
+
+
+
+
+	  if(_chk_cell.x < 0 || _chk_cell.y < 0 || _chk_cell.z < 0 || _chk_cell.x >= gridResolution || _chk_cell.y >= gridResolution || _chk_cell.z >= gridResolution ){
+		continue;
+	  }
+
+	  int _1d_grid_idx = gridIndex3Dto1D(_chk_cell.x, _chk_cell.y, _chk_cell.z, gridResolution);
+
+
+	  int particleIdxStart = gridCellStartIndices[_1d_grid_idx];
+	  if(particleIdxStart == -1){
+		  continue;
+	  }
+	  int particleIdxEnd = gridCellEndIndices[_1d_grid_idx];
+
+
+
+		glm::vec3 center = glm::vec3(0, 0, 0);
+		glm::vec3 separate = glm::vec3(0, 0, 0);
+		glm::vec3 cohesion = glm::vec3(0, 0, 0);
+		int numNeighborsR1 = 0;
+		int numNeighborsR2 = 0;
+		int numNeighborsR3 = 0;
+
+
+		for(int _i=particleIdxStart; _i<particleIdxEnd; _i++){
+
+		  int i = particleArrayIndices[_i];
+		  if (i == iSelf) continue;
+
+		  float distance = glm::distance(pos[i], pos[iSelf]);
+
+		  if (distance < rule1Distance){
+			numNeighborsR1++;
+			center += pos[i];
+		  }
+
+		  if (distance < rule2Distance){
+			numNeighborsR2++;
+			separate -= (pos[i] - pos[iSelf]);
+		  }
+
+		  if (distance < rule3Distance){
+			numNeighborsR3++;
+			cohesion += vel1[i];
+		  }
+
+
+		}
+
+		if(numNeighborsR1 > 0){
+		  center /= numNeighborsR1;
+		  vel2_ += ((center - pos[iSelf]) * rule1Scale);
+
+		}
+
+		if(numNeighborsR2 > 0) {
+		  vel2_ += (separate * rule2Scale);
+		}
+
+		if(numNeighborsR3 > 0){
+		  //cohesion /= numNeighborsR3; // ?
+		  //new_velocity += (cohesion - vel[iSelf]) * rule3Scale;
+		  vel2_ += cohesion * rule3Scale;
+		}
+  		}
+  	  	}
+  	  }
+
+
+
+  //glm::vec3 vel2_ = vel2[index] + computeVelocityChange(0, N, index, pos, vel1);
+  float speed = glm::length(vel2_);
+  if (speed > maxSpeed) {
+    vel2_ = vel2_ / speed * maxSpeed;
+  }
+
+  vel2[iSelf] = vel2_;
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +728,127 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+	 int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	  if (index >= N ) {
+	    return;
+	  }
+
+	  // get grid cell
+	  glm::ivec3 _3d_grid_idx = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+
+
+	  int iSelf = index; //particleArrayIndices[index];
+	  glm::vec3 vel2_ = vel2[iSelf];
+
+
+	  for(int aj_cell_i=7; aj_cell_i>=0; aj_cell_i--){
+		  glm::ivec3 _chk_cell = glm::ivec3(_3d_grid_idx.x, _3d_grid_idx.y, _3d_grid_idx.z);
+		  switch(aj_cell_i){
+			case 0:
+				break;
+			case 1:
+				_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+				break;
+			case 2:
+				_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+				break;
+			case 3:
+				_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+				_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+				break;
+			case 4:
+				_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+				break;
+			case 5:
+				_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+				_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+				break;
+			case 6:
+				_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+				_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+				break;
+			case 7:
+				_chk_cell.x = round((pos[index].x - gridMin.x) / cellWidth) == _3d_grid_idx.x ? _3d_grid_idx.x-1 : _3d_grid_idx.x+1;
+				_chk_cell.y = round((pos[index].y - gridMin.y) / cellWidth) == _3d_grid_idx.y ? _3d_grid_idx.y-1 : _3d_grid_idx.y+1;
+				_chk_cell.z = round((pos[index].z - gridMin.z) / cellWidth) == _3d_grid_idx.z ? _3d_grid_idx.z-1 : _3d_grid_idx.z+1;
+				break;
+
+		  }
+		  if(_chk_cell.x < 0 || _chk_cell.y < 0 || _chk_cell.z < 0 || _chk_cell.x >= gridResolution || _chk_cell.y >= gridResolution || _chk_cell.z >= gridResolution ){
+			continue;
+		  }
+
+		  int _1d_grid_idx = gridIndex3Dto1D(_chk_cell.x, _chk_cell.y, _chk_cell.z, gridResolution);
+
+
+		  int particleIdxStart = gridCellStartIndices[_1d_grid_idx];
+		  if(particleIdxStart == -1){
+			  continue;
+		  }
+		  int particleIdxEnd = gridCellEndIndices[_1d_grid_idx];
+
+
+
+			glm::vec3 center = glm::vec3(0, 0, 0);
+			glm::vec3 separate = glm::vec3(0, 0, 0);
+			glm::vec3 cohesion = glm::vec3(0, 0, 0);
+			int numNeighborsR1 = 0;
+			int numNeighborsR2 = 0;
+			int numNeighborsR3 = 0;
+
+
+			for(int _i=particleIdxStart; _i<particleIdxEnd; _i++){
+
+			  int i = _i;
+			  if (i == iSelf) continue;
+
+			  float distance = glm::distance(pos[i], pos[iSelf]);
+
+			  if (distance < rule1Distance){
+				numNeighborsR1++;
+				center += pos[i];
+			  }
+
+			  if (distance < rule2Distance){
+				numNeighborsR2++;
+				separate -= (pos[i] - pos[iSelf]);
+			  }
+
+			  if (distance < rule3Distance){
+				numNeighborsR3++;
+				cohesion += vel1[i];
+			  }
+
+
+			}
+
+			if(numNeighborsR1 > 0){
+			  center /= numNeighborsR1;
+			  vel2_ += ((center - pos[iSelf]) * rule1Scale);
+
+			}
+
+			if(numNeighborsR2 > 0) {
+			  vel2_ += (separate * rule2Scale);
+			}
+
+			if(numNeighborsR3 > 0){
+			  //cohesion /= numNeighborsR3; // ?
+			  //new_velocity += (cohesion - vel[iSelf]) * rule3Scale;
+			  vel2_ += cohesion * rule3Scale;
+			}
+	  }
+
+
+
+	  //glm::vec3 vel2_ = vel2[index] + computeVelocityChange(0, N, index, pos, vel1);
+	  float speed = glm::length(vel2_);
+	  if (speed > maxSpeed) {
+	    vel2_ = vel2_ / speed * maxSpeed;
+	  }
+
+	  vel2[iSelf] = vel2_;
+
 }
 
 /**
@@ -349,6 +857,18 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+
+
+
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
+  glm::vec3* tmp = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = tmp;
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +884,47 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+          dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+  thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+  thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+
+  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+  kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices,
+          dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(
+                                                            numObjects, gridSideCount, gridMinimum,
+                                                                    gridInverseCellWidth, 1.0 / gridInverseCellWidth,
+                                                                    dev_gridCellStartIndices, dev_gridCellEndIndices,
+                                                                    dev_particleArrayIndices,
+                                                                    dev_pos, dev_vel1, dev_vel2);
+
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
+  glm::vec3* tmp = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = tmp;
+
+
+}
+
+__global__ void kernGather(int N, int *intBuffer, glm::vec3 *pos_s, glm::vec3 *pos_d,
+																									glm::vec3 *vel_s, glm::vec3 *vel_d,
+																									glm::vec3 *vel2_s, glm::vec3 *vel2_d) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index < N) {
+  	pos_d[index] = pos_s[intBuffer[index]];
+  	vel_d[index] = vel_s[intBuffer[index]];
+  	vel2_d[index] = vel2_s[intBuffer[index]];
+  }
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,76 +943,151 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
-}
 
-void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  glm::vec3* tmp;
 
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
-}
+  kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+            dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
 
-void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
+  thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+  thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
 
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
+  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
 
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
+  // stuff:
+  // vel1 presort read
+  // vel2 presort write
+  // vel3 postsort read
+  // vel4 postsort write
+  // pos1 presort r/w
+  // pos2 postsort r/w
 
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
+  kernGather<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices,
+  		dev_pos, dev_pos2,
+  		dev_vel1, dev_vel3,
+  		dev_vel2, dev_vel4
+  		);
 
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
 
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+  kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_gridCellEndIndices, -1);
 
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices,
+          dev_gridCellStartIndices, dev_gridCellEndIndices);
 
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
+  kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid, blockSize>>>(
+          numObjects, gridSideCount, gridMinimum,
+                  gridInverseCellWidth, 1.0 / gridInverseCellWidth,
+                  dev_gridCellStartIndices, dev_gridCellEndIndices,
+                  dev_pos2, dev_vel3, dev_vel4);
 
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
 
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos2, dev_vel4);
+
+
+  // post to pre
+  tmp = dev_pos;
+	dev_pos = dev_pos2;
+	dev_pos2 = tmp;
+
+	// read to write
+	tmp = dev_vel4;
+	dev_vel4 = dev_vel3;
+	dev_vel3 = tmp;
+
+	// post to pre
+	tmp = dev_vel2;
+	dev_vel2 = dev_vel4;
+	dev_vel4 = tmp;
+
+  // post to pre
+	tmp = dev_vel1;
+	dev_vel1 = dev_vel3;
+	dev_vel3 = tmp;
+
 
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
 
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
 
-  // cleanup
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
 }
+
+void Boids::endSimulation() {
+  cudaFree(dev_vel1);
+  cudaFree(dev_vel2);
+  cudaFree(dev_pos);
+
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_pos2);
+  cudaFree(dev_vel3);
+  cudaFree(dev_vel4);
+
+  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+}
+
+//void Boids::unitTest() {
+//  // LOOK-1.2 Feel free to write additional tests here.
+//
+//  // test unstable sort
+//  int *dev_intKeys;
+//  int *dev_intValues;
+//  int N = 10;
+//
+//  std::unique_ptr<int[]>intKeys{ new int[N] };
+//  std::unique_ptr<int[]>intValues{ new int[N] };
+//
+//  intKeys[0] = 0; intValues[0] = 0;
+//  intKeys[1] = 1; intValues[1] = 1;
+//  intKeys[2] = 0; intValues[2] = 2;
+//  intKeys[3] = 3; intValues[3] = 3;
+//  intKeys[4] = 0; intValues[4] = 4;
+//  intKeys[5] = 2; intValues[5] = 5;
+//  intKeys[6] = 2; intValues[6] = 6;
+//  intKeys[7] = 0; intValues[7] = 7;
+//  intKeys[8] = 5; intValues[8] = 8;
+//  intKeys[9] = 6; intValues[9] = 9;
+//
+//  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+//  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+//
+//  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+//  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+//
+//  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+//
+//  std::cout << "before unstable sort: " << std::endl;
+//  for (int i = 0; i < N; i++) {
+//    std::cout << "  key: " << intKeys[i];
+//    std::cout << " value: " << intValues[i] << std::endl;
+//  }
+//
+//  // How to copy data to the GPU
+//  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+//  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+//
+//  // Wrap device vectors in thrust iterators for use with thrust.
+//  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+//  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+//  // LOOK-2.1 Example for using thrust::sort_by_key
+//  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+//
+//  // How to copy data back to the CPU side from the GPU
+//  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+//  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+//  checkCUDAErrorWithLine("memcpy back failed!");
+//
+//  std::cout << "after unstable sort: " << std::endl;
+//  for (int i = 0; i < N; i++) {
+//    std::cout << "  key: " << intKeys[i];
+//    std::cout << " value: " << intValues[i] << std::endl;
+//  }
+//
+//  // cleanup
+//  cudaFree(dev_intKeys);
+//  cudaFree(dev_intValues);
+//  checkCUDAErrorWithLine("cudaFree failed!");
+//  return;
+//}
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..303583f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -217,7 +217,7 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
-    Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
+    //Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
 
     while (!glfwWindowShouldClose(window)) {