diff --git a/README.md b/README.md
index d63a6a1..d654a21 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,55 @@
**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
Project 1 - Flocking**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Matt Elser
+ * [LinkedIn](https://www.linkedin.com/in/matt-elser-97b8151ba/)
+* Tested on: Ubuntu 20.04, i3-10100F @ 3.6GHz 16GB, GeForce 1660 Super 6GB
-### (TODO: Your README)
+### Boids Flocking Implementation
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+
+
+
+
+### Performance Analysis
+
+
+Data for this plot was gathered from repeated runs of the three algorithms with visualization turned off.
+Repeated FPS measurements were taken at 5 seconds into each run, and rounded to the nearest 5 FPS (with
+the exception of the notably low FPS runs at the max of each algorithm).
+
+Observations of note based on the above plot:
+#### 1. Brute force is the least performant by a large margin, and scales poorly.
+ The brute force algorithm when not run parallel would have quadratic complexity, and parallelisation
+ can only improve this so much.
+#### 2. Coherent does not out perform scattered grid, indicating a flaw in the algorithm.
+ The entire purpose of the coherent neighbor search within the uniform grid is to arrange as much
+as possible into contiguous memory for efficient io on the SMs, avoiding as much as possible the need
+to transfer memory to/from slower sources. Fixing this on time was derailed by a pernicious bug which
+prevented the algorithm from working entirely, so it has been noted in "Areas for Improvement" below.
+#### 3. FPS decreases over time for uniform grid algorithms
+ As the simulation runs for both uniform grid algorithms, they get slower. This makes sense: as time
+ goes on, more boids flock and therefore more neighbors need to be checked.
+#### 4. Counterintuitively, there is a repeatable increase in performance from 8,000 boids to 16,000
+ This could suggest a possibility for optimization of parameters such as block size, as explored below.
+
+### Block Size Comparison
+
+Using only the scattered uniform grid algorithm, timings were captured similarly to the algorithm
+comparison chart: fps was taken repeatedly at 5s from simulation start and rounded to the nearest 5fps.
+All block sizes tested show the same behavior from observation #4: simulation speed increases from
+80,000 boids to 160,000. Further digging would be aided by finer grained timing, such as seeing how
+much time is spent at the sorting stage, and whether block size affects this.
+
+### Areas for Improvement
+- 🔲 The code is not as tidy as I would like.
+- 🔲 As noted in the performance analysis, the fact that the coherent grid implementation
+runs notably slower than the scattered grid implies that memory is not being read continguously.
+- 🔲 Finer grained timing, e.x. how long sorting takes
+- ✅ In the course of writing the performance analysis, additional shuffles were removed from the
+coherent grid implementation. Initially, position/velocity arrays were kept in alignment with the
+neighboring grids by a shuffling step, then unshuffling for assignment and continuity between
+timesteps. This required multiple `copy` and `sortByKey` calls. Continuity and assigment do not
+require consistancy between timesteps though, only consistancy with eachother. Removing these
+shuffles yielded the performance described above, a ~10fps (~33%) speed increase at 50,000 boids
+from the previous version.
\ No newline at end of file
diff --git a/images/blockSizeTable.png b/images/blockSizeTable.png
new file mode 100644
index 0000000..cea21a4
Binary files /dev/null and b/images/blockSizeTable.png differ
diff --git a/images/boidsAnim.gif b/images/boidsAnim.gif
new file mode 100644
index 0000000..e9f321c
Binary files /dev/null and b/images/boidsAnim.gif differ
diff --git a/images/boidsStill.png b/images/boidsStill.png
new file mode 100644
index 0000000..1c4ce27
Binary files /dev/null and b/images/boidsStill.png differ
diff --git a/images/comparisonPlot.png b/images/comparisonPlot.png
new file mode 100644
index 0000000..2494478
Binary files /dev/null and b/images/comparisonPlot.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..6d9e8c9 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -85,6 +85,10 @@ int *dev_gridCellEndIndices; // to this cell?
// TODO-2.3 - consider what additional buffers you might need to reshuffle
// the position and velocity data to be coherent within cells.
+int *dev_shuffledArrayIndices1;
+int *dev_shuffledArrayIndices2;
+thrust::device_ptr dev_thrust_shuffledArrayIndices1;
+thrust::device_ptr dev_thrust_shuffledArrayIndices2;
// LOOK-2.1 - Grid parameters based on simulation parameters.
// These are automatically computed for you in Boids::initSimulation
@@ -169,6 +173,15 @@ void Boids::initSimulation(int N) {
gridMinimum.z -= halfGridWidth;
// TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+ cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+ cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+ cudaMalloc((void**)&dev_shuffledArrayIndices1, N * sizeof(int));
+ cudaMalloc((void**)&dev_shuffledArrayIndices2, N * sizeof(int));
+
+ cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+ cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+
+
cudaDeviceSynchronize();
}
@@ -230,10 +243,60 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
* in the `pos` and `vel` arrays.
*/
__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
- // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
- // Rule 2: boids try to stay a distance d away from each other
- // Rule 3: boids try to match the speed of surrounding boids
- return glm::vec3(0.0f, 0.0f, 0.0f);
+
+ // rule1 vars
+ int rule1Neighbors = 0;
+ glm::vec3 CM = glm::vec3(0);
+ glm::vec3 rule1Vel;
+ // rule2 vars
+ glm::vec3 c = glm::vec3(0);
+ glm::vec3 rule2Vel;
+ // rule3 vars
+ glm::vec3 perceivedVel = glm::vec3(0);
+ glm::vec3 rule3Vel;
+
+ // spacial vector between this boid and the boid being checked
+ glm::vec3 selfToThem;
+
+ for (int i=0; i 0){
+ CM /= (float)rule1Neighbors; // divide by the number of boids
+ rule1Vel = (CM - pos[iSelf]) * rule1Scale;
+ }
+ else{
+ rule1Vel = glm::vec3(0.0f);
+ }
+ rule2Vel = c * rule2Scale;
+ rule3Vel = (perceivedVel - vel[iSelf]) / 8.0f;// * rule3Scale;
+
+ return vel[iSelf] + rule1Vel + rule2Vel + rule3Vel;
}
/**
@@ -242,9 +305,19 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
*/
__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
glm::vec3 *vel1, glm::vec3 *vel2) {
+ // get index
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= N) {
+ return;
+ }
// Compute a new velocity based on pos and vel1
+ glm::vec3 newVel = computeVelocityChange(N, index, pos, vel1);
+
// Clamp the speed
+ newVel = glm::normalize(newVel) * maxSpeed;
+
// Record the new velocity into vel2. Question: why NOT vel1?
+ vel2[index] = newVel;
}
/**
@@ -285,10 +358,21 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
__global__ void kernComputeIndices(int N, int gridResolution,
glm::vec3 gridMin, float inverseCellWidth,
glm::vec3 *pos, int *indices, int *gridIndices) {
+
+ int threadi = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (threadi < N){
+ glm::vec3 gridPos = (pos[threadi] - gridMin) * inverseCellWidth;
+
// TODO-2.1
// - Label each boid with the index of its grid cell.
+ gridIndices[threadi] = gridIndex3Dto1D((int)gridPos.x,
+ (int)gridPos.y,
+ (int)gridPos.z,
+ gridResolution);
// - Set up a parallel array of integer indices as pointers to the actual
// boid data in pos and vel1/vel2
+ indices[threadi] = threadi;
+ }
}
// LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +390,17 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
// Identify the start point of each cell in the gridIndices array.
// This is basically a parallel unrolling of a loop that goes
// "this index doesn't match the one before it, must be a new cell!"
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index == 0){
+ gridCellStartIndices[particleGridIndices[index]] = index;
+ }
+ if (index < N) {
+ if (particleGridIndices[index-1] != particleGridIndices[index]){
+ gridCellStartIndices[particleGridIndices[index]] = index;
+ gridCellEndIndices[particleGridIndices[index-1]] = index-1;
+ }
+ }
+
}
__global__ void kernUpdateVelNeighborSearchScattered(
@@ -314,14 +409,103 @@ __global__ void kernUpdateVelNeighborSearchScattered(
int *gridCellStartIndices, int *gridCellEndIndices,
int *particleArrayIndices,
glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+
+ int iSelf = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (iSelf > N){
+ return;
+ }
+
// TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
// the number of boids that need to be checked.
// - Identify the grid cell that this particle is in
// - Identify which cells may contain neighbors. This isn't always 8.
- // - For each cell, read the start/end indices in the boid pointer array.
- // - Access each boid in the cell and compute velocity change from
- // the boids rules, if this boid is within the neighborhood distance.
- // - Clamp the speed change before putting the new speed in vel2
+ // --- find neighbors ---
+ glm::vec3 gridPos = (pos[iSelf] - gridMin) * inverseCellWidth;
+ // neighboorhood center is the intersection of 8 grid squares
+ // to which this boid is closest (e.x. if the boid is at gridPos
+ // {9.9, 9.8, 9.7}, it's {10, 10, 10} and we will check {10, 10, 10},
+ // {10, 10, 9} {10, 9, 10} {10, 9, 9} {9, 10, 10} {9, 10, 9} {9, 9, 10} {9, 9, 9})
+ glm::vec3 neighborhoodCenter = {std::round(gridPos.x),
+ std::round(gridPos.y),
+ std::round(gridPos.z)};
+ int neighborhoodIndices[8];
+ int maxNI=0;
+ int gridX, gridY, gridZ;
+ for (int x=neighborhoodCenter.x; x >= neighborhoodCenter.x - 1.0f; x--){
+ gridX = x > 0.0f ? x : gridResolution;
+ gridX = x < gridResolution ? x : 0.0f;
+ for (int y=neighborhoodCenter.y; y >= neighborhoodCenter.y - 1.0f; y--){
+ gridY = y > 0.0f ? y : gridResolution;
+ gridY = y < gridResolution ? y : 0.0f;
+ for (int z=neighborhoodCenter.z; z >= neighborhoodCenter.z - 1.0f; z--){
+ gridZ = z > 0.0f ? z : gridResolution;
+ gridZ = z < gridResolution ? z : 0.0f;
+ if (particleArrayIndices[gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution)] >= 0){
+ neighborhoodIndices[maxNI] = gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution);
+ maxNI++;
+ }
+ }
+ }
+ }
+
+ // rule1 vars
+ int rule1Neighbors = 0;
+ glm::vec3 CM = glm::vec3(0);
+ glm::vec3 rule1Vel;
+ // rule2 vars
+ glm::vec3 c = glm::vec3(0);
+ glm::vec3 rule2Vel;
+ // rule3 vars
+ glm::vec3 perceivedVel = glm::vec3(0);
+ glm::vec3 rule3Vel;
+
+ // spacial vector between this boid and the boid being checked
+ glm::vec3 selfToThem;
+
+ for (int gridi=0; gridi 0){
+ CM /= (float)rule1Neighbors; // divide by the number of boids
+ rule1Vel = (CM - pos[iSelf]) * rule1Scale;
+ }
+ else{
+ rule1Vel = glm::vec3(0.0f);
+ }
+ rule2Vel = c * rule2Scale;
+ rule3Vel = (perceivedVel - vel1[iSelf]) / 8.0f;// * rule3Scale;
+
+ // - Clamp the speed change before putting the new speed in vel2
+ glm::vec3 newVel = vel1[iSelf] + rule1Vel + rule2Vel + rule3Vel;
+ vel2[iSelf] = glm::normalize(newVel) * maxSpeed;
+
}
__global__ void kernUpdateVelNeighborSearchCoherent(
@@ -329,59 +513,289 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
float inverseCellWidth, float cellWidth,
int *gridCellStartIndices, int *gridCellEndIndices,
glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+
// TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
// except with one less level of indirection.
// This should expect gridCellStartIndices and gridCellEndIndices to refer
// directly to pos and vel1.
// - Identify the grid cell that this particle is in
+ int iSelf = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (iSelf > N){
+ return;
+ }
+
// - Identify which cells may contain neighbors. This isn't always 8.
+ // --- find neighbors ---
+ glm::vec3 gridPos = (pos[iSelf] - gridMin) * inverseCellWidth;
+ // neighboorhood center is the intersection of 8 grid squares
+ // to which this boid is closest (e.x. if the boid is at gridPos
+ // {9.9, 9.8, 9.7}, it's {10, 10, 10} and we will check {10, 10, 10},
+ // {10, 10, 9} {10, 9, 10} {10, 9, 9} {9, 10, 10} {9, 10, 9} {9, 9, 10} {9, 9, 9})
+ glm::vec3 neighborhoodCenter = {std::round(gridPos.x),
+ std::round(gridPos.y),
+ std::round(gridPos.z)};
+ int neighborhoodStarts[8];
+ int neighborhoodEnds[8];
+ int maxNI=0;
// - For each cell, read the start/end indices in the boid pointer array.
// DIFFERENCE: For best results, consider what order the cells should be
// checked in to maximize the memory benefits of reordering the boids data.
- // - Access each boid in the cell and compute velocity change from
- // the boids rules, if this boid is within the neighborhood distance.
- // - Clamp the speed change before putting the new speed in vel2
+ int gridX, gridY, gridZ;
+ for (int x=neighborhoodCenter.x; x >= neighborhoodCenter.x - 1.0f; x--){
+ gridX = x > 0.0f ? x : gridResolution;
+ gridX = x < gridResolution ? x : 0.0f;
+ for (int y=neighborhoodCenter.y; y >= neighborhoodCenter.y - 1.0f; y--){
+ gridY = y > 0.0f ? y : gridResolution;
+ gridY = y < gridResolution ? y : 0.0f;
+ for (int z=neighborhoodCenter.z; z >= neighborhoodCenter.z - 1.0f; z--){
+ gridZ = z > 0.0f ? z : gridResolution;
+ gridZ = z < gridResolution ? z : 0.0f;
+ if (gridCellStartIndices[gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution)] >= 0){
+ neighborhoodStarts[maxNI] = gridCellStartIndices[gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution)];
+ neighborhoodEnds[maxNI] = gridCellEndIndices[gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution)];
+ maxNI++;
+ }
+ }
+ }
+ }
+
+ // rule1 vars
+ int rule1Neighbors = 0;
+ glm::vec3 CM = glm::vec3(0);
+ glm::vec3 rule1Vel;
+ // rule2 vars
+ glm::vec3 c = glm::vec3(0);
+ glm::vec3 rule2Vel;
+ // rule3 vars
+ glm::vec3 perceivedVel = glm::vec3(0);
+ glm::vec3 rule3Vel;
+
+ // spacial vector between this boid and the boid being checked
+ glm::vec3 selfToThem;
+
+ for (int gridi=0; gridi 0){
+ CM /= (float)rule1Neighbors; // divide by the number of boids
+ rule1Vel = (CM - pos[iSelf]) * rule1Scale;
+ }
+ else{
+ rule1Vel = glm::vec3(0.0f);
+ }
+ rule2Vel = c * rule2Scale;
+ rule3Vel = (perceivedVel - vel1[iSelf]) / 8.0f;// * rule3Scale;
+
+ // - Clamp the speed change before putting the new speed in vel2
+ glm::vec3 newVel = vel1[iSelf] + rule1Vel + rule2Vel + rule3Vel;
+ vel2[iSelf] = glm::normalize(newVel) * maxSpeed;
}
/**
* Step the entire N-body simulation by `dt` seconds.
*/
void Boids::stepSimulationNaive(float dt) {
- // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+ // use the kernels you wrote to step the simulation forward in time.
+ kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+ checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+ kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel1);
+ checkCUDAErrorWithLine("kernUpdatePos failed!");
// TODO-1.2 ping-pong the velocity buffers
+ glm::vec3 *tmp = dev_vel1;
+ dev_vel1 = dev_vel2;
+ dev_vel2 = tmp;
+ cudaDeviceSynchronize();
}
void Boids::stepSimulationScatteredGrid(float dt) {
// TODO-2.1
// Uniform Grid Neighbor search using Thrust sort.
// In Parallel:
+ kernResetIntBuffer<<>>(numObjects,
+ dev_particleGridIndices,
+ -1);
+ kernResetIntBuffer<<>>(numObjects,
+ dev_gridCellStartIndices,
+ -1);
+ kernResetIntBuffer<<>>(numObjects,
+ dev_gridCellEndIndices,
+ -1);
+
// - label each particle with its array index as well as its grid index.
// Use 2x width grids.
+ kernComputeIndices<<>>(numObjects,
+ gridSideCount,
+ gridMinimum,
+ gridInverseCellWidth,
+ dev_pos,
+ dev_particleArrayIndices,
+ dev_particleGridIndices);
// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
// are welcome to do a performance comparison.
+ // Wrap device vectors in thrust iterators for use with thrust.
+ dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices);
+ dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices);
+ thrust::sort_by_key(dev_thrust_particleGridIndices,
+ dev_thrust_particleGridIndices + numObjects,
+ dev_thrust_particleArrayIndices);
+
// - Naively unroll the loop for finding the start and end indices of each
// cell's data pointers in the array of boid indices
+ kernIdentifyCellStartEnd<<>>(numObjects,
+ dev_particleGridIndices,
+ dev_gridCellStartIndices,
+ dev_gridCellEndIndices);
+ checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
// - Perform velocity updates using neighbor search
+ kernUpdateVelNeighborSearchScattered<<>>(numObjects,
+ gridSideCount,
+ gridMinimum,
+ gridInverseCellWidth,
+ gridCellWidth,
+ dev_gridCellStartIndices,
+ dev_gridCellEndIndices,
+ dev_particleArrayIndices,
+ dev_pos,
+ dev_vel1,
+ dev_vel2);
+ checkCUDAErrorWithLine("kernUpdateVelocityNeighborSearchScattered failed!");
+
// - Update positions
+ kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2);
+ checkCUDAErrorWithLine("kernUpdatePos failed!");
+
// - Ping-pong buffers as needed
+ glm::vec3 *tmp = dev_vel1;
+ dev_vel1 = dev_vel2;
+ dev_vel2 = tmp;
+
+ cudaDeviceSynchronize();
}
void Boids::stepSimulationCoherentGrid(float dt) {
// TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
// Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
// In Parallel:
- // - Label each particle with its array index as well as its grid index.
- // Use 2x width grids
+ kernResetIntBuffer<<>>(numObjects,
+ dev_particleGridIndices,
+ -1);
+ kernResetIntBuffer<<>>(numObjects,
+ dev_gridCellStartIndices,
+ -1);
+ kernResetIntBuffer<<>>(numObjects,
+ dev_gridCellEndIndices,
+ -1);
+
+ // - label each particle with its array index as well as its grid index.
+ // Use 2x width grids.
+ kernComputeIndices<<>>(numObjects,
+ gridSideCount,
+ gridMinimum,
+ gridInverseCellWidth,
+ dev_pos,
+ dev_particleArrayIndices,
+ dev_particleGridIndices);
// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
// are welcome to do a performance comparison.
+ // Wrap device vectors in thrust iterators for use with thrust.
+ dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices);
+ dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices);
+ dev_thrust_shuffledArrayIndices1 = thrust::device_ptr(dev_shuffledArrayIndices1);
+ dev_thrust_shuffledArrayIndices2 = thrust::device_ptr(dev_shuffledArrayIndices2);
+
+ thrust::copy(dev_thrust_particleGridIndices,
+ dev_thrust_particleGridIndices + numObjects,
+ dev_thrust_shuffledArrayIndices1);
+ thrust::sort_by_key(dev_thrust_particleGridIndices,
+ dev_thrust_particleGridIndices + numObjects,
+ dev_thrust_particleArrayIndices);
+
// - Naively unroll the loop for finding the start and end indices of each
// cell's data pointers in the array of boid indices
+ kernIdentifyCellStartEnd<<>>(numObjects,
+ dev_particleGridIndices,
+ dev_gridCellStartIndices,
+ dev_gridCellEndIndices);
+ checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
// - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
// the particle data in the simulation array.
// CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+
+ thrust::copy(dev_thrust_shuffledArrayIndices1,
+ dev_thrust_shuffledArrayIndices1 + numObjects,
+ dev_thrust_shuffledArrayIndices2);
+
+ thrust::sort_by_key(dev_thrust_shuffledArrayIndices1,
+ dev_thrust_shuffledArrayIndices1 + numObjects,
+ dev_pos);
+ thrust::sort_by_key(dev_thrust_shuffledArrayIndices2,
+ dev_thrust_shuffledArrayIndices2 + numObjects,
+ dev_vel1);
+
+
// - Perform velocity updates using neighbor search
+ kernUpdateVelNeighborSearchCoherent<<>>(numObjects,
+ gridSideCount,
+ gridMinimum,
+ gridInverseCellWidth,
+ gridCellWidth,
+ dev_gridCellStartIndices,
+ dev_gridCellEndIndices,
+ dev_pos,
+ dev_vel1,
+ dev_vel2);
+ checkCUDAErrorWithLine("kernUpdateVelocityNeighborSearchCoherent failed!");
+
+
+ cudaDeviceSynchronize();
// - Update positions
+ kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2);
+ checkCUDAErrorWithLine("kernUpdatePos failed!");
+
// - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+ glm::vec3 *tmp = dev_vel1;
+ dev_vel1 = dev_vel2;
+ dev_vel2 = tmp;
+
+ cudaDeviceSynchronize();
}
void Boids::endSimulation() {
@@ -390,6 +804,13 @@ void Boids::endSimulation() {
cudaFree(dev_pos);
// TODO-2.1 TODO-2.3 - Free any additional buffers here.
+ cudaFree(dev_particleArrayIndices);
+ cudaFree(dev_particleGridIndices);
+
+ cudaFree(dev_gridCellStartIndices);
+ cudaFree(dev_gridCellEndIndices);
+ cudaFree(dev_shuffledArrayIndices1);
+ cudaFree(dev_shuffledArrayIndices2);
}
void Boids::unitTest() {
@@ -398,10 +819,12 @@ void Boids::unitTest() {
// test unstable sort
int *dev_intKeys;
int *dev_intValues;
+ int *dev_intValues2;
int N = 10;
std::unique_ptrintKeys{ new int[N] };
std::unique_ptrintValues{ new int[N] };
+ std::unique_ptrintValues2{ new int[N] };
intKeys[0] = 0; intValues[0] = 0;
intKeys[1] = 1; intValues[1] = 1;
@@ -420,6 +843,9 @@ void Boids::unitTest() {
cudaMalloc((void**)&dev_intValues, N * sizeof(int));
checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+ cudaMalloc((void**)&dev_intValues2, N * sizeof(int));
+ checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
std::cout << "before unstable sort: " << std::endl;
@@ -435,18 +861,27 @@ void Boids::unitTest() {
// Wrap device vectors in thrust iterators for use with thrust.
thrust::device_ptr dev_thrust_keys(dev_intKeys);
thrust::device_ptr dev_thrust_values(dev_intValues);
+ thrust::device_ptr dev_thrust_values2(dev_intValues2);
+
+ thrust::copy(dev_thrust_values,
+ dev_thrust_values + N,
+ dev_thrust_values2);
+
// LOOK-2.1 Example for using thrust::sort_by_key
thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
// How to copy data back to the CPU side from the GPU
cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
- checkCUDAErrorWithLine("memcpy back failed!");
+ cudaMemcpy(intValues2.get(), dev_intValues2, sizeof(int) * N, cudaMemcpyDeviceToHost);
+
+ checkCUDAErrorWithLine("memcpy back failed!");
std::cout << "after unstable sort: " << std::endl;
for (int i = 0; i < N; i++) {
std::cout << " key: " << intKeys[i];
- std::cout << " value: " << intValues[i] << std::endl;
+ std::cout << " value: " << intValues[i];
+ std::cout << " value2: " << intValues2[i] << std::endl;
}
// cleanup
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..7ecd630 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
// LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
#define VISUALIZE 1
-#define UNIFORM_GRID 0
+#define UNIFORM_GRID 1
#define COHERENT_GRID 0
// LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 90001;
const float DT = 0.2f;
/**