codeplaysoftware
diff --git a/‎README.md‎
Lines changed: 55 additions & 1 deletion b/‎README.md‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/camera.cpp‎
Lines changed: 30 additions & 30 deletions b/‎src/camera.cpp‎
Lines changed: 30 additions & 30 deletions
diff --git a/‎src/camera.hpp‎
Lines changed: 37 additions & 37 deletions b/‎src/camera.hpp‎
Lines changed: 37 additions & 37 deletions
diff --git a/‎src/gen.cpp‎
Lines changed: 30 additions & 30 deletions b/‎src/gen.cpp‎
Lines changed: 30 additions & 30 deletions
diff --git a/‎src/gen.hpp‎
Lines changed: 1 addition & 1 deletion b/‎src/gen.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -134,10 +134,15 @@ A drag factor (`damping`) is used to regulate the velocity. At each timestep, th
 
 The `parameters` described in this section can all be adjusted via command line arguments, as follows:
 
-`./nbody_cuda numParticles simIterationsPerFrame damping dt distEps G numFrames`
+`./nbody_cuda numParticles simIterationsPerFrame damping dt distEps G numFrames gwSize calcMethod`
 
 Note that `numParticles` specifies the number of particles simulated, divided by blocksize (i.e. setting `numParticles` to 50 produces 50*256 particles). `simIterationsPerFrame` specifies how many steps of the simulation to take before rendering the next frame and `numFrames` specifies the total number of simulation steps before the program exits. For default values for all of these parameters, refer to `sim_param.cpp`.
 
+`gwSize`: This parameter allows changing the work group size from the default 64.
+
+`calcMethod`: This string parameter, with a default value of BRANCH, selects branch instruction code. If set to PREDICATED, it uses an arithmetic expression. Refer to the [performance](#sycl-vs-cuda-performance) section for details.
+
+
 ### Modifying Simulation Behaviour
 
 You can get quite a wide range of 'galactic' behaviours by playing with the parameters described above.
@@ -222,3 +227,52 @@ in the main loop in simulation.dp.cpp. Whereas NVCC handles this via instruction
 force += r * inv_dist_cube * (i != id);
 ```
 in both the CUDA & SYCL code, we get comparable performance between the two using our hardware set up (RTX 3060). For 5 steps of the physical simulation (1 rendered frame) with 12,800 particles, both CUDA and SYCL take ~5.05ms (RTX 3060).
+
+## Update 2024
+
+The ability to execute the nbody code without rendering simplified the process of running the code on different platforms. The results of these executions have brought to light some issues related to the runtime and compilers. As stated before, the original code was modified by substituting:
+
+```
+    // Original code
+    if (i == id) continue;
+
+    force += r * inv_dist_cube;
+```
+
+with
+
+```
+    // Modified code
+    force += r * inv_dist_cube * (i != id);
+```
+
+in order to address the 40% decrease in SYCL performance compared to the CUDA code. With this change, the performance was almost the same for both compilers in RTX 3060.
+
+We have found that while this is the case for the A100 (CUDA 8.48516 ms vs. SYCL 8.23865 ms), it is not the same on the RTX 2060, where CUDA is heavily penalized (CUDA 10.7281 ms vs. SYCL 8.52349 ms). Even on the A100, the change lowered the CUDA performance (7.95778 ms for the original code).
+
+The code change also greatly improved the performance by 100% on the MAX 1100 GPU, dropping from 21.6555 ms to 10.7633 ms.
+Below are the best results from executing the code on the three different platforms.
+
+```
+[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA GeForce RTX 2060 7.5 [CUDA 12.3]
+==================== WORK GROUP SIZE 512 BRANCH ========================
+CUDA - At step 10000 kernel time is 8.48516 and mean is 8.53952 and stddev is: 0.0884324
+ DPC - At step 10000 kernel time is 8.23865 and mean is 8.30511 and stddev is: 0.0788344
+==================== WORK GROUP SIZE 512 PREDICATED ====================
+CUDA - At step 10000 kernel time is 10.7281 and mean is 10.7601 and stddev is: 0.0630959
+ DPC - At step 10000 kernel time is 8.52349 and mean is 8.5992 and stddev is: 0.078034
+
+[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
+==================== WORK GROUP SIZE 128 BRANCH ========================
+CUDA - At step 10000 kernel time is 7.95778 and mean is 7.95753 and stddev is: 0.000680384
+ DPC - At step 10000 kernel time is 10.051 and mean is 10.0506 and stddev is: 0.00181166
+==================== WORK GROUP SIZE 128 PREDICATED ====================
+CUDA - At step 10000 kernel time is 8.60294 and mean is 8.60151 and stddev is: 0.00077172
+ DPC - At step 10000 kernel time is 7.99054 and mean is 7.99109 and stddev is: 0.0041852
+
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Data Center GPU Max 1100 1.3 [1.3.26516]
+==================== WORK GROUP SIZE 32 BRANCH ========================
+At step 10000 kernel time is 21.5747 and mean is 21.6555 and stddev is: 0.0734683
+==================== WORK GROUP SIZE 32 PREDICATED ====================
+At step 10000 kernel time is 10.6649 and mean is 10.7633 and stddev is: 0.0507969
+```
@@ -48,6 +48,7 @@ target_compile_definitions(${BINARY_NAME} PRIVATE ${RENDER_FLAG} COMPILER_NAME="
 target_link_libraries(${BINARY_NAME} PRIVATE ${RENDER_LIB})
 target_compile_features(${BINARY_NAME} PRIVATE cxx_auto_type cxx_nullptr cxx_range_for)
 target_include_directories(${BINARY_NAME} PRIVATE ${CUDA_INCLUDE_DIRS})
+target_compile_options(${BINARY_NAME} PRIVATE -use_fast_math)
 
 add_custom_target(debug DEPENDS ${BINARY_NAME}_d)
 add_executable(${BINARY_NAME}_d ${SOURCE_FILES})
 
@@ -10,59 +10,59 @@ const float PI = 3.14159265358979323846;
 using namespace std;
 
 Camera::Camera() {
-   position.x = 0;
-   position.y = PI / 4;
-   position.z = 50.0;
+  position.x = 0;
+  position.y = PI / 4;
+  position.z = 50.0;
 
-   velocity    = {0.0, 0.0, 0.0};
-   look_at     = {0.0, 0.0, 0.0};
-   look_at_vel = {0.0, 0.0, 0.0};
+  velocity    = {0.0, 0.0, 0.0};
+  look_at     = {0.0, 0.0, 0.0};
+  look_at_vel = {0.0, 0.0, 0.0};
 }
 
 void Camera::step() {
-   position.x -= velocity.x;
-   position.y -= velocity.y;
-   position.z *= (1.0 - velocity.z);
-   look_at += look_at_vel;
-
-   velocity *= 0.72;  // damping
-   look_at_vel *= 0.90;
-
-   // limits
-   if (position.x < 0) position.x += 2 * PI;
-   if (position.x >= 2 * PI) position.x -= 2 * PI;
-   position.y =
-       max(-(float)PI / 2 + 0.001f, min(position.y, (float)PI / 2 - 0.001f));
+  position.x -= velocity.x;
+  position.y -= velocity.y;
+  position.z *= (1.0 - velocity.z);
+  look_at += look_at_vel;
+
+  velocity *= 0.72;  // damping
+  look_at_vel *= 0.90;
+
+  // limits
+  if (position.x < 0) position.x += 2 * PI;
+  if (position.x >= 2 * PI) position.x -= 2 * PI;
+  position.y =
+    max(-(float)PI / 2 + 0.001f, min(position.y, (float)PI / 2 - 0.001f));
 }
 
 glm::mat4 Camera::getProj(int width, int height) {
-   return glm::infinitePerspective(glm::radians(30.0f), width / (float)height,
-                                   1.f);
+  return glm::infinitePerspective(glm::radians(30.0f), width / (float)height,
+      1.f);
 }
 
 glm::vec3 getCartesianCoordinates(glm::vec3 v) {
-   return glm::vec3(cos(v.x) * cos(v.y), sin(v.x) * cos(v.y), sin(v.y)) * v.z;
+  return glm::vec3(cos(v.x) * cos(v.y), sin(v.x) * cos(v.y), sin(v.y)) * v.z;
 }
 
 glm::mat4 Camera::getView() {
-   // polar to cartesian coordinates
-   glm::vec3 view_pos = getCartesianCoordinates(position);
+  // polar to cartesian coordinates
+  glm::vec3 view_pos = getCartesianCoordinates(position);
 
-   return glm::lookAt(view_pos + look_at, look_at, glm::vec3(0, 0, 1));
+  return glm::lookAt(view_pos + look_at, look_at, glm::vec3(0, 0, 1));
 }
 
 glm::vec3 Camera::getForward() {
-   return glm::normalize(-getCartesianCoordinates(position));
+  return glm::normalize(-getCartesianCoordinates(position));
 }
 
 glm::vec3 Camera::getRight() {
-   return glm::normalize(
-       glm::cross(getCartesianCoordinates(position), glm::vec3(0, 0, 1)));
+  return glm::normalize(
+      glm::cross(getCartesianCoordinates(position), glm::vec3(0, 0, 1)));
 }
 
 glm::vec3 Camera::getUp() {
-   return glm::normalize(
-       glm::cross(getCartesianCoordinates(position), getRight()));
+  return glm::normalize(
+      glm::cross(getCartesianCoordinates(position), getRight()));
 }
 
 void Camera::addVelocity(glm::vec3 vel) { velocity += vel; }
 
@@ -6,43 +6,43 @@
 
 class Camera {
   public:
-   Camera();
-
-   /**
-    * Computes next step of camera parameters
-    * @param c camera at step n
-    * @return camera at step n+1
-    */
-   void step();
-
-   /**
-    * Computes projection matrix from camera parameters
-    * @param c camera parameters
-    * @param width viewport width
-    * @param height viewport height
-    * @return projection matrix
-    */
-   glm::mat4 getProj(int width, int height);
-
-   /**
-    * Computes view matrix from camera parameters
-    * @param c camera parameters
-    * @param view matrix
-    */
-   glm::mat4 getView();
-
-   glm::vec3 getForward();
-   glm::vec3 getRight();
-   glm::vec3 getUp();
-
-   glm::vec3 getPosition();
-
-   void addVelocity(glm::vec3 vel);
-   void addLookAtVelocity(glm::vec3 vel);
+    Camera();
+
+    /**
+     * Computes next step of camera parameters
+     * @param c camera at step n
+     * @return camera at step n+1
+     */
+    void step();
+
+    /**
+     * Computes projection matrix from camera parameters
+     * @param c camera parameters
+     * @param width viewport width
+     * @param height viewport height
+     * @return projection matrix
+     */
+    glm::mat4 getProj(int width, int height);
+
+    /**
+     * Computes view matrix from camera parameters
+     * @param c camera parameters
+     * @param view matrix
+     */
+    glm::mat4 getView();
+
+    glm::vec3 getForward();
+    glm::vec3 getRight();
+    glm::vec3 getUp();
+
+    glm::vec3 getPosition();
+
+    void addVelocity(glm::vec3 vel);
+    void addLookAtVelocity(glm::vec3 vel);
 
   private:
-   glm::vec3 position;     ///< Polar coordinates in radians
-   glm::vec3 velocity;     ///< dp/dt of polar coordinates
-   glm::vec3 look_at;      ///< Where is the camera looking at
-   glm::vec3 look_at_vel;  ///< dp/dt of lookat position
+    glm::vec3 position;     ///< Polar coordinates in radians
+    glm::vec3 velocity;     ///< dp/dt of polar coordinates
+    glm::vec3 look_at;      ///< Where is the camera looking at
+    glm::vec3 look_at_vel;  ///< dp/dt of lookat position
 };
@@ -13,39 +13,39 @@ mt19937 rng;
 uniform_real_distribution<> dis(0, 1);
 
 glm::vec4 randomParticlePos() {
-   // Random position on a 'thick disk'
-   glm::vec4 particle;
-   float t = dis(rng) * 2 * PI;
-   float s = dis(rng) * 100;
-   particle.x = cos(t) * s;
-   particle.y = sin(t) * s;
-   particle.z = dis(rng) * 4;
-
-   particle.w = 1.f;
-   return particle;
+  // Random position on a 'thick disk'
+  glm::vec4 particle;
+  float t = dis(rng) * 2 * PI;
+  float s = dis(rng) * 100;
+  particle.x = cos(t) * s;
+  particle.y = sin(t) * s;
+  particle.z = dis(rng) * 4;
+
+  particle.w = 1.f;
+  return particle;
 }
 
 glm::vec4 randomParticleVel(glm::vec4 pos) {
-   // Initial velocity is 'orbital' velocity from position
-   glm::vec3 vel = glm::cross(glm::vec3(pos), glm::vec3(0, 0, 1));
-   float orbital_vel = sqrt(2.0 * glm::length(vel));
-   vel = glm::normalize(vel) * orbital_vel;
-   return glm::vec4(vel, 0.0);
+  // Initial velocity is 'orbital' velocity from position
+  glm::vec3 vel = glm::cross(glm::vec3(pos), glm::vec3(0, 0, 1));
+  float orbital_vel = sqrt(2.0 * glm::length(vel));
+  vel = glm::normalize(vel) * orbital_vel;
+  return glm::vec4(vel, 0.0);
 }
 
 std::vector<float> genFlareTex(int tex_size) {
-   std::vector<float> pixels(tex_size * tex_size);
-   float sigma2 = tex_size / 2.0;
-   float A = 1.0;
-   for (int i = 0; i < tex_size; ++i) {
-      float i1 = i - tex_size / 2;
-      for (int j = 0; j < tex_size; ++j) {
-         float j1 = j - tex_size / 2;
-         // gamma corrected gauss
-         pixels[i * tex_size + j] = pow(
-             A * exp(-((i1 * i1) / (2 * sigma2) + (j1 * j1) / (2 * sigma2))),
-             2.2);
-      }
-   }
-   return pixels;
-}
+  std::vector<float> pixels(tex_size * tex_size);
+  float sigma2 = tex_size / 2.0;
+  float A = 1.0;
+  for (int i = 0; i < tex_size; ++i) {
+    float i1 = i - tex_size / 2;
+    for (int j = 0; j < tex_size; ++j) {
+      float j1 = j - tex_size / 2;
+      // gamma corrected gauss
+      pixels[i * tex_size + j] = pow(
+          A * exp(-((i1 * i1) / (2 * sigma2) + (j1 * j1) / (2 * sigma2))),
+          2.2);
+    }
+  }
+  return pixels;
+}
@@ -18,4 +18,4 @@ glm::vec4 randomParticlePos();
  */
 glm::vec4 randomParticleVel(glm::vec4 pos);
 
-std::vector<float> genFlareTex(int size);
+std::vector<float> genFlareTex(int size);