soran-ghaderi
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 796 additions & 695 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 796 additions & 695 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 24 additions & 12 deletions b/‎CMakeLists.txt‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 6 deletions b/‎README.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎app/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion b/‎app/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎app/ReduceSum.cu‎
Lines changed: 68 additions & 0 deletions b/‎app/ReduceSum.cu‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎app/cuRBLAS_app.cpp‎
Lines changed: 0 additions & 7 deletions b/‎app/cuRBLAS_app.cpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎app/gaussian_sketch.cu‎
Lines changed: 71 additions & 0 deletions b/‎app/gaussian_sketch.cu‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎docs/getting-started/installation.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/getting-started/installation.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/cuRBLAS/cuRBLAS.hpp‎
Lines changed: 0 additions & 16 deletions b/‎include/cuRBLAS/cuRBLAS.hpp‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎include/curblas/curblas.cuh‎
Lines changed: 11 additions & 0 deletions b/‎include/curblas/curblas.cuh‎
Lines changed: 11 additions & 0 deletions
@@ -11,15 +11,15 @@ if(CMAKE_CUDA_COMPILER)
     enable_language(CUDA)
     set(CUDA_FOUND TRUE)
     message(STATUS "CUDA found and enabled")
-    
+
     # Set CUDA architectures - this fixes the CMAKE_CUDA_ARCHITECTURES error
     # Supporting common GPU architectures: Pascal, Volta, Turing, Ampere, Ada Lovelace
     set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80;86;89;90" CACHE STRING "CUDA architectures")
-    
+
     # Define CUDA standard
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-    
+
     # CUDA specific settings
     set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
     set(CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS ON)
@@ -50,14 +50,14 @@ if(WIN32 AND MSVC)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /MD")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /MD")
-    
+
     # Ensure consistent iterator debug levels
     # Use level 2 for Debug builds and level 0 for Release builds
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /D_ITERATOR_DEBUG_LEVEL=2")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
-    
+
     message(STATUS "Windows MSVC runtime library flags configured for consistent linking")
 endif()
 
@@ -69,6 +69,7 @@ option(BUILD_DOCS "Enable building of documentation" ON)
 if(CUDA_FOUND)
     find_package(CUDAToolkit REQUIRED)
 endif()
+find_package(Threads REQUIRED)
 
 # compile the library
 add_subdirectory(src)
@@ -81,27 +82,27 @@ add_subdirectory(app)
 include(CTest)
 if(BUILD_TESTING)
   find_package(Catch2 3 REQUIRED)
-  
+
   # Try to include Catch2 discovery functions if available
   if(TARGET Catch2::Catch2WithMain)
     # Look for the Catch2 CMake module in common locations
-    find_file(CATCH2_CMAKE_MODULE 
-      NAMES Catch2.cmake 
-      PATHS 
+    find_file(CATCH2_CMAKE_MODULE
+      NAMES Catch2.cmake
+      PATHS
         ${Catch2_DIR}
         ${Catch2_DIR}/../../../lib/cmake/Catch2
         /usr/local/lib/cmake/Catch2
         /usr/lib/cmake/Catch2
       NO_DEFAULT_PATH
     )
-    
+
     if(CATCH2_CMAKE_MODULE)
       include(${CATCH2_CMAKE_MODULE})
     else()
       message(STATUS "Catch2 discovery module not found, using basic test registration")
     endif()
   endif()
-  
+
   add_subdirectory(tests)
 endif()
 
@@ -113,7 +114,7 @@ if(BUILD_PYTHON)
   # Add Python bindings
   find_package(pybind11 REQUIRED)
   # Compile the Pybind11 module
-  pybind11_add_module(_cuRBLAS python/cuRBLAS/_cuRBLAS.cpp)
+  pybind11_add_module(_cuRBLAS python/curblas/_curblas.cpp)
   target_link_libraries(_cuRBLAS PUBLIC cuRBLAS)
 
   # Install the Python module shared library
@@ -157,6 +158,17 @@ install(
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
 
+#include_directories(${CUDA_INCLUDE_DIRS})
+#target_include_directories(curblas
+#        PUBLIC
+#        ${CMAKE_CURRENT_SOURCE_DIR}/include
+#        ${CUDA_INCLUDE_DIRS}
+#)
+#target_link_libraries(curblas
+#        PUBLIC
+#        ${CUDA_LIBRARIES}
+#)
+#include_directories(/usr/local/cuda/include)
 # This prints a summary of found dependencies
 include(FeatureSummary)
 feature_summary(WHAT ALL)
@@ -87,7 +87,7 @@ Building cuRBLAS requires:
 ```bash
 # Clone the repo
 git clone https://github.com/soran-ghaderi/cuRBLAS.git
-cd cuRBLAS
+cd curblas
 
 # Create build directory
 mkdir build && cd build
@@ -124,19 +124,19 @@ pip install .
 Or install directly from PyPI (when available):
 
 ```bash
-pip install cuRBLAS
+pip install curblas
 ```
 
 ## Usage Example
 
 ```c
 #include <cuRBLAS/curblas.h>
 
-// Create cuRBLAS context
+// Create curblas context
 curblasHandle_t handle;
 curblasStatus_t status = curblasCreate(&handle);
 if (status != CURBLAS_STATUS_SUCCESS) {
-    printf("Failed to create cuRBLAS handle: %s\n", 
+    printf("Failed to create curblas handle: %s\n", 
            curblasGetStatusString(status));
     return -1;
 }
@@ -153,7 +153,7 @@ curblasSetSketchType(handle, CURBLAS_SKETCH_GAUSSIAN);
 // Get version information
 int version;
 curblasGetVersion(handle, &version);
-printf("cuRBLAS Version: %d\n", version);
+printf("curblas Version: %d\n", version);
 
 // Note: Matrix operations like curblasRgemm are declared 
 // in headers but not yet implemented
@@ -237,7 +237,7 @@ We welcome contributions! Please see our [contribution guidelines](CONTRIBUTING.
 
 ```bash
 git clone https://github.com/soran-ghaderi/cuRBLAS.git
-cd cuRBLAS
+cd curblas
 pip install -r requirements-dev.txt
 ```
 
 
@@ -1,2 +1,5 @@
-add_executable(cuRBLAS_app cuRBLAS_app.cpp)
+add_executable(cuRBLAS_app ReduceSum.cu)
 target_link_libraries(cuRBLAS_app PRIVATE cuRBLAS)
+
+add_executable(generateGaussianSketch gaussian_sketch.cu)
+target_link_libraries(generateGaussianSketch PRIVATE cuRBLAS)
@@ -0,0 +1,68 @@
+#include "curblas/curblas.cuh"
+#include "cuda_runtime.h"
+#include <iostream>
+#include <vector>
+#include <cooperative_groups.h>
+
+
+int main() {
+    // Example usage of the reduceSum kernel
+    int N = 1024;
+    float h_input[N];
+
+    for (int i = 0; i < N; ++i) {
+        h_input[i] = 1.0f;
+    }
+
+    float *d_input, *d_output;
+    float h_output;
+
+
+    // Allocate device memory
+    cudaMalloc((void**)&d_input, N * sizeof(float));
+//    cudaMalloc((void**)&d_output, sizeof(float));
+
+
+    // Launch the kernel
+    int blockSize = 256;
+    int numBlocks = (N + blockSize - 1) / blockSize;
+    cudaMalloc((void**)&d_output, blockSize * sizeof(float));
+    cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemset(d_output, 0, blockSize * sizeof(float));
+
+//    curblas::reduceSum<<<numBlocks, blockSize, blockSize * sizeof(float)>>>(d_input, d_output, N);
+    void *args[] = {&d_input, &d_output, &N};
+    cudaLaunchCooperativeKernel((void*)curblas::reduceSum, dim3(numBlocks), dim3(blockSize), args, blockSize * sizeof(float));
+
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        std::cerr << "Kernel launch failed: " << cudaGetErrorString(err) << std::endl;
+    }
+
+    // Copy the result back to host
+    cudaMemcpy(&h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
+
+
+    cudaMemcpy(&h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
+
+    std::cout << "Final Sum (from GPU reduction with Cooperative Groups): " << h_output << std::endl;
+
+
+    std::vector<float> h_partialSums(numBlocks);
+    cudaMemcpy(h_partialSums.data(), d_output, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
+
+    float finalSum = 0.0f;
+    for (int i = 0; i < numBlocks; ++i) {
+        finalSum += h_partialSums[i];
+        std::cout << "Partial sum from block " << i << ": " << h_partialSums[i] << std::endl;
+    }
+
+
+    std::cout << "Sum: " << finalSum << std::endl;
+    // Clean up
+    cudaFree(d_input);
+    cudaFree(d_output);
+
+    return 0;
+}
@@ -0,0 +1,71 @@
+#include "curblas/curblas.cuh"
+//#include "curblas/curblas.h"
+//#include "curblas/curblas_types.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+
+
+
+void printMatrix(const std::vector<float>& vec, int rows, int cols) {
+    for (int i = 0; i < rows; ++i) {
+        for (int j = 0; j < cols; ++j) {
+            std::cout << std::fixed << std::setprecision(4) << std::setw(10) << vec[i * cols + j] << " ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+int main() {
+    int rows = 4000;
+    int cols = 5000;
+
+    int totalElements = rows * cols;
+    long long seed = 112345L;
+    float scale = 1.0f;
+
+    std::cout << "Generating a" << rows << " x " << cols << " gaussian sketch matrix." << std::endl;
+
+    std::vector<float> h_sketch(totalElements);
+
+    float* d_sketch;
+    cudaMalloc((void**)&d_sketch, totalElements * sizeof(float));
+
+    int blockSize = 256;
+
+    // int totalElements = rows * cols;
+    int elementsPerThread = 4; //test
+    int totalThreads = (totalElements + elementsPerThread - 1) / elementsPerThread;
+    int numBlocks = (totalThreads + blockSize - 1) / blockSize;
+
+    // int numBlocks = (totalElements + blockSize - 1) / blockSize;
+
+    curblas::generateGaussianSketch<<<numBlocks, blockSize>>>(d_sketch, rows, cols, seed, scale);
+
+    cudaError err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        std::cerr << "kernel launch failed: " << cudaGetErrorString(err) << std::endl;
+        cudaFree(d_sketch);
+        return -1;
+    }
+
+
+    cudaDeviceSynchronize();
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+//  bring the data back:
+    cudaMemcpyAsync(h_sketch.data(), d_sketch, totalElements * sizeof(float), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+
+    std::cout << "result:" << rows << 'x' << cols << std::endl;
+
+//    printMatrix(h_sketch, rows, cols);
+
+    cudaFree(d_sketch);
+
+
+}
@@ -14,7 +14,7 @@ Currently cuRBLAS is in development. To install and use cuRBLAS:
 
 ```bash
 git clone https://github.com/cuRBLAS/cuRBLAS.git
-cd cuRBLAS
+cd curblas
 mkdir build && cd build
 cmake .. -DCMAKE_BUILD_TYPE=Release
 make -j$(nproc)
 
@@ -0,0 +1,11 @@
+#pragma once
+#include "cuda_runtime.h"
+
+namespace curblas {
+
+    __global__ void reduceSum(const float *input, float *output, int n);
+
+    __global__ void generateGaussianSketch(float *sketch, int rows, int cols, long long seed, float scale);
+
+
+    } // namespace curblas