diff --git a/PIMbench/layer-normalization/baselines/layer_norm.py b/PIMbench/layer-normalization/baselines/layer_norm.py new file mode 100644 index 00000000..256d18b5 --- /dev/null +++ b/PIMbench/layer-normalization/baselines/layer_norm.py @@ -0,0 +1,48 @@ +import argparse +import torch +import torch.nn as nn +import time + +# Function to perform layer normalization +def perform_layer_norm(input_tensor, norm_layer, device): + input_tensor = input_tensor.to(device) + norm_layer = norm_layer.to(device) + + start_time = time.time() + output = norm_layer(input_tensor) + if device.type == 'cuda': + torch.cuda.synchronize() # Wait for GPU ops to complete + end_time = time.time() + + elapsed_time = end_time - start_time + return elapsed_time + +# Main function +def main(args): + # Set device + device = torch.device('cuda' if args.cuda and torch.cuda.is_available() else 'cpu') + print(f"[INFO] Using device: {device}") + + # Input tensor: [B, C, H, W] + input_tensor = torch.randn(args.batch_size, args.input_channels, args.input_height, args.input_width) + + # LayerNorm normalized over [C, H, W] for each sample + normalized_shape = [args.input_channels, args.input_height, args.input_width] + norm_layer = nn.LayerNorm(normalized_shape, eps=args.epsilon) + + # Run layer normalization + time_taken = perform_layer_norm(input_tensor, norm_layer, device) + print(f"[INFO] Time taken for layer normalization: {time_taken * 1000:.6f} ms") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="CNN-style Layer Normalization on CPU/GPU") + + parser.add_argument("-b", "--batch_size", type=int, default=64, help="Batch size") + parser.add_argument("-c", "--input_channels", type=int, default=64, help="Number of input channels") + parser.add_argument("-r", "--input_height", type=int, default=32, help="Input height") + parser.add_argument("-w", "--input_width", type=int, default=32, help="Input width") + parser.add_argument("-eps", "--epsilon", type=float, default=1e-5, help="Epsilon for LayerNorm") + parser.add_argument("-cuda", "--cuda", action='store_true', help="Use CUDA if available") + + args = parser.parse_args() + main(args) diff --git a/PIMbench/logistic-regression/Makefile b/PIMbench/logistic-regression/Makefile new file mode 100644 index 00000000..d26331b9 --- /dev/null +++ b/PIMbench/logistic-regression/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of logistic regression +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/logistic-regression/PIM/Makefile b/PIMbench/logistic-regression/PIM/Makefile new file mode 100644 index 00000000..ac6608ac --- /dev/null +++ b/PIMbench/logistic-regression/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of logistic regression +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := lr.out +SRC := lr.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/logistic-regression/PIM/lr.cpp b/PIMbench/logistic-regression/PIM/lr.cpp new file mode 100644 index 00000000..178215de --- /dev/null +++ b/PIMbench/logistic-regression/PIM/lr.cpp @@ -0,0 +1,357 @@ +#include +#include +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" + +using namespace std; + +struct Params { + uint64_t dataSize = 2048; + int epochs = 1000; + float learningRate = 0.01f; + char* configFile = nullptr; + char* inputFile = nullptr; + bool shouldVerify = false; +}; + +void usage() { + fprintf(stderr, + "\nUsage: ./lr.out [options]" + "\n" + "\n -l input size (default=2048 elements)" + "\n -e number of epochs (default=1000)" + "\n -r learning rate (default=0.01)" + "\n -c DRAMsim config file" + "\n -i input file (not implemented)" + "\n -v t = verify with host output" + "\n"); +} + +Params getInputParams(int argc, char** argv) { + Params p; + int opt; + while ((opt = getopt(argc, argv, "h:l:e:r:c:i:v:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + case 'l': + p.dataSize = strtoull(optarg, nullptr, 0); + break; + case 'e': + p.epochs = atoi(optarg); + break; + case 'r': + p.learningRate = atof(optarg); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't'); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +float sigmoid_exact(float z) { + return 1.0f / (1.0f + expf(-z)); +} + + + +void runLogisticRegressionPIM(uint64_t dataSize, int epochs, float lr, vector& Xf, vector& Yf, float& w, float& b) { + PimObjId xObj = pimAlloc(PIM_ALLOC_AUTO, dataSize, PIM_FP32); + if (xObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + PimObjId yObj = pimAllocAssociated(xObj, PIM_FP32); + if (yObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + PimObjId predictionObj = pimAllocAssociated(xObj, PIM_FP32); + if (predictionObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + PimObjId w_obj = pimAllocAssociated(xObj, PIM_FP32); + if (w_obj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId b_obj = pimAllocAssociated(predictionObj, PIM_FP32); + if (b_obj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId oneVecObj = pimAllocAssociated(predictionObj, PIM_FP32); + if (oneVecObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + + + PimObjId errorObj = pimAllocAssociated(xObj, PIM_FP32); + if (errorObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + + PimStatus status = pimCopyHostToDevice(Xf.data(), xObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + status = pimCopyHostToDevice(Yf.data(), yObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + status = pimBroadcastFP(oneVecObj, 1.0f); + if (status != PIM_OK) { + std::cout << "Abort" << std::endl; + return; + } + + std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); + std::vector zBuffer(dataSize); + for (int epoch = 0; epoch < epochs; ++epoch) { + float dw = 0.0f, db = 0.0f; + + + status = pimBroadcastFP(w_obj, w); + if (status != PIM_OK) { + std::cout << "Abort" << std::endl; + return; + } + + status = pimMul(xObj, w_obj, predictionObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + status = pimBroadcastFP(b_obj, b); + if (status != PIM_OK) { + std::cout << "Abort" << std::endl; + return; + } + + status = pimAdd(predictionObj, b_obj, predictionObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + status = pimCopyDeviceToHost(predictionObj, zBuffer.data()); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + auto start_cpu = std::chrono::high_resolution_clock::now(); + // #pragma omp parallel for + for (uint64_t i = 0; i < dataSize; ++i) { + zBuffer[i] = exp(-zBuffer[i]); + } + auto stop_cpu = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu - start_cpu); + + status = pimCopyHostToDevice(zBuffer.data(), predictionObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + status = pimAdd(predictionObj, oneVecObj, predictionObj); + if (status != PIM_OK) { + std::cout << "Abort" << std::endl; + return; + } + + status = pimDiv(oneVecObj, predictionObj, predictionObj); + if (status != PIM_OK) { + std::cout << "Abort" << std::endl; + return; + } + + status = pimSub(predictionObj, yObj, errorObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + status = pimRedSum(errorObj, &db); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + status = pimMul(errorObj, xObj, errorObj); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + std::vector errorBuffer(dataSize); + + status = pimRedSum(errorObj, &dw); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + w -= lr * dw / dataSize; + b -= lr * db / dataSize; + std::cout << "Epoch " << epoch + << ": w = " << w + << ", b = " << b + << ", dw = " << dw + << ", db = " << db + << std::endl; + + } + + pimFree(xObj); + pimFree(yObj); + pimFree(predictionObj); + pimFree(errorObj); + pimFree(w_obj); + pimFree(b_obj); + std::cout << "Host elapsed time: " << fixed << setprecision(3)<< hostElapsedTime.count() << " ms" << std::endl; +} + + + + +void runLogisticRegressionHost(uint64_t n, int epochs, float lr, const vector& X, const vector& Y, float& w_host, float& b_host) { + + w_host = 0.0f; + b_host = 0.0f; + + for (int epoch = 0; epoch < epochs; ++epoch) { + float dw = 0.0f, db = 0.0f; + // #pragma omp parallel for reduction(+ : dw, db) + for (uint64_t i = 0; i < n; ++i) { + float z = w_host * X[i] + b_host; + float pred = sigmoid_exact(z); + float error = pred - Y[i]; + dw += error * X[i]; + static std::ofstream error_log("error_log_host.txt", std::ios::app); + error_log << error * X[i] << std::endl; + db += error; + } + w_host -= lr * dw / n; + b_host -= lr * db / n; + + + std::cout << "Epoch " << epoch + << ": w = " << w_host + << ", b = " << b_host + << ", dw = " << dw + << ", db = " << db + << std::endl; + } +} + + +int main(int argc, char* argv[]) { + Params params = getInputParams(argc, argv); + vector X(params.dataSize), Y(params.dataSize); + + if (params.inputFile == nullptr){ + getVector(params.dataSize, X); + getVector(params.dataSize, Y); + for (auto& y : Y) y = y % 2; + } + else{ + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } + + std::vector Xf(params.dataSize), Yf(params.dataSize); + for (uint64_t i = 0; i < params.dataSize; ++i) { + Xf[i] = static_cast(X[i]); + Yf[i] = static_cast(Y[i]); + } + + + if (!createDevice(params.configFile)) return 1; + + + float w = 0.0f, b = 0.0f; + + + runLogisticRegressionPIM(params.dataSize, params.epochs, params.learningRate, Xf, Yf, w, b); + pimShowStats(); + + cout << "Model: sigmoid(" << w << " * x + " << b << ")\n"; + + + if (params.shouldVerify) { + float w_host, b_host; + auto start = chrono::high_resolution_clock::now(); + + runLogisticRegressionHost(params.dataSize, params.epochs, params.learningRate, Xf, Yf, w_host, b_host); + + auto end = chrono::high_resolution_clock::now(); + chrono::duration elapsedTime = end - start; + + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms\n"; + cout << "Host Model: sigmoid(" << w_host << " * x + " << b_host << ")\n"; + + float w_diff = fabs(w - w_host); + float b_diff = fabs(b - b_host); + + if (w_diff < 1e-2 && b_diff < 1e-2) + cout << "Verification PASSED.\n"; + else + cout << "Verification FAILED.\n"; + } + + return 0; +} diff --git a/PIMbench/logistic-regression/PIM/slurm.sh b/PIMbench/logistic-regression/PIM/slurm.sh new file mode 100755 index 00000000..b5c33d63 --- /dev/null +++ b/PIMbench/logistic-regression/PIM/slurm.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +#SBATCH --gpus=1 +#SBATCH -n 1 +#SBATCH -t 3-00:00:00 +#SBATCH -p gpu +#SBATCH --job-name=logistic-regression +#SBATCH --mem=900000 +#SBATCH --output=slurm-out.txt +#SBATCH --cpus-per-task=130 +#SBATCH --constraint=a100_80gb +#SBATCH --mail-type=end +#SBATCH --mail-user=yzp7fe@virginia.edu + +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Bank_Rank1.cfg > bank_rank1.txt +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Bank_Rank4.cfg > bank_rank4.txt +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Bank_Rank8.cfg > bank_rank8.txt +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank1.cfg > Fulcrum_Rank1.txt +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank4.cfg > Fulcrum_Rank4.txt +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank8.cfg > Fulcrum_Rank8.txt +./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank16.cfg > Fulcrum_Rank16.txt + + diff --git a/PIMbench/logistic-regression/baselines/CPU/Makefile b/PIMbench/logistic-regression/baselines/CPU/Makefile new file mode 100644 index 00000000..e8e18c44 --- /dev/null +++ b/PIMbench/logistic-regression/baselines/CPU/Makefile @@ -0,0 +1,25 @@ +# Compiler +CXX := g++ + +# Compiler flags +CXXFLAGS := -Wall -Wextra -Werror -march=native -std=c++17 -O3 -fopenmp + +# Executable name +EXEC := lr.out + +# Source files +SRC_FILES := $(wildcard *.cpp) + +# Dependancy +DEP := ../../../../util/ + +.PHONY: all clean + +all: $(EXEC) + +$(EXEC): $(SRC_FILES) | + $(CXX) $(CXXFLAGS) -I$(DEP) -o $@ $^ + +clean: + rm -rf $(EXEC) + \ No newline at end of file diff --git a/PIMbench/logistic-regression/baselines/CPU/lr.cpp b/PIMbench/logistic-regression/baselines/CPU/lr.cpp new file mode 100644 index 00000000..a4e2e44f --- /dev/null +++ b/PIMbench/logistic-regression/baselines/CPU/lr.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utilBaselines.h" + +using namespace std; + +struct Params { + uint64_t dataSize = 2048; + int epochs = 1000; + float learningRate = 0.01f; + string inputFile = ""; +}; + +void usage() { + fprintf(stderr, + "\nUsage: ./lr_host.out [options]" + "\n" + "\n -l input size (default=2048 elements)" + "\n -e number of epochs (default=1000)" + "\n -r learning rate (default=0.01)" + "\n -i input file (not implemented)" + "\n"); +} + +Params getInputParams(int argc, char** argv) { + Params p; + int opt; + while ((opt = getopt(argc, argv, "h:l:e:r:c:i:v:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + case 'l': + p.dataSize = strtoull(optarg, nullptr, 0); + break; + case 'e': + p.epochs = atoi(optarg); + break; + case 'r': + p.learningRate = atof(optarg); + break; + case 'i': + p.inputFile = optarg; + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +double sigmoid(double z) { + return 1.0 / (1.0 + exp(-z)); +} + +int main(int argc, char* argv[]) { + Params params = getInputParams(argc, argv); + + if (!params.inputFile.empty()) { + std::cout << "Reading input from file is not yet implemented." << std::endl; + return 1; + } + + uint64_t n = params.dataSize; + vector dataPointsX, dataPointsY; + getVector(n, dataPointsX); + getVector(n, dataPointsY); + + for (uint64_t i = 0; i < n; i++) { + dataPointsY[i] = dataPointsY[i] % 2; + } + + cout << "Done initializing data\n"; + + float w = 0.0, b = 0.0; + auto start = chrono::high_resolution_clock::now(); + + for (int epoch = 0; epoch < params.epochs; ++epoch) { + float dw = 0.0, db = 0.0; + + #pragma omp parallel for reduction(+ : dw, db) + for (uint64_t i = 0; i < n; i++) { + float z = w * dataPointsX[i] + b; + float pred = sigmoid(z); + float error = pred - dataPointsY[i]; + + dw += error * dataPointsX[i]; + db += error; + } + + w -= params.learningRate * dw / n; + b -= params.learningRate * db / n; + } + + auto end = chrono::high_resolution_clock::now(); + chrono::duration elapsedTime = end - start; + + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms\n"; + cout << "Model: sigmoid(" << w << " * x + " << b << ")\n"; + + return 0; +} diff --git a/PIMbench/logistic-regression/baselines/GPU/Makefile b/PIMbench/logistic-regression/baselines/GPU/Makefile new file mode 100644 index 00000000..d7657e9d --- /dev/null +++ b/PIMbench/logistic-regression/baselines/GPU/Makefile @@ -0,0 +1,32 @@ +# Compiler settings +NVCC := nvcc +CUDA_DIR ?= $(shell dirname $(shell dirname $(shell which nvcc))) +CUDA_INCLUDE := $(CUDA_DIR)/include +CUDA_LIB_DIR := $(CUDA_DIR)/lib64 +CUDA_LIB := -lcublas -lnvidia-ml +FOPENMP := -Xcompiler -fopenmp +# ARCH_FLAG := -arch=sm_80 + +# Target executable +EXEC := lr.out + +# Source files +SOURCES := lr.cu + +# Compiler flags +CFLAGS := -O3 +CXXFLAGS := -std=c++17 + +# Dependancy +DEP := ../../../../util/ + +# Default target +all: $(EXEC) + +# Rule to build the target executable +$(EXEC): $(SOURCES) + $(NVCC) $(SOURCES) -DENABLE_CUDA -I$(CUDA_INCLUDE) -I$(DEP) $(ARCH_FLAG) -L$(CUDA_LIB_DIR) $(CUDA_LIB) $(FOPENMP) $(CFLAGS) $(CXXFLAGS) -o $(EXEC) + +# Rule to clean the project +clean: + rm -f $(EXEC) diff --git a/PIMbench/logistic-regression/baselines/GPU/lr.cu b/PIMbench/logistic-regression/baselines/GPU/lr.cu new file mode 100644 index 00000000..bc80dea5 --- /dev/null +++ b/PIMbench/logistic-regression/baselines/GPU/lr.cu @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#include +#include +#include "utilBaselines.h" + +#define BLOCK_SIZE 256 + +using namespace std; + +struct Params { + uint64_t dataSize = 2048; + int epochs = 1000; + float learningRate = 0.01f; + string inputFile = ""; +}; + +void usage() { + fprintf(stderr, + "\nUsage: ./lr_gpu.out [options]" + "\n" + "\n -l input size (default=2048 elements)" + "\n -e number of epochs (default=1000)" + "\n -r learning rate (default=0.01)" + "\n -i input file (not implemented)" + "\n"); +} + +Params getInputParams(int argc, char** argv) { + Params p; + int opt; + while ((opt = getopt(argc, argv, "h:l:e:r:i:")) >= 0) { + switch (opt) { + case 'h': usage(); exit(0); + case 'l': p.dataSize = strtoull(optarg, nullptr, 0); break; + case 'e': p.epochs = atoi(optarg); break; + case 'r': p.learningRate = atof(optarg); break; + case 'i': p.inputFile = optarg; break; + default: fprintf(stderr, "\nUnrecognized option!\n"); usage(); exit(0); + } + } + return p; +} + +__device__ float sigmoid(float z) { + return 1.0f / (1.0f + expf(-z)); +} + +__device__ void warpReduce(volatile float* sdata, int tid) { + sdata[tid] += sdata[tid + 16]; + sdata[tid] += sdata[tid + 8]; + sdata[tid] += sdata[tid + 4]; + sdata[tid] += sdata[tid + 2]; + sdata[tid] += sdata[tid + 1]; +} + +__global__ void computeGradientsUpdate(float* w, float* b, const int* X, const int* Y, int n, float lr) { + __shared__ float dw_shared[BLOCK_SIZE]; + __shared__ float db_shared[BLOCK_SIZE]; + + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + float dw_local = 0.0f, db_local = 0.0f; + if (idx < n) { + float z = (*w) * X[idx] + (*b); + float pred = sigmoid(z); + float error = pred - Y[idx]; + dw_local = error * X[idx]; + db_local = error; + } + + dw_shared[tid] = dw_local; + db_shared[tid] = db_local; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride >= 32; stride >>= 1) { + if (tid < stride) { + dw_shared[tid] += dw_shared[tid + stride]; + db_shared[tid] += db_shared[tid + stride]; + } + __syncthreads(); + } + + if (tid < 32) { + volatile float* v_dw = dw_shared; + volatile float* v_db = db_shared; + warpReduce(v_dw, tid); + warpReduce(v_db, tid); + } + + if (tid == 0) { + atomicAdd(w, -lr * dw_shared[0] / n); + atomicAdd(b, -lr * db_shared[0] / n); + } +} + +void getVector(uint64_t size, vector& vec) { + vec.resize(size); + for (uint64_t i = 0; i < size; ++i) + vec[i] = rand() % 16; +} + + + +int main(int argc, char* argv[]) { + Params params = getInputParams(argc, argv); + uint64_t n = params.dataSize; + + vector X, Y; + getVector(n, X); + getVector(n, Y); + for (auto& y : Y) y = y % 2; + + float w_gpu = 0.0f, b_gpu = 0.0f; + int* d_X, * d_Y; + float *d_w, *d_b; + int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + + cudaMalloc(&d_X, n * sizeof(int)); + cudaMalloc(&d_Y, n * sizeof(int)); + cudaMalloc(&d_w, sizeof(float)); + cudaMalloc(&d_b, sizeof(float)); + + cudaMemcpy(d_X, X.data(), n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_Y, Y.data(), n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_w, &w_gpu, sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, &b_gpu, sizeof(float), cudaMemcpyHostToDevice); + + double gpu_time_ms; + auto [gpuElapsed, _, __] = measureCUDAPowerAndElapsedTime([&]() { + for (int epoch = 0; epoch < params.epochs; ++epoch) { + computeGradientsUpdate<<>>(d_w, d_b, d_X, d_Y, n, params.learningRate); + } + cudaDeviceSynchronize(); + }); + gpu_time_ms = gpuElapsed; + + cudaMemcpy(&w_gpu, d_w, sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&b_gpu, d_b, sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(d_X); + cudaFree(d_Y); + cudaFree(d_w); + cudaFree(d_b); + + cout << "[GPU] Duration: " << gpu_time_ms << " ms\n"; + cout << "[GPU] Model: sigmoid(" << w_gpu << " * x + " << b_gpu << ")\n"; + + float w_cpu = 0.0, b_cpu = 0.0; + for (int epoch = 0; epoch < params.epochs; ++epoch) { + float dw = 0.0, db = 0.0; + for (uint64_t i = 0; i < n; i++) { + float z = w_cpu * X[i] + b_cpu; + float pred = 1.0 / (1.0 + exp(-z)); + float error = pred - Y[i]; + dw += error * X[i]; + db += error; + } + w_cpu -= params.learningRate * dw / n; + b_cpu -= params.learningRate * db / n; + } + + cout << "[CPU] Model: sigmoid(" << w_cpu << " * x + " << b_cpu << ")\n"; + + if (abs(w_cpu - w_gpu) > 1e-2 || abs(b_cpu - b_gpu) > 1e-2) { + cout << "[CHECK] Mismatch between CPU and GPU results!\n"; + } else { + cout << "[CHECK] CPU and GPU results match.\n"; + } + + return 0; +} diff --git a/PIMbench/logistic-regression/baselines/GPU/out_gpu.txt b/PIMbench/logistic-regression/baselines/GPU/out_gpu.txt new file mode 100644 index 00000000..de002048 --- /dev/null +++ b/PIMbench/logistic-regression/baselines/GPU/out_gpu.txt @@ -0,0 +1,5 @@ +Power Sample Collected: 2935 +[GPU] Duration: 3162.43 ms +[GPU] Model: sigmoid(-2.28162e-05 * x + -4.27026e-05) +[CPU] Model: sigmoid(-1.55859e-05 * x + -0.000138422) +[CHECK] CPU and GPU results match. diff --git a/PIMbench/logistic-regression/baselines/GPU/slurm.sh b/PIMbench/logistic-regression/baselines/GPU/slurm.sh new file mode 100644 index 00000000..ec2b1b58 --- /dev/null +++ b/PIMbench/logistic-regression/baselines/GPU/slurm.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +#SBATCH --job-name=gpu_lr # Job name +#SBATCH --output=slurm-out.txt # Output log +#SBATCH --mail-type=END # Email notification when job ends +#SBATCH --mail-user=yzp7fe@virginia.edu # Your UVA email + +#SBATCH --partition=gpu # GPU partition +#SBATCH --gpus=2 # Request 2 GPUs +#SBATCH --constraint=a100_80gb # Specifically A100 80GB +#SBATCH --cpus-per-task=64 # Number of CPU cores +#SBATCH --mem=512G # Request 512 GB RAM +#SBATCH --time=3-00:00:00 # 3-day walltime +#SBATCH -n 1 # One task (multi-threaded) + +# Load necessary modules +module purge +module load gcc/12.1.0 +module load cuda/12.2.0 + +# Run your GPU-aware executable (must support multiple GPUs) +./lr.out -l 134217728 > out_gpu.txt \ No newline at end of file diff --git a/PIMbench/vec-add/baselines/GPU/Makefile b/PIMbench/vec-add/baselines/GPU/Makefile index fc86b6e4..77fc04c3 100644 --- a/PIMbench/vec-add/baselines/GPU/Makefile +++ b/PIMbench/vec-add/baselines/GPU/Makefile @@ -5,7 +5,7 @@ CUDA_INCLUDE := $(CUDA_DIR)/include CUDA_LIB_DIR := $(CUDA_DIR)/lib64 CUDA_LIB := -lcublas -lnvidia-ml FOPENMP := -Xcompiler -fopenmp -ARCH_FLAG := -arch=sm_80 +# ARCH_FLAG := -arch=sm_80 # Target executable EXEC := vec-add.out