diff --git a/PIMbench/layer-normalization/baselines/layer_norm.py b/PIMbench/layer-normalization/baselines/layer_norm.py
new file mode 100644
index 00000000..256d18b5
--- /dev/null
+++ b/PIMbench/layer-normalization/baselines/layer_norm.py
@@ -0,0 +1,48 @@
+import argparse
+import torch
+import torch.nn as nn
+import time
+
+# Function to perform layer normalization
+def perform_layer_norm(input_tensor, norm_layer, device):
+    input_tensor = input_tensor.to(device)
+    norm_layer = norm_layer.to(device)
+
+    start_time = time.time()
+    output = norm_layer(input_tensor)
+    if device.type == 'cuda':
+        torch.cuda.synchronize()  # Wait for GPU ops to complete
+    end_time = time.time()
+
+    elapsed_time = end_time - start_time
+    return elapsed_time
+
+# Main function
+def main(args):
+    # Set device
+    device = torch.device('cuda' if args.cuda and torch.cuda.is_available() else 'cpu')
+    print(f"[INFO] Using device: {device}")
+
+    # Input tensor: [B, C, H, W]
+    input_tensor = torch.randn(args.batch_size, args.input_channels, args.input_height, args.input_width)
+
+    # LayerNorm normalized over [C, H, W] for each sample
+    normalized_shape = [args.input_channels, args.input_height, args.input_width]
+    norm_layer = nn.LayerNorm(normalized_shape, eps=args.epsilon)
+
+    # Run layer normalization
+    time_taken = perform_layer_norm(input_tensor, norm_layer, device)
+    print(f"[INFO] Time taken for layer normalization: {time_taken * 1000:.6f} ms")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="CNN-style Layer Normalization on CPU/GPU")
+
+    parser.add_argument("-b", "--batch_size", type=int, default=64, help="Batch size")
+    parser.add_argument("-c", "--input_channels", type=int, default=64, help="Number of input channels")
+    parser.add_argument("-r", "--input_height", type=int, default=32, help="Input height")
+    parser.add_argument("-w", "--input_width", type=int, default=32, help="Input width")
+    parser.add_argument("-eps", "--epsilon", type=float, default=1e-5, help="Epsilon for LayerNorm")
+    parser.add_argument("-cuda", "--cuda", action='store_true', help="Use CUDA if available")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/PIMbench/logistic-regression/Makefile b/PIMbench/logistic-regression/Makefile
new file mode 100644
index 00000000..d26331b9
--- /dev/null
+++ b/PIMbench/logistic-regression/Makefile
@@ -0,0 +1,16 @@
+# Makefile: C++ version of logistic regression
+# Copyright (c) 2024 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+SUBDIRS := PIM
+
+.PHONY: debug perf dramsim3_integ clean $(SUBDIRS)
+.DEFAULT_GOAL := perf
+
+USE_OPENMP ?= 0
+
+debug perf dramsim3_integ clean: $(SUBDIRS)
+
+$(SUBDIRS):
+	$(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP)
diff --git a/PIMbench/logistic-regression/PIM/Makefile b/PIMbench/logistic-regression/PIM/Makefile
new file mode 100644
index 00000000..ac6608ac
--- /dev/null
+++ b/PIMbench/logistic-regression/PIM/Makefile
@@ -0,0 +1,24 @@
+# Makefile: C++ version of logistic regression
+# Copyright (c) 2024 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+PROJ_ROOT = ../../..
+include ${PROJ_ROOT}/Makefile.common
+
+# make USE_OPENMP=1
+USE_OPENMP ?= 0
+ifeq ($(USE_OPENMP),1)
+	CXXFLAGS += -fopenmp
+endif
+
+EXEC := lr.out
+SRC := lr.cpp
+
+debug perf dramsim3_integ: $(EXEC)
+
+$(EXEC): $(SRC) $(DEPS)
+	$(CXX) $< $(CXXFLAGS) -o $@
+
+clean:
+	rm -rf $(EXEC) *.dSYM
diff --git a/PIMbench/logistic-regression/PIM/lr.cpp b/PIMbench/logistic-regression/PIM/lr.cpp
new file mode 100644
index 00000000..178215de
--- /dev/null
+++ b/PIMbench/logistic-regression/PIM/lr.cpp
@@ -0,0 +1,357 @@
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+#include <stdint.h>
+#include <cmath>
+#include <chrono>
+#include <iomanip>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "util.h"
+#include "libpimeval.h"
+
+using namespace std;
+
+struct Params {
+    uint64_t dataSize = 2048;
+    int epochs = 1000;
+    float learningRate = 0.01f;
+    char* configFile = nullptr;
+    char* inputFile = nullptr;
+    bool shouldVerify = false;
+};
+
+void usage() {
+    fprintf(stderr,
+        "\nUsage:  ./lr.out [options]"
+        "\n"
+        "\n    -l    input size (default=2048 elements)"
+        "\n    -e    number of epochs (default=1000)"
+        "\n    -r    learning rate (default=0.01)"
+        "\n    -c    DRAMsim config file"
+        "\n    -i    input file (not implemented)"
+        "\n    -v    t = verify with host output"
+        "\n");
+}
+
+Params getInputParams(int argc, char** argv) {
+    Params p;
+    int opt;
+    while ((opt = getopt(argc, argv, "h:l:e:r:c:i:v:")) >= 0) {
+        switch (opt) {
+            case 'h': 
+                usage(); 
+                exit(0);
+            case 'l': 
+                p.dataSize = strtoull(optarg, nullptr, 0); 
+                break;
+            case 'e': 
+                p.epochs = atoi(optarg); 
+                break;
+            case 'r': 
+                p.learningRate = atof(optarg); 
+                break;
+            case 'c': 
+                p.configFile = optarg; 
+                break;
+            case 'i': 
+                p.inputFile = optarg; 
+                break;
+            case 'v': 
+                p.shouldVerify = (*optarg == 't'); 
+                break;
+            default: 
+                fprintf(stderr, "\nUnrecognized option!\n"); 
+                usage(); 
+                exit(0);
+        }
+    }
+    return p;
+}
+
+float sigmoid_exact(float z) {
+    return 1.0f / (1.0f + expf(-z));
+}
+
+
+
+void runLogisticRegressionPIM(uint64_t dataSize, int epochs, float lr, vector<float>& Xf, vector<float>& Yf, float& w, float& b) {
+    PimObjId xObj = pimAlloc(PIM_ALLOC_AUTO, dataSize, PIM_FP32);
+    if (xObj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+    PimObjId yObj = pimAllocAssociated(xObj, PIM_FP32);
+    if (yObj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+    PimObjId predictionObj = pimAllocAssociated(xObj, PIM_FP32);
+    if (predictionObj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+    PimObjId w_obj = pimAllocAssociated(xObj, PIM_FP32);
+    if (w_obj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+
+    PimObjId b_obj = pimAllocAssociated(predictionObj, PIM_FP32);
+    if (b_obj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+
+    PimObjId oneVecObj = pimAllocAssociated(predictionObj, PIM_FP32);
+    if (oneVecObj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+
+
+
+    PimObjId errorObj = pimAllocAssociated(xObj, PIM_FP32);
+    if (errorObj == -1)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+
+
+    PimStatus status = pimCopyHostToDevice(Xf.data(), xObj);
+    if (status != PIM_OK)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+
+    status = pimCopyHostToDevice(Yf.data(), yObj);
+    if (status != PIM_OK)
+    {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+
+    status = pimBroadcastFP(oneVecObj, 1.0f);
+    if (status != PIM_OK) {
+        std::cout << "Abort" << std::endl;
+        return;
+    }
+    
+    std::chrono::duration<double, std::milli> hostElapsedTime = std::chrono::duration<double, std::milli>::zero();
+    std::vector<float> zBuffer(dataSize);
+    for (int epoch = 0; epoch < epochs; ++epoch) {
+        float dw = 0.0f, db = 0.0f;
+
+
+        status = pimBroadcastFP(w_obj, w);
+        if (status != PIM_OK) {
+            std::cout << "Abort" << std::endl;
+            return;
+        }
+
+        status = pimMul(xObj, w_obj, predictionObj);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+
+        status = pimBroadcastFP(b_obj, b);
+        if (status != PIM_OK) {
+            std::cout << "Abort" << std::endl;
+            return;
+        }
+
+        status = pimAdd(predictionObj, b_obj, predictionObj);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+        status = pimCopyDeviceToHost(predictionObj, zBuffer.data());
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+
+
+        auto start_cpu = std::chrono::high_resolution_clock::now();
+        // #pragma omp parallel for
+        for (uint64_t i = 0; i < dataSize; ++i) {
+            zBuffer[i] = exp(-zBuffer[i]);
+        }
+        auto stop_cpu = std::chrono::high_resolution_clock::now();
+        hostElapsedTime += (stop_cpu - start_cpu);
+
+        status = pimCopyHostToDevice(zBuffer.data(), predictionObj);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+
+        status = pimAdd(predictionObj, oneVecObj, predictionObj);
+        if (status != PIM_OK) {
+            std::cout << "Abort" << std::endl;
+            return;
+        }
+
+        status = pimDiv(oneVecObj, predictionObj, predictionObj);  
+        if (status != PIM_OK) {
+            std::cout << "Abort" << std::endl;
+            return;
+        }
+
+        status = pimSub(predictionObj, yObj, errorObj);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+
+
+        status = pimRedSum(errorObj, &db);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+
+        status = pimMul(errorObj, xObj, errorObj);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+
+        std::vector<float> errorBuffer(dataSize);
+
+        status = pimRedSum(errorObj, &dw);
+        if (status != PIM_OK)
+        {
+          std::cout << "Abort" << std::endl;
+          return;
+        }
+        
+        
+        w -= lr * dw / dataSize;
+        b -= lr * db / dataSize;
+        std::cout << "Epoch " << epoch
+                  << ": w = " << w
+                  << ", b = " << b
+                  << ", dw = " << dw
+                  << ", db = " << db
+                  << std::endl;
+
+    }
+
+    pimFree(xObj);
+    pimFree(yObj);
+    pimFree(predictionObj);
+    pimFree(errorObj);
+    pimFree(w_obj);
+    pimFree(b_obj);
+    std::cout << "Host elapsed time: " << fixed << setprecision(3)<< hostElapsedTime.count() << " ms" << std::endl;
+}
+
+
+
+
+void runLogisticRegressionHost(uint64_t n, int epochs, float lr, const vector<float>& X, const vector<float>& Y, float& w_host, float& b_host) {
+
+    w_host = 0.0f;
+    b_host = 0.0f;
+
+    for (int epoch = 0; epoch < epochs; ++epoch) {
+        float dw = 0.0f, db = 0.0f;
+        // #pragma omp parallel for reduction(+ : dw, db) 
+        for (uint64_t i = 0; i < n; ++i) {
+            float z = w_host * X[i] + b_host;
+            float pred = sigmoid_exact(z);
+            float error = pred - Y[i];
+            dw += error * X[i];
+            static std::ofstream error_log("error_log_host.txt", std::ios::app);
+            error_log << error * X[i] << std::endl;
+            db += error;
+        }
+        w_host -= lr * dw / n;
+        b_host -= lr * db / n;
+
+
+        std::cout << "Epoch " << epoch
+            << ": w = " << w_host
+            << ", b = " << b_host
+            << ", dw = " << dw
+            << ", db = " << db
+            << std::endl;
+    }
+}
+
+
+int main(int argc, char* argv[]) {
+    Params params = getInputParams(argc, argv);
+    vector<int> X(params.dataSize), Y(params.dataSize);
+    
+    if (params.inputFile == nullptr){
+        getVector(params.dataSize, X);
+        getVector(params.dataSize, Y);
+        for (auto& y : Y) y = y % 2;
+    }
+    else{
+        std::cout << "Reading from input file is not implemented yet." << std::endl;
+        return 1;
+    }
+
+    std::vector<float> Xf(params.dataSize), Yf(params.dataSize);
+    for (uint64_t i = 0; i < params.dataSize; ++i) {
+        Xf[i] = static_cast<float>(X[i]);
+        Yf[i] = static_cast<float>(Y[i]);
+    }
+
+    
+    if (!createDevice(params.configFile)) return 1;
+
+
+    float w = 0.0f, b = 0.0f;
+
+
+    runLogisticRegressionPIM(params.dataSize, params.epochs, params.learningRate, Xf, Yf, w, b);
+    pimShowStats();
+
+    cout << "Model: sigmoid(" << w << " * x + " << b << ")\n";
+
+
+    if (params.shouldVerify) {
+        float w_host, b_host;
+        auto start = chrono::high_resolution_clock::now();
+
+        runLogisticRegressionHost(params.dataSize, params.epochs, params.learningRate, Xf, Yf, w_host, b_host);
+
+        auto end = chrono::high_resolution_clock::now();
+        chrono::duration<double, milli> elapsedTime = end - start;
+
+        cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms\n";
+        cout << "Host Model: sigmoid(" << w_host << " * x + " << b_host << ")\n";
+    
+        float w_diff = fabs(w - w_host);
+        float b_diff = fabs(b - b_host);
+    
+        if (w_diff < 1e-2 && b_diff < 1e-2)
+            cout << "Verification PASSED.\n";
+        else
+            cout << "Verification FAILED.\n";
+    }
+
+    return 0;
+}
diff --git a/PIMbench/logistic-regression/PIM/slurm.sh b/PIMbench/logistic-regression/PIM/slurm.sh
new file mode 100755
index 00000000..b5c33d63
--- /dev/null
+++ b/PIMbench/logistic-regression/PIM/slurm.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+#SBATCH --gpus=1
+#SBATCH -n 1
+#SBATCH -t 3-00:00:00
+#SBATCH -p gpu
+#SBATCH --job-name=logistic-regression
+#SBATCH --mem=900000
+#SBATCH --output=slurm-out.txt
+#SBATCH --cpus-per-task=130
+#SBATCH --constraint=a100_80gb
+#SBATCH --mail-type=end
+#SBATCH --mail-user=yzp7fe@virginia.edu
+
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Bank_Rank1.cfg > bank_rank1.txt
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Bank_Rank4.cfg > bank_rank4.txt
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Bank_Rank8.cfg > bank_rank8.txt
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank1.cfg > Fulcrum_Rank1.txt
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank4.cfg > Fulcrum_Rank4.txt
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank8.cfg > Fulcrum_Rank8.txt
+./lr.out -l 134217728 -v t -c /u/yzp7fe/PIMeval-PIMbench/configs/taco/PIMeval_Fulcrum_Rank16.cfg > Fulcrum_Rank16.txt
+
+
diff --git a/PIMbench/logistic-regression/baselines/CPU/Makefile b/PIMbench/logistic-regression/baselines/CPU/Makefile
new file mode 100644
index 00000000..e8e18c44
--- /dev/null
+++ b/PIMbench/logistic-regression/baselines/CPU/Makefile
@@ -0,0 +1,25 @@
+# Compiler
+CXX := g++
+
+# Compiler flags
+CXXFLAGS := -Wall -Wextra -Werror -march=native -std=c++17 -O3 -fopenmp
+
+# Executable name
+EXEC := lr.out
+
+# Source files
+SRC_FILES := $(wildcard *.cpp)
+
+# Dependancy
+DEP := ../../../../util/
+
+.PHONY: all clean
+
+all: $(EXEC)
+
+$(EXEC): $(SRC_FILES) |
+	$(CXX) $(CXXFLAGS) -I$(DEP) -o $@ $^
+
+clean:
+	rm -rf $(EXEC)
+	
\ No newline at end of file
diff --git a/PIMbench/logistic-regression/baselines/CPU/lr.cpp b/PIMbench/logistic-regression/baselines/CPU/lr.cpp
new file mode 100644
index 00000000..a4e2e44f
--- /dev/null
+++ b/PIMbench/logistic-regression/baselines/CPU/lr.cpp
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <chrono>
+#include <omp.h>
+#include <iomanip>
+#include <vector>
+
+#include "utilBaselines.h"
+
+using namespace std;
+
+struct Params {
+    uint64_t dataSize = 2048;
+    int epochs = 1000;
+    float learningRate = 0.01f;
+    string inputFile = "";
+};
+
+void usage() {
+    fprintf(stderr,
+        "\nUsage:  ./lr_host.out [options]"
+        "\n"
+        "\n    -l    input size (default=2048 elements)"
+        "\n    -e    number of epochs (default=1000)"
+        "\n    -r    learning rate (default=0.01)"
+        "\n    -i    input file (not implemented)"
+        "\n");
+}
+
+Params getInputParams(int argc, char** argv) {
+    Params p;
+    int opt;
+    while ((opt = getopt(argc, argv, "h:l:e:r:c:i:v:")) >= 0) {
+        switch (opt) {
+            case 'h': 
+                usage(); 
+                exit(0);
+            case 'l': 
+                p.dataSize = strtoull(optarg, nullptr, 0); 
+                break;
+            case 'e': 
+                p.epochs = atoi(optarg); 
+                break;
+            case 'r': 
+                p.learningRate = atof(optarg); 
+                break;
+            case 'i': 
+                p.inputFile = optarg; 
+                break;
+            default: 
+                fprintf(stderr, "\nUnrecognized option!\n"); 
+                usage(); 
+                exit(0);
+        }
+    }
+    return p;
+}
+
+double sigmoid(double z) {
+    return 1.0 / (1.0 + exp(-z));
+}
+
+int main(int argc, char* argv[]) {
+    Params params = getInputParams(argc, argv);
+
+    if (!params.inputFile.empty()) {
+        std::cout << "Reading input from file is not yet implemented." << std::endl;
+        return 1;
+    }
+
+    uint64_t n = params.dataSize;
+    vector<int32_t> dataPointsX, dataPointsY;
+    getVector(n, dataPointsX);
+    getVector(n, dataPointsY);
+
+    for (uint64_t i = 0; i < n; i++) {
+        dataPointsY[i] = dataPointsY[i] % 2;
+    }
+
+    cout << "Done initializing data\n";
+
+    float w = 0.0, b = 0.0;
+    auto start = chrono::high_resolution_clock::now();
+
+    for (int epoch = 0; epoch < params.epochs; ++epoch) {
+        float dw = 0.0, db = 0.0;
+
+        #pragma omp parallel for reduction(+ : dw, db)
+        for (uint64_t i = 0; i < n; i++) {
+            float z = w * dataPointsX[i] + b;
+            float pred = sigmoid(z);
+            float error = pred - dataPointsY[i];
+
+            dw += error * dataPointsX[i];
+            db += error;
+        }
+
+        w -= params.learningRate * dw / n;
+        b -= params.learningRate * db / n;
+    }
+
+    auto end = chrono::high_resolution_clock::now();
+    chrono::duration<double, milli> elapsedTime = end - start;
+
+    cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms\n";
+    cout << "Model: sigmoid(" << w << " * x + " << b << ")\n";
+
+    return 0;
+}
diff --git a/PIMbench/logistic-regression/baselines/GPU/Makefile b/PIMbench/logistic-regression/baselines/GPU/Makefile
new file mode 100644
index 00000000..d7657e9d
--- /dev/null
+++ b/PIMbench/logistic-regression/baselines/GPU/Makefile
@@ -0,0 +1,32 @@
+# Compiler settings
+NVCC := nvcc
+CUDA_DIR ?= $(shell dirname $(shell dirname $(shell which nvcc)))
+CUDA_INCLUDE := $(CUDA_DIR)/include
+CUDA_LIB_DIR := $(CUDA_DIR)/lib64
+CUDA_LIB := -lcublas -lnvidia-ml
+FOPENMP := -Xcompiler -fopenmp
+# ARCH_FLAG := -arch=sm_80
+
+# Target executable
+EXEC := lr.out
+
+# Source files
+SOURCES := lr.cu
+
+# Compiler flags
+CFLAGS := -O3
+CXXFLAGS := -std=c++17
+
+# Dependancy
+DEP := ../../../../util/
+
+# Default target
+all: $(EXEC)
+
+# Rule to build the target executable
+$(EXEC): $(SOURCES)
+	$(NVCC) $(SOURCES) -DENABLE_CUDA -I$(CUDA_INCLUDE) -I$(DEP) $(ARCH_FLAG) -L$(CUDA_LIB_DIR) $(CUDA_LIB) $(FOPENMP) $(CFLAGS) $(CXXFLAGS) -o $(EXEC)
+
+# Rule to clean the project
+clean:
+	rm -f $(EXEC)
diff --git a/PIMbench/logistic-regression/baselines/GPU/lr.cu b/PIMbench/logistic-regression/baselines/GPU/lr.cu
new file mode 100644
index 00000000..bc80dea5
--- /dev/null
+++ b/PIMbench/logistic-regression/baselines/GPU/lr.cu
@@ -0,0 +1,176 @@
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <chrono>
+#include <unistd.h>
+#include <iomanip>
+#include "utilBaselines.h"
+
+#define BLOCK_SIZE 256
+
+using namespace std;
+
+struct Params {
+    uint64_t dataSize = 2048;
+    int epochs = 1000;
+    float learningRate = 0.01f;
+    string inputFile = "";
+};
+
+void usage() {
+    fprintf(stderr,
+        "\nUsage:  ./lr_gpu.out [options]"
+        "\n"
+        "\n    -l    input size (default=2048 elements)"
+        "\n    -e    number of epochs (default=1000)"
+        "\n    -r    learning rate (default=0.01)"
+        "\n    -i    input file (not implemented)"
+        "\n");
+}
+
+Params getInputParams(int argc, char** argv) {
+    Params p;
+    int opt;
+    while ((opt = getopt(argc, argv, "h:l:e:r:i:")) >= 0) {
+        switch (opt) {
+            case 'h': usage(); exit(0);
+            case 'l': p.dataSize = strtoull(optarg, nullptr, 0); break;
+            case 'e': p.epochs = atoi(optarg); break;
+            case 'r': p.learningRate = atof(optarg); break;
+            case 'i': p.inputFile = optarg; break;
+            default: fprintf(stderr, "\nUnrecognized option!\n"); usage(); exit(0);
+        }
+    }
+    return p;
+}
+
+__device__ float sigmoid(float z) {
+    return 1.0f / (1.0f + expf(-z));
+}
+
+__device__ void warpReduce(volatile float* sdata, int tid) {
+    sdata[tid] += sdata[tid + 16];
+    sdata[tid] += sdata[tid + 8];
+    sdata[tid] += sdata[tid + 4];
+    sdata[tid] += sdata[tid + 2];
+    sdata[tid] += sdata[tid + 1];
+}
+
+__global__ void computeGradientsUpdate(float* w, float* b, const int* X, const int* Y, int n, float lr) {
+    __shared__ float dw_shared[BLOCK_SIZE];
+    __shared__ float db_shared[BLOCK_SIZE];
+
+    int tid = threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    float dw_local = 0.0f, db_local = 0.0f;
+    if (idx < n) {
+        float z = (*w) * X[idx] + (*b);
+        float pred = sigmoid(z);
+        float error = pred - Y[idx];
+        dw_local = error * X[idx];
+        db_local = error;
+    }
+
+    dw_shared[tid] = dw_local;
+    db_shared[tid] = db_local;
+    __syncthreads();
+
+    for (int stride = blockDim.x / 2; stride >= 32; stride >>= 1) {
+        if (tid < stride) {
+            dw_shared[tid] += dw_shared[tid + stride];
+            db_shared[tid] += db_shared[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    if (tid < 32) {
+        volatile float* v_dw = dw_shared;
+        volatile float* v_db = db_shared;
+        warpReduce(v_dw, tid);
+        warpReduce(v_db, tid);
+    }
+
+    if (tid == 0) {
+        atomicAdd(w, -lr * dw_shared[0] / n);
+        atomicAdd(b, -lr * db_shared[0] / n);
+    }
+}
+
+void getVector(uint64_t size, vector<int>& vec) {
+    vec.resize(size);
+    for (uint64_t i = 0; i < size; ++i)
+        vec[i] = rand() % 16;
+}
+
+
+
+int main(int argc, char* argv[]) {
+    Params params = getInputParams(argc, argv);
+    uint64_t n = params.dataSize;
+
+    vector<int> X, Y;
+    getVector(n, X);
+    getVector(n, Y);
+    for (auto& y : Y) y = y % 2;
+
+    float w_gpu = 0.0f, b_gpu = 0.0f;
+    int* d_X, * d_Y;
+    float *d_w, *d_b;
+    int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    cudaMalloc(&d_X, n * sizeof(int));
+    cudaMalloc(&d_Y, n * sizeof(int));
+    cudaMalloc(&d_w, sizeof(float));
+    cudaMalloc(&d_b, sizeof(float));
+
+    cudaMemcpy(d_X, X.data(), n * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Y, Y.data(), n * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_w, &w_gpu, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, &b_gpu, sizeof(float), cudaMemcpyHostToDevice);
+
+    double gpu_time_ms;
+    auto [gpuElapsed, _, __] = measureCUDAPowerAndElapsedTime([&]() {
+        for (int epoch = 0; epoch < params.epochs; ++epoch) {
+            computeGradientsUpdate<<<numBlocks, BLOCK_SIZE>>>(d_w, d_b, d_X, d_Y, n, params.learningRate);
+        }
+        cudaDeviceSynchronize();
+    });
+    gpu_time_ms = gpuElapsed;
+
+    cudaMemcpy(&w_gpu, d_w, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&b_gpu, d_b, sizeof(float), cudaMemcpyDeviceToHost);
+
+    cudaFree(d_X);
+    cudaFree(d_Y);
+    cudaFree(d_w);
+    cudaFree(d_b);
+
+    cout << "[GPU] Duration: " << gpu_time_ms << " ms\n";
+    cout << "[GPU] Model: sigmoid(" << w_gpu << " * x + " << b_gpu << ")\n";
+
+    float w_cpu = 0.0, b_cpu = 0.0;
+    for (int epoch = 0; epoch < params.epochs; ++epoch) {
+        float dw = 0.0, db = 0.0;
+        for (uint64_t i = 0; i < n; i++) {
+            float z = w_cpu * X[i] + b_cpu;
+            float pred = 1.0 / (1.0 + exp(-z));
+            float error = pred - Y[i];
+            dw += error * X[i];
+            db += error;
+        }
+        w_cpu -= params.learningRate * dw / n;
+        b_cpu -= params.learningRate * db / n;
+    }
+
+    cout << "[CPU] Model: sigmoid(" << w_cpu << " * x + " << b_cpu << ")\n";
+
+    if (abs(w_cpu - w_gpu) > 1e-2 || abs(b_cpu - b_gpu) > 1e-2) {
+        cout << "[CHECK] Mismatch between CPU and GPU results!\n";
+    } else {
+        cout << "[CHECK] CPU and GPU results match.\n";
+    }
+
+    return 0;
+}
diff --git a/PIMbench/logistic-regression/baselines/GPU/out_gpu.txt b/PIMbench/logistic-regression/baselines/GPU/out_gpu.txt
new file mode 100644
index 00000000..de002048
--- /dev/null
+++ b/PIMbench/logistic-regression/baselines/GPU/out_gpu.txt
@@ -0,0 +1,5 @@
+Power Sample Collected: 2935
+[GPU] Duration: 3162.43 ms
+[GPU] Model: sigmoid(-2.28162e-05 * x + -4.27026e-05)
+[CPU] Model: sigmoid(-1.55859e-05 * x + -0.000138422)
+[CHECK] CPU and GPU results match.
diff --git a/PIMbench/logistic-regression/baselines/GPU/slurm.sh b/PIMbench/logistic-regression/baselines/GPU/slurm.sh
new file mode 100644
index 00000000..ec2b1b58
--- /dev/null
+++ b/PIMbench/logistic-regression/baselines/GPU/slurm.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+#SBATCH --job-name=gpu_lr           # Job name
+#SBATCH --output=slurm-out.txt                     # Output log
+#SBATCH --mail-type=END                            # Email notification when job ends
+#SBATCH --mail-user=yzp7fe@virginia.edu            # Your UVA email
+
+#SBATCH --partition=gpu                            # GPU partition
+#SBATCH --gpus=2                                    # Request 2 GPUs
+#SBATCH --constraint=a100_80gb                     # Specifically A100 80GB
+#SBATCH --cpus-per-task=64                         # Number of CPU cores
+#SBATCH --mem=512G                                 # Request 512 GB RAM
+#SBATCH --time=3-00:00:00                          # 3-day walltime
+#SBATCH -n 1                                        # One task (multi-threaded)
+
+# Load necessary modules
+module purge
+module load gcc/12.1.0
+module load cuda/12.2.0
+
+# Run your GPU-aware executable (must support multiple GPUs)
+./lr.out -l 134217728 > out_gpu.txt
\ No newline at end of file
diff --git a/PIMbench/vec-add/baselines/GPU/Makefile b/PIMbench/vec-add/baselines/GPU/Makefile
index fc86b6e4..77fc04c3 100644
--- a/PIMbench/vec-add/baselines/GPU/Makefile
+++ b/PIMbench/vec-add/baselines/GPU/Makefile
@@ -5,7 +5,7 @@ CUDA_INCLUDE := $(CUDA_DIR)/include
 CUDA_LIB_DIR := $(CUDA_DIR)/lib64
 CUDA_LIB := -lcublas -lnvidia-ml
 FOPENMP := -Xcompiler -fopenmp
-ARCH_FLAG := -arch=sm_80
+# ARCH_FLAG := -arch=sm_80
 
 # Target executable
 EXEC := vec-add.out