TosylYshu
diff --git a/‎real-esrgan/CMakeLists.txt‎
Lines changed: 2 additions & 4 deletions b/‎real-esrgan/CMakeLists.txt‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎real-esrgan/common.hpp‎
Lines changed: 16 additions & 6 deletions b/‎real-esrgan/common.hpp‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎real-esrgan/gen_wts.py‎
Lines changed: 14 additions & 2 deletions b/‎real-esrgan/gen_wts.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎real-esrgan/postprocess.cu‎
Lines changed: 44 additions & 24 deletions b/‎real-esrgan/postprocess.cu‎
Lines changed: 44 additions & 24 deletions
@@ -20,8 +20,8 @@ include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(/usr/local/cuda/include)
 link_directories(/usr/local/cuda/lib64)
 # tensorrt
-include_directories(/usr/include/x86_64-linux-gnu/)
-link_directories(/usr/lib/x86_64-linux-gnu/)
+include_directories(/usr/local/TensorRT-10.10.0.31/include)
+link_directories(/usr/local/TensorRT-10.10.0.31/lib)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 cuda_add_library(myplugins SHARED preprocess.cu postprocess.cu)
@@ -40,5 +40,3 @@ target_link_libraries(real-esrgan ${OpenCV_LIBS})
 if(UNIX)
 add_definitions(-O2 -pthread)
 endif(UNIX)
-
-
@@ -10,6 +10,8 @@
 
 using namespace nvinfer1;
 
+static const int PRECISION_MODE = 16;  // fp32 : 32, fp16 : 16
+
 // TensorRT weight files have a simple space delimited format:
 // [type] [size] <data x size in hex>
 std::map<std::string, Weights> loadWeights(const std::string file) {
@@ -32,14 +34,22 @@ std::map<std::string, Weights> loadWeights(const std::string file) {
         // Read name and type of blob
         std::string name;
         input >> name >> std::dec >> size;
-        wt.type = DataType::kFLOAT;
 
-        // Load blob
-        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
-        for (uint32_t x = 0, y = size; x < y; ++x) {
-            input >> std::hex >> val[x];
+        if (PRECISION_MODE == 16) {
+            wt.type = DataType::kHALF;
+            uint16_t* val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
+            for (uint32_t x = 0, y = size; x < y; ++x) {
+                input >> std::hex >> val[x];
+            }
+            wt.values = val;
+        } else {
+            wt.type = DataType::kFLOAT;
+            uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+            for (uint32_t x = 0, y = size; x < y; ++x) {
+                input >> std::hex >> val[x];
+            }
+            wt.values = val;
         }
-        wt.values = val;
 
         wt.count = size;
         weightMap[name] = wt;
 
@@ -1,15 +1,23 @@
 import argparse
 import os
 import struct
+import numpy as np
 from basicsr.archs.rrdbnet_arch import RRDBNet
 from realesrgan import RealESRGANer
 from realesrgan.archs.srvgg_arch import SRVGGNetCompact
 
+
+def float32_to_float16_hex(value):
+    f16 = np.float16(value)
+    u16 = np.frombuffer(f16.tobytes(), dtype=np.uint16)[0]
+    return format(u16, "04x")
+
+
 def main():
     """Inference demo for Real-ESRGAN.
     """
     parser = argparse.ArgumentParser()
-    #parser.add_argument('-i', '--input', type=str, default='../TestData3', help='Input image or folder')
+    # parser.add_argument('-i', '--input', type=str, default='../TestData3', help='Input image or folder')
     parser.add_argument('-i', '--input', type=str, default='inputs', help='Input image or folder')
     parser.add_argument(
         '-n',
@@ -84,9 +92,13 @@ def main():
             f.write("{} {}".format(k, len(vr)))
             for vv in vr:
                 f.write(" ")
-                f.write(struct.pack(">f", float(vv)).hex())
+                if args.fp32:
+                    f.write(struct.pack(">f", float(vv)).hex())
+                else:
+                    f.write(float32_to_float16_hex(float(vv)))
             f.write("\n")
         print('Completed real-esrgan.wts file!')
 
+
 if __name__ == '__main__':
     main()
@@ -1,14 +1,15 @@
+#include "cublas_v2.h"
 #include "cuda_utils.h"
 
 using namespace std;
 
 // postprocess (NCHW->NHWC, RGB->BGR, *255, ROUND, uint8)
-__global__ void postprocess_kernel(uint8_t* output, float* input,
-    const int batchSize, const int height, const int width, const int channel,
-    const int thread_count)
-{
+template <typename T>
+__global__ void postprocess_kernel(uint8_t* output, const T* input, const int batchSize, const int height,
+                                   const int width, const int channel, const int thread_count) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
-    if (index >= thread_count) return;
+    if (index >= thread_count)
+        return;
 
     const int c_idx = index % channel;
     int idx = index / channel;
@@ -17,38 +18,57 @@ __global__ void postprocess_kernel(uint8_t* output, float* input,
     const int h_idx = idx % height;
     const int b_idx = idx / height;
 
-    int g_idx = b_idx * height * width * channel + (2 - c_idx)* height * width + h_idx * width + w_idx;
-    float tt = input[g_idx] * 255.f;
+    int g_idx = b_idx * height * width * channel + (2 - c_idx) * height * width + h_idx * width + w_idx;
+    float val = (float)input[g_idx];
+    float tt = val * 255.f;
     if (tt > 255)
         tt = 255;
-    output[index] = tt;
+    if (tt < 0)
+        tt = 0;
+    output[index] = (uint8_t)tt;
 }
 
-void postprocess(uint8_t* output, float*input, int batchSize, int height, int width, int channel, cudaStream_t stream)
-{
+template __global__ void postprocess_kernel<float>(uint8_t* output, const float* input, const int batchSize,
+                                                   const int height, const int width, const int channel,
+                                                   const int thread_count);
+template __global__ void postprocess_kernel<half>(uint8_t* output, const half* input, const int batchSize,
+                                                  const int height, const int width, const int channel,
+                                                  const int thread_count);
+
+template <typename T>
+void postprocess(uint8_t* output, const T* input, int batchSize, int height, int width, int channel,
+                 cudaStream_t stream) {
     int thread_count = batchSize * height * width * channel;
     int block = 512;
     int grid = (thread_count - 1) / block + 1;
 
-    postprocess_kernel << <grid, block, 0, stream >> > (output, input, batchSize, height, width, channel, thread_count);
+    postprocess_kernel<T><<<grid, block, 0, stream>>>(output, input, batchSize, height, width, channel, thread_count);
 }
 
+template void postprocess<float>(uint8_t* output, const float* input, int batchSize, int height, int width, int channel,
+                                 cudaStream_t stream);
+template void postprocess<half>(uint8_t* output, const half* input, int batchSize, int height, int width, int channel,
+                                cudaStream_t stream);
 
 #include "postprocess.hpp"
 
-namespace nvinfer1
-{
-    int PostprocessPluginV2::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
-    {
-        float* input = (float*)inputs[0];
-        uint8_t* output = (uint8_t*)outputs[0];
-
-        const int H = mPostprocess.H;
-        const int W = mPostprocess.W;
-        const int C = mPostprocess.C;
+namespace nvinfer1 {
+int PostprocessPluginV2::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
+                                 cudaStream_t stream) noexcept {
+    uint8_t* output = (uint8_t*)outputs[0];
 
-        postprocess(output, input, batchSize, H, W, C, stream);
+    const int H = mPostprocess.H;
+    const int W = mPostprocess.W;
+    const int C = mPostprocess.C;
 
-        return 0;
+    if (mDataType == DataType::kFLOAT) {
+        const float* input = (const float*)inputs[0];
+        postprocess<float>(output, input, batchSize, H, W, C, stream);
+    } else if (mDataType == DataType::kHALF) {
+        const half* input = (const half*)inputs[0];
+        postprocess<half>(output, input, batchSize, H, W, C, stream);
     }
-}
+
+    return 0;
+}
+}  // namespace nvinfer1