fix memcpy, speed up

Wish · Wish · commit 48742f77f86e · 2021-09-12T22:07:17.000+08:00
diff --git a/src/application/app_arcface/arcface.cpp b/src/application/app_arcface/arcface.cpp
@@ -171,7 +171,9 @@ namespace Arcface{
             float* affine_matrix_host     = (float*)cpu_workspace;
             uint8_t* image_host           = size_matrix + cpu_workspace;
 
-            checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            // speed up
+            memcpy(image_host, image.data, size_image);
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i,   sizeof(job.additional.d2i), cudaMemcpyHostToHost,   stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
diff --git a/src/application/app_retinaface.cpp b/src/application/app_retinaface.cpp
@@ -106,7 +106,7 @@ int app_retinaface(){
     INFO("===================== test retinaface fp32 ==================================");
 
     string model_file;
-    if(!compile_retinaface(640, 480, model_file))
+    if(!compile_retinaface(1920, 1280, model_file))
         return 0;
 
     auto engine = RetinaFace::create_infer(model_file, 0, 0.7);
diff --git a/src/application/app_retinaface/retinaface.cpp b/src/application/app_retinaface/retinaface.cpp
@@ -241,7 +241,9 @@ namespace RetinaFace{
             float* affine_matrix_host     = (float*)cpu_workspace;
             uint8_t* image_host           = size_matrix + cpu_workspace;
 
-            checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            // checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            // speed up
+            memcpy(image_host, image.data, size_image);
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
diff --git a/src/application/app_scrfd/scrfd.cpp b/src/application/app_scrfd/scrfd.cpp
@@ -243,7 +243,9 @@ namespace Scrfd{
             float* affine_matrix_host     = (float*)cpu_workspace;
             uint8_t* image_host           = size_matrix + cpu_workspace;
 
-            checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            // speed up
+            memcpy(image_host, image.data, size_image);
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
diff --git a/src/application/app_yolo/yolo.cpp b/src/application/app_yolo/yolo.cpp
@@ -237,7 +237,9 @@ namespace Yolo{
             float* affine_matrix_host     = (float*)cpu_workspace;
             uint8_t* image_host           = size_matrix + cpu_workspace;
 
-            checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            // speed up
+            memcpy(image_host, image.data, size_image);
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
diff --git a/workspace/my-yolov5s-car.jpg b/workspace/my-yolov5s-car.jpg
diff --git a/workspace/test.jpg b/workspace/test.jpg