Skip to content

Commit 48742f7

Browse files
author
Wish
committed
fix memcpy, speed up
1 parent fd44e37 commit 48742f7

File tree

7 files changed

+13
-5
lines changed

7 files changed

+13
-5
lines changed

src/application/app_arcface/arcface.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,9 @@ namespace Arcface{
171171
float* affine_matrix_host = (float*)cpu_workspace;
172172
uint8_t* image_host = size_matrix + cpu_workspace;
173173

174-
checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
174+
//checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
175+
// speed up
176+
memcpy(image_host, image.data, size_image);
175177
checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
176178
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
177179
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));

src/application/app_retinaface.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ int app_retinaface(){
106106
INFO("===================== test retinaface fp32 ==================================");
107107

108108
string model_file;
109-
if(!compile_retinaface(640, 480, model_file))
109+
if(!compile_retinaface(1920, 1280, model_file))
110110
return 0;
111111

112112
auto engine = RetinaFace::create_infer(model_file, 0, 0.7);

src/application/app_retinaface/retinaface.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,9 @@ namespace RetinaFace{
241241
float* affine_matrix_host = (float*)cpu_workspace;
242242
uint8_t* image_host = size_matrix + cpu_workspace;
243243

244-
checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
244+
// checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
245+
// speed up
246+
memcpy(image_host, image.data, size_image);
245247
checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
246248
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
247249
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));

src/application/app_scrfd/scrfd.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,9 @@ namespace Scrfd{
243243
float* affine_matrix_host = (float*)cpu_workspace;
244244
uint8_t* image_host = size_matrix + cpu_workspace;
245245

246-
checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
246+
//checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
247+
// speed up
248+
memcpy(image_host, image.data, size_image);
247249
checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
248250
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
249251
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));

src/application/app_yolo/yolo.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,9 @@ namespace Yolo{
237237
float* affine_matrix_host = (float*)cpu_workspace;
238238
uint8_t* image_host = size_matrix + cpu_workspace;
239239

240-
checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
240+
//checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
241+
// speed up
242+
memcpy(image_host, image.data, size_image);
241243
checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
242244
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
243245
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));

workspace/my-yolov5s-car.jpg

-476 KB
Binary file not shown.

workspace/test.jpg

188 KB
Loading

0 commit comments

Comments
 (0)