diff --git a/CMakeLists.txt b/CMakeLists.txt index 62d9cdd..6800880 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ set(CMAKE_BUILD_TYPE Release) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors") #add wrapper -set(TRT_WRAPPER_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorRTWrapper/code) +set(TRT_WRAPPER_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorRTW/code) add_subdirectory(${TRT_WRAPPER_DIR}) include_directories(${TRT_WRAPPER_DIR}/include) @@ -20,6 +20,6 @@ link_directories(${OpenCV_LIBRARIES_DIRS}) #build runYolov3 add_executable(runYolov3 main.cpp eval.cpp dataReader.cpp) -target_link_libraries(runYolov3 TrtNet ${OpenCV_LIBS}) +target_link_libraries(runYolov3 TrtNet ${OpenCV_LIBS} pthread) install(TARGETS runYolov3 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/install/) diff --git a/coco.names b/coco.names new file mode 100644 index 0000000..ca76c80 --- /dev/null +++ b/coco.names @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/image/example.png b/image/example.png new file mode 100644 index 0000000..9f5674c Binary files /dev/null and b/image/example.png differ diff --git a/include/configs.h b/include/configs.h index 420b7c5..ac55b8c 100644 --- a/include/configs.h +++ b/include/configs.h @@ -8,13 +8,19 @@ namespace Tn const std::string INPUT_PROTOTXT ="yolov3_trt.prototxt"; const std::string INPUT_CAFFEMODEL = "yolov3.caffemodel"; const std::string INPUT_IMAGE = "test.jpg"; + const std::string VIDEOFILE = "sample.mp4"; + const std::string SAVEFILE = "result"; + const std::string INPUT_STREAM = "cam"; + const std::string CLASS_NAME = "coco.name"; const std::string EVAL_LIST = ""; const std::string CALIBRATION_LIST = ""; const std::string MODE = "fp32"; const std::string OUTPUTS= "yolo-det";//layer82-conv,layer94-conv,layer106-conv + const int DISPLAY = 1; + const int SAVEIMG = 0; const int INPUT_WIDTH = 608; const int INPUT_HEIGHT = 608; - + const int CAM = 0; const int DETECT_CLASSES = 80; const float NMS_THRESH = 0.45; } diff --git a/main.cpp b/main.cpp index ac432f0..5a13123 100644 --- a/main.cpp +++ b/main.cpp @@ -6,321 +6,607 @@ #include "YoloLayer.h" #include "dataReader.h" #include "eval.h" +#include "pthread.h" using namespace std; using namespace argsParser; using namespace Tn; using namespace Yolo; + +static cv::VideoCapture cap; //静态全局变量仅对当前文件可见,其他文件不可访问 +cv::Mat frame; +cv::Mat frame_; +vector inputData; +vector inputData_; +list fileNames; +list> groundTruth; +list> outputs; +int outputCount; + + +vector ClassName; +int classNum; +int flag_exit = 0; + vector prepareImage(cv::Mat& img) { - using namespace cv; - - int c = parser::getIntValue("C"); - int h = parser::getIntValue("H"); //net h - int w = parser::getIntValue("W"); //net w - - float scale = min(float(w)/img.cols,float(h)/img.rows); - auto scaleSize = cv::Size(img.cols * scale,img.rows * scale); - - cv::Mat rgb ; - cv::cvtColor(img, rgb, CV_BGR2RGB); - cv::Mat resized; - cv::resize(rgb, resized,scaleSize,0,0,INTER_CUBIC); - - cv::Mat cropped(h, w,CV_8UC3, 127); - Rect rect((w- scaleSize.width)/2, (h-scaleSize.height)/2, scaleSize.width,scaleSize.height); - resized.copyTo(cropped(rect)); - - cv::Mat img_float; - if (c == 3) - cropped.convertTo(img_float, CV_32FC3, 1/255.0); - else - cropped.convertTo(img_float, CV_32FC1 ,1/255.0); - - //HWC TO CHW - vector input_channels(c); - cv::split(img_float, input_channels); - - vector result(h*w*c); - auto data = result.data(); - int channelLength = h * w; - for (int i = 0; i < c; ++i) { - memcpy(data,input_channels[i].data,channelLength*sizeof(float)); - data += channelLength; - } - - return result; + using namespace cv; + + int c = parser::getIntValue("C"); + int h = parser::getIntValue("H"); //net h + int w = parser::getIntValue("W"); //net w + + float scale = min(float(w) / img.cols, float(h) / img.rows); + auto scaleSize = cv::Size(img.cols * scale, img.rows * scale); + + cv::Mat rgb; + cv::cvtColor(img, rgb, CV_BGR2RGB); + cv::Mat resized; + cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC); + + cv::Mat cropped(h, w, CV_8UC3, 127); + Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height); + resized.copyTo(cropped(rect)); + + cv::Mat img_float; + if (c == 3) + cropped.convertTo(img_float, CV_32FC3, 1 / 255.0); + else + cropped.convertTo(img_float, CV_32FC1, 1 / 255.0); + + //HWC TO CHW + vector input_channels(c); + cv::split(img_float, input_channels);//通道分割 + + vector result(h*w*c); + auto data = result.data(); + int channelLength = h * w; + for (int i = 0; i < c; ++i) { + memcpy(data, input_channels[i].data, channelLength * sizeof(float)); + data += channelLength; + } + + return result; } -void DoNms(vector& detections,int classes ,float nmsThresh) +void DoNms(vector& detections, int classes, float nmsThresh) { - auto t_start = chrono::high_resolution_clock::now(); - - vector> resClass; - resClass.resize(classes); - - for (const auto& item : detections) - resClass[item.classId].push_back(item); - - auto iouCompute = [](float * lbox, float* rbox) - { - float interBox[] = { - max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left - min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right - max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top - min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom - }; - - if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) - return 0.0f; - - float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); - return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); - }; - - vector result; - for (int i = 0;i right.prob; - }); - - for (unsigned int m = 0;m < dets.size() ; ++m) - { - auto& item = dets[m]; - result.push_back(item); - for(unsigned int n = m + 1;n < dets.size() ; ++n) - { - if (iouCompute(item.bbox,dets[n].bbox) > nmsThresh) - { - dets.erase(dets.begin()+n); - --n; - } - } - } - } - - //swap(detections,result); - detections = move(result); - - auto t_end = chrono::high_resolution_clock::now(); - float total = chrono::duration(t_end - t_start).count(); - cout << "Time taken for nms is " << total << " ms." << endl; + //auto t_start = chrono::high_resolution_clock::now(); + + vector> resClass; + resClass.resize(classes); + + for (const auto& item : detections) + resClass[item.classId].push_back(item); + + auto iouCompute = [](float * lbox, float* rbox) + { + float interBox[] = { + max(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left + min(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right + max(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top + min(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom + }; + + if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) + return 0.0f; + + float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]); + return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); + }; + + vector result; + for (int i = 0; i < classes; ++i) + { + auto& dets = resClass[i]; + if (dets.size() == 0) + continue; + + sort(dets.begin(), dets.end(), [=](const Detection& left, const Detection& right) { + return left.prob > right.prob; + }); + + for (unsigned int m = 0; m < dets.size(); ++m) + { + auto& item = dets[m]; + result.push_back(item); + for (unsigned int n = m + 1; n < dets.size(); ++n) + { + if (iouCompute(item.bbox, dets[n].bbox) > nmsThresh) + { + dets.erase(dets.begin() + n); + --n; + } + } + } + } + + //swap(detections,result); + detections = move(result); + + //auto t_end = chrono::high_resolution_clock::now(); + //float total = chrono::duration(t_end - t_start).count(); + //cout << "Time taken for nms is " << total << " ms." << endl; } -vector postProcessImg(cv::Mat& img,vector& detections,int classes) +vector postProcessImg(cv::Mat& img, vector& detections, int classes) { - using namespace cv; - - int h = parser::getIntValue("H"); //net h - int w = parser::getIntValue("W"); //net w - - //scale bbox to img - int width = img.cols; - int height = img.rows; - float scale = min(float(w)/width,float(h)/height); - float scaleSize[] = {width * scale,height * scale}; - - //correct box - for (auto& item : detections) - { - auto& bbox = item.bbox; - bbox[0] = (bbox[0] * w - (w - scaleSize[0])/2.f) / scaleSize[0]; - bbox[1] = (bbox[1] * h - (h - scaleSize[1])/2.f) / scaleSize[1]; - bbox[2] /= scaleSize[0]; - bbox[3] /= scaleSize[1]; - } - - //nms - float nmsThresh = parser::getFloatValue("nms"); - if(nmsThresh > 0) - DoNms(detections,classes,nmsThresh); - - vector boxes; - for(const auto& item : detections) - { - auto& b = item.bbox; - Bbox bbox = - { - item.classId, //classId - max(int((b[0]-b[2]/2.)*width),0), //left - min(int((b[0]+b[2]/2.)*width),width), //right - max(int((b[1]-b[3]/2.)*height),0), //top - min(int((b[1]+b[3]/2.)*height),height), //bot - item.prob //score - }; - boxes.push_back(bbox); - } - - return boxes; + using namespace cv; + //auto t_start = chrono::high_resolution_clock::now(); + int h = parser::getIntValue("H"); //net h + int w = parser::getIntValue("W"); //net w + + //scale bbox to img + int width = img.cols; + int height = img.rows; + float scale = min(float(w) / width, float(h) / height); + float scaleSize[] = { width * scale,height * scale }; + + //correct box + for (auto& item : detections) + { + auto& bbox = item.bbox; + bbox[0] = (bbox[0] * w - (w - scaleSize[0]) / 2.f) / scaleSize[0]; + bbox[1] = (bbox[1] * h - (h - scaleSize[1]) / 2.f) / scaleSize[1]; + bbox[2] /= scaleSize[0]; + bbox[3] /= scaleSize[1]; + } + + //nms + float nmsThresh = parser::getFloatValue("nms"); + if (nmsThresh > 0) + DoNms(detections, classes, nmsThresh); + + vector boxes; + for (const auto& item : detections) + { + auto& b = item.bbox; + Bbox bbox = + { + item.classId, //classId + max(int((b[0] - b[2] / 2.)*width),0), //left + min(int((b[0] + b[2] / 2.)*width),width), //right + max(int((b[1] - b[3] / 2.)*height),0), //top + min(int((b[1] + b[3] / 2.)*height),height), //bot + item.prob //score + }; + boxes.push_back(bbox); + } + //auto t_end = chrono::high_resolution_clock::now(); + //float total = chrono::duration(t_end - t_start).count(); + //cout << "Time taken for nms is " << total << " ms." << endl; + return boxes; } vector split(const string& str, char delim) { - stringstream ss(str); - string token; - vector container; - while (getline(ss, token, delim)) { - container.push_back(token); - } - - return container; + stringstream ss(str); + string token; + vector container; + while (getline(ss, token, delim)) { + container.push_back(token); + } + + return container; +} + + +void error(const char *s) +{ + perror(s); + assert(0); + exit(EXIT_FAILURE); +} + + +void *fetch_in_thread(void *ptr) +{ + + cap >> frame; + if (!frame.data) + { + flag_exit = 1; + return 0; + } + //cv::Mat img = cv::imread(filename); + inputData = prepareImage(frame); + + return 0; } -int main( int argc, char* argv[] ) +void *detect_in_thread(void *ptr) +{ + trtNet *net; + net = (trtNet*)ptr; + //cout << frame.flags << endl; + unique_ptr outputData(new float[outputCount]); + net->doInference(inputData_.data(), outputData.get()); + + //Get Output + auto output = outputData.get(); + + //first detect count + int count = output[0]; + //later detect result + vector result; + result.resize(count); + memcpy(result.data(), &output[1], count * sizeof(Detection)); + + auto boxes = postProcessImg(frame_, result, classNum); + //outputs.emplace_back(boxes); + + //auto bbox = *outputs.begin(); + for (const auto& item : boxes) + { + cv::rectangle(frame_, cv::Point(item.left, item.top), cv::Point(item.right, item.bot), cv::Scalar(0, 0, 255), 3, 8, 0); + cv::rectangle(frame_, cv::Point(item.left, item.top - 20), cv::Point(item.right, item.top), cv::Scalar(0, 0, 255), CV_FILLED, 8, 0); + cv::putText(frame_, ClassName[item.classId], cv::Point(item.left, item.top), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(255, 255, 255), 2); + cout << "class=" << ClassName[item.classId] << " prob=" << item.score * 100 << endl; + //cout << "left=" << item.left << " right=" << item.right << " top=" << item.top << " bot=" << item.bot << endl; + } + + return 0; +} + + +void do_video_or_cam(trtNet &net) { - parser::ADD_ARG_STRING("prototxt",Desc("input yolov3 deploy"),DefaultValue(INPUT_PROTOTXT),ValueDesc("file")); - parser::ADD_ARG_STRING("caffemodel",Desc("input yolov3 caffemodel"),DefaultValue(INPUT_CAFFEMODEL),ValueDesc("file")); - parser::ADD_ARG_INT("C",Desc("channel"),DefaultValue(to_string(INPUT_CHANNEL))); - parser::ADD_ARG_INT("H",Desc("height"),DefaultValue(to_string(INPUT_HEIGHT))); - parser::ADD_ARG_INT("W",Desc("width"),DefaultValue(to_string(INPUT_WIDTH))); - parser::ADD_ARG_STRING("calib",Desc("calibration image List"),DefaultValue(CALIBRATION_LIST),ValueDesc("file")); - parser::ADD_ARG_STRING("mode",Desc("runtime mode"),DefaultValue(MODE), ValueDesc("fp32/fp16/int8")); - parser::ADD_ARG_STRING("outputs",Desc("output nodes name"),DefaultValue(OUTPUTS)); - parser::ADD_ARG_INT("class",Desc("num of classes"),DefaultValue(to_string(DETECT_CLASSES))); - parser::ADD_ARG_FLOAT("nms",Desc("non-maximum suppression value"),DefaultValue(to_string(NMS_THRESH))); - - //input - parser::ADD_ARG_STRING("input",Desc("input image file"),DefaultValue(INPUT_IMAGE),ValueDesc("file")); - parser::ADD_ARG_STRING("evallist",Desc("eval gt list"),DefaultValue(EVAL_LIST),ValueDesc("file")); - - if(argc < 2){ - parser::printDesc(); - exit(-1); - } - - parser::parseArgs(argc,argv); - - string deployFile = parser::getStringValue("prototxt"); - string caffemodelFile = parser::getStringValue("caffemodel"); - - vector> calibData; - string calibFileList = parser::getStringValue("calib"); - string mode = parser::getStringValue("mode"); - if(calibFileList.length() > 0 && mode == "int8") - { - cout << "find calibration file,loading ..." << endl; - - ifstream file(calibFileList); - if(!file.is_open()) - { - cout << "read file list error,please check file :" << calibFileList << endl; - exit(-1); - } - - string strLine; - while( getline(file,strLine) ) - { - cv::Mat img = cv::imread(strLine); - auto data = prepareImage(img); - calibData.emplace_back(data); - } - file.close(); - } - - RUN_MODE run_mode = RUN_MODE::FLOAT32; - if(mode == "int8") - { - if(calibFileList.length() == 0) - cout << "run int8 please input calibration file, will run in fp32" << endl; - else - run_mode = RUN_MODE::INT8; - } - else if(mode == "fp16") - { - run_mode = RUN_MODE::FLOAT16; - } - - string outputNodes = parser::getStringValue("outputs"); - auto outputNames = split(outputNodes,','); - - //can load from file - string saveName = "yolov3_" + mode + ".engine"; - -//#define LOAD_FROM_ENGINE -#ifdef LOAD_FROM_ENGINE - trtNet net(saveName); -#else - trtNet net(deployFile,caffemodelFile,outputNames,calibData,run_mode); - cout << "save Engine..." << saveName < outputData(new float[outputCount]); - - string listFile = parser::getStringValue("evallist"); - list fileNames; - list> groundTruth; - - if(listFile.length() > 0) - { - std::cout << "loading from eval list " << listFile << std::endl; - tie(fileNames,groundTruth) = readObjectLabelFileList(listFile); - } - else - { - string inputFileName = parser::getStringValue("input"); - fileNames.push_back(inputFileName); - } - - list> outputs; - int classNum = parser::getIntValue("class"); - for (const auto& filename :fileNames) - { - std::cout << "process: " << filename << std::endl; - - cv::Mat img = cv::imread(filename); - vector inputData = prepareImage(img); - if (!inputData.data()) - continue; - - net.doInference(inputData.data(), outputData.get()); - - //Get Output - auto output = outputData.get(); - - //first detect count - int count = output[0]; - //later detect result - vector result; - result.resize(count); - memcpy(result.data(), &output[1], count*sizeof(Detection)); - - auto boxes = postProcessImg(img,result,classNum); - outputs.emplace_back(boxes); - } - - net.printTime(); - - if(groundTruth.size() > 0) - { - //eval map - evalMAPResult(outputs,groundTruth,classNum,0.5f); - evalMAPResult(outputs,groundTruth,classNum,0.75f); - } - - if(fileNames.size() == 1) - { - //draw on image - cv::Mat img = cv::imread(*fileNames.begin()); - auto bbox = *outputs.begin(); - for(const auto& item : bbox) - { - cv::rectangle(img,cv::Point(item.left,item.top),cv::Point(item.right,item.bot),cv::Scalar(0,0,255),3,8,0); - cout << "class=" << item.classId << " prob=" << item.score*100 << endl; - cout << "left=" << item.left << " right=" << item.right << " top=" << item.top << " bot=" << item.bot << endl; - } - cv::imwrite("result.jpg",img); - cv::imshow("result",img); - cv::waitKey(0); - } - - return 0; + if (!cap.isOpened()) { + std::cout << "Error: video-stream can't be opened! \n"; + exit(EXIT_FAILURE); + } + pthread_t fetch_thread; + pthread_t detect_thread; + + fetch_in_thread(0); + inputData_ = inputData; + frame_ = frame; + //detect_in_thread(0); + //fetch_in_thread(0); + //inputData_ = inputData; + //frame_ = frame; + + + if (parser::getIntValue("display")) + { + cv::namedWindow("result", CV_WINDOW_NORMAL); + cv::resizeWindow("result", 640, 480); + } + + + float fps = 0; + + //for (const auto& filename :fileNames) + + while (1) + { + auto t_start = std::chrono::high_resolution_clock::now(); + if (pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed"); + if (pthread_create(&detect_thread, 0, detect_in_thread, &net)) error("Thread creation failed");//创造一个线程运行网络 + + + pthread_join(fetch_thread, 0);//塞入线程 + pthread_join(detect_thread, 0); + + + + if (parser::getIntValue("display")) + { + cv::imshow("result", frame_); + if (cv::waitKey(5) == 27) + { + break; + } + } + inputData_ = inputData; + frame_ = frame; + if (flag_exit == 1) + { + break; + } + + + auto t_end = std::chrono::high_resolution_clock::now(); + float total = std::chrono::duration(t_end - t_start).count(); + float cout = 1000. / total; + fps = 0.9*fps + 0.1*cout; + std::cout << "fps is " << fps << std::endl; + + } + std::cout << "video stream close\n " << std::endl; + cv::destroyAllWindows(); + cap.release(); } + + + +void *fetch_image_in_thread(void *ptr) +{ + if (!frame.data) + { + return 0; + } + //cv::Mat img = cv::imread(filename); + inputData = prepareImage(frame); + //in_s = resize_image(in, net.w, net.h); + + return 0; +} + + +void *detect_image_in_thread(void *ptr) +{ + trtNet *net; + net = (trtNet*)ptr; + //cout << frame.flags << endl; + unique_ptr outputData(new float[outputCount]); + net->doInference(inputData_.data(), outputData.get()); + + //Get Output + auto output = outputData.get(); + + //first detect count + int count = output[0]; + //later detect result + vector result; + result.resize(count); + memcpy(result.data(), &output[1], count * sizeof(Detection)); + + auto boxes = postProcessImg(frame_, result, classNum); + outputs.emplace_back(boxes); + + //auto bbox = *outputs.begin(); + for (const auto& item : boxes) + { + cv::rectangle(frame_, cv::Point(item.left, item.top), cv::Point(item.right, item.bot), cv::Scalar(0, 0, 255), 3, 8, 0); + cv::rectangle(frame_, cv::Point(item.left, item.top - 20), cv::Point(item.right, item.top), cv::Scalar(0, 0, 255), CV_FILLED, 8, 0); + cv::putText(frame_, ClassName[item.classId], cv::Point(item.left, item.top), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(255, 255, 255), 2); + cout << "class=" << ClassName[item.classId] << " prob=" << item.score * 100 << endl; + //cout << "left=" << item.left << " right=" << item.right << " top=" << item.top << " bot=" << item.bot << endl; + } + + return 0; +} + + +void do_image(trtNet &net) +{ + pthread_t fetch_thread; + pthread_t detect_thread; + list::iterator it; + it = fileNames.begin(); + //cout << "deal image:" << *it << endl; + frame = cv::imread(*it); + vector line = split(*it, '/'); + string save_img_name = parser::getStringValue("savefile")+"/"+ *(line.end()-1); + it++; + + fetch_image_in_thread(0); + inputData_ = inputData; + frame_ = frame; + + if (parser::getIntValue("display")) + { + cv::namedWindow("result", CV_WINDOW_NORMAL); + cv::resizeWindow("result", 640, 480); + } + float fps = 0; + + while(1) + { + auto t_start = chrono::high_resolution_clock::now(); + if (it != fileNames.end()) + { + frame = cv::imread(*it); + } + + if (pthread_create(&fetch_thread, 0, fetch_image_in_thread, 0)) error("Thread creation failed"); + if (pthread_create(&detect_thread, 0, detect_image_in_thread, &net)) error("Thread creation failed");//创造一个线程运行网络 + + pthread_join(fetch_thread, 0);//塞入线程 + pthread_join(detect_thread, 0); + + if (parser::getIntValue("display")) + { + cv::imshow("result", frame_); + if (cv::waitKey(5) == 27) + { + break; + } + } + if (parser::getIntValue("saveimg")) + { + cv::imwrite(save_img_name, frame_); + } + + inputData_ = inputData; + frame_ = frame; + auto t_end = chrono::high_resolution_clock::now(); + float total = chrono::duration(t_end - t_start).count(); + float cout = 1000. / total; + fps = 0.9*fps + 0.1*cout; + std::cout << "fps is " << fps << std::endl; + if (it == fileNames.end()) break; + line = split(*it, '/'); + save_img_name = parser::getStringValue("savefile") + "/" + *(line.end() - 1); + it++; + + } + cout << "all over\n " << endl; + cv::destroyAllWindows(); + if (groundTruth.size() > 0) + { + //eval map + evalMAPResult(outputs, groundTruth, classNum, 0.5f); + evalMAPResult(outputs, groundTruth, classNum, 0.75f); + } + return ; +} + +int main(int argc, char* argv[]) +{ + parser::ADD_ARG_STRING("prototxt", Desc("input yolov3 deploy"), DefaultValue(INPUT_PROTOTXT), ValueDesc("file")); + parser::ADD_ARG_STRING("caffemodel", Desc("input yolov3 caffemodel"), DefaultValue(INPUT_CAFFEMODEL), ValueDesc("file")); + parser::ADD_ARG_INT("C", Desc("channel"), DefaultValue(to_string(INPUT_CHANNEL))); + parser::ADD_ARG_INT("H", Desc("height"), DefaultValue(to_string(INPUT_HEIGHT))); + parser::ADD_ARG_INT("W", Desc("width"), DefaultValue(to_string(INPUT_WIDTH))); + parser::ADD_ARG_STRING("calib", Desc("calibration image List"), DefaultValue(CALIBRATION_LIST), ValueDesc("file")); + parser::ADD_ARG_STRING("mode", Desc("runtime mode"), DefaultValue(MODE), ValueDesc("fp32/fp16/int8")); + parser::ADD_ARG_STRING("outputs", Desc("output nodes name"), DefaultValue(OUTPUTS)); + parser::ADD_ARG_INT("class", Desc("num of classes"), DefaultValue(to_string(DETECT_CLASSES))); + parser::ADD_ARG_FLOAT("nms", Desc("non-maximum suppression value"), DefaultValue(to_string(NMS_THRESH))); + parser::ADD_ARG_FLOAT("classname", Desc("class name"), DefaultValue(CLASS_NAME), ValueDesc("file")); + parser::ADD_ARG_INT("display", Desc("whether display video"), DefaultValue(to_string(DISPLAY))); + parser::ADD_ARG_INT("saveimg", Desc("whether save image"), DefaultValue(to_string(SAVEIMG))); + parser::ADD_ARG_STRING("inputstream", Desc("input stream"), DefaultValue(INPUT_STREAM)); + parser::ADD_ARG_INT("cam", Desc("cam"), DefaultValue(to_string(CAM))); + parser::ADD_ARG_STRING("videofile", Desc("videofile"), DefaultValue(VIDEOFILE), ValueDesc("file")); + parser::ADD_ARG_STRING("savefile", Desc("savefile"), DefaultValue(SAVEFILE)); + parser::ADD_ARG_STRING("input", Desc("input image file"), DefaultValue(INPUT_IMAGE), ValueDesc("file")); + parser::ADD_ARG_STRING("evallist", Desc("eval gt list"), DefaultValue(EVAL_LIST), ValueDesc("file")); + + if (argc < 2) { + parser::printDesc(); + return 1; + } + + parser::parseArgs(argc, argv); + + string deployFile = parser::getStringValue("prototxt"); + string caffemodelFile = parser::getStringValue("caffemodel"); + + vector> calibData; + string calibFileList = parser::getStringValue("calib"); //校验文件的名字 txt + string mode = parser::getStringValue("mode"); + if (calibFileList.length() > 0 && mode == "int8") + { + cout << "find calibration file,loading ..." << endl; + + ifstream file(calibFileList); + if (!file.is_open()) + { + cout << "read file list error,please check file :" << calibFileList << endl; + return 1; + } + + string strLine; + while (getline(file, strLine)) + { + cv::Mat img = cv::imread(strLine); + auto data = prepareImage(img); //转换为608*608的数据 + calibData.emplace_back(data); + } + file.close(); + } + + RUN_MODE run_mode = RUN_MODE::FLOAT32; + if (mode == "int8") + { + if (calibFileList.length() == 0) + cout << "run int8 please input calibration file, will run in fp32" << endl; + else + run_mode = RUN_MODE::INT8; + } + else if (mode == "fp16") + { + run_mode = RUN_MODE::FLOAT16; + } + + string outputNodes = parser::getStringValue("outputs"); + auto outputNames = split(outputNodes, ','); + + //can load from file + string saveName = "yolov3_" + mode + ".engine"; + + //#define LOAD_FROM_ENGINE + std::ifstream serialize_iutput_stream(saveName, std::ios::in | std::ios::binary); + trtNet net; + net.set_mode(run_mode); + + if (!serialize_iutput_stream) + { + net.trtNet_engine(deployFile, caffemodelFile, outputNames, calibData, run_mode); + cout << "save Engine..." << saveName << endl; + net.saveEngine(saveName); + } + else + { + net.trtNet_engine(saveName); + } + + outputCount = net.getOutputSize() / sizeof(float); + + classNum = parser::getIntValue("class"); + string classFile = parser::getStringValue("classname"); + ifstream ClassNamelist(classFile); + if (!ClassNamelist.is_open()) + { + std::cout << "read file list error,please check file :" << classFile << std::endl; + return 1; + } + string strLine; + while (getline(ClassNamelist, strLine)) + { + ClassName.push_back(strLine); + } + + ClassNamelist.close(); + + string inputstream = parser::getStringValue("inputstream"); + + if (!inputstream.compare("video")) + { + string video_file = parser::getStringValue("videofile"); + cap.open(video_file); + do_video_or_cam(net); + } + else if (!inputstream.compare("cam")) + { + int cam_index = parser::getIntValue("cam"); + cap.open(cam_index); + do_video_or_cam(net); + } + else if (!inputstream.compare("image")) + { + string listFile = parser::getStringValue("evallist"); + if (listFile.length() > 0) + { + + std::cout << "loading from eval list " << listFile << std::endl; + tie(fileNames, groundTruth) = readObjectLabelFileList(listFile); + } + else + { + string inputFileName = parser::getStringValue("input"); + if (inputFileName.find(".jpg")!= inputFileName.npos || inputFileName.find(".png") != inputFileName.npos)//如果是单张图片 + { + fileNames.push_back(inputFileName); + } + else + { + ifstream readinputfile(inputFileName); + if (!readinputfile.is_open()) + { + cout << "can't read input image list" << inputFileName << endl; + return 1; + } + string strLine; + while (getline(readinputfile, strLine)) + { + fileNames.push_back(strLine); + } + readinputfile.close(); + } + //fileNames.push_back(inputFileName); + } + //list> outputs; + do_image(net); + } + + //net.~trtNet(); + return 0; +} + +//net.~trtNet(); \ No newline at end of file diff --git a/readme.md b/readme.md index c07608e..7371adb 100644 --- a/readme.md +++ b/readme.md @@ -1,14 +1,16 @@ # TRTForYolov3 +996.icu + ## Desc tensorRT for Yolov3 ### Test Enviroments - Ubuntu 16.04 - TensorRT 5.0.2.6/4.0.1.6 - CUDA 9.2 + Ubuntu 16.04 or Jetson Nano + TensorRT 5.1/5.0.2.6/4.0.1.6 + CUDA 9.2 or CUDA 9.0 or CUDA 10.0 ### Models @@ -33,64 +35,59 @@ layer { It also needs to change the yolo configs in "YoloConfigs.h" if different kernels. -### Run Sample +# build source code -```bash -#build source code git submodule update --init --recursive mkdir build cd build && cmake .. && make && make install cd .. -#for yolov3-608 -./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --input=./test.jpg --W=608 --H=608 --class=80 -#for fp16 -./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --input=./test.jpg --W=608 --H=608 --class=80 --mode=fp16 +# what I do -#for int8 with calibration datasets -./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --input=./test.jpg --W=608 --H=608 --class=80 --mode=int8 --calib=./calib_sample.txt +1.Added multithreading -#for yolov3-416 (need to modify include/YoloConfigs for YoloKernel) -./install/runYolov3 --caffemodel=./yolov3_416.caffemodel --prototxt=./yolov3_416.prototxt --input=./test.jpg --W=416 --H=416 --class=80 -``` +2.Added tag name +3.Added video inference -### Performance -Model | GPU | Mode | Inference Time --- | -- | -- | -- -Yolov3-416 | GTX 1060 | Caffe | 54.593ms -Yolov3-416 | GTX 1060 | float32 | 23.817ms -Yolov3-416 | GTX 1060 | int8 | 11.921ms -Yolov3-608 | GTX 1060 | Caffe | 88.489ms -Yolov3-608 | GTX 1060 | float32 | 43.965ms -Yolov3-608 | GTX 1060 | int8 | 21.638ms -Yolov3-608 | GTX 1080 Ti | float32 | 19.353ms -Yolov3-608 | GTX 1080 Ti | int8 | 9.727ms -Yolov3-416 | GTX 1080 Ti | float32 | 9.677ms -Yolov3-416 | GTX 1080 Ti | int8 | 6.129ms | li +# for yolov3-608 + +## video + +./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --display=1 --inputstream=video --videofile=sample_720p.mp4 --classname=coco.names + +## cam -### Eval Result +./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --display=1 --inputstream=cam --cam=0 --classname=coco.names -run above models with appending ```--evallist=labels.txt``` +## int8 + +./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --display=1 --inputstream=cam --cam=0 --classname=coco.names --mode=int8 --calib=cal.list + +## example + + +![图片alt](https://raw.githubusercontent.com/talebolano/TensorRT-Yolov3/master/image/example.png) + +### Performance -int8 calibration data made from 200 pics selected in val2014 (see scripts dir) +Model |TYPE | GPU | Mode | Inference Time | FPS +-- | -- | -- | -- | -- |--| +Yolov3-608 | tensorRT |GTX 1060(laptop)(win10) | float32 | 58ms | 15 +Yolov3-608 | tensorRT |GTX 1060(laptop)(win10) | int8 | 33ms | 18 +Yolov3-608 | tensorRT |P40 | float32 | 20ms | 40(display) +Yolov3-608 | darknet |P40 | float32 | -- | 30(display) +Yolov3-608 | tensorRT |P40 | int8 | 13ms | 50(display) +Yolov3-416 | tensorRT |P40 | float32 | 12ms | 60(display) +Yolov3-416 | darknet |P40 | float32 | -- | 50(display) +Yolov3-416 | tensorRT |P40 | int8 | 8ms | 70(display) +Yolov3-416 | tensorRT |jeston nano | fp16 | 343ms | 2.8 +Yolov3-tiny-416 | darknet | jeston nano | fp32 | -- | 17(display) +Yolov3-tiny-416 | tensorRT | jeston nano | fp16 | 37ms | 25(display) -Model | GPU | Mode | dataset | MAP(0.50) | MAP(0.75) --- | -- | -- | -- | -- | -- -Yolov3-416 | GTX 1060 | Caffe | COCO val2014 | 81.76 | 52.05 -Yolov3-416 | GTX 1060 | float32 | COCO val2014 | 81.93 | 52.19 -Yolov3-416 | GTX 1060 | int8 | COCO val2014 | 86.41 | 57.11 -Yolov3-416 | GTX 1060 | Caffe | COCO val2014 | 80.41 | 52.33 -Yolov3-608 | GTX 1060 | float32 | COCO val2014 | 80.6 | 52.43 -Yolov3-608 | GTX 1060 | int8 | COCO val2014 | 85.35 | 56.88 | li -Notice: -+ caffe implementation is little different in yolo layer and nms, and it should be the similar result compared to tensorRT fp32. -+ Int8 mode gets better result in the val dataset, but not certainly in other test data. And exactly it is more often a little worse. -### Details About Wrapper -see link [TensorRTWrapper](https://github.com/lewes6369/tensorRTWrapper) diff --git a/tensorRTW/code/CMakeLists.txt b/tensorRTW/code/CMakeLists.txt new file mode 100644 index 0000000..d6b133c --- /dev/null +++ b/tensorRTW/code/CMakeLists.txt @@ -0,0 +1,62 @@ +cmake_minimum_required(VERSION 2.8) +project(trtNet) + +set(CMAKE_BUILD_TYPE Release) + +#include +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +#src +set(PLUGIN_SOURCES + src/EntroyCalibrator.cpp + src/UpsampleLayer.cpp + src/UpsampleLayer.cu + src/YoloLayer.cu + src/TrtNet.cpp +) + +# +# CUDA Configuration +# +find_package(CUDA REQUIRED) + +set(CUDA_VERBOSE_BUILD ON) + +# Specify the cuda host compiler to use the same compiler as cmake. +set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + +# TensorRT +find_path(TENSORRT_INCLUDE_DIR NvInfer.h + HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES include) +MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}") +find_library(TENSORRT_LIBRARY_INFER nvinfer + HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib lib64 lib/x64) +find_library(TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin + HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib lib64 lib/x64) + find_library(TENSORRT_LIBRARY_PARSER nvparsers + HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib lib64 lib/x64) +set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN} ${TENSORRT_LIBRARY_PARSER}) +MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}") +find_package_handle_standard_args( + TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIBRARY) +if(NOT TENSORRT_FOUND) + message(ERROR + "Cannot find TensorRT library.") +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") # -std=gnu++11 +set(BUILD_DEPS "YES" CACHE BOOL "If YES, will install dependencies into sandbox. Automatically reset to NO after dependencies are installed.") + +# if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) +# list(APPEND CUDA_NVCC_FLAGS -std=c++11) +# endif() +list(APPEND CUDA_NVCC_FLAGS "-D_FORCE_INLINES -Xcompiler -fPIC") +CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR}) +CUDA_ADD_LIBRARY(TrtNet STATIC ${PLUGIN_SOURCES}) + +target_include_directories(TrtNet PUBLIC ${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR}) +target_link_libraries(TrtNet ${TENSORRT_LIBRARY}) \ No newline at end of file diff --git a/tensorRTW/code/include/EntroyCalibrator.h b/tensorRTW/code/include/EntroyCalibrator.h new file mode 100644 index 0000000..57346d8 --- /dev/null +++ b/tensorRTW/code/include/EntroyCalibrator.h @@ -0,0 +1,45 @@ +#ifndef _ENTROY_CALIBRATOR_H +#define _ENTROY_CALIBRATOR_H + +#include +#include +#include +#include "NvInfer.h" +#include "Utils.h" + +namespace nvinfer1 +{ + +class Int8EntropyCalibrator : public IInt8EntropyCalibrator +{ +public: + Int8EntropyCalibrator(int BatchSize,const std::vector>& data,const std::string& CalibDataName = "",bool readCache = true); + + virtual ~Int8EntropyCalibrator(); + + int getBatchSize() const override { return mBatchSize; } + + bool getBatch(void* bindings[], const char* names[], int nbBindings) override; + + const void* readCalibrationCache(size_t& length) override; + + void writeCalibrationCache(const void* cache, size_t length) override; + +private: + std::string mCalibDataName; + std::vector> mDatas; + int mBatchSize; + + int mCurBatchIdx; + float* mCurBatchData{ nullptr }; + + size_t mInputCount; + bool mReadCache; + void* mDeviceInput{ nullptr }; + + std::vector mCalibrationCache; +}; + +} //namespace + +#endif //_ENTROY_CALIBRATOR_H diff --git a/tensorRTW/code/include/PluginFactory.h b/tensorRTW/code/include/PluginFactory.h new file mode 100644 index 0000000..9f3caea --- /dev/null +++ b/tensorRTW/code/include/PluginFactory.h @@ -0,0 +1,129 @@ +#ifndef __PLUGIN_FACTORY_H_ +#define __PLUGIN_FACTORY_H_ + +#include +#include +#include +#include "UpsampleLayer.h" +#include "YoloLayer.h" +#include "NvInferPlugin.h" +#include "NvCaffeParser.h" + +namespace Tn +{ + static constexpr float NEG_SLOPE = 0.1; + static constexpr float UPSAMPLE_SCALE = 2.0; + static constexpr int CUDA_THREAD_NUM = 512; + + // Integration for serialization. + using nvinfer1::plugin::INvPlugin; + using nvinfer1::plugin::createPReLUPlugin; + using nvinfer1::UpsampleLayerPlugin; + using nvinfer1::YoloLayerPlugin; + class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactoryExt + { + public: + inline bool isLeakyRelu(const char* layerName) + { + return std::regex_match(layerName , std::regex(R"(layer(\d*)-act)")); + } + + inline bool isUpsample(const char* layerName) + { + return std::regex_match(layerName , std::regex(R"(layer(\d*)-upsample)")); + } + + inline bool isYolo(const char* layerName) + { + return strcmp(layerName,"yolo-det") == 0; + } + + virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override + { + assert(isPlugin(layerName)); + + if(isLeakyRelu(layerName)) + { + assert(nbWeights == 0 && weights == nullptr); + mPluginLeakyRelu.emplace_back(std::unique_ptr(createPReLUPlugin(NEG_SLOPE), nvPluginDeleter)); + return mPluginLeakyRelu.back().get(); + } + else if (isUpsample(layerName)) + { + assert(nbWeights == 0 && weights == nullptr); + mPluginUpsample.emplace_back(std::unique_ptr(new UpsampleLayerPlugin(UPSAMPLE_SCALE,CUDA_THREAD_NUM))); + return mPluginUpsample.back().get(); + } + else if (isYolo(layerName)) + { + assert(nbWeights == 0 && weights == nullptr && mPluginYolo.get() == nullptr); + mPluginYolo.reset(new YoloLayerPlugin(CUDA_THREAD_NUM)); + return mPluginYolo.get(); + } + else + { + assert(0); + return nullptr; + } + } + + nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override + { + assert(isPlugin(layerName)); + + if (isLeakyRelu(layerName)) + { + mPluginLeakyRelu.emplace_back(std::unique_ptr(createPReLUPlugin(serialData, serialLength), nvPluginDeleter)); + return mPluginLeakyRelu.back().get(); + } + else if (isUpsample(layerName)) + { + mPluginUpsample.emplace_back(std::unique_ptr(new UpsampleLayerPlugin(serialData, serialLength))); + return mPluginUpsample.back().get(); + } + else if (isYolo(layerName)) + { + assert(mPluginYolo.get() == nullptr); + mPluginYolo.reset(new YoloLayerPlugin(serialData, serialLength)); + return mPluginYolo.get(); + } + else + { + assert(0); + return nullptr; + } + } + + + bool isPlugin(const char* name) override + { + return isPluginExt(name); + } + + bool isPluginExt(const char* name) override + { + //std::cout << "check plugin " << name << isYolo(name)<< std::endl; + return isLeakyRelu(name) || isUpsample(name) || isYolo(name); + } + + // The application has to destroy the plugin when it knows it's safe to do so. + void destroyPlugin() + { + for (auto& item : mPluginLeakyRelu) + item.reset(); + + for (auto& item : mPluginUpsample) + item.reset(); + + mPluginYolo.reset(); + } + + void (*nvPluginDeleter)(INvPlugin*){[](INvPlugin* ptr) { if(ptr) ptr->destroy(); }}; + + std::vector> mPluginLeakyRelu{}; + std::vector> mPluginUpsample{}; + std::unique_ptr mPluginYolo {nullptr}; + }; +} + +#endif \ No newline at end of file diff --git a/tensorRTW/code/include/TrtNet.h b/tensorRTW/code/include/TrtNet.h new file mode 100644 index 0000000..0d3c759 --- /dev/null +++ b/tensorRTW/code/include/TrtNet.h @@ -0,0 +1,116 @@ +#ifndef __TRT_NET_H_ +#define __TRT_NET_H_ + +#include +#include +#include +#include +#include +#include "NvInferPlugin.h" +#include "NvCaffeParser.h" +#include "PluginFactory.h" +#include "Utils.h" + +namespace Tn +{ + enum class RUN_MODE + { + FLOAT32 = 0, + FLOAT16 = 1, + INT8 = 2 + }; + + class trtNet + { + public: + //Load from caffe model + /* trtNet(const std::string& prototxt, const std::string& caffemodel, const std::vector& outputNodesName, + const std::vector>& calibratorData, RUN_MODE mode /*= RUN_MODE::FLOAT32) :mTrtContext(nullptr), mTrtEngine(nullptr), mTrtRunTime(nullptr), mTrtRunMode(mode), mTrtInputCount(0), mTrtIterationTime(0) {}; + */ + trtNet() :mTrtContext(nullptr), mTrtEngine(nullptr), mTrtRunTime(nullptr), mTrtInputCount(0), mTrtIterationTime(0) {}; + + + ~trtNet() + { + // Release the stream and the buffers + cudaStreamSynchronize(mTrtCudaStream); + cudaStreamDestroy(mTrtCudaStream); + for(auto& item : mTrtCudaBuffer) + cudaFree(item); + + mTrtPluginFactory.destroyPlugin(); + + if(!mTrtRunTime) + mTrtRunTime->destroy(); + if(!mTrtContext) + mTrtContext->destroy(); + if(!mTrtEngine) + mTrtEngine->destroy(); + }; + + void trtNet_engine(const std::string& prototxt, const std::string& caffeModel, const std::vector& outputNodesName, + const std::vector>& calibratorData, RUN_MODE mode = RUN_MODE::FLOAT32); + + void set_mode(RUN_MODE mode) + { + mTrtRunMode = mode; + } + //Load from engine file + void trtNet_engine(const std::string& engineFile); + + void saveEngine(std::string fileName) + { + if(mTrtEngine) + { + nvinfer1::IHostMemory* data = mTrtEngine->serialize(); + std::ofstream file; + file.open(fileName,std::ios::binary | std::ios::out); + if(!file.is_open()) + { + std::cout << "read create engine file" << fileName <<" failed" << std::endl; + return; + } + + file.write((const char*)data->data(), data->size()); + file.close(); + } + }; + + void doInference(const void* inputData, void* outputData); + + inline size_t getInputSize() { + return std::accumulate(mTrtBindBufferSize.begin(), mTrtBindBufferSize.begin() + mTrtInputCount,0); + }; + + inline size_t getOutputSize() { + return std::accumulate(mTrtBindBufferSize.begin() + mTrtInputCount, mTrtBindBufferSize.end(),0); + }; + + void printTime() + { + mTrtProfiler.printLayerTimes(mTrtIterationTime); + } + + private: + nvinfer1::ICudaEngine* loadModelAndCreateEngine(const char* deployFile, const char* modelFile,int maxBatchSize, + nvcaffeparser1::ICaffeParser* parser, nvcaffeparser1::IPluginFactory* pluginFactory, + nvinfer1::IInt8Calibrator* calibrator, nvinfer1::IHostMemory*& trtModelStream,const std::vector& outputNodesName); + + void InitEngine(); + + nvinfer1::IExecutionContext* mTrtContext; + nvinfer1::ICudaEngine* mTrtEngine; + nvinfer1::IRuntime* mTrtRunTime; + PluginFactory mTrtPluginFactory; + cudaStream_t mTrtCudaStream; + Profiler mTrtProfiler; + RUN_MODE mTrtRunMode; + + std::vector mTrtCudaBuffer; + std::vector mTrtBindBufferSize; + int mTrtInputCount; + int mTrtIterationTime; + }; +} + +#endif //__TRT_NET_H_ diff --git a/tensorRTW/code/include/UpsampleLayer.h b/tensorRTW/code/include/UpsampleLayer.h new file mode 100644 index 0000000..0b3b271 --- /dev/null +++ b/tensorRTW/code/include/UpsampleLayer.h @@ -0,0 +1,72 @@ +#ifndef _UPSAMPLE_LAYER_H +#define _UPSAMPLE_LAYER_H + +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "Utils.h" +#include + +namespace nvinfer1 +{ + class UpsampleLayerPlugin: public IPluginExt + { + public: + explicit UpsampleLayerPlugin(const float scale, const int cudaThread = 512); + // create the plugin at runtime from a byte stream + UpsampleLayerPlugin(const void* data, size_t length); + + ~UpsampleLayerPlugin(); + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; + + bool supportsFormat(DataType type, PluginFormat format) const override { + //std::cout << "supportsFormat=== type:" << int(type) << "format" << int(format) << std::endl; + return (type == DataType::kFLOAT || type == DataType::kHALF || type == DataType::kINT8 ) + && format == PluginFormat::kNCHW; + } + + void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override; + + int initialize() override; + + virtual void terminate() override { + }; + + virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} + + virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; + + virtual size_t getSerializationSize() override + { + return sizeof(nvinfer1::Dims) + sizeof(mDataType) + sizeof(mScale) + + sizeof(mOutputWidth) + sizeof(mOutputHeight) + sizeof(mThreadCount); + } + + virtual void serialize(void* buffer) override; + + template + void forwardGpu(const Dtype* input,Dtype * outputint ,int N,int C,int H ,int W); + + private: + nvinfer1::Dims mCHW; + DataType mDataType{DataType::kFLOAT}; + float mScale; + int mOutputWidth; + int mOutputHeight; + int mThreadCount; + + void* mInputBuffer {nullptr}; + void* mOutputBuffer {nullptr}; + }; +}; + +#endif diff --git a/tensorRTW/code/include/Utils.h b/tensorRTW/code/include/Utils.h new file mode 100644 index 0000000..0de663c --- /dev/null +++ b/tensorRTW/code/include/Utils.h @@ -0,0 +1,94 @@ +#ifndef __TRT_UTILS_H_ +#define __TRT_UTILS_H_ + +#include +#include +#include +#include + +#ifndef CUDA_CHECK + +#define CUDA_CHECK(callstr) \ + { \ + cudaError_t error_code = callstr; \ + if (error_code != cudaSuccess) { \ + std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ + assert(0); \ + } \ + } + +#endif + +namespace Tn +{ + class Profiler : public nvinfer1::IProfiler + { + public: + void printLayerTimes(int itrationsTimes) + { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) + { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); + } + private: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) + { + auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + }; + + //Logger for TensorRT info/warning/errors + class Logger : public nvinfer1::ILogger + { + public: + + Logger(): Logger(Severity::kWARNING) {} + + Logger(Severity severity): reportableSeverity(severity) {} + + void log(Severity severity, const char* msg) override + { + // suppress messages with severity enum value greater than the reportable + if (severity > reportableSeverity) return; + + switch (severity) + { + case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; + case Severity::kERROR: std::cerr << "ERROR: "; break; + case Severity::kWARNING: std::cerr << "WARNING: "; break; + case Severity::kINFO: std::cerr << "INFO: "; break; + default: std::cerr << "UNKNOWN: "; break; + } + std::cerr << msg << std::endl; + } + + Severity reportableSeverity{Severity::kWARNING}; + }; + + template + void write(char*& buffer, const T& val) + { + *reinterpret_cast(buffer) = val; + buffer += sizeof(T); + } + + template + void read(const char*& buffer, T& val) + { + val = *reinterpret_cast(buffer); + buffer += sizeof(T); + } +} + +#endif \ No newline at end of file diff --git a/tensorRTW/code/include/YoloConfigs.h b/tensorRTW/code/include/YoloConfigs.h new file mode 100644 index 0000000..189d3a2 --- /dev/null +++ b/tensorRTW/code/include/YoloConfigs.h @@ -0,0 +1,53 @@ +#ifndef _YOLO_CONFIGS_H_ +#define _YOLO_CONFIGS_H_ + + +namespace Yolo +{ + static constexpr int CHECK_COUNT = 3; + static constexpr float IGNORE_THRESH = 0.5f; + static constexpr int CLASS_NUM = 80; + + struct YoloKernel + { + int width; + int height; + float anchors[CHECK_COUNT*2]; + }; + + //YOLO 608 + YoloKernel yolo1 = { + 19, + 19, + {116,90, 156,198, 373,326} + }; + YoloKernel yolo2 = { + 38, + 38, + {30,61, 62,45, 59,119} + }; + YoloKernel yolo3 = { + 76, + 76, + {10,13, 16,30, 33,23} + }; + + //YOLO 416 + // YoloKernel yolo1 = { + // 13, + // 13, + // {116,90, 156,198, 373,326} + // }; + // YoloKernel yolo2 = { + // 26, + // 26, + // {30,61, 62,45, 59,119} + // }; + // YoloKernel yolo3 = { + // 52, + // 52, + // {10,13, 16,30, 33,23} + // }; +} + +#endif \ No newline at end of file diff --git a/tensorRTW/code/include/YoloLayer.h b/tensorRTW/code/include/YoloLayer.h new file mode 100644 index 0000000..f90a003 --- /dev/null +++ b/tensorRTW/code/include/YoloLayer.h @@ -0,0 +1,79 @@ +#ifndef _YOLO_LAYER_H +#define _YOLO_LAYER_H + +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "Utils.h" +#include + +namespace Yolo +{ + struct YoloKernel; + + static constexpr int LOCATIONS = 4; + struct Detection{ + //x y w h + float bbox[LOCATIONS]; + //float objectness; + int classId; + float prob; + }; +} + + +namespace nvinfer1 +{ + class YoloLayerPlugin: public IPluginExt + { + public: + explicit YoloLayerPlugin(const int cudaThread = 512); + YoloLayerPlugin(const void* data, size_t length); + + ~YoloLayerPlugin(); + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; + + bool supportsFormat(DataType type, PluginFormat format) const override { + return type == DataType::kFLOAT && format == PluginFormat::kNCHW; + } + + void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override {}; + + int initialize() override; + + virtual void terminate() override {}; + + virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} + + virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; + + virtual size_t getSerializationSize() override; + + virtual void serialize(void* buffer) override; + + void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream); + + void forwardCpu(const float *const * inputs,float * output, cudaStream_t stream); + + private: + int mClassCount; + int mKernelCount; + std::vector mYoloKernel; + int mThreadCount; + + //cpu + void* mInputBuffer {nullptr}; + void* mOutputBuffer {nullptr}; + }; +}; + +#endif diff --git a/tensorRTW/code/src/EntroyCalibrator.cpp b/tensorRTW/code/src/EntroyCalibrator.cpp new file mode 100644 index 0000000..9e03a74 --- /dev/null +++ b/tensorRTW/code/src/EntroyCalibrator.cpp @@ -0,0 +1,75 @@ +#include "EntroyCalibrator.h" +#include +#include +#include +#include +#include + +namespace nvinfer1 +{ + Int8EntropyCalibrator::Int8EntropyCalibrator(int BatchSize,const std::vector>& data, + const std::string& CalibDataName /*= ""*/,bool readCache /*= true*/) + : mCalibDataName(CalibDataName),mBatchSize(BatchSize),mReadCache(readCache) + { + mDatas.reserve(data.size()); + mDatas = data; + + mInputCount = BatchSize * data[0].size(); + mCurBatchData = new float[mInputCount]; + mCurBatchIdx = 0; + CUDA_CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); + } + + + Int8EntropyCalibrator::~Int8EntropyCalibrator() + { + CUDA_CHECK(cudaFree(mDeviceInput)); + if(mCurBatchData) + delete[] mCurBatchData; + } + + + bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], int nbBindings) + { + if (mCurBatchIdx + mBatchSize > int(mDatas.size())) + return false; + + float* ptr = mCurBatchData; + size_t imgSize = mInputCount / mBatchSize; + auto iter = mDatas.begin() + mCurBatchIdx; + + std::for_each(iter, iter + mBatchSize, [=,&ptr](std::vector& val){ + assert(imgSize == val.size()); + memcpy(ptr,val.data(),imgSize*sizeof(float)); + + ptr += imgSize; + }); + + CUDA_CHECK(cudaMemcpy(mDeviceInput, mCurBatchData, mInputCount * sizeof(float), cudaMemcpyHostToDevice)); + //std::cout << "input name " << names[0] << std::endl; + bindings[0] = mDeviceInput; + + std::cout << "load batch " << mCurBatchIdx << " to " << mCurBatchIdx + mBatchSize - 1 << std::endl; + mCurBatchIdx += mBatchSize; + return true; + } + + const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length) + { + mCalibrationCache.clear(); + std::ifstream input(mCalibDataName+".calib", std::ios::binary); + input >> std::noskipws; + if (mReadCache && input.good()) + std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); + + length = mCalibrationCache.size(); + return length ? &mCalibrationCache[0] : nullptr; + } + + void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length) + { + std::ofstream output(mCalibDataName+".calib", std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + +} \ No newline at end of file diff --git a/tensorRTW/code/src/TrtNet.cpp b/tensorRTW/code/src/TrtNet.cpp new file mode 100644 index 0000000..4c53a79 --- /dev/null +++ b/tensorRTW/code/src/TrtNet.cpp @@ -0,0 +1,242 @@ +#include "TrtNet.h" +#include "EntroyCalibrator.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nvinfer1; +using namespace nvcaffeparser1; +using namespace plugin; + +static Tn::Logger gLogger; + +#define RETURN_AND_LOG(ret, severity, message) \ + do \ + { \ + std::string error_message = "ssd_error_log: " + std::string(message); \ + gLogger.log(ILogger::Severity::k##severity, error_message.c_str()); \ + return (ret); \ + } while (0) + +inline void* safeCudaMalloc(size_t memSize) +{ + void* deviceMem; + CUDA_CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + return deviceMem; +} + +inline int64_t volume(const nvinfer1::Dims& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +inline unsigned int getElementSize(nvinfer1::DataType t) +{ + switch (t) + { + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + } + throw std::runtime_error("Invalid DataType."); + return 0; +} + +namespace Tn +{ + void trtNet::trtNet_engine(const std::string& prototxt,const std::string& caffemodel,const std::vector& outputNodesName, + const std::vector>& calibratorData,RUN_MODE mode /*= RUN_MODE::FLOAT32*/) + { + std::cout << "init plugin proto: " << prototxt << " caffemodel: " << caffemodel << std::endl; + auto parser = createCaffeParser(); + + const int maxBatchSize = 1; + IHostMemory* trtModelStream{nullptr}; + + Int8EntropyCalibrator * calibrator = nullptr; + if (calibratorData.size() > 0 ){ + auto endPos= prototxt.find_last_of("."); + auto beginPos= prototxt.find_last_of('/') + 1; + std::string calibratorName = prototxt.substr(beginPos,endPos - beginPos); + std::cout << "create calibrator,Named:" << calibratorName << std::endl; + calibrator = new Int8EntropyCalibrator(maxBatchSize,calibratorData,calibratorName); + } + + PluginFactory pluginFactorySerialize; + ICudaEngine* tmpEngine = loadModelAndCreateEngine(prototxt.c_str(),caffemodel.c_str(), maxBatchSize, parser, &pluginFactorySerialize, calibrator, trtModelStream,outputNodesName); + assert(tmpEngine != nullptr); + assert(trtModelStream != nullptr); + if(calibrator){ + delete calibrator; + calibrator = nullptr; + } + tmpEngine->destroy(); + pluginFactorySerialize.destroyPlugin(); + + mTrtRunTime = createInferRuntime(gLogger); + assert(mTrtRunTime != nullptr); + mTrtEngine= mTrtRunTime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), &mTrtPluginFactory); + assert(mTrtEngine != nullptr); + // Deserialize the engine. + trtModelStream->destroy(); + + InitEngine(); + } + + void trtNet::trtNet_engine(const std::string& engineFile) + { + using namespace std; + fstream file; + + file.open(engineFile,ios::binary | ios::in); + if(!file.is_open()) + { + cout << "read engine file" << engineFile <<" failed" << endl; + return; + } + file.seekg(0, ios::end); + int length = file.tellg(); + file.seekg(0, ios::beg); + std::unique_ptr data(new char[length]); + file.read(data.get(), length); + + file.close(); + + std::cout << "*** deserializing" << std::endl; + mTrtRunTime = createInferRuntime(gLogger); + assert(mTrtRunTime != nullptr); + mTrtEngine= mTrtRunTime->deserializeCudaEngine(data.get(), length, &mTrtPluginFactory); + assert(mTrtEngine != nullptr); + + InitEngine(); + } + + void trtNet::InitEngine() + { + const int maxBatchSize = 1; + mTrtContext = mTrtEngine->createExecutionContext(); + assert(mTrtContext != nullptr); + mTrtContext->setProfiler(&mTrtProfiler); + + // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings() + int nbBindings = mTrtEngine->getNbBindings(); + + mTrtCudaBuffer.resize(nbBindings); + mTrtBindBufferSize.resize(nbBindings); + for (int i = 0; i < nbBindings; ++i) + { + Dims dims = mTrtEngine->getBindingDimensions(i); + DataType dtype = mTrtEngine->getBindingDataType(i); + int64_t totalSize = volume(dims) * maxBatchSize * getElementSize(dtype); + mTrtBindBufferSize[i] = totalSize; + mTrtCudaBuffer[i] = safeCudaMalloc(totalSize); + if(mTrtEngine->bindingIsInput(i)) + mTrtInputCount++; + } + + CUDA_CHECK(cudaStreamCreate(&mTrtCudaStream)); + } + + + nvinfer1::ICudaEngine* trtNet::loadModelAndCreateEngine(const char* deployFile, const char* modelFile,int maxBatchSize, + ICaffeParser* parser, nvcaffeparser1::IPluginFactory* pluginFactory, + IInt8Calibrator* calibrator, IHostMemory*& trtModelStream,const std::vector& outputNodesName) + { + // Create the builder + IBuilder* builder = createInferBuilder(gLogger); + + // Parse the model to populate the network, then set the outputs. + INetworkDefinition* network = builder->createNetwork(); + parser->setPluginFactory(pluginFactory); + + std::cout << "Begin parsing model..." << std::endl; + const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile,modelFile, *network, nvinfer1::DataType::kFLOAT); + if (!blobNameToTensor) + RETURN_AND_LOG(nullptr, ERROR, "Fail to parse"); + std::cout << "End parsing model..." << std::endl; + + // specify which tensors are outputs + for (auto& name : outputNodesName) + { + auto output = blobNameToTensor->find(name.c_str()); + assert(output!=nullptr); + if (output == nullptr) + std::cout << "can not find output named " << name << std::endl; + + network->markOutput(*output); + } + + // Build the engine. + builder->setMaxBatchSize(maxBatchSize); + builder->setMaxWorkspaceSize(1 << 30);// 1G + if (mTrtRunMode == RUN_MODE::INT8) + { + std::cout <<"setInt8Mode"<platformHasFastInt8()) + std::cout << "Notice: the platform do not has fast for int8" << std::endl; + builder->setInt8Mode(true); + builder->setInt8Calibrator(calibrator); + } + else if (mTrtRunMode == RUN_MODE::FLOAT16) + { + std::cout <<"setFp16Mode"<platformHasFastFp16()) + std::cout << "Notice: the platform do not has fast for fp16" << std::endl; + builder->setFp16Mode(true); + } + + std::cout << "Begin building engine..." << std::endl; + ICudaEngine* engine = builder->buildCudaEngine(*network); + if (!engine) + RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine"); + std::cout << "End building engine..." << std::endl; + + // We don't need the network any more, and we can destroy the parser. + network->destroy(); + parser->destroy(); + + // Serialize the engine, then close everything down. + trtModelStream = engine->serialize(); + + builder->destroy(); + shutdownProtobufLibrary(); + return engine; + } + + void trtNet::doInference(const void* inputData, void* outputData) + { + static const int batchSize = 1; + assert(mTrtInputCount == 1); + + // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: + int inputIndex = 0; + CUDA_CHECK(cudaMemcpyAsync(mTrtCudaBuffer[inputIndex], inputData, mTrtBindBufferSize[inputIndex], cudaMemcpyHostToDevice, mTrtCudaStream)); + auto t_start = std::chrono::high_resolution_clock::now(); + mTrtContext->execute(batchSize, &mTrtCudaBuffer[inputIndex]); + auto t_end = std::chrono::high_resolution_clock::now(); + float total = std::chrono::duration(t_end - t_start).count(); + + std::cout << "Time taken for inference is " << total << " ms." << std::endl; + + for (size_t bindingIdx = mTrtInputCount; bindingIdx < mTrtBindBufferSize.size(); ++bindingIdx) + { + auto size = mTrtBindBufferSize[bindingIdx]; + CUDA_CHECK(cudaMemcpyAsync(outputData, mTrtCudaBuffer[bindingIdx], size, cudaMemcpyDeviceToHost, mTrtCudaStream)); + outputData = (char *)outputData + size; + } + + mTrtIterationTime ++ ; + } +} \ No newline at end of file diff --git a/tensorRTW/code/src/UpsampleLayer.cpp b/tensorRTW/code/src/UpsampleLayer.cpp new file mode 100644 index 0000000..cbcf424 --- /dev/null +++ b/tensorRTW/code/src/UpsampleLayer.cpp @@ -0,0 +1,78 @@ +#include "UpsampleLayer.h" + +namespace nvinfer1 +{ + UpsampleLayerPlugin::UpsampleLayerPlugin(const float scale, const int cudaThread /*= 512*/) + : mScale(scale),mThreadCount(cudaThread) + { + } + + UpsampleLayerPlugin::~UpsampleLayerPlugin() + { + + } + + // create the plugin at runtime from a byte stream + UpsampleLayerPlugin::UpsampleLayerPlugin(const void* data, size_t length) + { + using namespace Tn; + const char *d = reinterpret_cast(data), *a = d; + read(d, mCHW); + read(d, mDataType); + read(d, mScale); + read(d, mOutputWidth); + read(d, mOutputHeight); + read(d, mThreadCount); + + //std::cout << "read:" << a << " " << mOutputWidth<< " " <(buffer), *a = d; + write(d, mCHW); + write(d, mDataType); + write(d, mScale); + write(d, mOutputWidth); + write(d, mOutputHeight); + write(d, mThreadCount); + + //std::cout << "write:" << a << " " << mOutputHeight<< " " < + __global__ void upscale(const Dtype *input, Dtype *output, + int no_elements, int scale_factor, int d1, int d2, int d3) { + int ii = threadIdx.x + blockDim.x * blockIdx.x; + if (ii >= no_elements) return; + int ipidx = translate_idx(ii, d1, d2, d3, scale_factor); + output[ii]=input[ipidx]; + } + + template + void UpsampleLayerPlugin::forwardGpu(const Dtype* input,Dtype * output, + int N,int C,int H ,int W) { + + int numElem = N*C*H*W; + upscale<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>(input,output, numElem, mScale, C, H, W); + } + + size_t type2size(DataType dataType) { + size_t _size = 0; + switch (dataType) + { + case DataType::kFLOAT: _size = sizeof(float);break; + case DataType::kHALF: _size = sizeof(__half);break; + case DataType::kINT8: _size = sizeof(u_int8_t);break; + default:std::cerr << "error data type" << std::endl; + } + return _size; + } + + int UpsampleLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) + { + assert(batchSize == 1); + const int channels = mCHW.d[0]; + const int64_t in_height = mCHW.d[1]; + const int64_t in_width = mCHW.d[2]; + const int64_t out_height = mOutputHeight; + const int64_t out_width = mOutputWidth; + int totalElems = batchSize * in_height * in_width * channels; + + // Handle no-op resizes efficiently. + if (out_height == in_height && out_width == in_width) { + CUDA_CHECK(cudaMemcpyAsync(outputs[0], inputs[0], totalElems * type2size(mDataType), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + return 0; + } + //CUDA_CHECK(cudaStreamSynchronize(stream)); + + switch (mDataType) + { + case DataType::kFLOAT : + forwardGpu((const float *)inputs[0],(float *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth); + break; + case DataType::kHALF: + forwardGpu<__half>((const __half *)inputs[0],(__half *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth); + break; + case DataType::kINT8: + forwardGpu((const u_int8_t *)inputs[0],(u_int8_t *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth); + break; + default: + std::cerr << "error data type" << std::endl; + } + return 0; + }; +} \ No newline at end of file diff --git a/tensorRTW/code/src/YoloLayer.cu b/tensorRTW/code/src/YoloLayer.cu new file mode 100644 index 0000000..061a5dd --- /dev/null +++ b/tensorRTW/code/src/YoloLayer.cu @@ -0,0 +1,253 @@ +#include "YoloConfigs.h" +#include "YoloLayer.h" + +using namespace Yolo; + +namespace nvinfer1 +{ + YoloLayerPlugin::YoloLayerPlugin(const int cudaThread /*= 512*/):mThreadCount(cudaThread) + { + mClassCount = CLASS_NUM; + mYoloKernel.clear(); + mYoloKernel.push_back(yolo1); + mYoloKernel.push_back(yolo2); + mYoloKernel.push_back(yolo3); + + mKernelCount = mYoloKernel.size(); + } + + YoloLayerPlugin::~YoloLayerPlugin() + { + if(mInputBuffer) + CUDA_CHECK(cudaFreeHost(mInputBuffer)); + + if(mOutputBuffer) + CUDA_CHECK(cudaFreeHost(mOutputBuffer)); + } + + // create the plugin at runtime from a byte stream + YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) + { + using namespace Tn; + const char *d = reinterpret_cast(data), *a = d; + read(d, mClassCount); + read(d, mThreadCount); + read(d, mKernelCount); + mYoloKernel.resize(mKernelCount); + auto kernelSize = mKernelCount*sizeof(YoloKernel); + memcpy(mYoloKernel.data(),d,kernelSize); + d += kernelSize; + + assert(d == a + length); + } + + void YoloLayerPlugin::serialize(void* buffer) + { + using namespace Tn; + char* d = static_cast(buffer), *a = d; + write(d, mClassCount); + write(d, mThreadCount); + write(d, mKernelCount); + auto kernelSize = mKernelCount*sizeof(YoloKernel); + memcpy(d,mYoloKernel.data(),kernelSize); + d += kernelSize; + + assert(d == a + getSerializationSize()); + } + + size_t YoloLayerPlugin::getSerializationSize() + { + return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); + } + + int YoloLayerPlugin::initialize() + { + int totalCount = 0; + for(const auto& yolo : mYoloKernel) + totalCount += (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT; + CUDA_CHECK(cudaHostAlloc(&mInputBuffer, totalCount * sizeof(float), cudaHostAllocDefault)); + + totalCount = 0;//detection count + for(const auto& yolo : mYoloKernel) + totalCount += yolo.width*yolo.height * CHECK_COUNT; + CUDA_CHECK(cudaHostAlloc(&mOutputBuffer, sizeof(float) + totalCount * sizeof(Detection), cudaHostAllocDefault)); + return 0; + } + + Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) + { + //output the result to channel + int totalCount = 0; + for(const auto& yolo : mYoloKernel) + totalCount += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float); + + return Dims3(totalCount + 1, 1, 1); + } + + void YoloLayerPlugin::forwardCpu(const float*const * inputs, float* outputs, cudaStream_t stream) + { + auto Logist = [=](float data){ + return 1./(1. + exp(-data)); + }; + + CUDA_CHECK(cudaStreamSynchronize(stream)); + int i = 0; + float* inputData = (float *)mInputBuffer; + for(const auto& yolo : mYoloKernel) + { + int size = (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT; + CUDA_CHECK(cudaMemcpyAsync(inputData, inputs[i], size * sizeof(float), cudaMemcpyDeviceToHost, stream)); + inputData += size; + ++ i; + } + + inputData = (float *)mInputBuffer; + std::vector result; + for (const auto& yolo : mYoloKernel) + { + int stride = yolo.width*yolo.height; + for (int j = 0;j < stride ;++j) + { + for (int k = 0;k < CHECK_COUNT; ++k ) + { + int beginIdx = (LOCATIONS + 1 + mClassCount)* stride *k + j; + int objIndex = beginIdx + LOCATIONS*stride; + + //check obj + float objProb = Logist(inputData[objIndex]); + if(objProb <= IGNORE_THRESH) + continue; + + //classes + int classId = -1; + float maxProb = IGNORE_THRESH; + for (int c = 0;c< mClassCount;++c){ + float cProb = Logist(inputData[beginIdx + (5 + c) * stride]) * objProb; + if(cProb > maxProb){ + maxProb = cProb; + classId = c; + } + } + + if(classId >= 0) { + Detection det; + int row = j / yolo.width; + int cols = j % yolo.width; + + //Location + det.bbox[0] = (cols + Logist(inputData[beginIdx]))/ yolo.width; + det.bbox[1] = (row + Logist(inputData[beginIdx+stride]))/ yolo.height; + det.bbox[2] = exp(inputData[beginIdx+2*stride]) * yolo.anchors[2*k]; + det.bbox[3] = exp(inputData[beginIdx+3*stride]) * yolo.anchors[2*k + 1]; + det.classId = classId; + det.prob = maxProb; + //det.objectness = objProb; + + result.emplace_back(det); + } + } + } + + inputData += (LOCATIONS + 1 + mClassCount) * stride * CHECK_COUNT; + } + + + int detCount =result.size(); + auto data = (float *)mOutputBuffer; + //copy count; + data[0] = (float)detCount; + //std::cout << "detCount"<< detCount << std::endl; + data++; + //copy result + memcpy(data,result.data(),result.size()*sizeof(Detection)); + + //(count + det result) + CUDA_CHECK(cudaMemcpyAsync(outputs, mOutputBuffer, sizeof(float) + result.size()*sizeof(Detection), cudaMemcpyHostToDevice, stream)); + }; + + __device__ float Logist(float data){ return 1./(1. + exp(-data)); }; + + __global__ void CalDetection(const float *input, float *output,int noElements, + int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes) { + + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= noElements) return; + + int stride = yoloWidth*yoloHeight; + + for (int k = 0;k < CHECK_COUNT; ++k ) + { + int beginIdx = (LOCATIONS + 1 + classes)* stride *k + idx; + int objIndex = beginIdx + LOCATIONS*stride; + + //check objectness + float objProb = Logist(input[objIndex]); + if(objProb <= IGNORE_THRESH) + continue; + + int row = idx / yoloWidth; + int cols = idx % yoloWidth; + + //classes + int classId = -1; + float maxProb = IGNORE_THRESH; + for (int c = 0;c maxProb){ + maxProb = cProb; + classId = c; + } + } + + if(classId >= 0) { + int resCount = (int)atomicAdd(output,1); + char* data = (char * )output + sizeof(float) + resCount*sizeof(Detection); + Detection* det = (Detection*)(data); + + //Location + det->bbox[0] = (cols + Logist(input[beginIdx]))/ yoloWidth; + det->bbox[1] = (row + Logist(input[beginIdx+stride]))/ yoloHeight; + det->bbox[2] = exp(input[beginIdx+2*stride]) * anchors[2*k]; + det->bbox[3] = exp(input[beginIdx+3*stride]) * anchors[2*k + 1]; + det->classId = classId; + det->prob = maxProb; + } + } + } + + void YoloLayerPlugin::forwardGpu(const float *const * inputs,float * output,cudaStream_t stream) { + int numElem; + void* devAnchor; + size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; + CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen)); + + //first detect count init 0 + CUDA_CHECK(cudaMemset(output, 0, sizeof(float))); + for (unsigned int i = 0;i< mYoloKernel.size();++i) + { + const auto& yolo = mYoloKernel[i]; + numElem = yolo.width*yolo.height; + + //copy anchor to device + CUDA_CHECK(cudaMemcpy(devAnchor,yolo.anchors,AnchorLen,cudaMemcpyHostToDevice)); + + CalDetection<<< (yolo.width*yolo.height + mThreadCount - 1) / mThreadCount, mThreadCount>>> + (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount); + } + CUDA_CHECK(cudaFree(devAnchor)); + } + + + int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) + { + assert(batchSize == 1); + + //GPU + forwardGpu((const float *const *)inputs,(float *)outputs[0],stream); + + //CPU + //forwardCpu((const float *const *)inputs,(float *)outputs[0],stream); + return 0; + }; + +} diff --git a/tensorRTWrapper b/tensorRTWrapper deleted file mode 160000 index cbfbdc5..0000000 --- a/tensorRTWrapper +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cbfbdc5bcf27d917c8ea9b23f46acc620ebe70f0