diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62d9cdd..6800880 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ set(CMAKE_BUILD_TYPE Release)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors")
 
 #add wrapper
-set(TRT_WRAPPER_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorRTWrapper/code)
+set(TRT_WRAPPER_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorRTW/code)
 add_subdirectory(${TRT_WRAPPER_DIR})
 include_directories(${TRT_WRAPPER_DIR}/include)
 
@@ -20,6 +20,6 @@ link_directories(${OpenCV_LIBRARIES_DIRS})
 
 #build runYolov3
 add_executable(runYolov3 main.cpp eval.cpp dataReader.cpp)
-target_link_libraries(runYolov3 TrtNet ${OpenCV_LIBS})
+target_link_libraries(runYolov3 TrtNet ${OpenCV_LIBS} pthread)
 
 install(TARGETS runYolov3 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/install/)
diff --git a/coco.names b/coco.names
new file mode 100644
index 0000000..ca76c80
--- /dev/null
+++ b/coco.names
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/image/example.png b/image/example.png
new file mode 100644
index 0000000..9f5674c
Binary files /dev/null and b/image/example.png differ
diff --git a/include/configs.h b/include/configs.h
index 420b7c5..ac55b8c 100644
--- a/include/configs.h
+++ b/include/configs.h
@@ -8,13 +8,19 @@ namespace Tn
     const std::string INPUT_PROTOTXT ="yolov3_trt.prototxt";
     const std::string INPUT_CAFFEMODEL = "yolov3.caffemodel";
     const std::string INPUT_IMAGE = "test.jpg";
+	const std::string VIDEOFILE = "sample.mp4";
+	const std::string SAVEFILE = "result";
+	const std::string INPUT_STREAM = "cam";
+	const std::string CLASS_NAME = "coco.name";
     const std::string EVAL_LIST = "";
     const std::string CALIBRATION_LIST = "";
     const std::string MODE = "fp32";
     const std::string OUTPUTS= "yolo-det";//layer82-conv,layer94-conv,layer106-conv
+	const int DISPLAY = 1;
+	const int SAVEIMG = 0;
     const int INPUT_WIDTH = 608;
     const int INPUT_HEIGHT = 608;
-
+	const int CAM = 0;
     const int DETECT_CLASSES = 80;
     const float NMS_THRESH = 0.45;
 }
diff --git a/main.cpp b/main.cpp
index ac432f0..5a13123 100644
--- a/main.cpp
+++ b/main.cpp
@@ -6,321 +6,607 @@
 #include "YoloLayer.h"
 #include "dataReader.h"
 #include "eval.h"
+#include "pthread.h"
 
 using namespace std;
 using namespace argsParser;
 using namespace Tn;
 using namespace Yolo;
 
+
+static cv::VideoCapture cap;  //静态全局变量仅对当前文件可见，其他文件不可访问
+cv::Mat frame;
+cv::Mat frame_;
+vector<float> inputData;
+vector<float> inputData_;
+list<string> fileNames;
+list<vector<Bbox>> groundTruth;
+list<vector<Bbox>> outputs;
+int outputCount;
+
+
+vector<string> ClassName;
+int classNum;
+int flag_exit = 0;
+
 vector<float> prepareImage(cv::Mat& img)
 {
-    using namespace cv;
-
-    int c = parser::getIntValue("C");
-    int h = parser::getIntValue("H");   //net h
-    int w = parser::getIntValue("W");   //net w
-
-    float scale = min(float(w)/img.cols,float(h)/img.rows);
-    auto scaleSize = cv::Size(img.cols * scale,img.rows * scale);
-
-    cv::Mat rgb ;
-    cv::cvtColor(img, rgb, CV_BGR2RGB);
-    cv::Mat resized;
-    cv::resize(rgb, resized,scaleSize,0,0,INTER_CUBIC);
-
-    cv::Mat cropped(h, w,CV_8UC3, 127);
-    Rect rect((w- scaleSize.width)/2, (h-scaleSize.height)/2, scaleSize.width,scaleSize.height); 
-    resized.copyTo(cropped(rect));
-
-    cv::Mat img_float;
-    if (c == 3)
-        cropped.convertTo(img_float, CV_32FC3, 1/255.0);
-    else
-        cropped.convertTo(img_float, CV_32FC1 ,1/255.0);
-
-    //HWC TO CHW
-    vector<Mat> input_channels(c);
-    cv::split(img_float, input_channels);
-
-    vector<float> result(h*w*c);
-    auto data = result.data();
-    int channelLength = h * w;
-    for (int i = 0; i < c; ++i) {
-        memcpy(data,input_channels[i].data,channelLength*sizeof(float));
-        data += channelLength;
-    }
-
-    return result;
+	using namespace cv;
+
+	int c = parser::getIntValue("C");
+	int h = parser::getIntValue("H");   //net h
+	int w = parser::getIntValue("W");   //net w
+
+	float scale = min(float(w) / img.cols, float(h) / img.rows);
+	auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);
+
+	cv::Mat rgb;
+	cv::cvtColor(img, rgb, CV_BGR2RGB);
+	cv::Mat resized;
+	cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC);
+
+	cv::Mat cropped(h, w, CV_8UC3, 127);
+	Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
+	resized.copyTo(cropped(rect));
+
+	cv::Mat img_float;
+	if (c == 3)
+		cropped.convertTo(img_float, CV_32FC3, 1 / 255.0);
+	else
+		cropped.convertTo(img_float, CV_32FC1, 1 / 255.0);
+
+	//HWC TO CHW
+	vector<Mat> input_channels(c);
+	cv::split(img_float, input_channels);//通道分割
+
+	vector<float> result(h*w*c);
+	auto data = result.data();
+	int channelLength = h * w;
+	for (int i = 0; i < c; ++i) {
+		memcpy(data, input_channels[i].data, channelLength * sizeof(float));
+		data += channelLength;
+	}
+
+	return result;
 }
 
-void DoNms(vector<Detection>& detections,int classes ,float nmsThresh)
+void DoNms(vector<Detection>& detections, int classes, float nmsThresh)
 {
-    auto t_start = chrono::high_resolution_clock::now();
-
-    vector<vector<Detection>> resClass;
-    resClass.resize(classes);
-
-    for (const auto& item : detections)
-        resClass[item.classId].push_back(item);
-
-    auto iouCompute = [](float * lbox, float* rbox)
-    {
-        float interBox[] = {
-            max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
-            min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
-            max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
-            min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
-        };
-        
-        if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
-            return 0.0f;
-
-        float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
-        return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
-    };
-
-    vector<Detection> result;
-    for (int i = 0;i<classes;++i)
-    {
-        auto& dets =resClass[i]; 
-        if(dets.size() == 0)
-            continue;
-
-        sort(dets.begin(),dets.end(),[=](const Detection& left,const Detection& right){
-            return left.prob > right.prob;
-        });
-
-        for (unsigned int m = 0;m < dets.size() ; ++m)
-        {
-            auto& item = dets[m];
-            result.push_back(item);
-            for(unsigned int n = m + 1;n < dets.size() ; ++n)
-            {
-                if (iouCompute(item.bbox,dets[n].bbox) > nmsThresh)
-                {
-                    dets.erase(dets.begin()+n);
-                    --n;
-                }
-            }
-        }
-    }
-
-    //swap(detections,result);
-    detections = move(result);
-
-    auto t_end = chrono::high_resolution_clock::now();
-    float total = chrono::duration<float, milli>(t_end - t_start).count();
-    cout << "Time taken for nms is " << total << " ms." << endl;
+	//auto t_start = chrono::high_resolution_clock::now();
+
+	vector<vector<Detection>> resClass;
+	resClass.resize(classes);
+
+	for (const auto& item : detections)
+		resClass[item.classId].push_back(item);
+
+	auto iouCompute = [](float * lbox, float* rbox)
+	{
+		float interBox[] = {
+			max(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
+			min(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
+			max(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
+			min(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
+		};
+
+		if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
+			return 0.0f;
+
+		float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
+		return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
+	};
+
+	vector<Detection> result;
+	for (int i = 0; i < classes; ++i)
+	{
+		auto& dets = resClass[i];
+		if (dets.size() == 0)
+			continue;
+
+		sort(dets.begin(), dets.end(), [=](const Detection& left, const Detection& right) {
+			return left.prob > right.prob;
+		});
+
+		for (unsigned int m = 0; m < dets.size(); ++m)
+		{
+			auto& item = dets[m];
+			result.push_back(item);
+			for (unsigned int n = m + 1; n < dets.size(); ++n)
+			{
+				if (iouCompute(item.bbox, dets[n].bbox) > nmsThresh)
+				{
+					dets.erase(dets.begin() + n);
+					--n;
+				}
+			}
+		}
+	}
+
+	//swap(detections,result);
+	detections = move(result);
+
+	//auto t_end = chrono::high_resolution_clock::now();
+	//float total = chrono::duration<float, milli>(t_end - t_start).count();
+	//cout << "Time taken for nms is " << total << " ms." << endl;
 }
 
 
-vector<Bbox> postProcessImg(cv::Mat& img,vector<Detection>& detections,int classes)
+vector<Bbox> postProcessImg(cv::Mat& img, vector<Detection>& detections, int classes)
 {
-    using namespace cv;
-
-    int h = parser::getIntValue("H");   //net h
-    int w = parser::getIntValue("W");   //net w
-
-    //scale bbox to img
-    int width = img.cols;
-    int height = img.rows;
-    float scale = min(float(w)/width,float(h)/height);
-    float scaleSize[] = {width * scale,height * scale};
-
-    //correct box
-    for (auto& item : detections)
-    {
-        auto& bbox = item.bbox;
-        bbox[0] = (bbox[0] * w - (w - scaleSize[0])/2.f) / scaleSize[0];
-        bbox[1] = (bbox[1] * h - (h - scaleSize[1])/2.f) / scaleSize[1];
-        bbox[2] /= scaleSize[0];
-        bbox[3] /= scaleSize[1];
-    }
-    
-    //nms
-    float nmsThresh = parser::getFloatValue("nms");
-    if(nmsThresh > 0) 
-        DoNms(detections,classes,nmsThresh);
-
-    vector<Bbox> boxes;
-    for(const auto& item : detections)
-    {
-        auto& b = item.bbox;
-        Bbox bbox = 
-        { 
-            item.classId,   //classId
-            max(int((b[0]-b[2]/2.)*width),0), //left
-            min(int((b[0]+b[2]/2.)*width),width), //right
-            max(int((b[1]-b[3]/2.)*height),0), //top
-            min(int((b[1]+b[3]/2.)*height),height), //bot
-            item.prob       //score
-        };
-        boxes.push_back(bbox);
-    }
-
-    return boxes;
+	using namespace cv;
+	//auto t_start = chrono::high_resolution_clock::now();
+	int h = parser::getIntValue("H");   //net h
+	int w = parser::getIntValue("W");   //net w
+
+	//scale bbox to img
+	int width = img.cols;
+	int height = img.rows;
+	float scale = min(float(w) / width, float(h) / height);
+	float scaleSize[] = { width * scale,height * scale };
+
+	//correct box
+	for (auto& item : detections)
+	{
+		auto& bbox = item.bbox;
+		bbox[0] = (bbox[0] * w - (w - scaleSize[0]) / 2.f) / scaleSize[0];
+		bbox[1] = (bbox[1] * h - (h - scaleSize[1]) / 2.f) / scaleSize[1];
+		bbox[2] /= scaleSize[0];
+		bbox[3] /= scaleSize[1];
+	}
+
+	//nms
+	float nmsThresh = parser::getFloatValue("nms");
+	if (nmsThresh > 0)
+		DoNms(detections, classes, nmsThresh);
+
+	vector<Bbox> boxes;
+	for (const auto& item : detections)
+	{
+		auto& b = item.bbox;
+		Bbox bbox =
+		{
+			item.classId,   //classId
+			max(int((b[0] - b[2] / 2.)*width),0), //left
+			min(int((b[0] + b[2] / 2.)*width),width), //right
+			max(int((b[1] - b[3] / 2.)*height),0), //top
+			min(int((b[1] + b[3] / 2.)*height),height), //bot
+			item.prob       //score
+		};
+		boxes.push_back(bbox);
+	}
+	//auto t_end = chrono::high_resolution_clock::now();
+	//float total = chrono::duration<float, milli>(t_end - t_start).count();
+	//cout << "Time taken for nms is " << total << " ms." << endl;
+	return boxes;
 }
 
 vector<string> split(const string& str, char delim)
 {
-    stringstream ss(str);
-    string token;
-    vector<string> container;
-    while (getline(ss, token, delim)) {
-        container.push_back(token);
-    }
-
-    return container;
+	stringstream ss(str);
+	string token;
+	vector<string> container;
+	while (getline(ss, token, delim)) {
+		container.push_back(token);
+	}
+
+	return container;
+}
+
+
+void error(const char *s)
+{
+	perror(s);
+	assert(0);
+	exit(EXIT_FAILURE);
+}
+
+
+void *fetch_in_thread(void *ptr)
+{
+
+	cap >> frame;
+	if (!frame.data)
+	{
+		flag_exit = 1;
+		return 0;
+	}
+	//cv::Mat img = cv::imread(filename);
+	inputData = prepareImage(frame);
+
+	return 0;
 }
 
-int main( int argc, char* argv[] )
+void *detect_in_thread(void *ptr)
+{
+	trtNet *net;
+	net = (trtNet*)ptr;
+	//cout << frame.flags << endl;
+	unique_ptr<float[]> outputData(new float[outputCount]);
+	net->doInference(inputData_.data(), outputData.get());
+
+	//Get Output    
+	auto output = outputData.get();
+
+	//first detect count
+	int count = output[0];
+	//later detect result
+	vector<Detection> result;
+	result.resize(count);
+	memcpy(result.data(), &output[1], count * sizeof(Detection));
+
+	auto boxes = postProcessImg(frame_, result, classNum);
+	//outputs.emplace_back(boxes);
+
+	//auto bbox = *outputs.begin();
+	for (const auto& item : boxes)
+	{
+		cv::rectangle(frame_, cv::Point(item.left, item.top), cv::Point(item.right, item.bot), cv::Scalar(0, 0, 255), 3, 8, 0);
+		cv::rectangle(frame_, cv::Point(item.left, item.top - 20), cv::Point(item.right, item.top), cv::Scalar(0, 0, 255), CV_FILLED, 8, 0);
+		cv::putText(frame_, ClassName[item.classId], cv::Point(item.left, item.top), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(255, 255, 255), 2);
+		cout << "class=" << ClassName[item.classId] << " prob=" << item.score * 100 << endl;
+		//cout << "left=" << item.left << " right=" << item.right << " top=" << item.top << " bot=" << item.bot << endl;
+	}
+
+	return 0;
+}
+
+
+void do_video_or_cam(trtNet &net)
 {
-    parser::ADD_ARG_STRING("prototxt",Desc("input yolov3 deploy"),DefaultValue(INPUT_PROTOTXT),ValueDesc("file"));
-    parser::ADD_ARG_STRING("caffemodel",Desc("input yolov3 caffemodel"),DefaultValue(INPUT_CAFFEMODEL),ValueDesc("file"));
-    parser::ADD_ARG_INT("C",Desc("channel"),DefaultValue(to_string(INPUT_CHANNEL)));
-    parser::ADD_ARG_INT("H",Desc("height"),DefaultValue(to_string(INPUT_HEIGHT)));
-    parser::ADD_ARG_INT("W",Desc("width"),DefaultValue(to_string(INPUT_WIDTH)));
-    parser::ADD_ARG_STRING("calib",Desc("calibration image List"),DefaultValue(CALIBRATION_LIST),ValueDesc("file"));
-    parser::ADD_ARG_STRING("mode",Desc("runtime mode"),DefaultValue(MODE), ValueDesc("fp32/fp16/int8"));
-    parser::ADD_ARG_STRING("outputs",Desc("output nodes name"),DefaultValue(OUTPUTS));
-    parser::ADD_ARG_INT("class",Desc("num of classes"),DefaultValue(to_string(DETECT_CLASSES)));
-    parser::ADD_ARG_FLOAT("nms",Desc("non-maximum suppression value"),DefaultValue(to_string(NMS_THRESH)));
-
-    //input
-    parser::ADD_ARG_STRING("input",Desc("input image file"),DefaultValue(INPUT_IMAGE),ValueDesc("file"));
-    parser::ADD_ARG_STRING("evallist",Desc("eval gt list"),DefaultValue(EVAL_LIST),ValueDesc("file"));
-
-    if(argc < 2){
-        parser::printDesc();
-        exit(-1);
-    }
-
-    parser::parseArgs(argc,argv);
-
-    string deployFile = parser::getStringValue("prototxt");
-    string caffemodelFile = parser::getStringValue("caffemodel");
-
-    vector<vector<float>> calibData;
-    string calibFileList = parser::getStringValue("calib");
-    string mode = parser::getStringValue("mode");
-    if(calibFileList.length() > 0 && mode == "int8")
-    {   
-        cout << "find calibration file,loading ..." << endl;
-      
-        ifstream file(calibFileList);  
-        if(!file.is_open())
-        {
-            cout << "read file list error,please check file :" << calibFileList << endl;
-            exit(-1);
-        }
-
-        string strLine;  
-        while( getline(file,strLine) )                               
-        { 
-            cv::Mat img = cv::imread(strLine);
-            auto data = prepareImage(img);
-            calibData.emplace_back(data);
-        } 
-        file.close();
-    }
-
-    RUN_MODE run_mode = RUN_MODE::FLOAT32;
-    if(mode == "int8")
-    {
-        if(calibFileList.length() == 0)
-            cout << "run int8 please input calibration file, will run in fp32" << endl;
-        else
-            run_mode = RUN_MODE::INT8;
-    }
-    else if(mode == "fp16")
-    {
-        run_mode = RUN_MODE::FLOAT16;
-    }
-    
-    string outputNodes = parser::getStringValue("outputs");
-    auto outputNames = split(outputNodes,',');
-    
-    //can load from file
-    string saveName = "yolov3_" + mode + ".engine";
-
-//#define LOAD_FROM_ENGINE
-#ifdef LOAD_FROM_ENGINE    
-    trtNet net(saveName);
-#else
-    trtNet net(deployFile,caffemodelFile,outputNames,calibData,run_mode);
-    cout << "save Engine..." << saveName <<endl;
-    net.saveEngine(saveName);
-#endif
-
-    int outputCount = net.getOutputSize()/sizeof(float);
-    unique_ptr<float[]> outputData(new float[outputCount]);
-
-    string listFile = parser::getStringValue("evallist");
-    list<string> fileNames;
-    list<vector<Bbox>> groundTruth;
-
-    if(listFile.length() > 0)
-    {
-        std::cout << "loading from eval list " << listFile << std::endl; 
-        tie(fileNames,groundTruth) = readObjectLabelFileList(listFile);
-    }
-    else
-    {
-        string inputFileName = parser::getStringValue("input");
-        fileNames.push_back(inputFileName);
-    }
-
-    list<vector<Bbox>> outputs;
-    int classNum = parser::getIntValue("class");
-    for (const auto& filename :fileNames)
-    {
-        std::cout << "process: " << filename << std::endl;
-
-        cv::Mat img = cv::imread(filename);
-        vector<float> inputData = prepareImage(img);
-        if (!inputData.data())
-            continue;
-
-        net.doInference(inputData.data(), outputData.get());
-
-        //Get Output    
-        auto output = outputData.get();
-
-        //first detect count
-        int count = output[0];
-        //later detect result
-        vector<Detection> result;
-        result.resize(count);
-        memcpy(result.data(), &output[1], count*sizeof(Detection));
-
-        auto boxes = postProcessImg(img,result,classNum);
-        outputs.emplace_back(boxes);
-    }
-    
-    net.printTime();        
-
-    if(groundTruth.size() > 0)
-    {
-        //eval map
-        evalMAPResult(outputs,groundTruth,classNum,0.5f);
-        evalMAPResult(outputs,groundTruth,classNum,0.75f);
-    }
-
-    if(fileNames.size() == 1)
-    {
-        //draw on image
-        cv::Mat img = cv::imread(*fileNames.begin());
-        auto bbox = *outputs.begin();
-        for(const auto& item : bbox)
-        {
-            cv::rectangle(img,cv::Point(item.left,item.top),cv::Point(item.right,item.bot),cv::Scalar(0,0,255),3,8,0);
-            cout << "class=" << item.classId << " prob=" << item.score*100 << endl;
-            cout << "left=" << item.left << " right=" << item.right << " top=" << item.top << " bot=" << item.bot << endl;
-        }
-        cv::imwrite("result.jpg",img);
-        cv::imshow("result",img);
-        cv::waitKey(0);
-    }
-
-    return 0;
+	if (!cap.isOpened()) {
+		std::cout << "Error: video-stream can't be opened! \n";
+		exit(EXIT_FAILURE);
+	}
+	pthread_t fetch_thread;
+	pthread_t detect_thread;
+
+	fetch_in_thread(0);
+	inputData_ = inputData;
+	frame_ = frame;
+	//detect_in_thread(0);
+	//fetch_in_thread(0);
+	//inputData_ = inputData;
+	//frame_ = frame;
+
+
+	if (parser::getIntValue("display"))
+	{
+		cv::namedWindow("result", CV_WINDOW_NORMAL);
+		cv::resizeWindow("result", 640, 480);
+	}
+
+
+	float fps = 0;
+
+	//for (const auto& filename :fileNames)
+
+	while (1)
+	{
+		auto t_start = std::chrono::high_resolution_clock::now();
+		if (pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
+		if (pthread_create(&detect_thread, 0, detect_in_thread, &net)) error("Thread creation failed");//创造一个线程运行网络
+
+
+		pthread_join(fetch_thread, 0);//塞入线程
+		pthread_join(detect_thread, 0);
+
+
+
+		if (parser::getIntValue("display"))
+		{
+			cv::imshow("result", frame_);
+			if (cv::waitKey(5) == 27)
+			{
+				break;
+			}
+		}
+		inputData_ = inputData;
+		frame_ = frame;
+		if (flag_exit == 1)
+		{
+			break;
+		}
+
+
+		auto t_end = std::chrono::high_resolution_clock::now();
+		float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+		float cout = 1000. / total;
+		fps = 0.9*fps + 0.1*cout;
+		std::cout << "fps is " << fps << std::endl;
+
+	}
+	std::cout << "video stream close\n " << std::endl;
+	cv::destroyAllWindows();
+	cap.release();
 }
+
+
+
+void *fetch_image_in_thread(void *ptr)
+{
+	if (!frame.data)
+	{
+		return 0;
+	}
+	//cv::Mat img = cv::imread(filename);
+	inputData = prepareImage(frame);
+	//in_s = resize_image(in, net.w, net.h);
+
+	return 0;
+}
+
+
+void *detect_image_in_thread(void *ptr)
+{
+	trtNet *net;
+	net = (trtNet*)ptr;
+	//cout << frame.flags << endl;
+	unique_ptr<float[]> outputData(new float[outputCount]);
+	net->doInference(inputData_.data(), outputData.get());
+
+	//Get Output    
+	auto output = outputData.get();
+
+	//first detect count
+	int count = output[0];
+	//later detect result
+	vector<Detection> result;
+	result.resize(count);
+	memcpy(result.data(), &output[1], count * sizeof(Detection));
+
+	auto boxes = postProcessImg(frame_, result, classNum);
+	outputs.emplace_back(boxes);
+
+	//auto bbox = *outputs.begin();
+	for (const auto& item : boxes)
+	{
+		cv::rectangle(frame_, cv::Point(item.left, item.top), cv::Point(item.right, item.bot), cv::Scalar(0, 0, 255), 3, 8, 0);
+		cv::rectangle(frame_, cv::Point(item.left, item.top - 20), cv::Point(item.right, item.top), cv::Scalar(0, 0, 255), CV_FILLED, 8, 0);
+		cv::putText(frame_, ClassName[item.classId], cv::Point(item.left, item.top), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(255, 255, 255), 2);
+		cout << "class=" << ClassName[item.classId] << " prob=" << item.score * 100 << endl;
+		//cout << "left=" << item.left << " right=" << item.right << " top=" << item.top << " bot=" << item.bot << endl;
+	}
+
+	return 0;
+}
+
+
+void do_image(trtNet &net)
+{
+	pthread_t fetch_thread;
+	pthread_t detect_thread;
+	list<string>::iterator it;
+	it = fileNames.begin();
+	//cout << "deal image:" << *it << endl;
+	frame = cv::imread(*it);
+	vector<string> line = split(*it, '/');
+	string save_img_name = parser::getStringValue("savefile")+"/"+ *(line.end()-1);
+	it++;
+
+	fetch_image_in_thread(0);
+	inputData_ = inputData;
+	frame_ = frame;
+
+	if (parser::getIntValue("display"))
+	{
+		cv::namedWindow("result", CV_WINDOW_NORMAL);
+		cv::resizeWindow("result", 640, 480);
+	}
+	float fps = 0;
+
+	while(1)
+	{
+		auto t_start = chrono::high_resolution_clock::now();
+		if (it != fileNames.end())
+		{
+			frame = cv::imread(*it);
+		}
+
+		if (pthread_create(&fetch_thread, 0, fetch_image_in_thread, 0)) error("Thread creation failed");
+		if (pthread_create(&detect_thread, 0, detect_image_in_thread, &net)) error("Thread creation failed");//创造一个线程运行网络
+
+		pthread_join(fetch_thread, 0);//塞入线程
+		pthread_join(detect_thread, 0);
+
+		if (parser::getIntValue("display"))
+		{
+			cv::imshow("result", frame_);
+			if (cv::waitKey(5) == 27)
+			{
+				break;
+			}
+		}
+		if (parser::getIntValue("saveimg"))
+		{
+			cv::imwrite(save_img_name, frame_);
+		}
+
+		inputData_ = inputData;
+		frame_ = frame;
+		auto t_end = chrono::high_resolution_clock::now();
+		float total = chrono::duration<float, std::milli>(t_end - t_start).count();
+		float cout = 1000. / total;
+		fps = 0.9*fps + 0.1*cout;
+		std::cout << "fps is " << fps << std::endl;
+		if (it == fileNames.end()) break;
+		line = split(*it, '/');
+		save_img_name = parser::getStringValue("savefile") + "/" + *(line.end() - 1);
+		it++;
+		
+	}
+	cout << "all over\n " << endl;
+	cv::destroyAllWindows();
+	if (groundTruth.size() > 0)
+	{
+		//eval map
+		evalMAPResult(outputs, groundTruth, classNum, 0.5f);
+		evalMAPResult(outputs, groundTruth, classNum, 0.75f);
+	}
+	return ;
+}
+
+int main(int argc, char* argv[])
+{
+	parser::ADD_ARG_STRING("prototxt", Desc("input yolov3 deploy"), DefaultValue(INPUT_PROTOTXT), ValueDesc("file"));
+	parser::ADD_ARG_STRING("caffemodel", Desc("input yolov3 caffemodel"), DefaultValue(INPUT_CAFFEMODEL), ValueDesc("file"));
+	parser::ADD_ARG_INT("C", Desc("channel"), DefaultValue(to_string(INPUT_CHANNEL)));
+	parser::ADD_ARG_INT("H", Desc("height"), DefaultValue(to_string(INPUT_HEIGHT)));
+	parser::ADD_ARG_INT("W", Desc("width"), DefaultValue(to_string(INPUT_WIDTH)));
+	parser::ADD_ARG_STRING("calib", Desc("calibration image List"), DefaultValue(CALIBRATION_LIST), ValueDesc("file"));
+	parser::ADD_ARG_STRING("mode", Desc("runtime mode"), DefaultValue(MODE), ValueDesc("fp32/fp16/int8"));
+	parser::ADD_ARG_STRING("outputs", Desc("output nodes name"), DefaultValue(OUTPUTS));
+	parser::ADD_ARG_INT("class", Desc("num of classes"), DefaultValue(to_string(DETECT_CLASSES)));
+	parser::ADD_ARG_FLOAT("nms", Desc("non-maximum suppression value"), DefaultValue(to_string(NMS_THRESH)));
+	parser::ADD_ARG_FLOAT("classname", Desc("class name"), DefaultValue(CLASS_NAME), ValueDesc("file"));
+	parser::ADD_ARG_INT("display", Desc("whether display video"), DefaultValue(to_string(DISPLAY)));
+	parser::ADD_ARG_INT("saveimg", Desc("whether save image"), DefaultValue(to_string(SAVEIMG)));
+	parser::ADD_ARG_STRING("inputstream", Desc("input stream"), DefaultValue(INPUT_STREAM));
+	parser::ADD_ARG_INT("cam", Desc("cam"), DefaultValue(to_string(CAM)));
+	parser::ADD_ARG_STRING("videofile", Desc("videofile"), DefaultValue(VIDEOFILE), ValueDesc("file"));
+	parser::ADD_ARG_STRING("savefile", Desc("savefile"), DefaultValue(SAVEFILE));
+	parser::ADD_ARG_STRING("input", Desc("input image file"), DefaultValue(INPUT_IMAGE), ValueDesc("file"));
+	parser::ADD_ARG_STRING("evallist", Desc("eval gt list"), DefaultValue(EVAL_LIST), ValueDesc("file"));
+
+	if (argc < 2) {
+		parser::printDesc();
+		return 1;
+	}
+
+	parser::parseArgs(argc, argv);
+
+	string deployFile = parser::getStringValue("prototxt");
+	string caffemodelFile = parser::getStringValue("caffemodel");
+
+	vector<vector<float>> calibData;
+	string calibFileList = parser::getStringValue("calib");  //校验文件的名字 txt
+	string mode = parser::getStringValue("mode");
+	if (calibFileList.length() > 0 && mode == "int8")
+	{
+		cout << "find calibration file,loading ..." << endl;
+
+		ifstream file(calibFileList);
+		if (!file.is_open())
+		{
+			cout << "read file list error,please check file :" << calibFileList << endl;
+			return 1;
+		}
+
+		string strLine;
+		while (getline(file, strLine))
+		{
+			cv::Mat img = cv::imread(strLine);
+			auto data = prepareImage(img);  //转换为608*608的数据
+			calibData.emplace_back(data);
+		}
+		file.close();
+	}
+
+	RUN_MODE run_mode = RUN_MODE::FLOAT32;
+	if (mode == "int8")
+	{
+		if (calibFileList.length() == 0)
+			cout << "run int8 please input calibration file, will run in fp32" << endl;
+		else
+			run_mode = RUN_MODE::INT8;
+	}
+	else if (mode == "fp16")
+	{
+		run_mode = RUN_MODE::FLOAT16;
+	}
+
+	string outputNodes = parser::getStringValue("outputs");
+	auto outputNames = split(outputNodes, ',');
+
+	//can load from file
+	string saveName = "yolov3_" + mode + ".engine";
+
+	//#define LOAD_FROM_ENGINE
+	std::ifstream serialize_iutput_stream(saveName, std::ios::in | std::ios::binary);
+	trtNet net;
+	net.set_mode(run_mode);
+
+	if (!serialize_iutput_stream)
+	{
+		net.trtNet_engine(deployFile, caffemodelFile, outputNames, calibData, run_mode);
+		cout << "save Engine..." << saveName << endl;
+		net.saveEngine(saveName);
+	}
+	else
+	{
+		net.trtNet_engine(saveName);
+	}
+
+	outputCount = net.getOutputSize() / sizeof(float);
+
+	classNum = parser::getIntValue("class");
+	string classFile = parser::getStringValue("classname");
+	ifstream ClassNamelist(classFile);
+	if (!ClassNamelist.is_open())
+	{
+		std::cout << "read file list error,please check file :" << classFile << std::endl;
+		return 1;
+	}
+	string strLine;
+	while (getline(ClassNamelist, strLine))
+	{
+		ClassName.push_back(strLine);
+	}
+
+	ClassNamelist.close();
+
+	string inputstream = parser::getStringValue("inputstream");
+
+	if (!inputstream.compare("video"))
+	{
+		string video_file = parser::getStringValue("videofile");
+		cap.open(video_file);
+		do_video_or_cam(net);
+	}
+	else if (!inputstream.compare("cam"))
+	{
+		int cam_index = parser::getIntValue("cam");
+		cap.open(cam_index);
+		do_video_or_cam(net);
+	}
+	else if (!inputstream.compare("image"))
+	{
+		string listFile = parser::getStringValue("evallist");
+		if (listFile.length() > 0)
+		{
+
+			std::cout << "loading from eval list " << listFile << std::endl;
+			tie(fileNames, groundTruth) = readObjectLabelFileList(listFile);
+		}
+		else
+		{
+			string inputFileName = parser::getStringValue("input");
+			if (inputFileName.find(".jpg")!= inputFileName.npos || inputFileName.find(".png") != inputFileName.npos)//如果是单张图片
+			{
+				fileNames.push_back(inputFileName);
+			}
+			else
+			{
+				ifstream readinputfile(inputFileName);
+				if (!readinputfile.is_open())
+				{
+					cout << "can't read input image list" << inputFileName << endl;
+					return 1;
+				}
+				string strLine;
+				while (getline(readinputfile, strLine))
+				{
+					fileNames.push_back(strLine);
+				}
+				readinputfile.close();
+			}
+			//fileNames.push_back(inputFileName);
+		}
+		//list<vector<Bbox>> outputs;
+		do_image(net);
+	}
+
+	//net.~trtNet();
+	return 0;
+}
+
+//net.~trtNet();
\ No newline at end of file
diff --git a/readme.md b/readme.md
index c07608e..7371adb 100644
--- a/readme.md
+++ b/readme.md
@@ -1,14 +1,16 @@
 # TRTForYolov3
 
+<a href="https://996.icu"><img src="https://img.shields.io/badge/link-996.icu-red.svg" alt="996.icu" /></a>
+
 ## Desc
 
     tensorRT for Yolov3
 
 ### Test Enviroments
 
-    Ubuntu  16.04
-    TensorRT 5.0.2.6/4.0.1.6
-    CUDA 9.2
+    Ubuntu  16.04  or Jetson Nano
+    TensorRT 5.1/5.0.2.6/4.0.1.6
+    CUDA 9.2 or CUDA 9.0 or CUDA 10.0
 
 ### Models
 
@@ -33,64 +35,59 @@ layer {
 
 It also needs to change the yolo configs in "YoloConfigs.h" if different kernels.
 
-### Run Sample
+# build source code
 
-```bash
-#build source code
 git submodule update --init --recursive
 mkdir build
 cd build && cmake .. && make && make install
 cd ..
 
-#for yolov3-608
-./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --input=./test.jpg --W=608 --H=608 --class=80
 
-#for fp16
-./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --input=./test.jpg --W=608 --H=608 --class=80 --mode=fp16
+# what I do
 
-#for int8 with calibration datasets
-./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --input=./test.jpg --W=608 --H=608 --class=80 --mode=int8 --calib=./calib_sample.txt
+1.Added multithreading
 
-#for yolov3-416 (need to modify include/YoloConfigs for YoloKernel)
-./install/runYolov3 --caffemodel=./yolov3_416.caffemodel --prototxt=./yolov3_416.prototxt --input=./test.jpg --W=416 --H=416 --class=80
-```
+2.Added tag name
 
+3.Added video inference
 
-### Performance
 
-Model | GPU | Mode | Inference Time
--- | -- | -- | -- 
-Yolov3-416 |  GTX 1060 | Caffe | 54.593ms
-Yolov3-416 |  GTX 1060 | float32 | 23.817ms
-Yolov3-416 |  GTX 1060 | int8 | 11.921ms
-Yolov3-608 |  GTX 1060 | Caffe | 88.489ms
-Yolov3-608 | GTX 1060 | float32 | 43.965ms
-Yolov3-608 |  GTX 1060 | int8 | 21.638ms
-Yolov3-608 | GTX 1080 Ti | float32 | 19.353ms
-Yolov3-608 | GTX 1080 Ti | int8 | 9.727ms
-Yolov3-416 |  GTX 1080 Ti | float32 | 9.677ms
-Yolov3-416 |  GTX 1080 Ti | int8 | 6.129ms  | li
+# for yolov3-608
+
+## video
+
+./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --display=1 --inputstream=video --videofile=sample_720p.mp4 --classname=coco.names
+
+## cam
 
-### Eval Result
+./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --display=1 --inputstream=cam --cam=0 --classname=coco.names
 
-run above models with appending ```--evallist=labels.txt```
+## int8
+
+./install/runYolov3 --caffemodel=./yolov3_608.caffemodel --prototxt=./yolov3_608.prototxt --display=1 --inputstream=cam --cam=0 --classname=coco.names --mode=int8 --calib=cal.list
+
+## example
+
+
+![图片alt](https://raw.githubusercontent.com/talebolano/TensorRT-Yolov3/master/image/example.png)
+
+### Performance
 
-int8 calibration data made from 200 pics selected in val2014 (see scripts dir)
+Model |TYPE | GPU | Mode | Inference Time | FPS
+-- | -- | -- | -- | -- |--|
+Yolov3-608 | tensorRT |GTX 1060(laptop)(win10) | float32 | 58ms | 15
+Yolov3-608 | tensorRT |GTX 1060(laptop)(win10) | int8 | 33ms | 18
+Yolov3-608 | tensorRT |P40 | float32 | 20ms | 40(display)
+Yolov3-608 | darknet |P40 | float32 | -- | 30(display)
+Yolov3-608 | tensorRT |P40 | int8 | 13ms | 50(display)
+Yolov3-416 | tensorRT |P40 | float32 | 12ms | 60(display)
+Yolov3-416 | darknet |P40 | float32 | -- | 50(display)
+Yolov3-416 | tensorRT |P40 | int8 | 8ms | 70(display)
+Yolov3-416 | tensorRT |jeston nano | fp16 | 343ms | 2.8
+Yolov3-tiny-416 | darknet | jeston nano | fp32 | -- | 17(display)
+Yolov3-tiny-416 | tensorRT | jeston nano | fp16 | 37ms | 25(display)
 
-Model | GPU | Mode | dataset | MAP(0.50) | MAP(0.75)
--- | -- | -- | -- | -- | --
-Yolov3-416 | GTX 1060 | Caffe | COCO val2014 | 81.76 | 52.05
-Yolov3-416 | GTX 1060 | float32 | COCO val2014 | 81.93 | 52.19
-Yolov3-416 | GTX 1060 | int8 | COCO val2014 | 86.41 | 57.11
-Yolov3-416 | GTX 1060 | Caffe | COCO val2014 | 80.41 | 52.33
-Yolov3-608 | GTX 1060 | float32 | COCO val2014 |  80.6 | 52.43
-Yolov3-608 | GTX 1060 | int8 | COCO val2014 |  85.35 | 56.88 | li
 
 
-Notice: 
-+ caffe implementation is little different in yolo layer and nms, and it should be the similar result compared to tensorRT fp32. 
-+ Int8 mode gets better result in the val dataset, but not certainly in other test data. And exactly it is more often a little worse.
 
-### Details About Wrapper
 
-see link [TensorRTWrapper](https://github.com/lewes6369/tensorRTWrapper)
diff --git a/tensorRTW/code/CMakeLists.txt b/tensorRTW/code/CMakeLists.txt
new file mode 100644
index 0000000..d6b133c
--- /dev/null
+++ b/tensorRTW/code/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 2.8)
+project(trtNet)
+
+set(CMAKE_BUILD_TYPE Release)
+
+#include
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+#src
+set(PLUGIN_SOURCES
+  src/EntroyCalibrator.cpp
+  src/UpsampleLayer.cpp
+  src/UpsampleLayer.cu
+  src/YoloLayer.cu
+  src/TrtNet.cpp
+)
+
+#
+# CUDA Configuration
+#
+find_package(CUDA REQUIRED)
+
+set(CUDA_VERBOSE_BUILD ON)
+
+# Specify the cuda host compiler to use the same compiler as cmake.
+set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+
+# TensorRT
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+  HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
+MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
+find_library(TENSORRT_LIBRARY_INFER nvinfer
+  HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+find_library(TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
+  HINTS  ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+  find_library(TENSORRT_LIBRARY_PARSER nvparsers
+  HINTS  ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN} ${TENSORRT_LIBRARY_PARSER})
+MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
+find_package_handle_standard_args(
+  TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIBRARY)
+if(NOT TENSORRT_FOUND)
+  message(ERROR
+    "Cannot find TensorRT library.")
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")	# -std=gnu++11
+set(BUILD_DEPS "YES" CACHE BOOL "If YES, will install dependencies into sandbox.  Automatically reset to NO after dependencies are installed.")
+
+# if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
+#   list(APPEND CUDA_NVCC_FLAGS -std=c++11)
+# endif()
+list(APPEND CUDA_NVCC_FLAGS "-D_FORCE_INLINES -Xcompiler -fPIC")
+CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
+CUDA_ADD_LIBRARY(TrtNet STATIC ${PLUGIN_SOURCES})
+
+target_include_directories(TrtNet PUBLIC ${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
+target_link_libraries(TrtNet ${TENSORRT_LIBRARY})
\ No newline at end of file
diff --git a/tensorRTW/code/include/EntroyCalibrator.h b/tensorRTW/code/include/EntroyCalibrator.h
new file mode 100644
index 0000000..57346d8
--- /dev/null
+++ b/tensorRTW/code/include/EntroyCalibrator.h
@@ -0,0 +1,45 @@
+#ifndef _ENTROY_CALIBRATOR_H
+#define _ENTROY_CALIBRATOR_H
+
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "NvInfer.h"
+#include "Utils.h"
+
+namespace nvinfer1
+{
+
+class Int8EntropyCalibrator : public IInt8EntropyCalibrator
+{
+public:
+	Int8EntropyCalibrator(int BatchSize,const std::vector<std::vector<float>>& data,const std::string& CalibDataName = "",bool readCache = true);
+
+	virtual ~Int8EntropyCalibrator();
+
+	int getBatchSize() const override { return mBatchSize; }
+
+	bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
+
+	const void* readCalibrationCache(size_t& length) override;
+
+	void writeCalibrationCache(const void* cache, size_t length) override;
+
+private:
+	std::string mCalibDataName;
+	std::vector<std::vector<float>> mDatas;
+	int mBatchSize;
+
+	int mCurBatchIdx;
+	float* mCurBatchData{ nullptr };
+	
+	size_t mInputCount;
+	bool mReadCache;
+	void* mDeviceInput{ nullptr };
+
+	std::vector<char> mCalibrationCache;
+};
+
+}	//namespace
+
+#endif //_ENTROY_CALIBRATOR_H
diff --git a/tensorRTW/code/include/PluginFactory.h b/tensorRTW/code/include/PluginFactory.h
new file mode 100644
index 0000000..9f3caea
--- /dev/null
+++ b/tensorRTW/code/include/PluginFactory.h
@@ -0,0 +1,129 @@
+#ifndef __PLUGIN_FACTORY_H_
+#define __PLUGIN_FACTORY_H_
+
+#include <vector>
+#include <memory>
+#include <regex>
+#include "UpsampleLayer.h"
+#include "YoloLayer.h"
+#include "NvInferPlugin.h"
+#include "NvCaffeParser.h"
+
+namespace Tn
+{
+    static constexpr float NEG_SLOPE = 0.1;
+    static constexpr float UPSAMPLE_SCALE = 2.0;
+    static constexpr int CUDA_THREAD_NUM = 512;
+
+    // Integration for serialization.
+    using nvinfer1::plugin::INvPlugin;
+    using nvinfer1::plugin::createPReLUPlugin;
+    using nvinfer1::UpsampleLayerPlugin;
+    using nvinfer1::YoloLayerPlugin;
+    class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactoryExt
+    {
+        public:
+            inline bool isLeakyRelu(const char* layerName)
+            {
+                return std::regex_match(layerName , std::regex(R"(layer(\d*)-act)"));
+            }
+
+            inline bool isUpsample(const char* layerName)
+            {
+                return std::regex_match(layerName , std::regex(R"(layer(\d*)-upsample)"));
+            }
+
+            inline bool isYolo(const char* layerName)
+            {
+                return strcmp(layerName,"yolo-det") == 0;
+            }
+
+            virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override
+            {
+                assert(isPlugin(layerName));
+
+                if(isLeakyRelu(layerName))
+                {
+                    assert(nbWeights == 0 && weights == nullptr);
+                    mPluginLeakyRelu.emplace_back(std::unique_ptr<INvPlugin, void(*)(INvPlugin*)>(createPReLUPlugin(NEG_SLOPE), nvPluginDeleter));
+                    return mPluginLeakyRelu.back().get();
+                }
+                else if (isUpsample(layerName))
+                {
+                    assert(nbWeights == 0 && weights == nullptr);
+                    mPluginUpsample.emplace_back(std::unique_ptr<UpsampleLayerPlugin>(new UpsampleLayerPlugin(UPSAMPLE_SCALE,CUDA_THREAD_NUM)));
+                    return mPluginUpsample.back().get();
+                }
+                else if (isYolo(layerName))
+                {
+                    assert(nbWeights == 0 && weights == nullptr && mPluginYolo.get() ==  nullptr);
+                    mPluginYolo.reset(new YoloLayerPlugin(CUDA_THREAD_NUM));
+                    return mPluginYolo.get();
+                }
+                else
+                {
+                assert(0);
+                return nullptr;
+                }
+            }
+
+        nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
+        {
+            assert(isPlugin(layerName));
+
+            if (isLeakyRelu(layerName))
+            {
+                mPluginLeakyRelu.emplace_back(std::unique_ptr<INvPlugin, void (*)(INvPlugin*)>(createPReLUPlugin(serialData, serialLength), nvPluginDeleter));
+                return mPluginLeakyRelu.back().get();
+            }
+            else if (isUpsample(layerName))
+            {
+                mPluginUpsample.emplace_back(std::unique_ptr<UpsampleLayerPlugin>(new UpsampleLayerPlugin(serialData, serialLength)));
+                return mPluginUpsample.back().get();
+            }
+            else if (isYolo(layerName))
+            {
+                assert(mPluginYolo.get() ==  nullptr);
+                mPluginYolo.reset(new YoloLayerPlugin(serialData, serialLength));
+                return mPluginYolo.get();
+            }
+            else
+            {
+                assert(0);
+                return nullptr;
+            }
+        }
+
+
+        bool isPlugin(const char* name) override
+        {
+            return isPluginExt(name);
+        }
+
+        bool isPluginExt(const char* name) override
+        {
+            //std::cout << "check plugin " << name  << isYolo(name)<< std::endl;
+            return isLeakyRelu(name) || isUpsample(name) || isYolo(name); 
+        }
+
+        // The application has to destroy the plugin when it knows it's safe to do so.
+        void destroyPlugin()
+        {
+            for (auto& item : mPluginLeakyRelu)
+                item.reset();
+
+            for (auto& item : mPluginUpsample)
+                item.reset();
+
+            mPluginYolo.reset();
+        }
+
+        void (*nvPluginDeleter)(INvPlugin*){[](INvPlugin* ptr) { if(ptr) ptr->destroy(); }};
+
+        std::vector<std::unique_ptr<INvPlugin,void (*)(INvPlugin*)>> mPluginLeakyRelu{};
+        std::vector<std::unique_ptr<UpsampleLayerPlugin>> mPluginUpsample{};
+        std::unique_ptr<YoloLayerPlugin> mPluginYolo {nullptr};
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/tensorRTW/code/include/TrtNet.h b/tensorRTW/code/include/TrtNet.h
new file mode 100644
index 0000000..0d3c759
--- /dev/null
+++ b/tensorRTW/code/include/TrtNet.h
@@ -0,0 +1,116 @@
+﻿#ifndef __TRT_NET_H_
+#define __TRT_NET_H_
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <numeric>
+#include "NvInferPlugin.h"
+#include "NvCaffeParser.h"
+#include "PluginFactory.h"
+#include "Utils.h"
+
+namespace Tn
+{
+    enum class RUN_MODE
+    {
+        FLOAT32 = 0,
+        FLOAT16 = 1,    
+        INT8 = 2
+    };
+
+    class trtNet 
+    {
+        public:
+            //Load from caffe model
+			/* trtNet(const std::string& prototxt, const std::string& caffemodel, const std::vector<std::string>& outputNodesName,
+				const std::vector<std::vector<float>>& calibratorData, RUN_MODE mode /*= RUN_MODE::FLOAT32) :mTrtContext(nullptr), mTrtEngine(nullptr), mTrtRunTime(nullptr), mTrtRunMode(mode), mTrtInputCount(0), mTrtIterationTime(0) {};
+			 */
+			trtNet() :mTrtContext(nullptr), mTrtEngine(nullptr), mTrtRunTime(nullptr), mTrtInputCount(0), mTrtIterationTime(0) {};
+          
+
+            ~trtNet()
+            {
+                // Release the stream and the buffers
+                cudaStreamSynchronize(mTrtCudaStream);
+                cudaStreamDestroy(mTrtCudaStream);
+                for(auto& item : mTrtCudaBuffer)
+                    cudaFree(item);
+
+                mTrtPluginFactory.destroyPlugin();
+
+                if(!mTrtRunTime)
+                    mTrtRunTime->destroy();
+                if(!mTrtContext)
+                    mTrtContext->destroy();
+                if(!mTrtEngine)
+                    mTrtEngine->destroy();
+            };
+
+			void trtNet_engine(const std::string& prototxt, const std::string& caffeModel, const std::vector<std::string>& outputNodesName,
+				const std::vector<std::vector<float>>& calibratorData, RUN_MODE mode = RUN_MODE::FLOAT32);
+
+			void set_mode(RUN_MODE mode)
+			{
+				mTrtRunMode = mode;
+			}
+			//Load from engine file
+			void trtNet_engine(const std::string& engineFile);
+
+            void saveEngine(std::string fileName)
+            {
+                if(mTrtEngine)
+                {
+                    nvinfer1::IHostMemory* data = mTrtEngine->serialize();
+                    std::ofstream file;
+                    file.open(fileName,std::ios::binary | std::ios::out);
+                    if(!file.is_open())
+                    {
+                        std::cout << "read create engine file" << fileName <<" failed" << std::endl;
+                        return;
+                    }
+
+                    file.write((const char*)data->data(), data->size());
+                    file.close();
+                }
+            };
+
+            void doInference(const void* inputData, void* outputData);
+            
+            inline size_t getInputSize() {
+                return std::accumulate(mTrtBindBufferSize.begin(), mTrtBindBufferSize.begin() + mTrtInputCount,0);
+            };
+
+            inline size_t getOutputSize() {
+                return std::accumulate(mTrtBindBufferSize.begin() + mTrtInputCount, mTrtBindBufferSize.end(),0);
+            };
+            
+            void printTime()
+            {
+                mTrtProfiler.printLayerTimes(mTrtIterationTime);
+            }
+
+        private:
+                nvinfer1::ICudaEngine* loadModelAndCreateEngine(const char* deployFile, const char* modelFile,int maxBatchSize,
+                                        nvcaffeparser1::ICaffeParser* parser, nvcaffeparser1::IPluginFactory* pluginFactory,
+                                        nvinfer1::IInt8Calibrator* calibrator, nvinfer1::IHostMemory*& trtModelStream,const std::vector<std::string>& outputNodesName);
+
+                void InitEngine();
+
+                nvinfer1::IExecutionContext* mTrtContext;
+                nvinfer1::ICudaEngine* mTrtEngine;
+                nvinfer1::IRuntime* mTrtRunTime;
+                PluginFactory mTrtPluginFactory;    
+                cudaStream_t mTrtCudaStream;
+                Profiler mTrtProfiler;
+                RUN_MODE mTrtRunMode;
+
+                std::vector<void*> mTrtCudaBuffer;
+                std::vector<int64_t> mTrtBindBufferSize;
+                int mTrtInputCount;
+                int mTrtIterationTime;
+    };
+}
+
+#endif //__TRT_NET_H_
diff --git a/tensorRTW/code/include/UpsampleLayer.h b/tensorRTW/code/include/UpsampleLayer.h
new file mode 100644
index 0000000..0b3b271
--- /dev/null
+++ b/tensorRTW/code/include/UpsampleLayer.h
@@ -0,0 +1,72 @@
+#ifndef _UPSAMPLE_LAYER_H
+#define _UPSAMPLE_LAYER_H
+
+#include <assert.h>
+#include <cmath>
+#include <string.h>
+#include <cudnn.h>
+#include <cublas_v2.h>
+#include "NvInfer.h"
+#include "Utils.h"
+#include <iostream>
+
+namespace nvinfer1
+{
+    class UpsampleLayerPlugin: public IPluginExt
+    {
+    public:
+        explicit UpsampleLayerPlugin(const float scale, const int cudaThread = 512);
+        // create the plugin at runtime from a byte stream
+        UpsampleLayerPlugin(const void* data, size_t length);
+
+        ~UpsampleLayerPlugin();
+
+        int getNbOutputs() const override
+        {
+            return 1;
+        }
+
+        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+        bool supportsFormat(DataType type, PluginFormat format) const override { 
+            //std::cout << "supportsFormat=== type:"  << int(type) << "format" << int(format) << std::endl;
+            return (type == DataType::kFLOAT || type == DataType::kHALF || type == DataType::kINT8 ) 
+            && format == PluginFormat::kNCHW; 
+        }
+
+        void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override;
+
+        int initialize() override;
+
+        virtual void terminate() override {
+        };
+
+        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
+
+        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+        virtual size_t getSerializationSize() override
+        {
+            return sizeof(nvinfer1::Dims) + sizeof(mDataType) + sizeof(mScale)
+            + sizeof(mOutputWidth) + sizeof(mOutputHeight) + sizeof(mThreadCount);
+        }
+
+        virtual void serialize(void* buffer) override;
+
+        template <typename Dtype>
+        void forwardGpu(const Dtype* input,Dtype * outputint ,int N,int C,int H ,int W);
+
+    private:
+        nvinfer1::Dims mCHW;
+        DataType mDataType{DataType::kFLOAT};
+        float mScale;
+        int mOutputWidth;
+        int mOutputHeight;
+        int mThreadCount;
+        
+        void* mInputBuffer  {nullptr}; 
+        void* mOutputBuffer {nullptr}; 
+    };
+};
+
+#endif 
diff --git a/tensorRTW/code/include/Utils.h b/tensorRTW/code/include/Utils.h
new file mode 100644
index 0000000..0de663c
--- /dev/null
+++ b/tensorRTW/code/include/Utils.h
@@ -0,0 +1,94 @@
+#ifndef __TRT_UTILS_H_
+#define __TRT_UTILS_H_
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cudnn.h>
+
+#ifndef CUDA_CHECK
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#endif
+
+namespace Tn
+{
+    class Profiler : public nvinfer1::IProfiler
+    {
+    public:
+        void printLayerTimes(int itrationsTimes)
+        {
+            float totalTime = 0;
+            for (size_t i = 0; i < mProfile.size(); i++)
+            {
+                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
+                totalTime += mProfile[i].second;
+            }
+            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
+        }
+    private:
+        typedef std::pair<std::string, float> Record;
+        std::vector<Record> mProfile;
+
+        virtual void reportLayerTime(const char* layerName, float ms)
+        {
+            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
+            if (record == mProfile.end())
+                mProfile.push_back(std::make_pair(layerName, ms));
+            else
+                record->second += ms;
+        }
+    };
+
+    //Logger for TensorRT info/warning/errors
+    class Logger : public nvinfer1::ILogger
+    {
+    public:
+
+        Logger(): Logger(Severity::kWARNING) {}
+
+        Logger(Severity severity): reportableSeverity(severity) {}
+
+        void log(Severity severity, const char* msg) override
+        {
+            // suppress messages with severity enum value greater than the reportable
+            if (severity > reportableSeverity) return;
+
+            switch (severity)
+            {
+                case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+                case Severity::kERROR: std::cerr << "ERROR: "; break;
+                case Severity::kWARNING: std::cerr << "WARNING: "; break;
+                case Severity::kINFO: std::cerr << "INFO: "; break;
+                default: std::cerr << "UNKNOWN: "; break;
+            }
+            std::cerr << msg << std::endl;
+        }
+
+        Severity reportableSeverity{Severity::kWARNING};
+    };
+
+    template<typename T> 
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T> 
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/tensorRTW/code/include/YoloConfigs.h b/tensorRTW/code/include/YoloConfigs.h
new file mode 100644
index 0000000..189d3a2
--- /dev/null
+++ b/tensorRTW/code/include/YoloConfigs.h
@@ -0,0 +1,53 @@
+#ifndef _YOLO_CONFIGS_H_
+#define _YOLO_CONFIGS_H_
+
+
+namespace Yolo
+{
+    static constexpr int CHECK_COUNT = 3;
+    static constexpr float IGNORE_THRESH = 0.5f;
+    static constexpr int CLASS_NUM = 80;
+
+    struct YoloKernel
+    {
+        int width;
+        int height;
+        float anchors[CHECK_COUNT*2];
+    };
+
+    //YOLO 608
+    YoloKernel yolo1 = {
+        19,
+        19,
+        {116,90,  156,198,  373,326}
+    };
+    YoloKernel yolo2 = {
+        38,
+        38,
+        {30,61,  62,45,  59,119}
+    };
+    YoloKernel yolo3 = {
+        76,
+        76,
+        {10,13,  16,30,  33,23}
+    };
+
+    //YOLO 416
+    // YoloKernel yolo1 = {
+    //     13,
+    //     13,
+    //     {116,90,  156,198,  373,326}
+    // };
+    // YoloKernel yolo2 = {
+    //     26,
+    //     26,
+    //     {30,61,  62,45,  59,119}
+    // };
+    // YoloKernel yolo3 = {
+    //     52,
+    //     52,
+    //     {10,13,  16,30,  33,23}
+    // };
+}
+
+#endif
\ No newline at end of file
diff --git a/tensorRTW/code/include/YoloLayer.h b/tensorRTW/code/include/YoloLayer.h
new file mode 100644
index 0000000..f90a003
--- /dev/null
+++ b/tensorRTW/code/include/YoloLayer.h
@@ -0,0 +1,79 @@
+#ifndef _YOLO_LAYER_H
+#define _YOLO_LAYER_H
+
+#include <assert.h>
+#include <cmath>
+#include <string.h>
+#include <cudnn.h>
+#include <cublas_v2.h>
+#include "NvInfer.h"
+#include "Utils.h"
+#include <iostream>
+
+namespace Yolo
+{
+    struct YoloKernel;
+
+    static constexpr int LOCATIONS = 4;
+    struct Detection{
+        //x y w h
+        float bbox[LOCATIONS];
+        //float objectness;
+        int classId;
+        float prob;
+    };
+}
+
+
+namespace nvinfer1
+{
+    class YoloLayerPlugin: public IPluginExt
+    {
+    public:
+        explicit YoloLayerPlugin(const int cudaThread = 512);
+        YoloLayerPlugin(const void* data, size_t length);
+
+        ~YoloLayerPlugin();
+
+        int getNbOutputs() const override
+        {
+            return 1;
+        }
+
+        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+        bool supportsFormat(DataType type, PluginFormat format) const override { 
+            return type == DataType::kFLOAT && format == PluginFormat::kNCHW; 
+        }
+
+        void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override {};
+
+        int initialize() override;
+
+        virtual void terminate() override {};
+
+        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
+
+        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+        virtual size_t getSerializationSize() override;
+
+        virtual void serialize(void* buffer) override;
+
+        void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream);
+
+        void forwardCpu(const float *const * inputs,float * output, cudaStream_t stream);
+
+    private:
+        int mClassCount;
+        int mKernelCount;
+        std::vector<Yolo::YoloKernel> mYoloKernel;
+        int mThreadCount;
+
+        //cpu
+        void* mInputBuffer  {nullptr}; 
+        void* mOutputBuffer {nullptr}; 
+    };
+};
+
+#endif 
diff --git a/tensorRTW/code/src/EntroyCalibrator.cpp b/tensorRTW/code/src/EntroyCalibrator.cpp
new file mode 100644
index 0000000..9e03a74
--- /dev/null
+++ b/tensorRTW/code/src/EntroyCalibrator.cpp
@@ -0,0 +1,75 @@
+#include "EntroyCalibrator.h"
+#include <fstream>
+#include <iterator>
+#include <cassert>
+#include <string.h>
+#include <algorithm>
+
+namespace nvinfer1
+{
+    Int8EntropyCalibrator::Int8EntropyCalibrator(int BatchSize,const std::vector<std::vector<float>>& data,
+                                            const std::string& CalibDataName /*= ""*/,bool readCache /*= true*/)
+        : mCalibDataName(CalibDataName),mBatchSize(BatchSize),mReadCache(readCache)
+    {     
+        mDatas.reserve(data.size());
+        mDatas = data;
+
+        mInputCount =  BatchSize * data[0].size();
+        mCurBatchData = new float[mInputCount];
+        mCurBatchIdx = 0;
+        CUDA_CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
+    }
+
+
+    Int8EntropyCalibrator::~Int8EntropyCalibrator()
+    {
+        CUDA_CHECK(cudaFree(mDeviceInput));
+        if(mCurBatchData)
+            delete[] mCurBatchData;
+    }
+
+
+    bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], int nbBindings)
+    {
+        if (mCurBatchIdx + mBatchSize > int(mDatas.size())) 
+             return false;
+
+        float* ptr = mCurBatchData;
+        size_t imgSize = mInputCount / mBatchSize;
+        auto iter = mDatas.begin() + mCurBatchIdx;
+
+        std::for_each(iter, iter + mBatchSize, [=,&ptr](std::vector<float>& val){
+            assert(imgSize == val.size());
+            memcpy(ptr,val.data(),imgSize*sizeof(float));
+            
+            ptr += imgSize;
+        });
+
+        CUDA_CHECK(cudaMemcpy(mDeviceInput, mCurBatchData, mInputCount * sizeof(float), cudaMemcpyHostToDevice));
+        //std::cout << "input name " << names[0] << std::endl;
+        bindings[0] = mDeviceInput;
+
+        std::cout << "load batch " << mCurBatchIdx << " to " << mCurBatchIdx + mBatchSize - 1 << std::endl;        
+        mCurBatchIdx += mBatchSize;
+        return true;
+    }
+
+    const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length)
+    {
+        mCalibrationCache.clear();
+        std::ifstream input(mCalibDataName+".calib", std::ios::binary);
+        input >> std::noskipws;
+        if (mReadCache && input.good())
+            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
+
+        length = mCalibrationCache.size();
+        return length ? &mCalibrationCache[0] : nullptr;
+    }
+
+    void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length)
+    {
+        std::ofstream output(mCalibDataName+".calib", std::ios::binary);
+        output.write(reinterpret_cast<const char*>(cache), length);
+    }
+
+}
\ No newline at end of file
diff --git a/tensorRTW/code/src/TrtNet.cpp b/tensorRTW/code/src/TrtNet.cpp
new file mode 100644
index 0000000..4c53a79
--- /dev/null
+++ b/tensorRTW/code/src/TrtNet.cpp
@@ -0,0 +1,242 @@
+﻿#include "TrtNet.h"
+#include "EntroyCalibrator.h"
+#include <cassert>
+#include <chrono>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <iostream>
+#include <sstream>
+#include <string.h>
+#include <time.h>
+#include <unordered_map>
+
+using namespace nvinfer1;
+using namespace nvcaffeparser1;
+using namespace plugin;
+
+static Tn::Logger gLogger;
+
+#define RETURN_AND_LOG(ret, severity, message)                                 \
+    do                                                                         \
+    {                                                                          \
+        std::string error_message = "ssd_error_log: " + std::string(message); \
+        gLogger.log(ILogger::Severity::k##severity, error_message.c_str());    \
+        return (ret);                                                          \
+    } while (0)
+
+inline void* safeCudaMalloc(size_t memSize)
+{
+    void* deviceMem;
+    CUDA_CHECK(cudaMalloc(&deviceMem, memSize));
+    if (deviceMem == nullptr)
+    {
+        std::cerr << "Out of memory" << std::endl;
+        exit(1);
+    }
+    return deviceMem;
+}
+
+inline int64_t volume(const nvinfer1::Dims& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+}
+
+inline unsigned int getElementSize(nvinfer1::DataType t)
+{
+    switch (t)
+    {
+        case nvinfer1::DataType::kINT32: return 4;
+        case nvinfer1::DataType::kFLOAT: return 4;
+        case nvinfer1::DataType::kHALF: return 2;
+        case nvinfer1::DataType::kINT8: return 1;
+    }
+    throw std::runtime_error("Invalid DataType.");
+    return 0;
+}
+
+namespace Tn
+{
+    void trtNet::trtNet_engine(const std::string& prototxt,const std::string& caffemodel,const std::vector<std::string>& outputNodesName,
+                    const std::vector<std::vector<float>>& calibratorData,RUN_MODE mode /*= RUN_MODE::FLOAT32*/)
+    {
+        std::cout << "init plugin proto: " << prototxt << " caffemodel: " << caffemodel << std::endl;
+        auto parser = createCaffeParser();
+
+        const int maxBatchSize = 1;
+        IHostMemory* trtModelStream{nullptr};
+
+        Int8EntropyCalibrator * calibrator = nullptr;
+        if (calibratorData.size() > 0 ){
+            auto endPos= prototxt.find_last_of(".");
+	        auto beginPos= prototxt.find_last_of('/') + 1;
+            std::string calibratorName = prototxt.substr(beginPos,endPos - beginPos);
+            std::cout << "create calibrator,Named:" << calibratorName << std::endl;
+            calibrator = new Int8EntropyCalibrator(maxBatchSize,calibratorData,calibratorName);
+        }
+
+        PluginFactory pluginFactorySerialize;
+        ICudaEngine* tmpEngine = loadModelAndCreateEngine(prototxt.c_str(),caffemodel.c_str(), maxBatchSize, parser, &pluginFactorySerialize, calibrator, trtModelStream,outputNodesName);
+        assert(tmpEngine != nullptr);
+        assert(trtModelStream != nullptr);
+        if(calibrator){
+            delete calibrator;
+            calibrator = nullptr;
+        }
+        tmpEngine->destroy();
+        pluginFactorySerialize.destroyPlugin();
+
+        mTrtRunTime = createInferRuntime(gLogger);     
+        assert(mTrtRunTime != nullptr);
+        mTrtEngine= mTrtRunTime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), &mTrtPluginFactory);
+        assert(mTrtEngine != nullptr);
+        // Deserialize the engine.
+        trtModelStream->destroy();
+
+        InitEngine();
+    }
+
+	void trtNet::trtNet_engine(const std::string& engineFile)
+    {
+        using namespace std;
+        fstream file;
+        
+        file.open(engineFile,ios::binary | ios::in);
+        if(!file.is_open())
+        {
+            cout << "read engine file" << engineFile <<" failed" << endl;
+            return;
+        }
+        file.seekg(0, ios::end); 
+        int length = file.tellg();         
+        file.seekg(0, ios::beg); 
+        std::unique_ptr<char[]> data(new char[length]);
+        file.read(data.get(), length);
+
+        file.close();
+
+        std::cout << "*** deserializing" << std::endl;
+        mTrtRunTime = createInferRuntime(gLogger);
+        assert(mTrtRunTime != nullptr);
+        mTrtEngine= mTrtRunTime->deserializeCudaEngine(data.get(), length, &mTrtPluginFactory);
+        assert(mTrtEngine != nullptr);
+
+        InitEngine();
+    }
+
+    void trtNet::InitEngine()
+    {
+        const int maxBatchSize = 1;
+        mTrtContext = mTrtEngine->createExecutionContext();
+        assert(mTrtContext != nullptr);
+        mTrtContext->setProfiler(&mTrtProfiler);
+
+        // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings()
+        int nbBindings = mTrtEngine->getNbBindings();
+
+        mTrtCudaBuffer.resize(nbBindings);
+        mTrtBindBufferSize.resize(nbBindings);
+        for (int i = 0; i < nbBindings; ++i)
+        {
+            Dims dims = mTrtEngine->getBindingDimensions(i);
+            DataType dtype = mTrtEngine->getBindingDataType(i);
+            int64_t totalSize = volume(dims) * maxBatchSize * getElementSize(dtype);
+            mTrtBindBufferSize[i] = totalSize;
+            mTrtCudaBuffer[i] = safeCudaMalloc(totalSize);
+            if(mTrtEngine->bindingIsInput(i))
+                mTrtInputCount++;
+        }
+
+        CUDA_CHECK(cudaStreamCreate(&mTrtCudaStream));
+    }
+
+
+    nvinfer1::ICudaEngine* trtNet::loadModelAndCreateEngine(const char* deployFile, const char* modelFile,int maxBatchSize,
+                                        ICaffeParser* parser, nvcaffeparser1::IPluginFactory* pluginFactory,
+                                        IInt8Calibrator* calibrator, IHostMemory*& trtModelStream,const std::vector<std::string>& outputNodesName)
+    {
+        // Create the builder
+        IBuilder* builder = createInferBuilder(gLogger);
+
+        // Parse the model to populate the network, then set the outputs.
+        INetworkDefinition* network = builder->createNetwork();
+        parser->setPluginFactory(pluginFactory);
+
+        std::cout << "Begin parsing model..." << std::endl;
+        const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile,modelFile, *network, nvinfer1::DataType::kFLOAT);
+        if (!blobNameToTensor)
+            RETURN_AND_LOG(nullptr, ERROR, "Fail to parse");
+        std::cout << "End parsing model..." << std::endl;
+
+        // specify which tensors are outputs
+        for (auto& name : outputNodesName)
+        {
+            auto output = blobNameToTensor->find(name.c_str());
+            assert(output!=nullptr);
+            if (output == nullptr)
+                std::cout << "can not find output named " << name << std::endl;
+
+            network->markOutput(*output);
+        }
+
+        // Build the engine.
+        builder->setMaxBatchSize(maxBatchSize);
+        builder->setMaxWorkspaceSize(1 << 30);// 1G
+        if (mTrtRunMode == RUN_MODE::INT8)
+        {
+             std::cout <<"setInt8Mode"<<std::endl;
+            if (!builder->platformHasFastInt8())
+                std::cout << "Notice: the platform do not has fast for int8" << std::endl;
+            builder->setInt8Mode(true);
+            builder->setInt8Calibrator(calibrator);
+        }
+        else if (mTrtRunMode == RUN_MODE::FLOAT16)
+        {
+            std::cout <<"setFp16Mode"<<std::endl;
+            if (!builder->platformHasFastFp16())
+                std::cout << "Notice: the platform do not has fast for fp16" << std::endl;
+            builder->setFp16Mode(true);
+        }
+
+        std::cout << "Begin building engine..." << std::endl;
+        ICudaEngine* engine = builder->buildCudaEngine(*network);
+        if (!engine)
+            RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine");
+        std::cout << "End building engine..." << std::endl;
+
+        // We don't need the network any more, and we can destroy the parser.
+        network->destroy();
+        parser->destroy();
+
+        // Serialize the engine, then close everything down.
+        trtModelStream = engine->serialize();
+
+        builder->destroy();
+        shutdownProtobufLibrary();
+        return engine;
+    }
+
+    void trtNet::doInference(const void* inputData, void* outputData)
+    {
+        static const int batchSize = 1;
+        assert(mTrtInputCount == 1);
+
+        // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
+        int inputIndex = 0;
+        CUDA_CHECK(cudaMemcpyAsync(mTrtCudaBuffer[inputIndex], inputData, mTrtBindBufferSize[inputIndex], cudaMemcpyHostToDevice, mTrtCudaStream));
+        auto t_start = std::chrono::high_resolution_clock::now();
+        mTrtContext->execute(batchSize, &mTrtCudaBuffer[inputIndex]);
+        auto t_end = std::chrono::high_resolution_clock::now();
+        float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+
+        std::cout << "Time taken for inference is " << total << " ms." << std::endl;
+
+        for (size_t bindingIdx = mTrtInputCount; bindingIdx < mTrtBindBufferSize.size(); ++bindingIdx)
+        {
+            auto size = mTrtBindBufferSize[bindingIdx];
+            CUDA_CHECK(cudaMemcpyAsync(outputData, mTrtCudaBuffer[bindingIdx], size, cudaMemcpyDeviceToHost, mTrtCudaStream));
+            outputData = (char *)outputData + size;
+        }
+
+        mTrtIterationTime ++ ;
+    }
+}
\ No newline at end of file
diff --git a/tensorRTW/code/src/UpsampleLayer.cpp b/tensorRTW/code/src/UpsampleLayer.cpp
new file mode 100644
index 0000000..cbcf424
--- /dev/null
+++ b/tensorRTW/code/src/UpsampleLayer.cpp
@@ -0,0 +1,78 @@
+#include "UpsampleLayer.h"
+
+namespace nvinfer1
+{
+  UpsampleLayerPlugin::UpsampleLayerPlugin(const float scale, const int cudaThread /*= 512*/)
+  : mScale(scale),mThreadCount(cudaThread)
+  {
+  }
+  
+  UpsampleLayerPlugin::~UpsampleLayerPlugin()
+  {
+  
+  }
+  
+  // create the plugin at runtime from a byte stream
+  UpsampleLayerPlugin::UpsampleLayerPlugin(const void* data, size_t length)
+  {
+      using namespace Tn;
+      const char *d = reinterpret_cast<const char *>(data), *a = d;
+      read(d, mCHW);
+      read(d, mDataType);
+      read(d, mScale);
+      read(d, mOutputWidth);
+      read(d, mOutputHeight);
+      read(d, mThreadCount);
+  
+      //std::cout << "read:" << a << " " << mOutputWidth<< " " <<mOutputHeight<<std::endl;
+      assert(d == a + length);
+  }
+  
+  void UpsampleLayerPlugin::serialize(void* buffer)
+  {
+      using namespace Tn;
+      char* d = static_cast<char*>(buffer), *a = d;
+      write(d, mCHW);
+      write(d, mDataType);
+      write(d, mScale);
+      write(d, mOutputWidth);
+      write(d, mOutputHeight);
+      write(d, mThreadCount);
+  
+      //std::cout << "write:" << a << " " << mOutputHeight<< " " <<mOutputWidth<<std::endl;
+      assert(d == a + getSerializationSize());
+  }
+  
+  int UpsampleLayerPlugin::initialize()
+  {
+      int inputHeight = mCHW.d[1];
+      int inputWidth = mCHW.d[2];
+      
+      mOutputHeight = inputHeight * mScale;
+      mOutputWidth = inputWidth * mScale;
+  
+      return 0;
+  }
+  
+  void UpsampleLayerPlugin::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize)
+  {
+      //std::cout << "type " << int(type) << "format " << (int)format <<std::endl;
+      assert((type == DataType::kFLOAT || type == DataType::kHALF || type == DataType::kINT8) && format == PluginFormat::kNCHW);
+      mDataType = type;
+      
+      //std::cout << "configureWithFormat:" <<inputDims[0].d[0]<< " " <<inputDims[0].d[1] << " "<<inputDims[0].d[2] <<std::endl;
+  }
+  
+  //it is called prior to any call to initialize().
+  Dims UpsampleLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+  {
+      //std::cout <<"Input:" << inputs[0].d[0] << " "<<inputs[0].d[1]<< " "<<inputs[0].d[2]<<std::endl;
+      //std::cout <<"nbInputDims : "<<nbInputDims<< " input:" << inputs[0].nbDims << std::endl;
+  
+      mCHW = inputs[0];
+      mOutputHeight = inputs[0].d[1]* mScale;
+      mOutputWidth = inputs[0].d[2]* mScale;
+      //std::cout << "ouputDims:" << mCHW.d[0] << " " << mOutputHeight << " " << mOutputWidth << std::endl;
+      return Dims3(mCHW.d[0], mOutputHeight, mOutputWidth);
+  }
+}
diff --git a/tensorRTW/code/src/UpsampleLayer.cu b/tensorRTW/code/src/UpsampleLayer.cu
new file mode 100644
index 0000000..e1bae71
--- /dev/null
+++ b/tensorRTW/code/src/UpsampleLayer.cu
@@ -0,0 +1,84 @@
+#include "UpsampleLayer.h"
+
+namespace nvinfer1
+{
+  __device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
+    int x, y, z, w;
+    w = ii % d3;
+    ii = ii/d3;
+    z = ii % d2;
+    ii = ii/d2;
+    y = ii % d1;
+    ii = ii/d1;
+    x = ii;
+    w = w/scale_factor;
+    z = z/scale_factor;
+    d2 /= scale_factor;
+    d3 /= scale_factor;
+    return (((x*d1+y)*d2)+z)*d3+w;
+  }
+
+  template <typename Dtype>
+  __global__ void upscale(const Dtype *input, Dtype *output,
+          int no_elements, int scale_factor, int d1, int d2, int d3) {
+    int ii = threadIdx.x + blockDim.x * blockIdx.x;
+    if (ii >= no_elements) return;
+    int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
+    output[ii]=input[ipidx];
+  }
+
+  template <typename Dtype>
+  void UpsampleLayerPlugin::forwardGpu(const Dtype* input,Dtype * output,
+      int N,int C,int H ,int W) {
+
+    int numElem = N*C*H*W;
+    upscale<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>(input,output, numElem, mScale, C, H, W);
+  }
+  
+  size_t type2size(DataType dataType) { 
+    size_t _size = 0;
+    switch (dataType)
+    {
+        case DataType::kFLOAT: _size = sizeof(float);break;
+        case DataType::kHALF: _size = sizeof(__half);break;
+        case DataType::kINT8: _size = sizeof(u_int8_t);break;
+        default:std::cerr << "error data type" << std::endl;
+    }
+    return _size;
+  }
+
+  int UpsampleLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+  {
+      assert(batchSize == 1);
+      const int channels = mCHW.d[0];
+      const int64_t in_height = mCHW.d[1];
+      const int64_t in_width = mCHW.d[2];
+      const int64_t out_height = mOutputHeight;
+      const int64_t out_width = mOutputWidth;
+      int totalElems = batchSize * in_height * in_width * channels;
+      
+      // Handle no-op resizes efficiently.
+      if (out_height == in_height && out_width == in_width) {
+          CUDA_CHECK(cudaMemcpyAsync(outputs[0], inputs[0], totalElems * type2size(mDataType), cudaMemcpyDeviceToDevice, stream));
+          CUDA_CHECK(cudaStreamSynchronize(stream));
+          return 0;
+      }
+      //CUDA_CHECK(cudaStreamSynchronize(stream));
+      
+       switch (mDataType)
+       {
+           case DataType::kFLOAT :
+              forwardGpu<float>((const float *)inputs[0],(float *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth);
+               break;
+           case DataType::kHALF:
+               forwardGpu<__half>((const __half *)inputs[0],(__half *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth);
+               break;
+           case DataType::kINT8:
+               forwardGpu<u_int8_t>((const u_int8_t *)inputs[0],(u_int8_t *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth);
+              break;
+           default:
+               std::cerr << "error data type" << std::endl;
+       }
+      return 0;    
+  };
+}
\ No newline at end of file
diff --git a/tensorRTW/code/src/YoloLayer.cu b/tensorRTW/code/src/YoloLayer.cu
new file mode 100644
index 0000000..061a5dd
--- /dev/null
+++ b/tensorRTW/code/src/YoloLayer.cu
@@ -0,0 +1,253 @@
+#include "YoloConfigs.h"
+#include "YoloLayer.h"
+
+using namespace Yolo;
+
+namespace nvinfer1
+{
+    YoloLayerPlugin::YoloLayerPlugin(const int cudaThread /*= 512*/):mThreadCount(cudaThread)
+    {
+        mClassCount = CLASS_NUM;
+        mYoloKernel.clear();
+        mYoloKernel.push_back(yolo1);
+        mYoloKernel.push_back(yolo2);
+        mYoloKernel.push_back(yolo3);
+
+        mKernelCount = mYoloKernel.size();
+    }
+    
+    YoloLayerPlugin::~YoloLayerPlugin()
+    {
+        if(mInputBuffer)
+            CUDA_CHECK(cudaFreeHost(mInputBuffer));
+
+        if(mOutputBuffer)
+            CUDA_CHECK(cudaFreeHost(mOutputBuffer));
+    }
+    
+    // create the plugin at runtime from a byte stream
+    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
+    {
+        using namespace Tn;
+        const char *d = reinterpret_cast<const char *>(data), *a = d;
+        read(d, mClassCount);
+        read(d, mThreadCount);
+        read(d, mKernelCount);
+        mYoloKernel.resize(mKernelCount);
+        auto kernelSize = mKernelCount*sizeof(YoloKernel);
+        memcpy(mYoloKernel.data(),d,kernelSize);
+        d += kernelSize;
+
+        assert(d == a + length);
+    }
+
+    void YoloLayerPlugin::serialize(void* buffer)
+    {
+        using namespace Tn;
+        char* d = static_cast<char*>(buffer), *a = d;
+        write(d, mClassCount);
+        write(d, mThreadCount);
+        write(d, mKernelCount);
+        auto kernelSize = mKernelCount*sizeof(YoloKernel);
+        memcpy(d,mYoloKernel.data(),kernelSize);
+        d += kernelSize;
+
+        assert(d == a + getSerializationSize());
+    }
+    
+    size_t YoloLayerPlugin::getSerializationSize()
+    {  
+        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
+    }
+
+    int YoloLayerPlugin::initialize()
+    { 
+            int totalCount = 0;
+            for(const auto& yolo : mYoloKernel)
+                totalCount += (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT;
+            CUDA_CHECK(cudaHostAlloc(&mInputBuffer, totalCount * sizeof(float), cudaHostAllocDefault));
+
+            totalCount = 0;//detection count
+            for(const auto& yolo : mYoloKernel)
+                totalCount += yolo.width*yolo.height * CHECK_COUNT;
+            CUDA_CHECK(cudaHostAlloc(&mOutputBuffer, sizeof(float) + totalCount * sizeof(Detection), cudaHostAllocDefault));
+            return 0;
+    }
+    
+    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    {
+            //output the result to channel
+            int totalCount = 0;
+            for(const auto& yolo : mYoloKernel)
+                totalCount += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float);
+
+            return Dims3(totalCount + 1, 1, 1);
+    }
+
+    void YoloLayerPlugin::forwardCpu(const float*const * inputs, float* outputs, cudaStream_t stream)
+    {
+            auto Logist = [=](float data){
+                return 1./(1. + exp(-data));
+            };
+
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            int i = 0;
+            float* inputData = (float *)mInputBuffer; 
+            for(const auto& yolo : mYoloKernel)
+            {
+                int size = (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT;
+                CUDA_CHECK(cudaMemcpyAsync(inputData, inputs[i], size * sizeof(float), cudaMemcpyDeviceToHost, stream));
+                inputData += size;
+                ++ i;
+            }
+
+            inputData = (float *)mInputBuffer;
+            std::vector <Detection> result;
+            for (const auto& yolo : mYoloKernel)
+            {
+                int stride = yolo.width*yolo.height;
+                for (int j = 0;j < stride ;++j)
+                {
+                    for (int k = 0;k < CHECK_COUNT; ++k )
+                    {
+                        int beginIdx = (LOCATIONS + 1 + mClassCount)* stride *k + j;
+                        int objIndex = beginIdx + LOCATIONS*stride;
+                        
+                        //check obj
+                        float objProb = Logist(inputData[objIndex]);   
+                        if(objProb <= IGNORE_THRESH)
+                            continue;
+
+                        //classes
+                        int classId = -1;
+                        float maxProb = IGNORE_THRESH;
+                        for (int c = 0;c< mClassCount;++c){
+                            float cProb =  Logist(inputData[beginIdx + (5 + c) * stride]) * objProb;
+                            if(cProb > maxProb){
+                                maxProb = cProb;
+                                classId = c;
+                            }
+                        }
+            
+                        if(classId >= 0) {
+                            Detection det;
+                            int row = j / yolo.width;
+                            int cols = j % yolo.width;
+    
+                            //Location
+                            det.bbox[0] = (cols + Logist(inputData[beginIdx]))/ yolo.width;
+                            det.bbox[1] = (row + Logist(inputData[beginIdx+stride]))/ yolo.height;
+                            det.bbox[2] = exp(inputData[beginIdx+2*stride]) * yolo.anchors[2*k];
+                            det.bbox[3] = exp(inputData[beginIdx+3*stride]) * yolo.anchors[2*k + 1];
+                            det.classId = classId;
+                            det.prob = maxProb;
+                            //det.objectness = objProb;
+
+                            result.emplace_back(det);
+                        }
+                    }
+                }
+
+                inputData += (LOCATIONS + 1 + mClassCount) * stride * CHECK_COUNT;
+            }
+
+            
+            int detCount =result.size();
+            auto data = (float *)mOutputBuffer;
+            //copy count;
+            data[0] = (float)detCount;
+            //std::cout << "detCount"<< detCount << std::endl;
+            data++;
+            //copy result
+            memcpy(data,result.data(),result.size()*sizeof(Detection));
+
+            //(count + det result)
+            CUDA_CHECK(cudaMemcpyAsync(outputs, mOutputBuffer, sizeof(float) + result.size()*sizeof(Detection), cudaMemcpyHostToDevice, stream));
+    };
+
+    __device__ float Logist(float data){ return 1./(1. + exp(-data)); };
+
+    __global__ void CalDetection(const float *input, float *output,int noElements, 
+            int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes) {
+ 
+        int idx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (idx >= noElements) return;
+
+        int stride = yoloWidth*yoloHeight;
+
+        for (int k = 0;k < CHECK_COUNT; ++k )
+        {
+            int beginIdx = (LOCATIONS + 1 + classes)* stride *k + idx;
+            int objIndex = beginIdx + LOCATIONS*stride;
+            
+            //check objectness
+            float objProb = Logist(input[objIndex]);   
+            if(objProb <= IGNORE_THRESH)
+                continue;
+
+            int row = idx / yoloWidth;
+            int cols = idx % yoloWidth;
+            
+            //classes
+            int classId = -1;
+            float maxProb = IGNORE_THRESH;
+            for (int c = 0;c<classes;++c){
+                float cProb =  Logist(input[beginIdx + (5 + c) * stride]) * objProb;
+                if(cProb > maxProb){
+                    maxProb = cProb;
+                    classId = c;
+                }
+            }
+
+            if(classId >= 0) {
+                int resCount = (int)atomicAdd(output,1);
+                char* data = (char * )output + sizeof(float) + resCount*sizeof(Detection);
+                Detection* det =  (Detection*)(data);
+
+                //Location
+                det->bbox[0] = (cols + Logist(input[beginIdx]))/ yoloWidth;
+                det->bbox[1] = (row + Logist(input[beginIdx+stride]))/ yoloHeight;
+                det->bbox[2] = exp(input[beginIdx+2*stride]) * anchors[2*k];
+                det->bbox[3] = exp(input[beginIdx+3*stride]) * anchors[2*k + 1];
+                det->classId = classId;
+                det->prob = maxProb;
+            }
+        }
+    }
+   
+    void YoloLayerPlugin::forwardGpu(const float *const * inputs,float * output,cudaStream_t stream) {
+        int numElem;
+        void* devAnchor;
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
+        CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
+
+        //first detect count init 0
+        CUDA_CHECK(cudaMemset(output, 0, sizeof(float)));
+        for (unsigned int i = 0;i< mYoloKernel.size();++i)
+        {
+            const auto& yolo = mYoloKernel[i];
+            numElem = yolo.width*yolo.height;
+
+            //copy anchor to device
+	        CUDA_CHECK(cudaMemcpy(devAnchor,yolo.anchors,AnchorLen,cudaMemcpyHostToDevice));
+
+            CalDetection<<< (yolo.width*yolo.height + mThreadCount - 1) / mThreadCount, mThreadCount>>>
+                    (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount);
+        }
+        CUDA_CHECK(cudaFree(devAnchor));
+    }
+
+
+    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+    {
+        assert(batchSize == 1);
+        
+        //GPU
+        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream);
+
+        //CPU
+        //forwardCpu((const float *const *)inputs,(float *)outputs[0],stream);
+        return 0;
+    };
+
+}
diff --git a/tensorRTWrapper b/tensorRTWrapper
deleted file mode 160000
index cbfbdc5..0000000
--- a/tensorRTWrapper
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit cbfbdc5bcf27d917c8ea9b23f46acc620ebe70f0