|
| 1 | +#include <algorithm> |
| 2 | +#include <fstream> // NOLINT(readability/streams) |
| 3 | +#include <map> |
| 4 | +#include <string> |
| 5 | +#include <utility> |
| 6 | +#include <vector> |
| 7 | + |
| 8 | +#include "boost/filesystem.hpp" |
| 9 | +#include "boost/foreach.hpp" |
| 10 | + |
| 11 | +#include "caffe/layers/ssd_decoder_layer.hpp" |
| 12 | + |
| 13 | +namespace caffe { |
| 14 | + |
| 15 | +template <typename Dtype> |
| 16 | +void SSDDecoderLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, |
| 17 | + const vector<Blob<Dtype>*>& top) { |
| 18 | + const DetectionOutputParameter& detection_output_param = |
| 19 | + this->layer_param_.detection_output_param(); |
| 20 | + CHECK(detection_output_param.has_num_classes()) << "Must specify num_classes"; |
| 21 | + objectness_score_ = detection_output_param.objectness_score(); |
| 22 | + num_classes_ = detection_output_param.num_classes(); |
| 23 | + share_location_ = detection_output_param.share_location(); |
| 24 | + num_loc_classes_ = share_location_ ? 1 : num_classes_; |
| 25 | + background_label_id_ = detection_output_param.background_label_id(); |
| 26 | + code_type_ = detection_output_param.code_type(); |
| 27 | + variance_encoded_in_target_ = |
| 28 | + detection_output_param.variance_encoded_in_target(); |
| 29 | + keep_top_k_ = detection_output_param.keep_top_k(); |
| 30 | + confidence_threshold_ = detection_output_param.has_confidence_threshold() ? |
| 31 | + detection_output_param.confidence_threshold() : -FLT_MAX; |
| 32 | + // Parameters used in nms. |
| 33 | + nms_threshold_ = detection_output_param.nms_param().nms_threshold(); |
| 34 | + CHECK_GE(nms_threshold_, 0.) << "nms_threshold must be non negative."; |
| 35 | + eta_ = detection_output_param.nms_param().eta(); |
| 36 | + CHECK_GT(eta_, 0.); |
| 37 | + CHECK_LE(eta_, 1.); |
| 38 | + top_k_ = -1; |
| 39 | + if (detection_output_param.nms_param().has_top_k()) { |
| 40 | + top_k_ = detection_output_param.nms_param().top_k(); |
| 41 | + } |
| 42 | + const SaveOutputParameter& save_output_param = |
| 43 | + detection_output_param.save_output_param(); |
| 44 | + output_directory_ = save_output_param.output_directory(); |
| 45 | + if (!output_directory_.empty()) { |
| 46 | + if (boost::filesystem::is_directory(output_directory_)) { |
| 47 | + // boost::filesystem::remove_all(output_directory_); |
| 48 | + } |
| 49 | + if (!boost::filesystem::create_directories(output_directory_)) { |
| 50 | + LOG(WARNING) << "Failed to create directory: " << output_directory_; |
| 51 | + } |
| 52 | + } |
| 53 | + output_name_prefix_ = save_output_param.output_name_prefix(); |
| 54 | + need_save_ = output_directory_ == "" ? false : true; |
| 55 | + output_format_ = save_output_param.output_format(); |
| 56 | + if (save_output_param.has_label_map_file()) { |
| 57 | + string label_map_file = save_output_param.label_map_file(); |
| 58 | + if (label_map_file.empty()) { |
| 59 | + // Ignore saving if there is no label_map_file provided. |
| 60 | + LOG(WARNING) << "Provide label_map_file if output results to files."; |
| 61 | + need_save_ = false; |
| 62 | + } else { |
| 63 | + LabelMap label_map; |
| 64 | + CHECK(ReadProtoFromTextFile(label_map_file, &label_map)) |
| 65 | + << "Failed to read label map file: " << label_map_file; |
| 66 | + CHECK(MapLabelToName(label_map, true, &label_to_name_)) |
| 67 | + << "Failed to convert label to name."; |
| 68 | + CHECK(MapLabelToDisplayName(label_map, true, &label_to_display_name_)) |
| 69 | + << "Failed to convert label to display name."; |
| 70 | + } |
| 71 | + } else { |
| 72 | + need_save_ = false; |
| 73 | + } |
| 74 | + if (save_output_param.has_name_size_file()) { |
| 75 | + string name_size_file = save_output_param.name_size_file(); |
| 76 | + if (name_size_file.empty()) { |
| 77 | + // Ignore saving if there is no name_size_file provided. |
| 78 | + LOG(WARNING) << "Provide name_size_file if output results to files."; |
| 79 | + need_save_ = false; |
| 80 | + } else { |
| 81 | + std::ifstream infile(name_size_file.c_str()); |
| 82 | + CHECK(infile.good()) |
| 83 | + << "Failed to open name size file: " << name_size_file; |
| 84 | + // The file is in the following format: |
| 85 | + // name height width |
| 86 | + // ... |
| 87 | + string name; |
| 88 | + int height, width; |
| 89 | + while (infile >> name >> height >> width) { |
| 90 | + names_.push_back(name); |
| 91 | + sizes_.push_back(std::make_pair(height, width)); |
| 92 | + } |
| 93 | + infile.close(); |
| 94 | + if (save_output_param.has_num_test_image()) { |
| 95 | + num_test_image_ = save_output_param.num_test_image(); |
| 96 | + } else { |
| 97 | + num_test_image_ = names_.size(); |
| 98 | + } |
| 99 | + CHECK_LE(num_test_image_, names_.size()); |
| 100 | + } |
| 101 | + } else { |
| 102 | + need_save_ = false; |
| 103 | + } |
| 104 | + has_resize_ = save_output_param.has_resize_param(); |
| 105 | + if (has_resize_) { |
| 106 | + resize_param_ = save_output_param.resize_param(); |
| 107 | + } |
| 108 | + name_count_ = 0; |
| 109 | + visualize_ = detection_output_param.visualize(); |
| 110 | + if (visualize_) { |
| 111 | + visualize_threshold_ = 0.6; |
| 112 | + if (detection_output_param.has_visualize_threshold()) { |
| 113 | + visualize_threshold_ = detection_output_param.visualize_threshold(); |
| 114 | + } |
| 115 | + data_transformer_.reset( |
| 116 | + new DataTransformer<Dtype>(this->layer_param_.transform_param(), |
| 117 | + this->phase_)); |
| 118 | + data_transformer_->InitRand(); |
| 119 | + save_file_ = detection_output_param.save_file(); |
| 120 | + } |
| 121 | + bbox_preds_.ReshapeLike(*(bottom[0])); |
| 122 | + if (!share_location_) { |
| 123 | + bbox_permute_.ReshapeLike(*(bottom[0])); |
| 124 | + } |
| 125 | + conf_permute_.ReshapeLike(*(bottom[1])); |
| 126 | +} |
| 127 | + |
| 128 | +template <typename Dtype> |
| 129 | +void SSDDecoderLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, |
| 130 | + const vector<Blob<Dtype>*>& top) { |
| 131 | + |
| 132 | + if (bbox_preds_.num() != bottom[0]->num() || |
| 133 | + bbox_preds_.count(1) != bottom[0]->count(1)) { |
| 134 | + bbox_preds_.ReshapeLike(*(bottom[0])); |
| 135 | + } |
| 136 | + if (!share_location_ && (bbox_permute_.num() != bottom[0]->num() || |
| 137 | + bbox_permute_.count(1) != bottom[0]->count(1))) { |
| 138 | + bbox_permute_.ReshapeLike(*(bottom[0])); |
| 139 | + } |
| 140 | + |
| 141 | + num_priors_ = bottom[1]->height() / 4; |
| 142 | + CHECK_EQ(num_priors_ * num_loc_classes_ * 4, bottom[0]->channels()) |
| 143 | + << "Number of priors must match number of location predictions."; |
| 144 | + // num() and channels() are 1. |
| 145 | + vector<int> top_shape(2, 1); |
| 146 | + top_shape.push_back(num_priors_); |
| 147 | + // Each row is a 4 dimension vector, which stores |
| 148 | + // [xmin, ymin, xmax, ymax] |
| 149 | + top_shape.push_back(4); |
| 150 | + top[0]->Reshape(top_shape); |
| 151 | +} |
| 152 | + |
| 153 | +template <typename Dtype> |
| 154 | +void SSDDecoderLayer<Dtype>::Forward_cpu( |
| 155 | + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { |
| 156 | + const Dtype* loc_data = bottom[0]->cpu_data(); |
| 157 | + const Dtype* prior_data = bottom[1]->cpu_data(); |
| 158 | + const int num = bottom[0]->num(); |
| 159 | + |
| 160 | + // Retrieve all location predictions. |
| 161 | + vector<LabelBBox> all_loc_preds; |
| 162 | + GetLocPredictions(loc_data, num, num_priors_, num_loc_classes_, |
| 163 | + share_location_, &all_loc_preds); |
| 164 | + |
| 165 | + // Retrieve all prior bboxes. It is same within a batch since we assume all |
| 166 | + // images in a batch are of same dimension. |
| 167 | + vector<NormalizedBBox> prior_bboxes; |
| 168 | + vector<vector<float> > prior_variances; |
| 169 | + GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances); |
| 170 | + |
| 171 | + // Decode all loc predictions to bboxes. |
| 172 | + vector<LabelBBox> all_decode_bboxes; |
| 173 | + const bool clip_bbox = false; |
| 174 | + |
| 175 | +DecodeBBoxesAll(all_loc_preds, prior_bboxes, prior_variances, num, |
| 176 | + share_location_, num_loc_classes_, background_label_id_, |
| 177 | + code_type_, variance_encoded_in_target_, clip_bbox, |
| 178 | + &all_decode_bboxes); |
| 179 | + |
| 180 | + Dtype* top_data = top[0]->mutable_cpu_data(); |
| 181 | + |
| 182 | + int count = 0; |
| 183 | + //boost::filesystem::path output_directory(output_directory_); |
| 184 | + for (int i = 0; i < num; ++i) { |
| 185 | + const LabelBBox& decode_bboxes = all_decode_bboxes[i]; |
| 186 | + |
| 187 | + int loc_label = -1; //share_location_ ? -1 : label; |
| 188 | + if (decode_bboxes.find(loc_label) == decode_bboxes.end()) { |
| 189 | + // Something bad happened if there are no predictions for current label. |
| 190 | + LOG(FATAL) << "Could not find location predictions for " << loc_label; |
| 191 | + continue; |
| 192 | + } |
| 193 | + const vector<NormalizedBBox>& bboxes = |
| 194 | + decode_bboxes.find(loc_label)->second; |
| 195 | + |
| 196 | + for (int j = 0; j < num_priors_; ++j) { |
| 197 | + const NormalizedBBox& bbox = bboxes[j]; |
| 198 | + top_data[count * 4] = bbox.xmin(); |
| 199 | + top_data[count * 4 + 1] = bbox.ymin(); |
| 200 | + top_data[count * 4 + 2] = bbox.xmax(); |
| 201 | + top_data[count * 4 + 3] = bbox.ymax(); |
| 202 | + ++count; |
| 203 | + } |
| 204 | + } |
| 205 | +} |
| 206 | + |
| 207 | +INSTANTIATE_CLASS(SSDDecoderLayer); |
| 208 | +REGISTER_LAYER_CLASS(SSDDecoder); |
| 209 | + |
| 210 | +} // namespace caffe |
0 commit comments