|
| 1 | +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +#include "paddle/fluid/operators/reader/ctr_reader.h" |
| 16 | + |
| 17 | +#include <gzstream.h> |
| 18 | + |
| 19 | +#include <cstdlib> |
| 20 | +#include <fstream> |
| 21 | +#include <iostream> |
| 22 | +#include <sstream> |
| 23 | +#include <string> |
| 24 | +#include <unordered_map> |
| 25 | + |
| 26 | +#include <algorithm> |
| 27 | +#include <random> |
| 28 | + |
| 29 | +namespace paddle { |
| 30 | +namespace operators { |
| 31 | +namespace reader { |
| 32 | + |
| 33 | +static inline void string_split(const std::string& s, const char delimiter, |
| 34 | + std::vector<std::string>* output) { |
| 35 | + size_t start = 0; |
| 36 | + size_t end = s.find_first_of(delimiter); |
| 37 | + |
| 38 | + while (end <= std::string::npos) { |
| 39 | + output->emplace_back(s.substr(start, end - start)); |
| 40 | + if (end == std::string::npos) { |
| 41 | + break; |
| 42 | + } |
| 43 | + start = end + 1; |
| 44 | + end = s.find_first_of(delimiter, start); |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +static inline void parse_line( |
| 49 | + const std::string& line, |
| 50 | + const std::unordered_map<std::string, size_t>& slot_to_index, |
| 51 | + int64_t* label, |
| 52 | + std::unordered_map<std::string, std::vector<int64_t>>* slot_to_data) { |
| 53 | + std::vector<std::string> ret; |
| 54 | + string_split(line, ' ', &ret); |
| 55 | + *label = std::stoi(ret[2]) > 0; |
| 56 | + |
| 57 | + for (size_t i = 3; i < ret.size(); ++i) { |
| 58 | + const std::string& item = ret[i]; |
| 59 | + std::vector<std::string> feasign_and_slot; |
| 60 | + string_split(item, ':', &feasign_and_slot); |
| 61 | + if (feasign_and_slot.size() == 2 && |
| 62 | + slot_to_index.find(feasign_and_slot[1]) != slot_to_index.end()) { |
| 63 | + int64_t feasign = std::strtoll(feasign_and_slot[0].c_str(), NULL, 10); |
| 64 | + (*slot_to_data)[feasign_and_slot[1]].push_back(feasign); |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + // NOTE:: if the slot has no value, then fill [0] as it's data. |
| 69 | + for (auto& item : slot_to_index) { |
| 70 | + if (slot_to_data->find(item.first) == slot_to_data->end()) { |
| 71 | + (*slot_to_data)[item.first].push_back(0); |
| 72 | + } |
| 73 | + } |
| 74 | +} |
| 75 | + |
| 76 | +class Reader { |
| 77 | + public: |
| 78 | + virtual ~Reader() {} |
| 79 | + virtual bool HasNext() = 0; |
| 80 | + virtual void NextLine(std::string* line) = 0; |
| 81 | +}; |
| 82 | + |
| 83 | +class GzipReader : public Reader { |
| 84 | + public: |
| 85 | + explicit GzipReader(const std::string& file_name) |
| 86 | + : gzstream_(file_name.c_str()) {} |
| 87 | + |
| 88 | + ~GzipReader() {} |
| 89 | + |
| 90 | + bool HasNext() override { return gzstream_.peek() != EOF; } |
| 91 | + |
| 92 | + void NextLine(std::string* line) override { std::getline(gzstream_, *line); } |
| 93 | + |
| 94 | + private: |
| 95 | + igzstream gzstream_; |
| 96 | +}; |
| 97 | + |
| 98 | +class MultiGzipReader : public Reader { |
| 99 | + public: |
| 100 | + explicit MultiGzipReader(const std::vector<std::string>& file_list) { |
| 101 | + for (auto& file : file_list) { |
| 102 | + readers_.emplace_back(std::make_shared<GzipReader>(file)); |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + bool HasNext() override { |
| 107 | + if (current_reader_index_ >= readers_.size()) { |
| 108 | + return false; |
| 109 | + } |
| 110 | + if (!readers_[current_reader_index_]->HasNext()) { |
| 111 | + current_reader_index_++; |
| 112 | + return HasNext(); |
| 113 | + } |
| 114 | + return true; |
| 115 | + } |
| 116 | + |
| 117 | + void NextLine(std::string* line) override { |
| 118 | + readers_[current_reader_index_]->NextLine(line); |
| 119 | + } |
| 120 | + |
| 121 | + private: |
| 122 | + std::vector<std::shared_ptr<GzipReader>> readers_; |
| 123 | + size_t current_reader_index_ = 0; |
| 124 | +}; |
| 125 | + |
| 126 | +void MonitorThread(std::vector<ReaderThreadStatus>* thread_status, |
| 127 | + std::shared_ptr<LoDTensorBlockingQueue> queue) { |
| 128 | + VLOG(30) << "monitor thread in"; |
| 129 | + bool reader_thread_is_running = true; |
| 130 | + while (reader_thread_is_running) { |
| 131 | + VLOG(30) << "reader_thread_is_running"; |
| 132 | + reader_thread_is_running = false; |
| 133 | + for (size_t i = 0; i < (*thread_status).size(); ++i) { |
| 134 | + if ((*thread_status)[i] == Running) { |
| 135 | + VLOG(30) << "reader is running!"; |
| 136 | + reader_thread_is_running = true; |
| 137 | + } |
| 138 | + } |
| 139 | + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); |
| 140 | + } |
| 141 | + VLOG(30) << "all reader thread is stopped, push empty data into queue"; |
| 142 | + queue->Push({}); |
| 143 | + VLOG(30) << "monitor thread exited"; |
| 144 | +} |
| 145 | + |
| 146 | +void ReadThread(const std::vector<std::string>& file_list, |
| 147 | + const std::vector<std::string>& slots, int batch_size, |
| 148 | + int thread_id, std::vector<ReaderThreadStatus>* thread_status, |
| 149 | + std::shared_ptr<LoDTensorBlockingQueue> queue) { |
| 150 | + VLOG(30) << "[" << thread_id << "]" |
| 151 | + << " reader thread start! thread_id = " << thread_id; |
| 152 | + for (auto& file : file_list) { |
| 153 | + VLOG(30) << "[" << thread_id << "]" |
| 154 | + << " file " << file; |
| 155 | + } |
| 156 | + (*thread_status)[thread_id] = Running; |
| 157 | + VLOG(30) << "set status to running"; |
| 158 | + |
| 159 | + std::unordered_map<std::string, size_t> slot_to_index; |
| 160 | + for (size_t i = 0; i < slots.size(); ++i) { |
| 161 | + slot_to_index[slots[i]] = i; |
| 162 | + } |
| 163 | + |
| 164 | + std::string line; |
| 165 | + |
| 166 | + std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data; |
| 167 | + std::vector<int64_t> batch_label; |
| 168 | + |
| 169 | + MultiGzipReader reader(file_list); |
| 170 | + |
| 171 | + VLOG(30) << "reader inited"; |
| 172 | + |
| 173 | + while (reader.HasNext()) { |
| 174 | + batch_data.clear(); |
| 175 | + batch_data.reserve(batch_size); |
| 176 | + |
| 177 | + batch_label.clear(); |
| 178 | + batch_label.reserve(batch_size); |
| 179 | + |
| 180 | + // read batch_size data |
| 181 | + for (int i = 0; i < batch_size; ++i) { |
| 182 | + if (reader.HasNext()) { |
| 183 | + reader.NextLine(&line); |
| 184 | + std::unordered_map<std::string, std::vector<int64_t>> slot_to_data; |
| 185 | + int64_t label; |
| 186 | + parse_line(line, slot_to_index, &label, &slot_to_data); |
| 187 | + batch_data.push_back(slot_to_data); |
| 188 | + batch_label.push_back(label); |
| 189 | + } else { |
| 190 | + break; |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + std::vector<framework::LoDTensor> lod_datas; |
| 195 | + |
| 196 | + // first insert tensor for each slots |
| 197 | + for (auto& slot : slots) { |
| 198 | + std::vector<size_t> lod_data{0}; |
| 199 | + std::vector<int64_t> batch_feasign; |
| 200 | + |
| 201 | + for (size_t i = 0; i < batch_data.size(); ++i) { |
| 202 | + auto& feasign = batch_data[i][slot]; |
| 203 | + lod_data.push_back(lod_data.back() + feasign.size()); |
| 204 | + batch_feasign.insert(batch_feasign.end(), feasign.begin(), |
| 205 | + feasign.end()); |
| 206 | + } |
| 207 | + |
| 208 | + framework::LoDTensor lod_tensor; |
| 209 | + framework::LoD lod{lod_data}; |
| 210 | + lod_tensor.set_lod(lod); |
| 211 | + int64_t* tensor_data = lod_tensor.mutable_data<int64_t>( |
| 212 | + framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}), |
| 213 | + platform::CPUPlace()); |
| 214 | + memcpy(tensor_data, batch_feasign.data(), |
| 215 | + batch_feasign.size() * sizeof(int64_t)); |
| 216 | + lod_datas.push_back(lod_tensor); |
| 217 | + } |
| 218 | + |
| 219 | + // insert label tensor |
| 220 | + framework::LoDTensor label_tensor; |
| 221 | + auto* label_tensor_data = label_tensor.mutable_data<int64_t>( |
| 222 | + framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}), |
| 223 | + platform::CPUPlace()); |
| 224 | + memcpy(label_tensor_data, batch_label.data(), |
| 225 | + batch_label.size() * sizeof(int64_t)); |
| 226 | + lod_datas.push_back(label_tensor); |
| 227 | + |
| 228 | + queue->Push(lod_datas); |
| 229 | + VLOG(40) << "push one data, queue_size=" << queue->Size(); |
| 230 | + } |
| 231 | + |
| 232 | + (*thread_status)[thread_id] = Stopped; |
| 233 | + VLOG(30) << "set status to stopped, thread " << thread_id << " exited"; |
| 234 | +} |
| 235 | + |
| 236 | +} // namespace reader |
| 237 | +} // namespace operators |
| 238 | +} // namespace paddle |
0 commit comments