@@ -46,32 +46,47 @@ static inline void string_split(const std::string& s, const char delimiter,
46
46
}
47
47
48
48
static inline void parse_line (
49
- const std::string& line, const std::vector<std::string>& slots,
49
+ const std::string& line,
50
+ const std::unordered_map<std::string, size_t >& slot_to_index,
50
51
int64_t * label,
51
- std::unordered_map<std::string, std::vector<int64_t >>* slots_to_data ) {
52
+ std::unordered_map<std::string, std::vector<int64_t >>* slot_to_data ) {
52
53
std::vector<std::string> ret;
53
54
string_split (line, ' ' , &ret);
54
55
*label = std::stoi (ret[2 ]) > 0 ;
55
56
56
57
for (size_t i = 3 ; i < ret.size (); ++i) {
57
58
const std::string& item = ret[i];
58
- std::vector<std::string> slot_and_feasign;
59
- string_split (item, ' :' , &slot_and_feasign);
60
- if (slot_and_feasign.size () == 2 ) {
61
- const std::string& slot = slot_and_feasign[1 ];
62
- int64_t feasign = std::strtoll (slot_and_feasign[0 ].c_str (), NULL , 10 );
63
- (*slots_to_data)[slot_and_feasign[1 ]].push_back (feasign);
59
+ std::vector<std::string> feasign_and_slot;
60
+ string_split (item, ' :' , &feasign_and_slot);
61
+ auto & slot = feasign_and_slot[1 ];
62
+ if (feasign_and_slot.size () == 2 &&
63
+ slot_to_index.find (slot) != slot_to_index.end ()) {
64
+ const std::string& slot = feasign_and_slot[1 ];
65
+ int64_t feasign = std::strtoll (feasign_and_slot[0 ].c_str (), NULL , 10 );
66
+ (*slot_to_data)[feasign_and_slot[1 ]].push_back (feasign);
64
67
}
65
68
}
66
69
67
70
// NOTE:: if the slot has no value, then fill [0] as it's data.
68
- for (auto & slot : slots ) {
69
- if (slots_to_data ->find (slot ) == slots_to_data ->end ()) {
70
- (*slots_to_data)[slot ].push_back (0 );
71
+ for (auto & item : slot_to_index ) {
72
+ if (slot_to_data ->find (item. first ) == slot_to_data ->end ()) {
73
+ (*slot_to_data)[item. first ].push_back (0 );
71
74
}
72
75
}
73
76
}
74
77
78
+ static void print_map (
79
+ std::unordered_map<std::string, std::vector<int64_t >>* map) {
80
+ for (auto it = map->begin (); it != map->end (); ++it) {
81
+ std::cout << it->first << " -> " ;
82
+ std::cout << " [" ;
83
+ for (auto & i : it->second ) {
84
+ std::cout << i << " " ;
85
+ }
86
+ std::cout << " ]\n " ;
87
+ }
88
+ }
89
+
75
90
class Reader {
76
91
public:
77
92
virtual ~Reader () {}
@@ -126,7 +141,14 @@ void ReadThread(const std::vector<std::string>& file_list,
126
141
const std::vector<std::string>& slots, int batch_size,
127
142
int thread_id, std::vector<ReaderThreadStatus>* thread_status,
128
143
std::shared_ptr<LoDTensorBlockingQueue> queue) {
144
+ VLOG (3 ) << " reader thread start! thread_id = " << thread_id;
129
145
(*thread_status)[thread_id] = Running;
146
+ VLOG (3 ) << " set status to running" ;
147
+
148
+ std::unordered_map<std::string, size_t > slot_to_index;
149
+ for (size_t i = 0 ; i < slots.size (); ++i) {
150
+ slot_to_index[slots[i]] = i;
151
+ }
130
152
131
153
std::string line;
132
154
@@ -135,21 +157,29 @@ void ReadThread(const std::vector<std::string>& file_list,
135
157
136
158
MultiGzipReader reader (file_list);
137
159
160
+ VLOG (3 ) << " reader inited" ;
161
+
138
162
while (reader.HasNext ()) {
139
- // read all files
163
+ batch_data.clear ();
164
+ batch_label.clear ();
165
+
166
+ // read batch_size data
140
167
for (int i = 0 ; i < batch_size; ++i) {
141
168
if (reader.HasNext ()) {
142
169
reader.NextLine (&line);
143
- std::unordered_map<std::string, std::vector<int64_t >> slots_to_data ;
170
+ std::unordered_map<std::string, std::vector<int64_t >> slot_to_data ;
144
171
int64_t label;
145
- parse_line (line, slots , &label, &slots_to_data );
146
- batch_data.push_back (slots_to_data );
172
+ parse_line (line, slot_to_index , &label, &slot_to_data );
173
+ batch_data.push_back (slot_to_data );
147
174
batch_label.push_back (label);
148
175
} else {
149
176
break ;
150
177
}
151
178
}
152
179
180
+ VLOG (3 ) << " read one batch, batch_size = " << batch_data.size ();
181
+ print_map (&batch_data[0 ]);
182
+
153
183
std::vector<framework::LoDTensor> lod_datas;
154
184
155
185
// first insert tensor for each slots
@@ -159,9 +189,9 @@ void ReadThread(const std::vector<std::string>& file_list,
159
189
160
190
for (size_t i = 0 ; i < batch_data.size (); ++i) {
161
191
auto & feasign = batch_data[i][slot];
162
-
163
192
lod_data.push_back (lod_data.back () + feasign.size ());
164
- batch_feasign.insert (feasign.end (), feasign.begin (), feasign.end ());
193
+ batch_feasign.insert (batch_feasign.end (), feasign.begin (),
194
+ feasign.end ());
165
195
}
166
196
167
197
framework::LoDTensor lod_tensor;
@@ -174,6 +204,8 @@ void ReadThread(const std::vector<std::string>& file_list,
174
204
lod_datas.push_back (lod_tensor);
175
205
}
176
206
207
+ VLOG (3 ) << " convert data to tensor" ;
208
+
177
209
// insert label tensor
178
210
framework::LoDTensor label_tensor;
179
211
int64_t * label_tensor_data = label_tensor.mutable_data <int64_t >(
@@ -182,10 +214,12 @@ void ReadThread(const std::vector<std::string>& file_list,
182
214
memcpy (label_tensor_data, batch_label.data (), batch_label.size ());
183
215
lod_datas.push_back (label_tensor);
184
216
217
+ VLOG (3 ) << " push one data" ;
185
218
queue->Push (lod_datas);
186
219
}
187
220
188
221
(*thread_status)[thread_id] = Stopped;
222
+ VLOG (3 ) << " thread " << thread_id << " exited" ;
189
223
}
190
224
191
225
} // namespace reader
0 commit comments