Skip to content

Commit 0f3ece7

Browse files
committed
use gzstream
1 parent a1e0f5a commit 0f3ece7

File tree

3 files changed

+17
-35
lines changed

3 files changed

+17
-35
lines changed

paddle/fluid/operators/reader/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ function(reader_library TARGET_NAME)
1616
endfunction()
1717

1818
cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
19-
cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost)
19+
cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost gzstream)
2020
reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
2121
reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader)
2222
reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)

paddle/fluid/operators/reader/ctr_reader.cc

Lines changed: 16 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#include "paddle/fluid/operators/reader/ctr_reader.h"
1616

17+
#include <gzstream.h>
18+
1719
#include <cstdlib>
1820
#include <fstream>
1921
#include <iostream>
@@ -24,10 +26,6 @@
2426
#include <algorithm>
2527
#include <random>
2628

27-
#include <boost/iostreams/copy.hpp>
28-
#include <boost/iostreams/filter/gzip.hpp>
29-
#include <boost/iostreams/filtering_streambuf.hpp>
30-
3129
namespace paddle {
3230
namespace operators {
3331
namespace reader {
@@ -75,23 +73,19 @@ static inline void parse_line(
7573

7674
class GzipReader {
7775
public:
78-
explicit GzipReader(const std::string& file_name) : instream_(&inbuf_) {
79-
file_ = std::ifstream(file_name, std::ios_base::in | std::ios_base::binary);
80-
inbuf_.push(boost::iostreams::gzip_decompressor());
81-
inbuf_.push(file_);
82-
// Convert streambuf to istream
83-
}
76+
explicit GzipReader(const std::string& file_name)
77+
: gzstream_(file_name.c_str()) {}
8478

85-
~GzipReader() { file_.close(); }
79+
~GzipReader() {}
8680

87-
bool HasNext() { return instream_.peek() != EOF; }
81+
bool HasNext() { return gzstream_.peek() != EOF; }
8882

89-
void NextLine(std::string& line) { std::getline(instream_, line); } // NOLINT
83+
void NextLine(std::string* line) { // NOLINT
84+
std::getline(gzstream_, line);
85+
}
9086

9187
private:
92-
boost::iostreams::filtering_streambuf<boost::iostreams::input> inbuf_;
93-
std::ifstream file_;
94-
std::istream instream_;
88+
igzstream gzstream_;
9589
};
9690

9791
class MultiGzipReader {
@@ -113,36 +107,28 @@ class MultiGzipReader {
113107
return true;
114108
}
115109

116-
void NextLine(std::string& line) { // NOLINT
117-
readers_[current_reader_index_]->NextLine(line);
110+
void NextLine(std::string* line) {
111+
readers_[current_reader_index_]->NextLine(*line);
118112
}
119113

120114
private:
121115
std::vector<std::shared_ptr<GzipReader>> readers_;
122116
size_t current_reader_index_ = 0;
123117
};
124118

125-
// void CTRReader::ReadThread(
126-
// const std::vector<std::string> &file_list,
127-
// const std::vector<std::string>& slots,
128-
// int batch_size,
129-
// std::shared_ptr<LoDTensorBlockingQueue>& queue) {}
130-
131119
void CTRReader::ReadThread(const std::vector<std::string>& file_list,
132120
const std::vector<std::string>& slots,
133121
int batch_size,
134122
std::shared_ptr<LoDTensorBlockingQueue>* queue) {
135123
std::string line;
136124

137125
// read all files
138-
std::vector<std::string> all_lines;
139126
MultiGzipReader reader(file_list);
127+
reader.NextLine(&line);
140128

141-
for (int j = 0; j < all_lines.size(); ++j) {
142-
std::unordered_map<std::string, std::vector<int64_t>> slots_to_data;
143-
int64_t label;
144-
parse_line(all_lines[j], slots, &label, &slots_to_data);
145-
}
129+
std::unordered_map<std::string, std::vector<int64_t>> slots_to_data;
130+
int64_t label;
131+
parse_line(line, slots, &label, &slots_to_data);
146132
}
147133

148134
} // namespace reader

paddle/fluid/operators/reader/ctr_reader.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@
2222
#include <unordered_map>
2323
#include <vector>
2424

25-
#include <boost/iostreams/copy.hpp>
26-
#include <boost/iostreams/filter/gzip.hpp>
27-
#include <boost/iostreams/filtering_streambuf.hpp>
28-
2925
#include "paddle/fluid/framework/reader.h"
3026
#include "paddle/fluid/framework/threadpool.h"
3127
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"

0 commit comments

Comments
 (0)