Skip to content

Commit 82b149c

Browse files
authored
Merge pull request #8780 from reyoung/feature/recordio
Feature/recordio
2 parents f608bb2 + 9dc6958 commit 82b149c

File tree

11 files changed

+516
-0
lines changed

11 files changed

+516
-0
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ include(external/eigen) # download eigen3
144144
include(external/pybind11) # download pybind11
145145
include(external/cares)
146146
include(external/grpc)
147+
include(external/snappy) # download snappy
148+
include(external/snappystream)
147149

148150
include(cudnn) # set cudnn libraries, must before configure
149151
include(cupti)

cmake/external/snappy.cmake

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
IF(MOBILE_INFERENCE)
17+
return()
18+
ENDIF()
19+
20+
include (ExternalProject)
21+
22+
# NOTE: snappy is needed when linking with recordio
23+
24+
SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
25+
SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
26+
SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
27+
28+
ExternalProject_Add(
29+
extern_snappy
30+
GIT_REPOSITORY "https://github.com/google/snappy"
31+
GIT_TAG "1.1.7"
32+
PREFIX ${SNAPPY_SOURCES_DIR}
33+
UPDATE_COMMAND ""
34+
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
35+
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
36+
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
37+
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
38+
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
39+
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
40+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
41+
-DBUILD_TESTING=OFF
42+
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
43+
${EXTERNAL_OPTIONAL_ARGS}
44+
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
45+
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
46+
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
47+
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
48+
BUILD_COMMAND make -j8
49+
INSTALL_COMMAND make install
50+
)
51+
52+
add_library(snappy STATIC IMPORTED GLOBAL)
53+
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
54+
"${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
55+
56+
include_directories(${SNAPPY_INCLUDE_DIR})
57+
add_dependencies(snappy extern_snappy)

cmake/external/snappystream.cmake

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
IF(MOBILE_INFERENCE)
17+
return()
18+
ENDIF()
19+
20+
include (ExternalProject)
21+
22+
# NOTE: snappy is needed when linking with recordio
23+
24+
SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
25+
SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
26+
SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
27+
28+
ExternalProject_Add(
29+
extern_snappystream
30+
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
31+
GIT_TAG "0.2.8"
32+
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
33+
UPDATE_COMMAND ""
34+
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
35+
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
36+
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
37+
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
38+
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
39+
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
40+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
41+
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
42+
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
43+
${EXTERNAL_OPTIONAL_ARGS}
44+
CMAKE_CACHE_ARGS
45+
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
46+
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
47+
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
48+
BUILD_COMMAND make -j8
49+
INSTALL_COMMAND make install
50+
DEPENDS snappy
51+
)
52+
53+
add_library(snappystream STATIC IMPORTED GLOBAL)
54+
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
55+
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
56+
57+
include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
58+
add_dependencies(snappystream extern_snappystream)

paddle/fluid/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ add_subdirectory(operators)
55
add_subdirectory(pybind)
66
add_subdirectory(inference)
77
add_subdirectory(string)
8+
add_subdirectory(recordio)

paddle/fluid/recordio/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# internal library.
2+
cc_library(header SRCS header.cc)
3+
cc_test(header_test SRCS header_test.cc DEPS header)
4+
cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
5+
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
6+
cc_library(recordio DEPS chunk header)

paddle/fluid/recordio/chunk.cc

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/recordio/chunk.h"
16+
17+
#include <memory>
18+
#include <sstream>
19+
#include "paddle/fluid/platform/enforce.h"
20+
#include "snappystream.hpp"
21+
#include "zlib.h"
22+
23+
namespace paddle {
24+
namespace recordio {
25+
constexpr size_t kMaxBufSize = 1024;
26+
27+
template <typename Callback>
28+
static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) {
29+
char buf[kMaxBufSize];
30+
std::streamsize actual_size;
31+
size_t counter = 0;
32+
do {
33+
auto actual_max =
34+
limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize;
35+
actual_size = in.readsome(buf, actual_max);
36+
if (actual_size == 0) {
37+
break;
38+
}
39+
callback(buf, actual_size);
40+
if (limit > 0) {
41+
counter += actual_size;
42+
}
43+
} while (actual_size == kMaxBufSize);
44+
}
45+
46+
static void PipeStream(std::istream& in, std::ostream& os) {
47+
ReadStreamByBuf(
48+
in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); });
49+
}
50+
static uint32_t Crc32Stream(std::istream& in, int limit = -1) {
51+
auto crc = crc32(0, nullptr, 0);
52+
ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
53+
crc = crc32(crc, reinterpret_cast<const Bytef*>(buf), len);
54+
});
55+
return crc;
56+
}
57+
58+
bool Chunk::Write(std::ostream& os, Compressor ct) const {
59+
// NOTE(dzhwinter): don't check records.numBytes instead, because
60+
// empty records are allowed.
61+
if (records_.empty()) {
62+
return false;
63+
}
64+
std::stringstream sout;
65+
std::unique_ptr<std::ostream> compressed_stream;
66+
switch (ct) {
67+
case Compressor::kNoCompress:
68+
break;
69+
case Compressor::kSnappy:
70+
compressed_stream.reset(new snappy::oSnappyStream(sout));
71+
break;
72+
default:
73+
PADDLE_THROW("Not implemented");
74+
}
75+
76+
std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout;
77+
78+
for (auto& record : records_) {
79+
size_t sz = record.size();
80+
buf_stream.write(reinterpret_cast<const char*>(&sz), sizeof(uint32_t))
81+
.write(record.data(), record.size());
82+
}
83+
84+
if (compressed_stream) {
85+
compressed_stream.reset();
86+
}
87+
88+
auto end_pos = sout.tellg();
89+
sout.seekg(0, std::ios::beg);
90+
uint32_t len = static_cast<uint32_t>(end_pos - sout.tellg());
91+
uint32_t crc = Crc32Stream(sout);
92+
sout.seekg(0, std::ios::beg);
93+
94+
Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len);
95+
hdr.Write(os);
96+
PipeStream(sout, os);
97+
return true;
98+
}
99+
100+
void Chunk::Parse(std::istream& sin) {
101+
Header hdr;
102+
hdr.Parse(sin);
103+
auto beg_pos = sin.tellg();
104+
auto crc = Crc32Stream(sin, hdr.CompressSize());
105+
PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
106+
107+
Clear();
108+
109+
sin.seekg(beg_pos, std::ios::beg);
110+
std::unique_ptr<std::istream> compressed_stream;
111+
switch (hdr.CompressType()) {
112+
case Compressor::kNoCompress:
113+
break;
114+
case Compressor::kSnappy:
115+
compressed_stream.reset(new snappy::iSnappyStream(sin));
116+
break;
117+
default:
118+
PADDLE_THROW("Not implemented");
119+
}
120+
121+
std::istream& stream = compressed_stream ? *compressed_stream : sin;
122+
123+
for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
124+
uint32_t rec_len;
125+
stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
126+
std::string buf;
127+
buf.resize(rec_len);
128+
stream.read(&buf[0], rec_len);
129+
Add(buf);
130+
}
131+
}
132+
133+
} // namespace recordio
134+
} // namespace paddle

paddle/fluid/recordio/chunk.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
#include <string>
17+
#include <vector>
18+
19+
#include "paddle/fluid/platform/macros.h"
20+
#include "paddle/fluid/recordio/header.h"
21+
22+
namespace paddle {
23+
namespace recordio {
24+
25+
// A Chunk contains the Header and optionally compressed records.
26+
class Chunk {
27+
public:
28+
Chunk() : num_bytes_(0) {}
29+
void Add(std::string buf) {
30+
records_.push_back(buf);
31+
num_bytes_ += buf.size();
32+
}
33+
// dump the chunk into w, and clears the chunk and makes it ready for
34+
// the next add invocation.
35+
bool Write(std::ostream& fo, Compressor ct) const;
36+
void Clear() {
37+
records_.clear();
38+
num_bytes_ = 0;
39+
}
40+
void Parse(std::istream& sin);
41+
size_t NumBytes() { return num_bytes_; }
42+
const std::string& Record(int i) const { return records_[i]; }
43+
44+
private:
45+
std::vector<std::string> records_;
46+
// sum of record lengths in bytes.
47+
size_t num_bytes_;
48+
DISABLE_COPY_AND_ASSIGN(Chunk);
49+
};
50+
51+
size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
52+
53+
void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
54+
55+
} // namespace recordio
56+
} // namespace paddle

paddle/fluid/recordio/chunk_test.cc

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/recordio/chunk.h"
16+
17+
#include <sstream>
18+
19+
#include "gtest/gtest.h"
20+
21+
using namespace paddle::recordio;
22+
23+
TEST(Chunk, SaveLoad) {
24+
Chunk ch;
25+
ch.Add(std::string("12345", 6));
26+
ch.Add(std::string("123", 4));
27+
std::stringstream ss;
28+
ch.Write(ss, Compressor::kNoCompress);
29+
ch.Clear();
30+
ch.Parse(ss);
31+
ASSERT_EQ(ch.NumBytes(), 10U);
32+
}
33+
34+
TEST(Chunk, Compressor) {
35+
Chunk ch;
36+
ch.Add(std::string("12345", 6));
37+
ch.Add(std::string("123", 4));
38+
ch.Add(std::string("123", 4));
39+
ch.Add(std::string("123", 4));
40+
std::stringstream ss;
41+
ch.Write(ss, Compressor::kSnappy);
42+
std::stringstream ss2;
43+
ch.Write(ss2, Compressor::kNoCompress);
44+
ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data;
45+
46+
ch.Clear();
47+
ch.Parse(ss);
48+
ASSERT_EQ(ch.NumBytes(), 18);
49+
}

0 commit comments

Comments
 (0)