Skip to content

Commit 7c67c6d

Browse files
authored
Merge pull request #29 from Enmk/string_column_performance_fix
ColumnString and ColumnFixedString performance fix
2 parents 8a255e8 + 367958f commit 7c67c6d

File tree

7 files changed

+395
-57
lines changed

7 files changed

+395
-57
lines changed

clickhouse/base/wire_format.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class WireFormat {
2323

2424
static void WriteBytes(CodedOutputStream* output, const void* buf, size_t len);
2525

26-
static void WriteString(CodedOutputStream* output, const std::string& value);
26+
static void WriteString(CodedOutputStream* output, std::string_view value);
2727

2828
static void WriteUInt64(CodedOutputStream* output, const uint64_t value);
2929
};
@@ -85,7 +85,7 @@ inline void WireFormat::WriteBytes(
8585

8686
inline void WireFormat::WriteString(
8787
CodedOutputStream* output,
88-
const std::string& value)
88+
std::string_view value)
8989
{
9090
output->WriteVarint64(value.size());
9191
output->WriteRaw(value.data(), value.size());

clickhouse/columns/ip6.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ std::string ColumnIPv6::AsString (size_t n) const{
4343
char buf[INET6_ADDRSTRLEN];
4444
const char* ip_str = inet_ntop(AF_INET6, addr.data(), buf, INET6_ADDRSTRLEN);
4545
if (ip_str == nullptr) {
46-
throw std::runtime_error("invalid IPv6 format: " + addr);
46+
throw std::runtime_error("invalid IPv6 format: " + std::string(addr));
4747
}
4848
return ip_str;
4949
}

clickhouse/columns/string.cpp

Lines changed: 154 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,26 @@
33

44
#include "../base/wire_format.h"
55

6+
namespace
7+
{
8+
const size_t DEFAULT_BLOCK_SIZE = 4096;
9+
10+
template <typename Container>
11+
size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len = -1)
12+
{
13+
size_t result = 0;
14+
if (begin < strings.size()) {
15+
len = std::min(len, strings.size() - begin);
16+
17+
for (size_t i = begin; i < begin + len; ++i)
18+
result += strings[i].size();
19+
}
20+
21+
return result;
22+
}
23+
24+
}
25+
626
namespace clickhouse {
727

828
ColumnFixedString::ColumnFixedString(size_t n)
@@ -11,21 +31,29 @@ ColumnFixedString::ColumnFixedString(size_t n)
1131
{
1232
}
1333

14-
void ColumnFixedString::Append(const std::string& str) {
15-
data_.push_back(str);
16-
data_.back().resize(string_size_);
34+
void ColumnFixedString::Append(std::string_view str) {
35+
if (data_.capacity() < str.size())
36+
{
37+
// round up to the next block size
38+
const auto new_size = (((data_.size() + string_size_) / DEFAULT_BLOCK_SIZE) + 1) * DEFAULT_BLOCK_SIZE;
39+
data_.reserve(new_size);
40+
}
41+
42+
data_.insert(data_.size(), str);
1743
}
1844

1945
void ColumnFixedString::Clear() {
2046
data_.clear();
2147
}
2248

23-
const std::string& ColumnFixedString::At(size_t n) const {
24-
return data_.at(n);
49+
std::string_view ColumnFixedString::At(size_t n) const {
50+
const auto pos = n * string_size_;
51+
return std::string_view(&data_.at(pos), string_size_);
2552
}
2653

27-
const std::string& ColumnFixedString::operator [] (size_t n) const {
28-
return data_[n];
54+
std::string_view ColumnFixedString::operator [](size_t n) const {
55+
const auto pos = n * string_size_;
56+
return std::string_view(&data_[pos], string_size_);
2957
}
3058

3159
size_t ColumnFixedString::FixedSize() const
@@ -42,104 +70,187 @@ void ColumnFixedString::Append(ColumnRef column) {
4270
}
4371

4472
bool ColumnFixedString::Load(CodedInputStream* input, size_t rows) {
45-
data_.reserve(data_.size() + rows);
46-
47-
for (size_t i = 0; i < rows; ++i) {
48-
std::string s;
49-
s.resize(string_size_);
50-
51-
if (!WireFormat::ReadBytes(input, &s[0], s.size())) {
52-
return false;
53-
}
54-
55-
data_.push_back(std::move(s));
73+
data_.resize(string_size_ * rows);
74+
if (!WireFormat::ReadBytes(input, &data_[0], data_.size())) {
75+
return false;
5676
}
5777

5878
return true;
5979
}
6080

6181
void ColumnFixedString::Save(CodedOutputStream* output) {
62-
for (size_t i = 0; i < data_.size(); ++i) {
63-
WireFormat::WriteBytes(output, data_[i].data(), string_size_);
64-
}
82+
WireFormat::WriteBytes(output, data_.data(), data_.size());
6583
}
6684

6785
size_t ColumnFixedString::Size() const {
68-
return data_.size();
86+
return data_.size() / string_size_;
6987
}
7088

7189
ColumnRef ColumnFixedString::Slice(size_t begin, size_t len) {
7290
auto result = std::make_shared<ColumnFixedString>(string_size_);
7391

74-
if (begin < data_.size()) {
75-
result->data_ = SliceVector(data_, begin, len);
92+
if (begin < Size()) {
93+
const auto b = begin * string_size_;
94+
const auto l = len * string_size_;
95+
result->data_ = data_.substr(b, std::min(data_.size() - b, l));
7696
}
7797

7898
return result;
7999
}
80100

101+
struct ColumnString::Block
102+
{
103+
using CharT = typename std::string::value_type;
104+
105+
explicit Block(size_t starting_capacity)
106+
: size(0),
107+
capacity(starting_capacity),
108+
data_(new CharT[capacity])
109+
{}
110+
111+
inline auto GetAvailble() const
112+
{
113+
return capacity - size;
114+
}
115+
116+
std::string_view AppendUnsafe(std::string_view str)
117+
{
118+
const auto pos = &data_[size];
119+
120+
memcpy(pos, str.data(), str.size());
121+
size += str.size();
122+
123+
return std::string_view(pos, str.size());
124+
}
125+
126+
auto GetCurrentWritePos()
127+
{
128+
return &data_[size];
129+
}
130+
131+
std::string_view ConsumeTailAsStringViewUnsafe(size_t len)
132+
{
133+
const auto start = &data_[size];
134+
size += len;
135+
return std::string_view(start, len);
136+
}
137+
138+
size_t size;
139+
const size_t capacity;
140+
std::unique_ptr<CharT[]> data_;
141+
};
81142

82143
ColumnString::ColumnString()
83144
: Column(Type::CreateString())
84145
{
85146
}
86147

87-
ColumnString::ColumnString(const std::vector<std::string>& data)
148+
ColumnString::ColumnString(const std::vector<std::string> & data)
88149
: Column(Type::CreateString())
89-
, data_(data)
90150
{
151+
items_.reserve(data.size());
152+
blocks_.emplace_back(ComputeTotalSize(data));
153+
154+
for (const auto & s : data)
155+
{
156+
AppendUnsafe(s);
157+
}
91158
}
92159

93-
void ColumnString::Append(const std::string& str) {
94-
data_.push_back(str);
160+
ColumnString::~ColumnString()
161+
{}
162+
163+
void ColumnString::Append(std::string_view str) {
164+
if (blocks_.size() == 0 || blocks_.back().GetAvailble() < str.length())
165+
{
166+
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size()));
167+
}
168+
169+
items_.emplace_back(blocks_.back().AppendUnsafe(str));
170+
}
171+
172+
void ColumnString::AppendUnsafe(std::string_view str)
173+
{
174+
items_.emplace_back(blocks_.back().AppendUnsafe(str));
95175
}
96176

97177
void ColumnString::Clear() {
98-
data_.clear();
178+
items_.clear();
179+
blocks_.clear();
99180
}
100181

101-
const std::string& ColumnString::At(size_t n) const {
102-
return data_.at(n);
182+
std::string_view ColumnString::At(size_t n) const {
183+
return items_.at(n);
103184
}
104185

105-
const std::string& ColumnString::operator [] (size_t n) const {
106-
return data_[n];
186+
std::string_view ColumnString::operator [] (size_t n) const {
187+
return items_[n];
107188
}
108189

109190
void ColumnString::Append(ColumnRef column) {
110191
if (auto col = column->As<ColumnString>()) {
111-
data_.insert(data_.end(), col->data_.begin(), col->data_.end());
192+
const auto total_size = ComputeTotalSize(col->items_);
193+
194+
// TODO: fill up existing block with some items and then add a new one for the rest of items
195+
if (blocks_.size() == 0 || blocks_.back().GetAvailble() < total_size)
196+
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, total_size));
197+
items_.reserve(items_.size() + col->Size());
198+
199+
for (size_t i = 0; i < column->Size(); ++i) {
200+
this->AppendUnsafe((*col)[i]);
201+
}
112202
}
113203
}
114204

115205
bool ColumnString::Load(CodedInputStream* input, size_t rows) {
116-
data_.reserve(data_.size() + rows);
206+
items_.clear();
207+
blocks_.clear();
208+
209+
items_.reserve(rows);
210+
Block * block = nullptr;
117211

212+
// TODO(performance): unroll a loop to a first row (to get rid of `blocks_.size() == 0` check) and the rest.
118213
for (size_t i = 0; i < rows; ++i) {
119-
std::string s;
214+
uint64_t len;
215+
if (!WireFormat::ReadUInt64(input, &len))
216+
return false;
217+
218+
if (blocks_.size() == 0 || len > block->GetAvailble())
219+
block = &blocks_.emplace_back(std::max<size_t>(DEFAULT_BLOCK_SIZE, len));
120220

121-
if (!WireFormat::ReadString(input, &s)) {
221+
if (!WireFormat::ReadBytes(input, block->GetCurrentWritePos(), len))
122222
return false;
123-
}
124223

125-
data_.push_back(std::move(s));
224+
items_.emplace_back(block->ConsumeTailAsStringViewUnsafe(len));
126225
}
127226

128227
return true;
129228
}
130229

131230
void ColumnString::Save(CodedOutputStream* output) {
132-
for (auto si = data_.begin(); si != data_.end(); ++si) {
133-
WireFormat::WriteString(output, *si);
231+
for (const auto & item : items_) {
232+
WireFormat::WriteString(output, item);
134233
}
135234
}
136235

137236
size_t ColumnString::Size() const {
138-
return data_.size();
237+
return items_.size();
139238
}
140239

141240
ColumnRef ColumnString::Slice(size_t begin, size_t len) {
142-
return std::make_shared<ColumnString>(SliceVector(data_, begin, len));
241+
auto result = std::make_shared<ColumnString>();
242+
243+
if (begin < items_.size()) {
244+
len = std::min(len, items_.size() - begin);
245+
246+
result->blocks_.emplace_back(ComputeTotalSize(items_, begin, len));
247+
for (size_t i = begin; i < begin + len; ++i)
248+
{
249+
result->Append(items_[i]);
250+
}
251+
}
252+
253+
return result;
143254
}
144255

145256
}

clickhouse/columns/string.h

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
#include "column.h"
44

5+
#include <string>
6+
#include <string_view>
7+
#include <utility>
8+
#include <vector>
9+
510
namespace clickhouse {
611

712
/**
@@ -12,13 +17,13 @@ class ColumnFixedString : public Column {
1217
explicit ColumnFixedString(size_t n);
1318

1419
/// Appends one element to the column.
15-
void Append(const std::string& str);
20+
void Append(std::string_view str);
1621

1722
/// Returns element at given row number.
18-
const std::string& At(size_t n) const;
23+
std::string_view At(size_t n) const;
1924

2025
/// Returns element at given row number.
21-
const std::string& operator [] (size_t n) const;
26+
std::string_view operator [] (size_t n) const;
2227

2328
/// Returns the max size of the fixed string
2429
size_t FixedSize() const;
@@ -44,7 +49,7 @@ class ColumnFixedString : public Column {
4449

4550
private:
4651
const size_t string_size_;
47-
std::vector<std::string> data_;
52+
std::string data_;
4853
};
4954

5055
/**
@@ -53,16 +58,18 @@ class ColumnFixedString : public Column {
5358
class ColumnString : public Column {
5459
public:
5560
ColumnString();
56-
explicit ColumnString(const std::vector<std::string>& data);
61+
~ColumnString();
62+
63+
explicit ColumnString(const std::vector<std::string> & data);
5764

5865
/// Appends one element to the column.
59-
void Append(const std::string& str);
66+
void Append(std::string_view str);
6067

6168
/// Returns element at given row number.
62-
const std::string& At(size_t n) const;
69+
std::string_view At(size_t n) const;
6370

6471
/// Returns element at given row number.
65-
const std::string& operator [] (size_t n) const;
72+
std::string_view operator [] (size_t n) const;
6673

6774
public:
6875
/// Appends content of given column to the end of current one.
@@ -84,7 +91,13 @@ class ColumnString : public Column {
8491
ColumnRef Slice(size_t begin, size_t len) override;
8592

8693
private:
87-
std::vector<std::string> data_;
94+
void AppendUnsafe(std::string_view);
95+
96+
private:
97+
struct Block;
98+
99+
std::vector<std::string_view> items_;
100+
std::vector<Block> blocks_;
88101
};
89102

90103
}

0 commit comments

Comments
 (0)