33
44#include " ../base/wire_format.h"
55
6+ namespace
7+ {
8+ const size_t DEFAULT_BLOCK_SIZE = 4096 ;
9+
10+ template <typename Container>
11+ size_t ComputeTotalSize (const Container & strings, size_t begin = 0 , size_t len = -1 )
12+ {
13+ size_t result = 0 ;
14+ if (begin < strings.size ()) {
15+ len = std::min (len, strings.size () - begin);
16+
17+ for (size_t i = begin; i < begin + len; ++i)
18+ result += strings[i].size ();
19+ }
20+
21+ return result;
22+ }
23+
24+ }
25+
626namespace clickhouse {
727
828ColumnFixedString::ColumnFixedString (size_t n)
@@ -11,21 +31,29 @@ ColumnFixedString::ColumnFixedString(size_t n)
1131{
1232}
1333
14- void ColumnFixedString::Append (const std::string& str) {
15- data_.push_back (str);
16- data_.back ().resize (string_size_);
34+ void ColumnFixedString::Append (std::string_view str) {
35+ if (data_.capacity () < str.size ())
36+ {
37+ // round up to the next block size
38+ const auto new_size = (((data_.size () + string_size_) / DEFAULT_BLOCK_SIZE) + 1 ) * DEFAULT_BLOCK_SIZE;
39+ data_.reserve (new_size);
40+ }
41+
42+ data_.insert (data_.size (), str);
1743}
1844
1945void ColumnFixedString::Clear () {
2046 data_.clear ();
2147}
2248
23- const std::string& ColumnFixedString::At (size_t n) const {
24- return data_.at (n);
49+ std::string_view ColumnFixedString::At (size_t n) const {
50+ const auto pos = n * string_size_;
51+ return std::string_view (&data_.at (pos), string_size_);
2552}
2653
27- const std::string& ColumnFixedString::operator [] (size_t n) const {
28- return data_[n];
54+ std::string_view ColumnFixedString::operator [](size_t n) const {
55+ const auto pos = n * string_size_;
56+ return std::string_view (&data_[pos], string_size_);
2957}
3058
3159size_t ColumnFixedString::FixedSize () const
@@ -42,104 +70,187 @@ void ColumnFixedString::Append(ColumnRef column) {
4270}
4371
4472bool ColumnFixedString::Load (CodedInputStream* input, size_t rows) {
45- data_.reserve (data_.size () + rows);
46-
47- for (size_t i = 0 ; i < rows; ++i) {
48- std::string s;
49- s.resize (string_size_);
50-
51- if (!WireFormat::ReadBytes (input, &s[0 ], s.size ())) {
52- return false ;
53- }
54-
55- data_.push_back (std::move (s));
73+ data_.resize (string_size_ * rows);
74+ if (!WireFormat::ReadBytes (input, &data_[0 ], data_.size ())) {
75+ return false ;
5676 }
5777
5878 return true ;
5979}
6080
6181void ColumnFixedString::Save (CodedOutputStream* output) {
62- for (size_t i = 0 ; i < data_.size (); ++i) {
63- WireFormat::WriteBytes (output, data_[i].data (), string_size_);
64- }
82+ WireFormat::WriteBytes (output, data_.data (), data_.size ());
6583}
6684
6785size_t ColumnFixedString::Size () const {
68- return data_.size ();
86+ return data_.size () / string_size_ ;
6987}
7088
7189ColumnRef ColumnFixedString::Slice (size_t begin, size_t len) {
7290 auto result = std::make_shared<ColumnFixedString>(string_size_);
7391
74- if (begin < data_.size ()) {
75- result->data_ = SliceVector (data_, begin, len);
92+ if (begin < Size ()) {
93+ const auto b = begin * string_size_;
94+ const auto l = len * string_size_;
95+ result->data_ = data_.substr (b, std::min (data_.size () - b, l));
7696 }
7797
7898 return result;
7999}
80100
101+ struct ColumnString ::Block
102+ {
103+ using CharT = typename std::string::value_type;
104+
105+ explicit Block (size_t starting_capacity)
106+ : size(0 ),
107+ capacity(starting_capacity),
108+ data_(new CharT[capacity])
109+ {}
110+
111+ inline auto GetAvailble () const
112+ {
113+ return capacity - size;
114+ }
115+
116+ std::string_view AppendUnsafe (std::string_view str)
117+ {
118+ const auto pos = &data_[size];
119+
120+ memcpy (pos, str.data (), str.size ());
121+ size += str.size ();
122+
123+ return std::string_view (pos, str.size ());
124+ }
125+
126+ auto GetCurrentWritePos ()
127+ {
128+ return &data_[size];
129+ }
130+
131+ std::string_view ConsumeTailAsStringViewUnsafe (size_t len)
132+ {
133+ const auto start = &data_[size];
134+ size += len;
135+ return std::string_view (start, len);
136+ }
137+
138+ size_t size;
139+ const size_t capacity;
140+ std::unique_ptr<CharT[]> data_;
141+ };
81142
82143ColumnString::ColumnString ()
83144 : Column(Type::CreateString())
84145{
85146}
86147
87- ColumnString::ColumnString (const std::vector<std::string>& data)
148+ ColumnString::ColumnString (const std::vector<std::string> & data)
88149 : Column(Type::CreateString())
89- , data_(data)
90150{
151+ items_.reserve (data.size ());
152+ blocks_.emplace_back (ComputeTotalSize (data));
153+
154+ for (const auto & s : data)
155+ {
156+ AppendUnsafe (s);
157+ }
91158}
92159
93- void ColumnString::Append (const std::string& str) {
94- data_.push_back (str);
160+ ColumnString::~ColumnString ()
161+ {}
162+
163+ void ColumnString::Append (std::string_view str) {
164+ if (blocks_.size () == 0 || blocks_.back ().GetAvailble () < str.length ())
165+ {
166+ blocks_.emplace_back (std::max (DEFAULT_BLOCK_SIZE, str.size ()));
167+ }
168+
169+ items_.emplace_back (blocks_.back ().AppendUnsafe (str));
170+ }
171+
172+ void ColumnString::AppendUnsafe (std::string_view str)
173+ {
174+ items_.emplace_back (blocks_.back ().AppendUnsafe (str));
95175}
96176
97177void ColumnString::Clear () {
98- data_.clear ();
178+ items_.clear ();
179+ blocks_.clear ();
99180}
100181
101- const std::string& ColumnString::At (size_t n) const {
102- return data_ .at (n);
182+ std::string_view ColumnString::At (size_t n) const {
183+ return items_ .at (n);
103184}
104185
105- const std::string& ColumnString::operator [] (size_t n) const {
106- return data_ [n];
186+ std::string_view ColumnString::operator [] (size_t n) const {
187+ return items_ [n];
107188}
108189
109190void ColumnString::Append (ColumnRef column) {
110191 if (auto col = column->As <ColumnString>()) {
111- data_.insert (data_.end (), col->data_ .begin (), col->data_ .end ());
192+ const auto total_size = ComputeTotalSize (col->items_ );
193+
194+ // TODO: fill up existing block with some items and then add a new one for the rest of items
195+ if (blocks_.size () == 0 || blocks_.back ().GetAvailble () < total_size)
196+ blocks_.emplace_back (std::max (DEFAULT_BLOCK_SIZE, total_size));
197+ items_.reserve (items_.size () + col->Size ());
198+
199+ for (size_t i = 0 ; i < column->Size (); ++i) {
200+ this ->AppendUnsafe ((*col)[i]);
201+ }
112202 }
113203}
114204
115205bool ColumnString::Load (CodedInputStream* input, size_t rows) {
116- data_.reserve (data_.size () + rows);
206+ items_.clear ();
207+ blocks_.clear ();
208+
209+ items_.reserve (rows);
210+ Block * block = nullptr ;
117211
212+ // TODO(performance): unroll a loop to a first row (to get rid of `blocks_.size() == 0` check) and the rest.
118213 for (size_t i = 0 ; i < rows; ++i) {
119- std::string s;
214+ uint64_t len;
215+ if (!WireFormat::ReadUInt64 (input, &len))
216+ return false ;
217+
218+ if (blocks_.size () == 0 || len > block->GetAvailble ())
219+ block = &blocks_.emplace_back (std::max<size_t >(DEFAULT_BLOCK_SIZE, len));
120220
121- if (!WireFormat::ReadString (input, &s)) {
221+ if (!WireFormat::ReadBytes (input, block-> GetCurrentWritePos (), len))
122222 return false ;
123- }
124223
125- data_. push_back ( std::move (s ));
224+ items_. emplace_back (block-> ConsumeTailAsStringViewUnsafe (len ));
126225 }
127226
128227 return true ;
129228}
130229
131230void ColumnString::Save (CodedOutputStream* output) {
132- for (auto si = data_. begin (); si != data_. end (); ++si ) {
133- WireFormat::WriteString (output, *si );
231+ for (const auto & item : items_ ) {
232+ WireFormat::WriteString (output, item );
134233 }
135234}
136235
137236size_t ColumnString::Size () const {
138- return data_ .size ();
237+ return items_ .size ();
139238}
140239
141240ColumnRef ColumnString::Slice (size_t begin, size_t len) {
142- return std::make_shared<ColumnString>(SliceVector (data_, begin, len));
241+ auto result = std::make_shared<ColumnString>();
242+
243+ if (begin < items_.size ()) {
244+ len = std::min (len, items_.size () - begin);
245+
246+ result->blocks_ .emplace_back (ComputeTotalSize (items_, begin, len));
247+ for (size_t i = begin; i < begin + len; ++i)
248+ {
249+ result->Append (items_[i]);
250+ }
251+ }
252+
253+ return result;
143254}
144255
145256}
0 commit comments