Skip to content

Commit 9367683

Browse files
authored
feat: support encode and decode string for Spark UnsafeRow format (#119)
* Support encode and decode string for Spark UnsafeRow format
1 parent e958670 commit 9367683

File tree

7 files changed

+235
-48
lines changed

7 files changed

+235
-48
lines changed

include/codec/list_iterator_codec.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ class StringColumnImpl : public ColumnImpl<StringRef> {
125125
int32_t addr_space = v1::GetAddrSpace(row.size(row_idx_));
126126
StringRef value;
127127
const char *buffer;
128-
v1::GetStrFieldUnsafe(row.buf(row_idx_), str_field_offset_,
128+
v1::GetStrFieldUnsafe(row.buf(row_idx_), col_idx_, str_field_offset_,
129129
next_str_field_offset_, str_start_offset_,
130130
addr_space, &buffer, &(value.size_));
131131
value.data_ = buffer;
@@ -142,7 +142,7 @@ class StringColumnImpl : public ColumnImpl<StringRef> {
142142
int32_t addr_space = v1::GetAddrSpace(row.size(row_idx_));
143143
StringRef value;
144144
const char *buffer;
145-
v1::GetStrFieldUnsafe(buf, str_field_offset_,
145+
v1::GetStrFieldUnsafe(buf, col_idx_, str_field_offset_,
146146
next_str_field_offset_, str_start_offset_,
147147
addr_space, &buffer, &(value.size_));
148148
value.data_ = buffer;

include/codec/type_codec.h

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "base/fe_hash.h"
2525
#include "base/mem_pool.h"
2626
#include "glog/logging.h"
27+
2728
namespace hybridse {
2829
namespace codec {
2930
static const uint32_t SEED = 0xe17a1465;
@@ -217,23 +218,8 @@ static constexpr uint8_t SIZE_LENGTH = 4;
217218
static constexpr uint8_t HEADER_LENGTH = VERSION_LENGTH + SIZE_LENGTH;
218219

219220
// calc the total row size with primary_size, str field count and str_size
220-
inline uint32_t CalcTotalLength(uint32_t primary_size, uint32_t str_field_cnt,
221-
uint32_t str_size, uint32_t* str_addr_space) {
222-
uint32_t total_size = primary_size + str_size;
223-
if (total_size + str_field_cnt <= UINT8_MAX) {
224-
*str_addr_space = 1;
225-
return total_size + str_field_cnt;
226-
} else if (total_size + str_field_cnt * 2 <= UINT16_MAX) {
227-
*str_addr_space = 2;
228-
return total_size + str_field_cnt * 2;
229-
} else if (total_size + str_field_cnt * 3 <= 1 << 24) {
230-
*str_addr_space = 3;
231-
return total_size + str_field_cnt * 3;
232-
} else {
233-
*str_addr_space = 4;
234-
return total_size + str_field_cnt * 4;
235-
}
236-
}
221+
uint32_t CalcTotalLength(uint32_t primary_size, uint32_t str_field_cnt,
222+
uint32_t str_size, uint32_t* str_addr_space);
237223

238224
inline void AppendNullBit(int8_t* buf_ptr, uint32_t col_idx, int8_t is_null) {
239225
int8_t* ptr = buf_ptr + HEADER_LENGTH + (col_idx >> 3);
@@ -432,7 +418,8 @@ inline double GetDoubleField(const int8_t* row, uint32_t idx, uint32_t offset,
432418
}
433419

434420
// native get string field method
435-
int32_t GetStrFieldUnsafe(const int8_t* row, uint32_t str_field_offset,
421+
int32_t GetStrFieldUnsafe(const int8_t* row, uint32_t col_idx,
422+
uint32_t str_field_offset,
436423
uint32_t next_str_field_offset,
437424
uint32_t str_start_offset, uint32_t addr_space,
438425
const char** data, uint32_t* size);

src/codec/fe_row_codec.cc

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@ namespace codec {
2727

2828
const uint32_t BitMapSize(uint32_t size) {
2929
if (FLAGS_enable_spark_unsaferow_format) {
30-
return 8;
30+
// For UnsafeRow opt, the nullbit set increases by 8 bytes
31+
return ((size >> 6) + !!(size&0x7f)) * 8;
3132
} else {
32-
return ((size) >> 3) + !!((size)&0x07);
33+
return (size >> 3) + !!(size&0x07);
3334
}
3435
}
3536

@@ -448,7 +449,7 @@ std::string RowView::GetStringUnsafe(uint32_t idx) {
448449
}
449450
const char* val;
450451
uint32_t length;
451-
v1::GetStrFieldUnsafe(row_, field_offset, next_str_field_offset,
452+
v1::GetStrFieldUnsafe(row_, idx, field_offset, next_str_field_offset,
452453
str_field_start_offset_, str_addr_length_, &val,
453454
&length);
454455
return std::string(val, length);
@@ -845,7 +846,7 @@ int32_t RowView::GetValue(const int8_t* row, uint32_t idx, const char** val,
845846
if (offset_vec_.at(idx) < string_field_cnt_ - 1) {
846847
next_str_field_offset = field_offset + 1;
847848
}
848-
return v1::GetStrFieldUnsafe(row, field_offset, next_str_field_offset,
849+
return v1::GetStrFieldUnsafe(row, idx, field_offset, next_str_field_offset,
849850
str_field_start_offset_, GetAddrLength(size),
850851
val, length);
851852
}
@@ -871,7 +872,7 @@ int32_t RowView::GetString(uint32_t idx, const char** val, uint32_t* length) {
871872
if (offset_vec_.at(idx) < string_field_cnt_ - 1) {
872873
next_str_field_offset = field_offset + 1;
873874
}
874-
return v1::GetStrFieldUnsafe(row_, field_offset, next_str_field_offset,
875+
return v1::GetStrFieldUnsafe(row_, idx, field_offset, next_str_field_offset,
875876
str_field_start_offset_, str_addr_length_, val,
876877
length);
877878
}
@@ -941,8 +942,15 @@ bool RowFormat::GetStringColumnInfo(size_t idx, StringColInfo* res) const {
941942
DLOG(INFO) << "get string with offset " << offset << " next offset "
942943
<< next_offset << " str_field_start_offset "
943944
<< str_field_start_offset_ << " for col " << base_col_info.name;
944-
*res = StringColInfo(base_col_info.name, ty, col_idx, offset, next_offset,
945-
str_field_start_offset_);
945+
946+
if (FLAGS_enable_spark_unsaferow_format) {
947+
// Notice that we pass the nullbitmap size as str_field_start_offset
948+
*res = StringColInfo(base_col_info.name, ty, col_idx, offset, next_offset,
949+
BitMapSize(schema_->size()));
950+
} else {
951+
*res = StringColInfo(base_col_info.name, ty, col_idx, offset, next_offset,
952+
str_field_start_offset_);
953+
}
946954
return true;
947955
}
948956

src/codec/fe_row_codec_test.cc

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <vector>
2020
#include "gtest/gtest.h"
2121

22+
DECLARE_bool(enable_spark_unsaferow_format);
23+
2224
namespace hybridse {
2325
namespace codec {
2426

@@ -649,6 +651,61 @@ TEST_F(CodecTest, RowFormatOffsetLongHeaderTest) {
649651
ASSERT_EQ(50u, str_info.str_start_offset);
650652
}
651653
}
654+
TEST_F(CodecTest, SparkUnsaferowBitMapSizeTest) {
655+
FLAGS_enable_spark_unsaferow_format = false;
656+
ASSERT_EQ(BitMapSize(3), 1);
657+
ASSERT_EQ(BitMapSize(8), 1);
658+
ASSERT_EQ(BitMapSize(9), 2);
659+
ASSERT_EQ(BitMapSize(20), 3);
660+
ASSERT_EQ(BitMapSize(65), 9);
661+
662+
FLAGS_enable_spark_unsaferow_format = true;
663+
ASSERT_EQ(BitMapSize(3), 8);
664+
ASSERT_EQ(BitMapSize(8), 8);
665+
ASSERT_EQ(BitMapSize(9), 8);
666+
ASSERT_EQ(BitMapSize(20), 8);
667+
ASSERT_EQ(BitMapSize(65), 16);
668+
}
669+
TEST_F(CodecTest, SparkUnsaferowRowFormatTest) {
670+
FLAGS_enable_spark_unsaferow_format = true;
671+
672+
std::vector<int> num_vec = {10, 20, 50, 100, 1000};
673+
for (auto col_num : num_vec) {
674+
::hybridse::type::TableDef def;
675+
for (int i = 0; i < col_num; i++) {
676+
::hybridse::type::ColumnDef* col = def.add_columns();
677+
col->set_name("col" + std::to_string(i));
678+
if (i % 3 == 0) {
679+
col->set_type(::hybridse::type::kVarchar);
680+
} else if (i % 3 == 1) {
681+
col->set_type(::hybridse::type::kInt64);
682+
} else if (i % 3 == 2) {
683+
col->set_type(::hybridse::type::kDouble);
684+
}
685+
}
686+
687+
RowFormat decoder(&def.columns());
688+
for (int i = 0; i < col_num; i++) {
689+
if (i % 3 == 0) {
690+
const codec::ColInfo* info = decoder.GetColumnInfo(i);
691+
ASSERT_TRUE(info != nullptr);
692+
ASSERT_EQ(::hybridse::type::kVarchar, info->type);
693+
694+
codec::StringColInfo str_info;
695+
ASSERT_TRUE(decoder.GetStringColumnInfo(i, &str_info));
696+
} else if (i % 3 == 1) {
697+
const codec::ColInfo* info = decoder.GetColumnInfo(i);
698+
ASSERT_TRUE(info != nullptr);
699+
ASSERT_EQ(::hybridse::type::kInt64, info->type);
700+
} else if (i % 3 == 2) {
701+
const codec::ColInfo* info = decoder.GetColumnInfo(i);
702+
ASSERT_TRUE(info != nullptr);
703+
ASSERT_EQ(::hybridse::type::kDouble, info->type);
704+
}
705+
}
706+
}
707+
}
708+
652709
} // namespace codec
653710
} // namespace hybridse
654711

src/codec/type_codec.cc

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,41 @@
2424
#include "glog/logging.h"
2525
#include "proto/fe_type.pb.h"
2626

27+
DECLARE_bool(enable_spark_unsaferow_format);
28+
2729
namespace hybridse {
2830
namespace codec {
2931
namespace v1 {
3032

3133
using hybridse::codec::ListV;
3234
using hybridse::codec::Row;
3335

36+
uint32_t CalcTotalLength(uint32_t primary_size, uint32_t str_field_cnt,
37+
uint32_t str_size, uint32_t* str_addr_space) {
38+
uint32_t total_size = primary_size + str_size;
39+
40+
// Support Spark UnsafeRow format where string field will take up 8 bytes
41+
if (FLAGS_enable_spark_unsaferow_format) {
42+
// Make sure each string column takes up 8 bytes
43+
*str_addr_space = 8;
44+
return total_size + str_field_cnt * 8;
45+
}
46+
47+
if (total_size + str_field_cnt <= UINT8_MAX) {
48+
*str_addr_space = 1;
49+
return total_size + str_field_cnt;
50+
} else if (total_size + str_field_cnt * 2 <= UINT16_MAX) {
51+
*str_addr_space = 2;
52+
return total_size + str_field_cnt * 2;
53+
} else if (total_size + str_field_cnt * 3 <= 1 << 24) {
54+
*str_addr_space = 3;
55+
return total_size + str_field_cnt * 3;
56+
} else {
57+
*str_addr_space = 4;
58+
return total_size + str_field_cnt * 4;
59+
}
60+
}
61+
3462
int32_t GetStrField(const int8_t* row, uint32_t idx, uint32_t str_field_offset,
3563
uint32_t next_str_field_offset, uint32_t str_start_offset,
3664
uint32_t addr_space, const char** data, uint32_t* size,
@@ -42,16 +70,33 @@ int32_t GetStrField(const int8_t* row, uint32_t idx, uint32_t str_field_offset,
4270
return 0;
4371
} else {
4472
*is_null = false;
45-
return GetStrFieldUnsafe(row, str_field_offset, next_str_field_offset,
73+
return GetStrFieldUnsafe(row, idx, str_field_offset, next_str_field_offset,
4674
str_start_offset, addr_space, data, size);
4775
}
4876
}
4977

50-
int32_t GetStrFieldUnsafe(const int8_t* row, uint32_t field_offset,
78+
int32_t GetStrFieldUnsafe(const int8_t* row, uint32_t col_idx,
79+
uint32_t field_offset,
5180
uint32_t next_str_field_offset,
5281
uint32_t str_start_offset, uint32_t addr_space,
5382
const char** data, uint32_t* size) {
5483
if (row == NULL || data == NULL || size == NULL) return -1;
84+
85+
// Support Spark UnsafeRow format
86+
if (FLAGS_enable_spark_unsaferow_format) {
87+
// For UnsafeRow opt, str_start_offset is the nullbitmap size
88+
const uint32_t bitmap_size = str_start_offset;
89+
const int8_t* row_with_col_offset = row + HEADER_LENGTH + bitmap_size + col_idx * 8;
90+
91+
// For Spark UnsafeRow, the first 32 bits is for length and the last
92+
// 32 bits is for offset.
93+
*size = *(reinterpret_cast<const uint32_t*>(row_with_col_offset));
94+
uint32_t str_value_offset = *(reinterpret_cast<const uint32_t*>(row_with_col_offset + 4)) + HEADER_LENGTH;
95+
*data = reinterpret_cast<const char*>(row + str_value_offset);
96+
97+
return 0;
98+
}
99+
55100
const int8_t* row_with_offset = row + str_start_offset;
56101
uint32_t str_offset = 0;
57102
uint32_t next_str_offset = 0;
@@ -143,6 +188,24 @@ int32_t AppendString(int8_t* buf_ptr, uint32_t buf_size, uint32_t col_idx,
143188
int8_t* val, uint32_t size, int8_t is_null,
144189
uint32_t str_start_offset, uint32_t str_field_offset,
145190
uint32_t str_addr_space, uint32_t str_body_offset) {
191+
192+
if (FLAGS_enable_spark_unsaferow_format) {
193+
// TODO(chenjing): Refactor to support multiple codec instead of reusing the variable
194+
// For UnsafeRow opt, str_start_offset is the nullbitmap size
195+
const uint32_t bitmap_size = str_start_offset;
196+
const uint32_t str_col_offset = HEADER_LENGTH + bitmap_size + col_idx * 8;
197+
198+
*(reinterpret_cast<uint32_t*>(buf_ptr + str_col_offset)) = size; // set size
199+
// Notice that the offset in UnsafeRow should start without HybridSE header
200+
*(reinterpret_cast<uint32_t*>(buf_ptr + str_col_offset + 4)) = str_body_offset - HEADER_LENGTH; // set offset
201+
202+
if (size != 0) {
203+
memcpy(reinterpret_cast<char*>(buf_ptr + str_body_offset), val, size);
204+
}
205+
206+
return str_body_offset + size;
207+
}
208+
146209
if (is_null) {
147210
AppendNullBit(buf_ptr, col_idx, true);
148211
size_t str_addr_length = GetAddrLength(buf_size);

src/codegen/buf_ir_builder.cc

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#include "codegen/timestamp_ir_builder.h"
2626
#include "glog/logging.h"
2727

28+
DECLARE_bool(enable_spark_unsaferow_format);
29+
2830
namespace hybridse {
2931
namespace codegen {
3032

@@ -258,22 +260,33 @@ BufNativeEncoderIRBuilder::BufNativeEncoderIRBuilder(
258260
block_(block) {
259261
str_field_start_offset_ = codec::GetStartOffset(schema_->size());
260262
for (int32_t idx = 0; idx < schema_->size(); idx++) {
261-
const ::hybridse::type::ColumnDef& column = schema_->Get(idx);
262-
if (column.type() == ::hybridse::type::kVarchar) {
263-
offset_vec_.push_back(str_field_cnt_);
264-
str_field_cnt_++;
263+
// Support Spark UnsafeRow format where all fields will take up 8 bytes
264+
if (FLAGS_enable_spark_unsaferow_format) {
265+
offset_vec_.push_back(str_field_start_offset_);
266+
str_field_start_offset_ += 8;
267+
const ::hybridse::type::ColumnDef& column = schema_->Get(idx);
268+
if (column.type() == ::hybridse::type::kVarchar) {
269+
str_field_cnt_++;
270+
}
265271
} else {
266-
auto TYPE_SIZE_MAP = codec::GetTypeSizeMap();
267-
auto it = TYPE_SIZE_MAP.find(column.type());
268-
if (it == TYPE_SIZE_MAP.end()) {
269-
LOG(WARNING) << ::hybridse::type::Type_Name(column.type())
270-
<< " is not supported";
272+
const ::hybridse::type::ColumnDef& column = schema_->Get(idx);
273+
if (column.type() == ::hybridse::type::kVarchar) {
274+
offset_vec_.push_back(str_field_cnt_);
275+
str_field_cnt_++;
271276
} else {
272-
offset_vec_.push_back(str_field_start_offset_);
273-
DLOG(INFO) << "idx " << idx << " offset "
274-
<< str_field_start_offset_;
275-
str_field_start_offset_ += it->second;
277+
auto TYPE_SIZE_MAP = codec::GetTypeSizeMap();
278+
auto it = TYPE_SIZE_MAP.find(column.type());
279+
if (it == TYPE_SIZE_MAP.end()) {
280+
LOG(WARNING) << ::hybridse::type::Type_Name(column.type())
281+
<< " is not supported";
282+
} else {
283+
offset_vec_.push_back(str_field_start_offset_);
284+
DLOG(INFO) << "idx " << idx << " offset "
285+
<< str_field_start_offset_;
286+
str_field_start_offset_ += it->second;
287+
}
276288
}
289+
277290
}
278291
}
279292
}
@@ -499,12 +512,23 @@ bool BufNativeEncoderIRBuilder::AppendString(
499512
size_ty, // str_field_offset
500513
size_ty, // str_addr_space
501514
size_ty); // str_body_offset
502-
*output = builder.CreateCall(
503-
callee,
504-
::llvm::ArrayRef<::llvm::Value*>{
505-
i8_ptr, buf_size, val_field_idx, data_ptr, fe_str_size, is_null,
506-
builder.getInt32(str_field_start_offset_),
507-
builder.getInt32(str_field_idx), str_addr_space, str_body_offset});
515+
516+
if (FLAGS_enable_spark_unsaferow_format) {
517+
*output = builder.CreateCall(
518+
callee,
519+
::llvm::ArrayRef<::llvm::Value*>{
520+
i8_ptr, buf_size, val_field_idx, data_ptr, fe_str_size, is_null,
521+
// Notice that we pass nullbitmap size as str_field_start_offset
522+
builder.getInt32(codec::BitMapSize(schema_->size())),
523+
builder.getInt32(str_field_idx), str_addr_space, str_body_offset});
524+
} else {
525+
*output = builder.CreateCall(
526+
callee,
527+
::llvm::ArrayRef<::llvm::Value*>{
528+
i8_ptr, buf_size, val_field_idx, data_ptr, fe_str_size, is_null,
529+
builder.getInt32(str_field_start_offset_),
530+
builder.getInt32(str_field_idx), str_addr_space, str_body_offset});
531+
}
508532
return true;
509533
}
510534

0 commit comments

Comments
 (0)