2424#include " glog/logging.h"
2525#include " proto/fe_type.pb.h"
2626
27+ DECLARE_bool (enable_spark_unsaferow_format);
28+
2729namespace hybridse {
2830namespace codec {
2931namespace v1 {
3032
3133using hybridse::codec::ListV;
3234using hybridse::codec::Row;
3335
36+ uint32_t CalcTotalLength (uint32_t primary_size, uint32_t str_field_cnt,
37+ uint32_t str_size, uint32_t * str_addr_space) {
38+ uint32_t total_size = primary_size + str_size;
39+
40+ // Support Spark UnsafeRow format where string field will take up 8 bytes
41+ if (FLAGS_enable_spark_unsaferow_format) {
42+ // Make sure each string column takes up 8 bytes
43+ *str_addr_space = 8 ;
44+ return total_size + str_field_cnt * 8 ;
45+ }
46+
47+ if (total_size + str_field_cnt <= UINT8_MAX) {
48+ *str_addr_space = 1 ;
49+ return total_size + str_field_cnt;
50+ } else if (total_size + str_field_cnt * 2 <= UINT16_MAX) {
51+ *str_addr_space = 2 ;
52+ return total_size + str_field_cnt * 2 ;
53+ } else if (total_size + str_field_cnt * 3 <= 1 << 24 ) {
54+ *str_addr_space = 3 ;
55+ return total_size + str_field_cnt * 3 ;
56+ } else {
57+ *str_addr_space = 4 ;
58+ return total_size + str_field_cnt * 4 ;
59+ }
60+ }
61+
3462int32_t GetStrField (const int8_t * row, uint32_t idx, uint32_t str_field_offset,
3563 uint32_t next_str_field_offset, uint32_t str_start_offset,
3664 uint32_t addr_space, const char ** data, uint32_t * size,
@@ -42,16 +70,33 @@ int32_t GetStrField(const int8_t* row, uint32_t idx, uint32_t str_field_offset,
4270 return 0 ;
4371 } else {
4472 *is_null = false ;
45- return GetStrFieldUnsafe (row, str_field_offset, next_str_field_offset,
73+ return GetStrFieldUnsafe (row, idx, str_field_offset, next_str_field_offset,
4674 str_start_offset, addr_space, data, size);
4775 }
4876}
4977
50- int32_t GetStrFieldUnsafe (const int8_t * row, uint32_t field_offset,
78+ int32_t GetStrFieldUnsafe (const int8_t * row, uint32_t col_idx,
79+ uint32_t field_offset,
5180 uint32_t next_str_field_offset,
5281 uint32_t str_start_offset, uint32_t addr_space,
5382 const char ** data, uint32_t * size) {
5483 if (row == NULL || data == NULL || size == NULL ) return -1 ;
84+
85+ // Support Spark UnsafeRow format
86+ if (FLAGS_enable_spark_unsaferow_format) {
87+ // For UnsafeRow opt, str_start_offset is the nullbitmap size
88+ const uint32_t bitmap_size = str_start_offset;
89+ const int8_t * row_with_col_offset = row + HEADER_LENGTH + bitmap_size + col_idx * 8 ;
90+
91+ // For Spark UnsafeRow, the first 32 bits is for length and the last
92+ // 32 bits is for offset.
93+ *size = *(reinterpret_cast <const uint32_t *>(row_with_col_offset));
94+ uint32_t str_value_offset = *(reinterpret_cast <const uint32_t *>(row_with_col_offset + 4 )) + HEADER_LENGTH;
95+ *data = reinterpret_cast <const char *>(row + str_value_offset);
96+
97+ return 0 ;
98+ }
99+
55100 const int8_t * row_with_offset = row + str_start_offset;
56101 uint32_t str_offset = 0 ;
57102 uint32_t next_str_offset = 0 ;
@@ -143,6 +188,24 @@ int32_t AppendString(int8_t* buf_ptr, uint32_t buf_size, uint32_t col_idx,
143188 int8_t * val, uint32_t size, int8_t is_null,
144189 uint32_t str_start_offset, uint32_t str_field_offset,
145190 uint32_t str_addr_space, uint32_t str_body_offset) {
191+
192+ if (FLAGS_enable_spark_unsaferow_format) {
193+ // TODO(chenjing): Refactor to support multiple codec instead of reusing the variable
194+ // For UnsafeRow opt, str_start_offset is the nullbitmap size
195+ const uint32_t bitmap_size = str_start_offset;
196+ const uint32_t str_col_offset = HEADER_LENGTH + bitmap_size + col_idx * 8 ;
197+
198+ *(reinterpret_cast <uint32_t *>(buf_ptr + str_col_offset)) = size; // set size
199+ // Notice that the offset in UnsafeRow should start without HybridSE header
200+ *(reinterpret_cast <uint32_t *>(buf_ptr + str_col_offset + 4 )) = str_body_offset - HEADER_LENGTH; // set offset
201+
202+ if (size != 0 ) {
203+ memcpy (reinterpret_cast <char *>(buf_ptr + str_body_offset), val, size);
204+ }
205+
206+ return str_body_offset + size;
207+ }
208+
146209 if (is_null) {
147210 AppendNullBit (buf_ptr, col_idx, true );
148211 size_t str_addr_length = GetAddrLength (buf_size);
0 commit comments