@@ -323,21 +323,28 @@ void checkStringColumnData(ArrowStorage& storage,
323
323
size_t end_row = std::min (row_count, start_row + fragment_size);
324
324
size_t frag_rows = end_row - start_row;
325
325
size_t chunk_size = 0 ;
326
+ bool has_nulls = false ;
326
327
for (size_t i = start_row; i < end_row; ++i) {
327
- chunk_size += vals[i].size ();
328
+ if (vals[i] != " <NULL>" ) {
329
+ chunk_size += vals[i].size ();
330
+ } else {
331
+ has_nulls = true ;
332
+ }
328
333
}
329
334
checkChunkMeta (chunk_meta_map.at (col_id),
330
335
storage.getColumnInfo (TEST_DB_ID, table_id, col_id)->type ,
331
336
frag_rows,
332
337
chunk_size,
333
- false );
338
+ has_nulls );
334
339
std::vector<int8_t > expected_data (chunk_size);
335
340
std::vector<uint32_t > expected_offset (frag_rows + 1 );
336
341
uint32_t data_offset = 0 ;
337
342
for (size_t i = start_row; i < end_row; ++i) {
338
343
expected_offset[i - start_row] = data_offset;
339
- memcpy (expected_data.data () + data_offset, vals[i].data (), vals[i].size ());
340
- data_offset += vals[i].size ();
344
+ if (vals[i] != " <NULL>" ) {
345
+ memcpy (expected_data.data () + data_offset, vals[i].data (), vals[i].size ());
346
+ data_offset += vals[i].size ();
347
+ }
341
348
}
342
349
expected_offset.back () = data_offset;
343
350
checkFetchedData (storage, table_id, col_id, frag_idx + 1 , expected_offset, {2 });
@@ -361,17 +368,28 @@ void checkStringDictColumnData(ArrowStorage& storage,
361
368
auto & dict = *storage.getDictMetadata (getDictId (col_info->type ))->stringDict ;
362
369
363
370
std::vector<IndexType> expected_ids (frag_rows);
371
+ bool has_nulls = false ;
372
+ IndexType min = std::numeric_limits<IndexType>::max ();
373
+ IndexType max = std::numeric_limits<IndexType>::min ();
364
374
for (size_t i = start_row; i < end_row; ++i) {
365
- expected_ids[i - start_row] = static_cast <IndexType>(dict.getIdOfString (expected[i]));
375
+ if (expected[i] == " <NULL>" ) {
376
+ expected_ids[i - start_row] = inline_int_null_value<IndexType>();
377
+ has_nulls = true ;
378
+ } else {
379
+ expected_ids[i - start_row] =
380
+ static_cast <IndexType>(dict.getIdOfString (expected[i]));
381
+ min = std::min (min, expected_ids[i - start_row]);
382
+ max = std::max (max, expected_ids[i - start_row]);
383
+ }
366
384
}
367
385
368
386
checkChunkMeta (chunk_meta_map.at (col_id),
369
387
col_info->type ,
370
388
frag_rows,
371
389
frag_rows * sizeof (IndexType),
372
- false ,
373
- * std::min_element (expected_ids. begin (), expected_ids. end ()) ,
374
- * std::max_element (expected_ids. begin (), expected_ids. end ()) );
390
+ has_nulls ,
391
+ min ,
392
+ max );
375
393
376
394
checkFetchedData (storage, table_id, col_id, frag_idx + 1 , expected_ids);
377
395
}
@@ -1754,6 +1772,109 @@ TEST_F(ArrowStorageTest, ImportParquet) {
1754
1772
std::vector<double >({1.1 , 2.2 , 3.3 , 4.4 , 5.5 }));
1755
1773
}
1756
1774
1775
+ namespace {
1776
+
1777
+ template <typename INDEX_TYPE,
1778
+ bool NULL_INDICES = false ,
1779
+ bool NULL_VALUES = false ,
1780
+ bool TARGET_DICT = true >
1781
+ void TestImportArrowDict (ConfigPtr config) {
1782
+ ArrowStorage storage (TEST_SCHEMA_ID, " test" , TEST_DB_ID, config);
1783
+ auto tinfo = storage.createTable (
1784
+ " table1" ,
1785
+ {{" col1" ,
1786
+ TARGET_DICT ? static_cast <const hdk::ir::Type*>(ctx.extDict (ctx.text (), 0 ))
1787
+ : static_cast <const hdk::ir::Type*>(ctx.text ())}});
1788
+
1789
+ using IndexArrowType = typename arrow::CTypeTraits<INDEX_TYPE>::ArrowType;
1790
+ using IndexBuilder = typename arrow::TypeTraits<IndexArrowType>::BuilderType;
1791
+
1792
+ std::vector<std::shared_ptr<arrow::Array>> arrays;
1793
+ IndexBuilder index_builder;
1794
+ ARROW_THROW_NOT_OK (index_builder.Append (0 ));
1795
+ ARROW_THROW_NOT_OK (index_builder.Append (1 ));
1796
+ ARROW_THROW_NOT_OK (index_builder.Append (0 ));
1797
+ ARROW_THROW_NOT_OK (index_builder.Append (1 ));
1798
+ arrow::StringBuilder value_builder;
1799
+ ARROW_THROW_NOT_OK (value_builder.Append (" str1" ));
1800
+ ARROW_THROW_NOT_OK (value_builder.Append (" str2" ));
1801
+ arrays.push_back (arrow::DictionaryArray::FromArrays (index_builder.Finish ().ValueOrDie (),
1802
+ value_builder.Finish ().ValueOrDie ())
1803
+ .ValueOrDie ());
1804
+ ARROW_THROW_NOT_OK (index_builder.Append (0 ));
1805
+ ARROW_THROW_NOT_OK (index_builder.Append (1 ));
1806
+ if (NULL_INDICES) {
1807
+ ARROW_THROW_NOT_OK (index_builder.AppendNull ());
1808
+ } else {
1809
+ ARROW_THROW_NOT_OK (index_builder.Append (1 ));
1810
+ }
1811
+ ARROW_THROW_NOT_OK (index_builder.Append (2 ));
1812
+ ARROW_THROW_NOT_OK (index_builder.Append (2 ));
1813
+ if (NULL_VALUES) {
1814
+ ARROW_THROW_NOT_OK (value_builder.AppendNull ());
1815
+ } else {
1816
+ ARROW_THROW_NOT_OK (value_builder.Append (" str2" ));
1817
+ }
1818
+ ARROW_THROW_NOT_OK (value_builder.Append (" str3" ));
1819
+ ARROW_THROW_NOT_OK (value_builder.Append (" str4" ));
1820
+ arrays.push_back (arrow::DictionaryArray::FromArrays (index_builder.Finish ().ValueOrDie (),
1821
+ value_builder.Finish ().ValueOrDie ())
1822
+ .ValueOrDie ());
1823
+ auto chunked_arr = std::make_shared<arrow::ChunkedArray>(arrays);
1824
+
1825
+ arrow::SchemaBuilder schema_builder;
1826
+ ARROW_THROW_NOT_OK (schema_builder.AddField (
1827
+ std::make_shared<arrow::Field>(" col1" , chunked_arr->type ())));
1828
+ auto schema = schema_builder.Finish ().ValueOrDie ();
1829
+ auto at = arrow::Table::Make (schema, {chunked_arr});
1830
+
1831
+ storage.appendArrowTable (at, " table1" );
1832
+
1833
+ checkData (storage,
1834
+ tinfo->table_id ,
1835
+ 9 ,
1836
+ 32'000'000 ,
1837
+ std::vector<std::string>({" str1" s,
1838
+ " str2" s,
1839
+ " str1" s,
1840
+ " str2" s,
1841
+ NULL_VALUES ? " <NULL>" s : " str2" s,
1842
+ " str3" s,
1843
+ NULL_INDICES ? " <NULL>" s : " str3" s,
1844
+ " str4" s,
1845
+ " str4" s}));
1846
+ }
1847
+
1848
+ } // namespace
1849
+
1850
+ TEST_F (ArrowStorageTest, ImportArrowDict8) {
1851
+ TestImportArrowDict<int8_t >(config_);
1852
+ }
1853
+
1854
+ TEST_F (ArrowStorageTest, ImportArrowDict16) {
1855
+ TestImportArrowDict<int16_t >(config_);
1856
+ }
1857
+
1858
+ TEST_F (ArrowStorageTest, ImportArrowDict32) {
1859
+ TestImportArrowDict<int32_t >(config_);
1860
+ }
1861
+
1862
+ TEST_F (ArrowStorageTest, ImportArrowDict64) {
1863
+ TestImportArrowDict<int64_t >(config_);
1864
+ }
1865
+
1866
+ TEST_F (ArrowStorageTest, ImportArrowDict_Null_Indices) {
1867
+ TestImportArrowDict<int32_t , true , false >(config_);
1868
+ }
1869
+
1870
+ TEST_F (ArrowStorageTest, ImportArrowDict_Null_Values) {
1871
+ TestImportArrowDict<int32_t , false , true >(config_);
1872
+ }
1873
+
1874
+ TEST_F (ArrowStorageTest, ImportArrowDictToPlainString) {
1875
+ TestImportArrowDict<int32_t , true , true , false >(config_);
1876
+ }
1877
+
1757
1878
int main (int argc, char ** argv) {
1758
1879
TestHelpers::init_logger_stderr_only (argc, argv);
1759
1880
testing::InitGoogleTest (&argc, argv);
0 commit comments