@@ -319,171 +319,67 @@ carquet_status_t carquet_page_writer_add_values(
319319 & writer -> rep_levels_buffer );
320320 }
321321
322- /* Encode values using PLAIN encoding
323- * When there are nulls (def_levels provided), we must only encode
324- * the non-null values, skipping positions where def_level < max_def_level
322+ /* Encode values using PLAIN encoding.
323+ *
324+ * The values array uses sparse encoding: it contains only non-null values
325+ * (packed at the front), with num_non_null entries. The def_levels array
326+ * has num_values entries (one per logical row) indicating which rows are
327+ * null vs present.
325328 */
326329 carquet_status_t status = CARQUET_OK ;
327- bool has_nulls = def_levels && writer -> max_def_level > 0 && num_non_null < num_values ;
328330
329331 switch (writer -> type ) {
330332 case CARQUET_PHYSICAL_BOOLEAN : {
331333 const uint8_t * bools = (const uint8_t * )values ;
332- if (has_nulls ) {
333- uint8_t * non_null_bools = malloc (num_non_null );
334- if (!non_null_bools ) return CARQUET_ERROR_OUT_OF_MEMORY ;
335- int64_t j = 0 ;
336- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
337- if (def_levels [i ] == writer -> max_def_level ) {
338- non_null_bools [j ++ ] = bools [i ];
339- }
340- }
341- status = carquet_encode_plain_boolean (non_null_bools , num_non_null ,
342- & writer -> values_buffer );
343- free (non_null_bools );
344- } else {
345- status = carquet_encode_plain_boolean (bools , num_non_null ,
346- & writer -> values_buffer );
347- }
334+ status = carquet_encode_plain_boolean (bools , num_non_null ,
335+ & writer -> values_buffer );
348336 break ;
349337 }
350338
351339 case CARQUET_PHYSICAL_INT32 : {
352340 const int32_t * ints = (const int32_t * )values ;
353- if (has_nulls ) {
354- int32_t * non_null_ints = malloc (num_non_null * sizeof (int32_t ));
355- if (!non_null_ints ) return CARQUET_ERROR_OUT_OF_MEMORY ;
356- int64_t j = 0 ;
357- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
358- if (def_levels [i ] == writer -> max_def_level ) {
359- non_null_ints [j ++ ] = ints [i ];
360- }
361- }
362- status = carquet_encode_plain_int32 (non_null_ints , num_non_null ,
363- & writer -> values_buffer );
364- update_statistics_i32 (writer , non_null_ints , num_non_null );
365- free (non_null_ints );
366- } else {
367- status = carquet_encode_plain_int32 (ints , num_non_null ,
368- & writer -> values_buffer );
369- update_statistics_i32 (writer , ints , num_non_null );
370- }
341+ status = carquet_encode_plain_int32 (ints , num_non_null ,
342+ & writer -> values_buffer );
343+ update_statistics_i32 (writer , ints , num_non_null );
371344 break ;
372345 }
373346
374347 case CARQUET_PHYSICAL_INT64 : {
375348 const int64_t * ints = (const int64_t * )values ;
376- if (has_nulls ) {
377- int64_t * non_null_ints = malloc (num_non_null * sizeof (int64_t ));
378- if (!non_null_ints ) return CARQUET_ERROR_OUT_OF_MEMORY ;
379- int64_t j = 0 ;
380- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
381- if (def_levels [i ] == writer -> max_def_level ) {
382- non_null_ints [j ++ ] = ints [i ];
383- }
384- }
385- status = carquet_encode_plain_int64 (non_null_ints , num_non_null ,
386- & writer -> values_buffer );
387- update_statistics_i64 (writer , non_null_ints , num_non_null );
388- free (non_null_ints );
389- } else {
390- status = carquet_encode_plain_int64 (ints , num_non_null ,
391- & writer -> values_buffer );
392- update_statistics_i64 (writer , ints , num_non_null );
393- }
349+ status = carquet_encode_plain_int64 (ints , num_non_null ,
350+ & writer -> values_buffer );
351+ update_statistics_i64 (writer , ints , num_non_null );
394352 break ;
395353 }
396354
397355 case CARQUET_PHYSICAL_FLOAT : {
398356 const float * floats = (const float * )values ;
399- if (has_nulls ) {
400- float * non_null_floats = malloc (num_non_null * sizeof (float ));
401- if (!non_null_floats ) return CARQUET_ERROR_OUT_OF_MEMORY ;
402- int64_t j = 0 ;
403- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
404- if (def_levels [i ] == writer -> max_def_level ) {
405- non_null_floats [j ++ ] = floats [i ];
406- }
407- }
408- status = carquet_encode_plain_float (non_null_floats , num_non_null ,
409- & writer -> values_buffer );
410- update_statistics_float (writer , non_null_floats , num_non_null );
411- free (non_null_floats );
412- } else {
413- status = carquet_encode_plain_float (floats , num_non_null ,
414- & writer -> values_buffer );
415- update_statistics_float (writer , floats , num_non_null );
416- }
357+ status = carquet_encode_plain_float (floats , num_non_null ,
358+ & writer -> values_buffer );
359+ update_statistics_float (writer , floats , num_non_null );
417360 break ;
418361 }
419362
420363 case CARQUET_PHYSICAL_DOUBLE : {
421364 const double * doubles = (const double * )values ;
422- if (has_nulls ) {
423- double * non_null_doubles = malloc (num_non_null * sizeof (double ));
424- if (!non_null_doubles ) return CARQUET_ERROR_OUT_OF_MEMORY ;
425- int64_t j = 0 ;
426- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
427- if (def_levels [i ] == writer -> max_def_level ) {
428- non_null_doubles [j ++ ] = doubles [i ];
429- }
430- }
431- status = carquet_encode_plain_double (non_null_doubles , num_non_null ,
432- & writer -> values_buffer );
433- update_statistics_double (writer , non_null_doubles , num_non_null );
434- free (non_null_doubles );
435- } else {
436- status = carquet_encode_plain_double (doubles , num_non_null ,
437- & writer -> values_buffer );
438- update_statistics_double (writer , doubles , num_non_null );
439- }
365+ status = carquet_encode_plain_double (doubles , num_non_null ,
366+ & writer -> values_buffer );
367+ update_statistics_double (writer , doubles , num_non_null );
440368 break ;
441369 }
442370
443371 case CARQUET_PHYSICAL_BYTE_ARRAY : {
444372 const carquet_byte_array_t * arrays = (const carquet_byte_array_t * )values ;
445- if (has_nulls ) {
446- carquet_byte_array_t * non_null_arrays = malloc (num_non_null * sizeof (carquet_byte_array_t ));
447- if (!non_null_arrays ) return CARQUET_ERROR_OUT_OF_MEMORY ;
448- int64_t j = 0 ;
449- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
450- if (def_levels [i ] == writer -> max_def_level ) {
451- non_null_arrays [j ++ ] = arrays [i ];
452- }
453- }
454- status = carquet_encode_plain_byte_array (non_null_arrays , num_non_null ,
455- & writer -> values_buffer );
456- free (non_null_arrays );
457- } else {
458- status = carquet_encode_plain_byte_array (arrays , num_non_null ,
459- & writer -> values_buffer );
460- }
373+ status = carquet_encode_plain_byte_array (arrays , num_non_null ,
374+ & writer -> values_buffer );
461375 break ;
462376 }
463377
464378 case CARQUET_PHYSICAL_FIXED_LEN_BYTE_ARRAY : {
465379 const uint8_t * fixed = (const uint8_t * )values ;
466- if (has_nulls ) {
467- uint8_t * non_null_fixed = malloc (num_non_null * writer -> type_length );
468- if (!non_null_fixed ) return CARQUET_ERROR_OUT_OF_MEMORY ;
469- int64_t j = 0 ;
470- for (int64_t i = 0 ; i < num_values && j < num_non_null ; i ++ ) {
471- if (def_levels [i ] == writer -> max_def_level ) {
472- memcpy (non_null_fixed + j * writer -> type_length ,
473- fixed + i * writer -> type_length ,
474- writer -> type_length );
475- j ++ ;
476- }
477- }
478- status = carquet_encode_plain_fixed_byte_array (non_null_fixed , num_non_null ,
479- writer -> type_length ,
480- & writer -> values_buffer );
481- free (non_null_fixed );
482- } else {
483- status = carquet_encode_plain_fixed_byte_array (fixed , num_non_null ,
484- writer -> type_length ,
485- & writer -> values_buffer );
486- }
380+ status = carquet_encode_plain_fixed_byte_array (fixed , num_non_null ,
381+ writer -> type_length ,
382+ & writer -> values_buffer );
487383 break ;
488384 }
489385
0 commit comments