@@ -20,90 +20,135 @@ namespace sparrow_ipc
2020// }
2121// }
2222
23- std::vector<std:: uint8_t > lz4_compress (std::span< const std:: uint8_t > data)
23+ namespace
2424 {
25- const std::int64_t uncompressed_size = data.size ();
26- const size_t max_compressed_size = LZ4F_compressFrameBound (uncompressed_size, nullptr );
27- std::vector<std::uint8_t > compressed_data (max_compressed_size);
28- const size_t compressed_size = LZ4F_compressFrame (compressed_data.data (), max_compressed_size, data.data (), uncompressed_size, nullptr );
29- if (LZ4F_isError (compressed_size))
25+ std::vector<std::uint8_t > lz4_compress (std::span<const std::uint8_t > data)
3026 {
31- throw std::runtime_error (" Failed to compress data with LZ4 frame format" );
27+ const std::int64_t uncompressed_size = data.size ();
28+ const size_t max_compressed_size = LZ4F_compressFrameBound (uncompressed_size, nullptr );
29+ std::vector<std::uint8_t > compressed_data (max_compressed_size);
30+ const size_t compressed_size = LZ4F_compressFrame (compressed_data.data (), max_compressed_size, data.data (), uncompressed_size, nullptr );
31+ if (LZ4F_isError (compressed_size))
32+ {
33+ throw std::runtime_error (" Failed to compress data with LZ4 frame format" );
34+ }
35+ compressed_data.resize (compressed_size);
36+ return compressed_data;
3237 }
33- compressed_data.resize (compressed_size);
34- return compressed_data;
35- }
3638
37- std::vector<std::uint8_t > lz4_decompress (std::span<const std::uint8_t > data)
38- {
39- if (data.size () < 8 )
39+ std::vector<std::uint8_t > lz4_decompress (std::span<const std::uint8_t > data, const std::int64_t decompressed_size)
4040 {
41- throw std::runtime_error (" Invalid compressed data: missing decompressed size" );
41+ std::vector<std::uint8_t > decompressed_data (decompressed_size);
42+ LZ4F_dctx* dctx = nullptr ;
43+ LZ4F_createDecompressionContext (&dctx, LZ4F_VERSION);
44+ size_t compressed_size_in_out = data.size ();
45+ size_t decompressed_size_in_out = decompressed_size;
46+ size_t result = LZ4F_decompress (dctx, decompressed_data.data (), &decompressed_size_in_out, data.data (), &compressed_size_in_out, nullptr );
47+ if (LZ4F_isError (result) || (decompressed_size_in_out != (size_t )decompressed_size))
48+ {
49+ throw std::runtime_error (" Failed to decompress data with LZ4 frame format" );
50+ }
51+ LZ4F_freeDecompressionContext (dctx);
52+ return decompressed_data;
4253 }
43- const std::int64_t decompressed_size = *reinterpret_cast <const std::int64_t *>(data.data ());
44- const auto compressed_data = data.subspan (8 );
4554
46- if (decompressed_size == -1 )
55+ // TODO These functions could be moved to serialize_utils and deserialize_utils if preferred
56+ // as they are handling the header size
57+ std::vector<std::uint8_t > uncompressed_data_with_header (std::span<const std::uint8_t > data)
4758 {
48- // TODO think of avoiding copy here
49- return {compressed_data.begin (), compressed_data.end ()};
59+ std::vector<std::uint8_t > result;
60+ result.reserve (CompressionHeaderSize + data.size ());
61+ const std::int64_t header = -1 ;
62+ result.insert (result.end (), reinterpret_cast <const uint8_t *>(&header), reinterpret_cast <const uint8_t *>(&header) + sizeof (header));
63+ result.insert (result.end (), data.begin (), data.end ());
64+ return result;
5065 }
5166
52- std::vector<std::uint8_t > decompressed_data (decompressed_size);
53- LZ4F_dctx* dctx = nullptr ;
54- LZ4F_createDecompressionContext (&dctx, LZ4F_VERSION);
55- size_t compressed_size_in_out = compressed_data.size ();
56- size_t decompressed_size_in_out = decompressed_size;
57- size_t result = LZ4F_decompress (dctx, decompressed_data.data (), &decompressed_size_in_out, compressed_data.data (), &compressed_size_in_out, nullptr );
58- if (LZ4F_isError (result) || (decompressed_size_in_out != (size_t )decompressed_size))
67+ std::vector<std::uint8_t > lz4_compress_with_header (std::span<const std::uint8_t > data)
5968 {
60- throw std::runtime_error (" Failed to decompress data with LZ4 frame format" );
69+ const std::int64_t original_size = data.size ();
70+ auto compressed_body = lz4_compress (data);
71+
72+ if (compressed_body.size () >= static_cast <size_t >(original_size))
73+ {
74+ return uncompressed_data_with_header (data);
75+ }
76+
77+ std::vector<std::uint8_t > result;
78+ result.reserve (CompressionHeaderSize + compressed_body.size ());
79+ result.insert (result.end (), reinterpret_cast <const uint8_t *>(&original_size), reinterpret_cast <const uint8_t *>(&original_size) + sizeof (original_size));
80+ result.insert (result.end (), compressed_body.begin (), compressed_body.end ());
81+ return result;
82+ }
83+
84+ std::variant<std::vector<std::uint8_t >, std::span<const std::uint8_t >> lz4_decompress_with_header (std::span<const std::uint8_t > data)
85+ {
86+ if (data.size () < CompressionHeaderSize)
87+ {
88+ throw std::runtime_error (" Invalid compressed data: missing decompressed size" );
89+ }
90+ const std::int64_t decompressed_size = *reinterpret_cast <const std::int64_t *>(data.data ());
91+ const auto compressed_data = data.subspan (CompressionHeaderSize);
92+
93+ if (decompressed_size == -1 )
94+ {
95+ return compressed_data;
96+ }
97+
98+ return lz4_decompress (compressed_data, decompressed_size);
99+ }
100+
101+ std::span<const uint8_t > get_body_from_uncompressed_data (std::span<const uint8_t > data)
102+ {
103+ if (data.size () < CompressionHeaderSize)
104+ {
105+ throw std::runtime_error (" Invalid data: missing header" );
106+ }
107+ return data.subspan (CompressionHeaderSize);
61108 }
62- LZ4F_freeDecompressionContext (dctx);
63- return decompressed_data;
64109 }
65110
66111 std::vector<std::uint8_t > compress (const org::apache::arrow::flatbuf::CompressionType compression_type, std::span<const std::uint8_t > data)
67112 {
68- if (data.empty ())
69- {
70- return {};
71- }
72113 switch (compression_type)
73114 {
74115 case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME:
75116 {
76- return lz4_compress (data);
117+ return lz4_compress_with_header (data);
77118 }
78119 case org::apache::arrow::flatbuf::CompressionType::ZSTD:
79120 {
80121 throw std::runtime_error (" Compression using zstd is not supported yet." );
81122 }
82123 default :
83- // TODO think of avoiding copy here
84- return {data.begin (), data.end ()};
124+ return uncompressed_data_with_header (data);
85125 }
86126 }
87127
88- std::vector<std::uint8_t > decompress (const org::apache::arrow::flatbuf::CompressionType compression_type, std::span<const std::uint8_t > data)
128+ std::variant<std:: vector<std::uint8_t >, std::span< const std:: uint8_t > > decompress (const org::apache::arrow::flatbuf::CompressionType compression_type, std::span<const std::uint8_t > data)
89129 {
130+ // Handle empty input: an empty span is a valid representation for an empty buffer
131+ // (e.g., a validity bitmap for a column with no nulls) and should decompress to an empty output.
132+ // TODO if we don't call this fct anymore on validity buffers, remove this empty data handling
90133 if (data.empty ())
91134 {
92135 return {};
93136 }
137+
94138 switch (compression_type)
95139 {
96140 case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME:
97141 {
98- return lz4_decompress (data);
142+ return lz4_decompress_with_header (data);
99143 }
100144 case org::apache::arrow::flatbuf::CompressionType::ZSTD:
101145 {
102146 throw std::runtime_error (" Decompression using zstd is not supported yet." );
103147 }
104148 default :
105- // TODO think of avoiding copy here
106- return {data.begin (), data.end ()};
149+ {
150+ return get_body_from_uncompressed_data (data);
151+ }
107152 }
108153 }
109154}
0 commit comments