Skip to content

Commit c48a5c8

Browse files
committed
wip
1 parent 840a6c3 commit c48a5c8

File tree

12 files changed

+619
-1034
lines changed

12 files changed

+619
-1034
lines changed

include/sparrow_ipc/deserialize.hpp

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,10 @@
66

77
#include <sparrow/record_batch.hpp>
88

9-
#include "Message_generated.h"
10-
#include "Schema_generated.h"
119
#include "sparrow_ipc/config/config.hpp"
12-
#include "sparrow_ipc/encapsulated_message.hpp"
13-
#include "sparrow_ipc/metadata.hpp"
1410

1511
namespace sparrow_ipc
1612
{
17-
/**
18-
* @brief Deserializes arrays from an Apache Arrow RecordBatch using the provided schema.
19-
*
20-
* This function processes each field in the schema and deserializes the corresponding
21-
* data from the RecordBatch into sparrow::array objects. It handles various Arrow data
22-
* types including primitive types (bool, integers, floating point), binary data, and
23-
* string data with their respective size variants.
24-
*
25-
* @param record_batch The Apache Arrow FlatBuffer RecordBatch containing the serialized data
26-
* @param schema The Apache Arrow FlatBuffer Schema defining the structure and types of the data
27-
* @param encapsulated_message The message containing the binary data buffers
28-
* @param field_metadata Metadata for each field
29-
*
30-
* @return std::vector<sparrow::array> A vector of deserialized arrays, one for each field in the schema
31-
*
32-
* @throws std::runtime_error If an unsupported data type, integer bit width, or floating point precision
33-
* is encountered
34-
*
35-
* The function maintains a buffer index that is incremented as it processes each field
36-
* to correctly map data buffers to their corresponding arrays.
37-
*/
38-
[[nodiscard]] SPARROW_IPC_API std::vector<sparrow::array> get_arrays_from_record_batch(
39-
const org::apache::arrow::flatbuf::RecordBatch& record_batch,
40-
const org::apache::arrow::flatbuf::Schema& schema,
41-
const encapsulated_message& encapsulated_message,
42-
const std::vector<std::optional<std::vector<sparrow::metadata_pair>>>& field_metadata
43-
);
44-
4513
/**
4614
* @brief Deserializes an Arrow IPC stream from binary data into a vector of record batches.
4715
*

include/sparrow_ipc/serialize.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
#include <ranges>
44

55
#include <sparrow/record_batch.hpp>
6-
7-
#include "Message_generated.h"
86
#include "sparrow_ipc/any_output_stream.hpp"
97
#include "sparrow_ipc/compression.hpp"
108
#include "sparrow_ipc/config/config.hpp"

include/sparrow_ipc/serialize_utils.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
#include <sparrow/record_batch.hpp>
77

8-
#include "Message_generated.h"
98
#include "sparrow_ipc/any_output_stream.hpp"
109
#include "sparrow_ipc/compression.hpp"
1110
#include "sparrow_ipc/config/config.hpp"

integration_tests/CMakeLists.txt

Lines changed: 58 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,44 @@
11
cmake_minimum_required(VERSION 3.28)
22

3+
# Create the integration_tools library
4+
add_library(integration_tools src/integration_tools.cpp)
5+
6+
target_link_libraries(integration_tools
7+
PUBLIC
8+
sparrow-ipc
9+
sparrow::sparrow
10+
sparrow::json_reader
11+
)
12+
13+
target_include_directories(integration_tools
14+
PUBLIC
15+
${CMAKE_CURRENT_SOURCE_DIR}/include
16+
PRIVATE
17+
${CMAKE_SOURCE_DIR}/include
18+
${CMAKE_BINARY_DIR}/generated
19+
)
20+
21+
set_target_properties(integration_tools
22+
PROPERTIES
23+
CXX_STANDARD 20
24+
CXX_STANDARD_REQUIRED ON
25+
CXX_EXTENSIONS OFF
26+
FOLDER "integration_tests"
27+
)
28+
29+
add_dependencies(integration_tools generate_flatbuffers_headers)
30+
331
# Create executable for arrow_file_to_stream integration test
432
add_executable(arrow_file_to_stream arrow_file_to_stream.cpp)
533

634
target_link_libraries(arrow_file_to_stream
735
PRIVATE
8-
sparrow-ipc
9-
sparrow::sparrow
10-
sparrow::json_reader
36+
integration_tools
37+
)
38+
39+
target_include_directories(arrow_file_to_stream
40+
PRIVATE
41+
${CMAKE_CURRENT_SOURCE_DIR}/include
1142
)
1243

1344
set_target_properties(arrow_file_to_stream
@@ -23,21 +54,19 @@ set_target_properties(arrow_file_to_stream
2354
INSTALL_RPATH "$ORIGIN"
2455
)
2556

26-
target_include_directories(arrow_file_to_stream
27-
PRIVATE
28-
${CMAKE_SOURCE_DIR}/include
29-
${CMAKE_BINARY_DIR}/generated
30-
)
31-
3257
add_dependencies(arrow_file_to_stream generate_flatbuffers_headers)
3358

3459
# Create executable for arrow_stream_to_file integration test
3560
add_executable(arrow_stream_to_file arrow_stream_to_file.cpp)
3661

3762
target_link_libraries(arrow_stream_to_file
3863
PRIVATE
39-
sparrow-ipc
40-
sparrow::sparrow
64+
integration_tools
65+
)
66+
67+
target_include_directories(arrow_stream_to_file
68+
PRIVATE
69+
${CMAKE_CURRENT_SOURCE_DIR}/include
4170
)
4271

4372
set_target_properties(arrow_stream_to_file
@@ -53,22 +82,19 @@ set_target_properties(arrow_stream_to_file
5382
INSTALL_RPATH "$ORIGIN"
5483
)
5584

56-
target_include_directories(arrow_stream_to_file
57-
PRIVATE
58-
${CMAKE_SOURCE_DIR}/include
59-
${CMAKE_BINARY_DIR}/generated
60-
)
61-
6285
add_dependencies(arrow_stream_to_file generate_flatbuffers_headers)
6386

6487
# Create executable for arrow_json_to_file integration test
6588
add_executable(arrow_json_to_file arrow_json_to_file.cpp)
6689

6790
target_link_libraries(arrow_json_to_file
6891
PRIVATE
69-
sparrow-ipc
70-
sparrow::sparrow
71-
sparrow::json_reader
92+
integration_tools
93+
)
94+
95+
target_include_directories(arrow_json_to_file
96+
PRIVATE
97+
${CMAKE_CURRENT_SOURCE_DIR}/include
7298
)
7399

74100
set_target_properties(arrow_json_to_file
@@ -84,22 +110,19 @@ set_target_properties(arrow_json_to_file
84110
INSTALL_RPATH "$ORIGIN"
85111
)
86112

87-
target_include_directories(arrow_json_to_file
88-
PRIVATE
89-
${CMAKE_SOURCE_DIR}/include
90-
${CMAKE_BINARY_DIR}/generated
91-
)
92-
93113
add_dependencies(arrow_json_to_file generate_flatbuffers_headers)
94114

95115
# Create executable for arrow_validate integration test
96116
add_executable(arrow_validate arrow_validate.cpp)
97117

98118
target_link_libraries(arrow_validate
99119
PRIVATE
100-
sparrow-ipc
101-
sparrow::sparrow
102-
sparrow::json_reader
120+
integration_tools
121+
)
122+
123+
target_include_directories(arrow_validate
124+
PRIVATE
125+
${CMAKE_CURRENT_SOURCE_DIR}/include
103126
)
104127

105128
set_target_properties(arrow_validate
@@ -115,46 +138,25 @@ set_target_properties(arrow_validate
115138
INSTALL_RPATH "$ORIGIN"
116139
)
117140

118-
target_include_directories(arrow_validate
119-
PRIVATE
120-
${CMAKE_SOURCE_DIR}/include
121-
${CMAKE_BINARY_DIR}/generated
122-
)
123-
124141
add_dependencies(arrow_validate generate_flatbuffers_headers)
125142

126143
# Create test executable for integration tools
127144
add_executable(test_integration_tools main.cpp test_integration_tools.cpp)
128145

129146
target_link_libraries(test_integration_tools
130147
PRIVATE
131-
sparrow-ipc
132-
sparrow::sparrow
133-
sparrow::json_reader
134-
doctest::doctest
135-
arrow-testing-data
136-
)
137-
138-
target_compile_definitions(test_integration_tools
139-
PRIVATE
140-
INTEGRATION_TOOLS_DIR="${CMAKE_CURRENT_BINARY_DIR}"
141-
)
142-
143-
set_target_properties(test_integration_tools
144-
PROPERTIES
145-
CXX_STANDARD 20
146-
CXX_STANDARD_REQUIRED ON
147-
CXX_EXTENSIONS OFF
148+
integration_tools
149+
doctest::doctest
150+
arrow-testing-data
148151
)
149152

150153
target_include_directories(test_integration_tools
151154
PRIVATE
152-
${CMAKE_SOURCE_DIR}/include
153-
${CMAKE_BINARY_DIR}/generated
155+
${CMAKE_CURRENT_SOURCE_DIR}/include
156+
${CMAKE_SOURCE_DIR}/include
157+
${CMAKE_BINARY_DIR}/generated
154158
)
155159

156-
add_dependencies(test_integration_tools generate_flatbuffers_headers arrow_file_to_stream arrow_stream_to_file arrow_json_to_file arrow_validate)
157-
158160
# Register with CTest
159161
enable_testing()
160162
add_test(NAME integration_tools_test COMMAND test_integration_tools)

integration_tests/arrow_file_to_stream.cpp

Lines changed: 3 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,9 @@
11
#include <cstdlib>
22
#include <filesystem>
3-
#include <fstream>
43
#include <iostream>
54
#include <vector>
65

7-
#include <nlohmann/json.hpp>
8-
#include <sparrow/record_batch.hpp>
9-
10-
#include "sparrow/json_reader/json_parser.hpp"
11-
12-
#include <sparrow_ipc/memory_output_stream.hpp>
13-
#include <sparrow_ipc/stream_file_serializer.hpp>
6+
#include "integration_tools.hpp"
147

158
/**
169
* @brief Reads a JSON file containing record batches and outputs the serialized Arrow IPC stream to stdout.
@@ -39,68 +32,8 @@ int main(int argc, char* argv[])
3932

4033
try
4134
{
42-
// Check if the JSON file exists
43-
if (!std::filesystem::exists(json_path))
44-
{
45-
std::cerr << "Error: File not found: " << json_path << "\n";
46-
return EXIT_FAILURE;
47-
}
48-
49-
// Open and parse the JSON file
50-
std::ifstream json_file(json_path);
51-
if (!json_file.is_open())
52-
{
53-
std::cerr << "Error: Could not open file: " << json_path << "\n";
54-
return EXIT_FAILURE;
55-
}
56-
57-
nlohmann::json json_data;
58-
try
59-
{
60-
json_data = nlohmann::json::parse(json_file);
61-
}
62-
catch (const nlohmann::json::parse_error& e)
63-
{
64-
std::cerr << "Error: Failed to parse JSON file: " << e.what() << "\n";
65-
return EXIT_FAILURE;
66-
}
67-
json_file.close();
68-
69-
// Get the number of batches
70-
if (!json_data.contains("batches") || !json_data["batches"].is_array())
71-
{
72-
std::cerr << "Error: JSON file does not contain a 'batches' array.\n";
73-
return EXIT_FAILURE;
74-
}
75-
76-
const size_t num_batches = json_data["batches"].size();
77-
78-
// Parse all record batches from JSON
79-
std::vector<sparrow::record_batch> record_batches;
80-
record_batches.reserve(num_batches);
81-
82-
for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx)
83-
{
84-
try
85-
{
86-
record_batches.emplace_back(
87-
sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx)
88-
);
89-
}
90-
catch (const std::exception& e)
91-
{
92-
std::cerr << "Error: Failed to build record batch " << batch_idx << ": " << e.what()
93-
<< "\n";
94-
return EXIT_FAILURE;
95-
}
96-
}
97-
98-
// Serialize record batches to Arrow IPC stream format
99-
std::vector<uint8_t> stream_data;
100-
sparrow_ipc::memory_output_stream stream(stream_data);
101-
sparrow_ipc::stream_file_serializer serializer(stream);
102-
serializer << record_batches;
103-
serializer.end();
35+
// Convert JSON file to stream using the library
36+
std::vector<uint8_t> stream_data = integration_tools::json_file_to_stream(json_path);
10437

10538
// Write the binary stream to stdout
10639
std::cout.write(reinterpret_cast<const char*>(stream_data.data()), stream_data.size());

0 commit comments

Comments
 (0)