@@ -23,121 +23,138 @@ namespace sp = sparrow;
2323std::random_device rd;
2424std::mt19937 gen (rd());
2525
26- /* *
27- * Helper function to create a record batch with the same schema but random values
28- * All batches have: int32 column, float column, bool column, and string column
29- */
30- sp::record_batch create_random_record_batch (size_t num_rows)
26+ namespace utils
3127{
32- // Helper lambda to generate a vector with random values
33- auto generate_vector = [num_rows](auto generator) {
34- using T = decltype (generator ());
35- std::vector<T> values (num_rows);
36- std::generate (values.begin (), values.end (), generator);
37- return values;
38- };
39-
40- // Create integer column with random values
41- std::uniform_int_distribution<int32_t > int_dist (0 , 1000 );
42- auto int_array = sp::primitive_array<int32_t >(
43- generate_vector ([&]() { return int_dist (gen); })
44- );
45-
46- // Create float column with random values
47- std::uniform_real_distribution<float > float_dist (-100 .0f , 100 .0f );
48- auto float_array = sp::primitive_array<float >(
49- generate_vector ([&]() { return float_dist (gen); })
50- );
51-
52- // Create boolean column with random values
53- std::uniform_int_distribution<int > bool_dist (0 , 1 );
54- auto bool_array = sp::primitive_array<bool >(
55- generate_vector ([&]() { return static_cast <bool >(bool_dist (gen)); })
56- );
57-
58- // Create string column with random values
59- const std::vector<std::string> sample_strings =
60- {" alpha" , " beta" , " gamma" , " delta" , " epsilon" , " zeta" , " eta" , " theta" , " iota" , " kappa" };
61- std::uniform_int_distribution<size_t > str_dist (0 , sample_strings.size () - 1 );
62- size_t counter = 0 ;
63- auto string_array = sp::string_array (
64- generate_vector ([&]() { return sample_strings[str_dist (gen)] + " _" + std::to_string (counter++); })
65- );
66-
67- // Create record batch with named columns (same schema for all batches)
68- return sp::record_batch (
69- {{" id" , sp::array (std::move (int_array))},
70- {" value" , sp::array (std::move (float_array))},
71- {" flag" , sp::array (std::move (bool_array))},
72- {" name" , sp::array (std::move (string_array))}}
73- );
74- }
75-
76- /* *
77- * Verify that two sets of record batches are identical
78- * Returns true if all batches match, false otherwise
79- */
80- bool verify_batches_match (
81- const std::vector<sp::record_batch>& original_batches,
82- const std::vector<sp::record_batch>& deserialized_batches
83- )
84- {
85- if (original_batches.size () != deserialized_batches.size ())
28+ /* *
29+ * Helper function to create a record batch with the same schema but random values
30+ * All batches have: int32 column, float column, bool column, and string column
31+ */
32+ sp::record_batch create_random_record_batch (size_t num_rows)
8633 {
87- std::cerr << " ERROR: Batch count mismatch! Original: " << original_batches.size ()
88- << " , Deserialized: " << deserialized_batches.size () << " \n " ;
89- return false ;
90- }
34+ // Helper lambda to generate a vector with random values
35+ auto generate_vector = [num_rows](auto generator)
36+ {
37+ using T = decltype (generator ());
38+ std::vector<T> values (num_rows);
39+ std::generate (values.begin (), values.end (), generator);
40+ return values;
41+ };
42+
43+ // Create integer column with random values
44+ std::uniform_int_distribution<int32_t > int_dist (0 , 1000 );
45+ auto int_array = sp::primitive_array<int32_t >(generate_vector (
46+ [&]()
47+ {
48+ return int_dist (gen);
49+ }
50+ ));
9151
92- bool all_match = true ;
93- for (size_t batch_idx = 0 ; batch_idx < original_batches.size (); ++batch_idx)
94- {
95- const auto & original = original_batches[batch_idx];
96- const auto & deserialized = deserialized_batches[batch_idx];
52+ // Create float column with random values
53+ std::uniform_real_distribution<float > float_dist (-100 .0f , 100 .0f );
54+ auto float_array = sp::primitive_array<float >(generate_vector (
55+ [&]()
56+ {
57+ return float_dist (gen);
58+ }
59+ ));
9760
98- // Check basic structure
99- if (original.nb_columns () != deserialized.nb_columns () || original.nb_rows () != deserialized.nb_rows ())
100- {
101- std::cerr << " ERROR: Batch " << batch_idx << " structure mismatch!\n " ;
102- all_match = false ;
103- continue ;
104- }
61+ // Create boolean column with random values
62+ std::uniform_int_distribution<int > bool_dist (0 , 1 );
63+ auto bool_array = sp::primitive_array<bool >(generate_vector (
64+ [&]()
65+ {
66+ return static_cast <bool >(bool_dist (gen));
67+ }
68+ ));
69+
70+ // Create string column with random values
71+ const std::vector<std::string> sample_strings =
72+ {" alpha" , " beta" , " gamma" , " delta" , " epsilon" , " zeta" , " eta" , " theta" , " iota" , " kappa" };
73+ std::uniform_int_distribution<size_t > str_dist (0 , sample_strings.size () - 1 );
74+ size_t counter = 0 ;
75+ auto string_array = sp::string_array (generate_vector (
76+ [&]()
77+ {
78+ return sample_strings[str_dist (gen)] + " _" + std::to_string (counter++);
79+ }
80+ ));
81+
82+ // Create record batch with named columns (same schema for all batches)
83+ return sp::record_batch (
84+ {{" id" , sp::array (std::move (int_array))},
85+ {" value" , sp::array (std::move (float_array))},
86+ {" flag" , sp::array (std::move (bool_array))},
87+ {" name" , sp::array (std::move (string_array))}}
88+ );
89+ }
10590
106- // Check column names
107- if (!std::ranges::equal (original.names (), deserialized.names ()))
91+ /* *
92+ * Verify that two sets of record batches are identical
93+ * Returns true if all batches match, false otherwise
94+ */
95+ bool verify_batches_match (
96+ const std::vector<sp::record_batch>& original_batches,
97+ const std::vector<sp::record_batch>& deserialized_batches
98+ )
99+ {
100+ if (original_batches.size () != deserialized_batches.size ())
108101 {
109- std::cerr << " WARNING: Batch " << batch_idx << " column names mismatch!\n " ;
102+ std::cerr << " ERROR: Batch count mismatch! Original: " << original_batches.size ()
103+ << " , Deserialized: " << deserialized_batches.size () << " \n " ;
104+ return false ;
110105 }
111106
112- // Check column data
113- for (size_t col_idx = 0 ; col_idx < original. nb_columns (); ++col_idx )
107+ bool all_match = true ;
108+ for (size_t batch_idx = 0 ; batch_idx < original_batches. size (); ++batch_idx )
114109 {
115- const auto & orig_col = original. get_column (col_idx) ;
116- const auto & deser_col = deserialized. get_column (col_idx) ;
110+ const auto & original = original_batches[batch_idx] ;
111+ const auto & deserialized = deserialized_batches[batch_idx] ;
117112
118- if (orig_col.data_type () != deser_col.data_type ())
113+ // Check basic structure
114+ if (original.nb_columns () != deserialized.nb_columns ()
115+ || original.nb_rows () != deserialized.nb_rows ())
119116 {
120- std::cerr << " ERROR: Batch " << batch_idx << " , column " << col_idx << " type mismatch!\n " ;
117+ std::cerr << " ERROR: Batch " << batch_idx << " structure mismatch!\n " ;
121118 all_match = false ;
122119 continue ;
123120 }
124121
125- // Check values
126- for (size_t row_idx = 0 ; row_idx < orig_col.size (); ++row_idx)
122+ // Check column names
123+ if (!std::ranges::equal (original.names (), deserialized.names ()))
124+ {
125+ std::cerr << " WARNING: Batch " << batch_idx << " column names mismatch!\n " ;
126+ }
127+
128+ // Check column data
129+ for (size_t col_idx = 0 ; col_idx < original.nb_columns (); ++col_idx)
127130 {
128- if (orig_col[row_idx] != deser_col[row_idx])
131+ const auto & orig_col = original.get_column (col_idx);
132+ const auto & deser_col = deserialized.get_column (col_idx);
133+
134+ if (orig_col.data_type () != deser_col.data_type ())
129135 {
130- std::cerr << " ERROR: Batch " << batch_idx << " , column " << col_idx << " , row " << row_idx
131- << " value mismatch!\n " ;
132- std::cerr << " Original: " << orig_col[row_idx]
133- << " , Deserialized: " << deser_col[row_idx] << " \n " ;
136+ std::cerr << " ERROR: Batch " << batch_idx << " , column " << col_idx << " type mismatch!\n " ;
134137 all_match = false ;
138+ continue ;
139+ }
140+
141+ // Check values
142+ for (size_t row_idx = 0 ; row_idx < orig_col.size (); ++row_idx)
143+ {
144+ if (orig_col[row_idx] != deser_col[row_idx])
145+ {
146+ std::cerr << " ERROR: Batch " << batch_idx << " , column " << col_idx << " , row "
147+ << row_idx << " value mismatch!\n " ;
148+ std::cerr << " Original: " << orig_col[row_idx]
149+ << " , Deserialized: " << deser_col[row_idx] << " \n " ;
150+ all_match = false ;
151+ }
135152 }
136153 }
137154 }
138- }
139155
140- return all_match;
156+ return all_match;
157+ }
141158}
142159
143160/* *
@@ -153,7 +170,7 @@ std::vector<sp::record_batch> create_record_batches(size_t num_batches, size_t r
153170
154171 for (size_t i = 0 ; i < num_batches; ++i)
155172 {
156- batches.push_back (create_random_record_batch (rows_per_batch));
173+ batches.push_back (utils:: create_random_record_batch (rows_per_batch));
157174 }
158175
159176 std::cout << " Created " << batches.size () << " record batches\n " ;
@@ -241,7 +258,7 @@ void demonstrate_serialization_methods(
241258bool verify_schema_consistency (const std::vector<sp::record_batch>& batches)
242259{
243260 std::cout << " \n 7. Verifying schema consistency across all batches...\n " ;
244-
261+
245262 if (batches.empty ())
246263 {
247264 std::cout << " No batches to verify\n " ;
@@ -296,7 +313,8 @@ void read_and_display_test_file()
296313{
297314 std::cout << " \n 8. Reading a primitive stream file from test resources...\n " ;
298315
299- const std::filesystem::path primitive_stream_file = tests_resources_files_path / " generated_primitive.stream" ;
316+ const std::filesystem::path primitive_stream_file = tests_resources_files_path
317+ / " generated_primitive.stream" ;
300318
301319 if (std::filesystem::exists (primitive_stream_file))
302320 {
@@ -361,7 +379,7 @@ int main()
361379 // Step 4: Verify that original and deserialized data match
362380 std::cout << " \n 4. Verifying data integrity...\n " ;
363381
364- if (verify_batches_match (original_batches, deserialized_batches))
382+ if (utils:: verify_batches_match (original_batches, deserialized_batches))
365383 {
366384 std::cout << " ✓ All data matches perfectly!\n " ;
367385 }
0 commit comments