Skip to content

Commit 7f9caa5

Browse files
committed
move to utils
1 parent eee3533 commit 7f9caa5

File tree

1 file changed

+115
-97
lines changed

1 file changed

+115
-97
lines changed

examples/write_and_read_streams.cpp

Lines changed: 115 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -23,121 +23,138 @@ namespace sp = sparrow;
2323
std::random_device rd;
2424
std::mt19937 gen(rd());
2525

26-
/**
27-
* Helper function to create a record batch with the same schema but random values
28-
* All batches have: int32 column, float column, bool column, and string column
29-
*/
30-
sp::record_batch create_random_record_batch(size_t num_rows)
26+
namespace utils
3127
{
32-
// Helper lambda to generate a vector with random values
33-
auto generate_vector = [num_rows](auto generator) {
34-
using T = decltype(generator());
35-
std::vector<T> values(num_rows);
36-
std::generate(values.begin(), values.end(), generator);
37-
return values;
38-
};
39-
40-
// Create integer column with random values
41-
std::uniform_int_distribution<int32_t> int_dist(0, 1000);
42-
auto int_array = sp::primitive_array<int32_t>(
43-
generate_vector([&]() { return int_dist(gen); })
44-
);
45-
46-
// Create float column with random values
47-
std::uniform_real_distribution<float> float_dist(-100.0f, 100.0f);
48-
auto float_array = sp::primitive_array<float>(
49-
generate_vector([&]() { return float_dist(gen); })
50-
);
51-
52-
// Create boolean column with random values
53-
std::uniform_int_distribution<int> bool_dist(0, 1);
54-
auto bool_array = sp::primitive_array<bool>(
55-
generate_vector([&]() { return static_cast<bool>(bool_dist(gen)); })
56-
);
57-
58-
// Create string column with random values
59-
const std::vector<std::string> sample_strings =
60-
{"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa"};
61-
std::uniform_int_distribution<size_t> str_dist(0, sample_strings.size() - 1);
62-
size_t counter = 0;
63-
auto string_array = sp::string_array(
64-
generate_vector([&]() { return sample_strings[str_dist(gen)] + "_" + std::to_string(counter++); })
65-
);
66-
67-
// Create record batch with named columns (same schema for all batches)
68-
return sp::record_batch(
69-
{{"id", sp::array(std::move(int_array))},
70-
{"value", sp::array(std::move(float_array))},
71-
{"flag", sp::array(std::move(bool_array))},
72-
{"name", sp::array(std::move(string_array))}}
73-
);
74-
}
75-
76-
/**
77-
* Verify that two sets of record batches are identical
78-
* Returns true if all batches match, false otherwise
79-
*/
80-
bool verify_batches_match(
81-
const std::vector<sp::record_batch>& original_batches,
82-
const std::vector<sp::record_batch>& deserialized_batches
83-
)
84-
{
85-
if (original_batches.size() != deserialized_batches.size())
28+
/**
29+
* Helper function to create a record batch with the same schema but random values
30+
* All batches have: int32 column, float column, bool column, and string column
31+
*/
32+
sp::record_batch create_random_record_batch(size_t num_rows)
8633
{
87-
std::cerr << "ERROR: Batch count mismatch! Original: " << original_batches.size()
88-
<< ", Deserialized: " << deserialized_batches.size() << "\n";
89-
return false;
90-
}
34+
// Helper lambda to generate a vector with random values
35+
auto generate_vector = [num_rows](auto generator)
36+
{
37+
using T = decltype(generator());
38+
std::vector<T> values(num_rows);
39+
std::generate(values.begin(), values.end(), generator);
40+
return values;
41+
};
42+
43+
// Create integer column with random values
44+
std::uniform_int_distribution<int32_t> int_dist(0, 1000);
45+
auto int_array = sp::primitive_array<int32_t>(generate_vector(
46+
[&]()
47+
{
48+
return int_dist(gen);
49+
}
50+
));
9151

92-
bool all_match = true;
93-
for (size_t batch_idx = 0; batch_idx < original_batches.size(); ++batch_idx)
94-
{
95-
const auto& original = original_batches[batch_idx];
96-
const auto& deserialized = deserialized_batches[batch_idx];
52+
// Create float column with random values
53+
std::uniform_real_distribution<float> float_dist(-100.0f, 100.0f);
54+
auto float_array = sp::primitive_array<float>(generate_vector(
55+
[&]()
56+
{
57+
return float_dist(gen);
58+
}
59+
));
9760

98-
// Check basic structure
99-
if (original.nb_columns() != deserialized.nb_columns() || original.nb_rows() != deserialized.nb_rows())
100-
{
101-
std::cerr << "ERROR: Batch " << batch_idx << " structure mismatch!\n";
102-
all_match = false;
103-
continue;
104-
}
61+
// Create boolean column with random values
62+
std::uniform_int_distribution<int> bool_dist(0, 1);
63+
auto bool_array = sp::primitive_array<bool>(generate_vector(
64+
[&]()
65+
{
66+
return static_cast<bool>(bool_dist(gen));
67+
}
68+
));
69+
70+
// Create string column with random values
71+
const std::vector<std::string> sample_strings =
72+
{"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa"};
73+
std::uniform_int_distribution<size_t> str_dist(0, sample_strings.size() - 1);
74+
size_t counter = 0;
75+
auto string_array = sp::string_array(generate_vector(
76+
[&]()
77+
{
78+
return sample_strings[str_dist(gen)] + "_" + std::to_string(counter++);
79+
}
80+
));
81+
82+
// Create record batch with named columns (same schema for all batches)
83+
return sp::record_batch(
84+
{{"id", sp::array(std::move(int_array))},
85+
{"value", sp::array(std::move(float_array))},
86+
{"flag", sp::array(std::move(bool_array))},
87+
{"name", sp::array(std::move(string_array))}}
88+
);
89+
}
10590

106-
// Check column names
107-
if (!std::ranges::equal(original.names(), deserialized.names()))
91+
/**
92+
* Verify that two sets of record batches are identical
93+
* Returns true if all batches match, false otherwise
94+
*/
95+
bool verify_batches_match(
96+
const std::vector<sp::record_batch>& original_batches,
97+
const std::vector<sp::record_batch>& deserialized_batches
98+
)
99+
{
100+
if (original_batches.size() != deserialized_batches.size())
108101
{
109-
std::cerr << "WARNING: Batch " << batch_idx << " column names mismatch!\n";
102+
std::cerr << "ERROR: Batch count mismatch! Original: " << original_batches.size()
103+
<< ", Deserialized: " << deserialized_batches.size() << "\n";
104+
return false;
110105
}
111106

112-
// Check column data
113-
for (size_t col_idx = 0; col_idx < original.nb_columns(); ++col_idx)
107+
bool all_match = true;
108+
for (size_t batch_idx = 0; batch_idx < original_batches.size(); ++batch_idx)
114109
{
115-
const auto& orig_col = original.get_column(col_idx);
116-
const auto& deser_col = deserialized.get_column(col_idx);
110+
const auto& original = original_batches[batch_idx];
111+
const auto& deserialized = deserialized_batches[batch_idx];
117112

118-
if (orig_col.data_type() != deser_col.data_type())
113+
// Check basic structure
114+
if (original.nb_columns() != deserialized.nb_columns()
115+
|| original.nb_rows() != deserialized.nb_rows())
119116
{
120-
std::cerr << "ERROR: Batch " << batch_idx << ", column " << col_idx << " type mismatch!\n";
117+
std::cerr << "ERROR: Batch " << batch_idx << " structure mismatch!\n";
121118
all_match = false;
122119
continue;
123120
}
124121

125-
// Check values
126-
for (size_t row_idx = 0; row_idx < orig_col.size(); ++row_idx)
122+
// Check column names
123+
if (!std::ranges::equal(original.names(), deserialized.names()))
124+
{
125+
std::cerr << "WARNING: Batch " << batch_idx << " column names mismatch!\n";
126+
}
127+
128+
// Check column data
129+
for (size_t col_idx = 0; col_idx < original.nb_columns(); ++col_idx)
127130
{
128-
if (orig_col[row_idx] != deser_col[row_idx])
131+
const auto& orig_col = original.get_column(col_idx);
132+
const auto& deser_col = deserialized.get_column(col_idx);
133+
134+
if (orig_col.data_type() != deser_col.data_type())
129135
{
130-
std::cerr << "ERROR: Batch " << batch_idx << ", column " << col_idx << ", row " << row_idx
131-
<< " value mismatch!\n";
132-
std::cerr << " Original: " << orig_col[row_idx]
133-
<< ", Deserialized: " << deser_col[row_idx] << "\n";
136+
std::cerr << "ERROR: Batch " << batch_idx << ", column " << col_idx << " type mismatch!\n";
134137
all_match = false;
138+
continue;
139+
}
140+
141+
// Check values
142+
for (size_t row_idx = 0; row_idx < orig_col.size(); ++row_idx)
143+
{
144+
if (orig_col[row_idx] != deser_col[row_idx])
145+
{
146+
std::cerr << "ERROR: Batch " << batch_idx << ", column " << col_idx << ", row "
147+
<< row_idx << " value mismatch!\n";
148+
std::cerr << " Original: " << orig_col[row_idx]
149+
<< ", Deserialized: " << deser_col[row_idx] << "\n";
150+
all_match = false;
151+
}
135152
}
136153
}
137154
}
138-
}
139155

140-
return all_match;
156+
return all_match;
157+
}
141158
}
142159

143160
/**
@@ -153,7 +170,7 @@ std::vector<sp::record_batch> create_record_batches(size_t num_batches, size_t r
153170

154171
for (size_t i = 0; i < num_batches; ++i)
155172
{
156-
batches.push_back(create_random_record_batch(rows_per_batch));
173+
batches.push_back(utils::create_random_record_batch(rows_per_batch));
157174
}
158175

159176
std::cout << " Created " << batches.size() << " record batches\n";
@@ -241,7 +258,7 @@ void demonstrate_serialization_methods(
241258
bool verify_schema_consistency(const std::vector<sp::record_batch>& batches)
242259
{
243260
std::cout << "\n7. Verifying schema consistency across all batches...\n";
244-
261+
245262
if (batches.empty())
246263
{
247264
std::cout << " No batches to verify\n";
@@ -296,7 +313,8 @@ void read_and_display_test_file()
296313
{
297314
std::cout << "\n8. Reading a primitive stream file from test resources...\n";
298315

299-
const std::filesystem::path primitive_stream_file = tests_resources_files_path / "generated_primitive.stream";
316+
const std::filesystem::path primitive_stream_file = tests_resources_files_path
317+
/ "generated_primitive.stream";
300318

301319
if (std::filesystem::exists(primitive_stream_file))
302320
{
@@ -361,7 +379,7 @@ int main()
361379
// Step 4: Verify that original and deserialized data match
362380
std::cout << "\n4. Verifying data integrity...\n";
363381

364-
if (verify_batches_match(original_batches, deserialized_batches))
382+
if (utils::verify_batches_match(original_batches, deserialized_batches))
365383
{
366384
std::cout << " ✓ All data matches perfectly!\n";
367385
}

0 commit comments

Comments
 (0)