Skip to content

Commit 7acd347

Browse files
authored
Apply formatting rules (#2649)
#### Reference Issues/PRs Monday ref: 10048929527 #### What does this implement or fix? #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent ffe58bc commit 7acd347

File tree

729 files changed

+40579
-36971
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

729 files changed

+40579
-36971
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,14 +144,12 @@ jobs:
144144
- name: Lint Python
145145
if: always()
146146
run: |
147-
python3 build_tooling/format.py --check --type python \
148-
|| true # formatting not enforced yet
147+
python3 build_tooling/format.py --check --type python
149148
150149
- name: Lint C++
151150
if: always()
152151
run: |
153-
python3 build_tooling/format.py --check --type cpp \
154-
|| true # formatting not enforced yet
152+
python3 build_tooling/format.py --check --type cpp
155153
156154
common_config:
157155
needs: [cibw_docker_image]

cpp/arcticdb/arrow/array_from_block.hpp

Lines changed: 54 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
*
33
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
44
*
5-
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
5+
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
6+
* will be governed by the Apache License, version 2.0.
67
*/
78
#pragma once
89

@@ -13,77 +14,78 @@
1314

1415
namespace arcticdb {
1516

16-
inline std::optional<sparrow::validity_bitmap> create_validity_bitmap(size_t offset, const Column& column, size_t bitmap_size) {
17-
if(column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
18-
auto &bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
19-
return sparrow::validity_bitmap{reinterpret_cast<uint8_t *>(bitmap_buffer.block(0)->release()), bitmap_size};
17+
inline std::optional<sparrow::validity_bitmap> create_validity_bitmap(
18+
size_t offset, const Column& column, size_t bitmap_size
19+
) {
20+
if (column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
21+
auto& bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
22+
return sparrow::validity_bitmap{reinterpret_cast<uint8_t*>(bitmap_buffer.block(0)->release()), bitmap_size};
2023
} else {
2124
return std::nullopt;
2225
}
2326
}
2427

25-
template <typename T>
28+
template<typename T>
2629
sparrow::primitive_array<T> create_primitive_array(
27-
T* data_ptr,
28-
size_t data_size,
29-
std::optional<sparrow::validity_bitmap>&& validity_bitmap) {
30+
T* data_ptr, size_t data_size, std::optional<sparrow::validity_bitmap>&& validity_bitmap
31+
) {
3032
sparrow::u8_buffer<T> buffer(data_ptr, data_size);
31-
if(validity_bitmap) {
33+
if (validity_bitmap) {
3234
return sparrow::primitive_array<T>{std::move(buffer), data_size, std::move(*validity_bitmap)};
3335
} else {
3436
return sparrow::primitive_array<T>{std::move(buffer), data_size};
3537
}
3638
}
3739

38-
template <>
40+
template<>
3941
inline sparrow::primitive_array<bool> create_primitive_array(
40-
bool* data_ptr,
41-
size_t data_size,
42-
std::optional<sparrow::validity_bitmap>&& validity_bitmap) {
42+
bool* data_ptr, size_t data_size, std::optional<sparrow::validity_bitmap>&& validity_bitmap
43+
) {
4344
// We need special handling for bools because arrow uses dense bool representation (i.e. 8 bools per byte)
4445
// Our internal representation is not dense. We use sparrow's `make_data_buffer` utility, but if needed, we can use
4546
// our own.
4647
auto buffer = sparrow::details::primitive_data_access<bool>::make_data_buffer(std::span{data_ptr, data_size});
47-
if(validity_bitmap) {
48+
if (validity_bitmap) {
4849
return sparrow::primitive_array<bool>{std::move(buffer), data_size, std::move(*validity_bitmap)};
4950
} else {
5051
return sparrow::primitive_array<bool>{std::move(buffer), data_size};
5152
}
5253
}
5354

54-
template <typename T>
55+
template<typename T>
5556
sparrow::timestamp_without_timezone_nanoseconds_array create_timestamp_array(
56-
T* data_ptr,
57-
size_t data_size,
58-
std::optional<sparrow::validity_bitmap>&& validity_bitmap) {
57+
T* data_ptr, size_t data_size, std::optional<sparrow::validity_bitmap>&& validity_bitmap
58+
) {
5959
static_assert(sizeof(T) == sizeof(sparrow::zoned_time_without_timezone_nanoseconds));
6060
// We default to using timestamps without timezones. If the normalization metadata contains a timezone it will be
6161
// applied during normalization in python layer.
6262
sparrow::u8_buffer<sparrow::zoned_time_without_timezone_nanoseconds> buffer(
63-
reinterpret_cast<sparrow::zoned_time_without_timezone_nanoseconds*>(data_ptr), data_size);
64-
if(validity_bitmap) {
65-
return sparrow::timestamp_without_timezone_nanoseconds_array{std::move(buffer), data_size, std::move(*validity_bitmap)};
63+
reinterpret_cast<sparrow::zoned_time_without_timezone_nanoseconds*>(data_ptr), data_size
64+
);
65+
if (validity_bitmap) {
66+
return sparrow::timestamp_without_timezone_nanoseconds_array{
67+
std::move(buffer), data_size, std::move(*validity_bitmap)
68+
};
6669
} else {
6770
return sparrow::timestamp_without_timezone_nanoseconds_array{std::move(buffer), data_size};
6871
}
6972
}
7073

71-
template <typename T>
74+
template<typename T>
7275
sparrow::dictionary_encoded_array<T> create_dict_array(
73-
sparrow::array&& dict_values_array,
74-
sparrow::u8_buffer<T>&& dict_keys_buffer,
75-
std::optional<sparrow::validity_bitmap>&& validity_bitmap
76-
) {
77-
if(validity_bitmap) {
76+
sparrow::array&& dict_values_array, sparrow::u8_buffer<T>&& dict_keys_buffer,
77+
std::optional<sparrow::validity_bitmap>&& validity_bitmap
78+
) {
79+
if (validity_bitmap) {
7880
return sparrow::dictionary_encoded_array<T>{
79-
typename sparrow::dictionary_encoded_array<T>::keys_buffer_type(std::move(dict_keys_buffer)),
80-
std::move(dict_values_array),
81-
std::move(*validity_bitmap)
81+
typename sparrow::dictionary_encoded_array<T>::keys_buffer_type(std::move(dict_keys_buffer)),
82+
std::move(dict_values_array),
83+
std::move(*validity_bitmap)
8284
};
8385
} else {
8486
return sparrow::dictionary_encoded_array<T>{
85-
typename sparrow::dictionary_encoded_array<T>::keys_buffer_type(std::move(dict_keys_buffer)),
86-
std::move(dict_values_array),
87+
typename sparrow::dictionary_encoded_array<T>::keys_buffer_type(std::move(dict_keys_buffer)),
88+
std::move(dict_values_array),
8789
};
8890
}
8991
}
@@ -102,12 +104,11 @@ inline sparrow::big_string_array minimal_strings_dict() {
102104
return {std::move(strings_buffer), std::move(offsets_buffer)};
103105
}
104106

105-
template <typename TagType>
107+
template<typename TagType>
106108
sparrow::array string_dict_from_block(
107-
TypedBlockData<TagType>& block,
108-
const Column& column,
109-
std::string_view name,
110-
std::optional<sparrow::validity_bitmap>&& maybe_bitmap) {
109+
TypedBlockData<TagType>& block, const Column& column, std::string_view name,
110+
std::optional<sparrow::validity_bitmap>&& maybe_bitmap
111+
) {
111112
const auto offset = block.offset();
112113
// We use 64-bit offsets and 32-bit keys because we use a layout where each row-segment has its own arrow array.
113114
// By default, the row-segments are 100k rows, so number of rows wouldn't exceed 32-bit ints.
@@ -119,45 +120,47 @@ sparrow::array string_dict_from_block(
119120
// We use `int32_t` dictionary keys because pyarrow doesn't work with unsigned dictionary keys:
120121
// https://github.com/pola-rs/polars/issues/10977
121122
const auto block_size = block.row_count();
122-
sparrow::u8_buffer<int32_t> dict_keys_buffer{reinterpret_cast<int32_t *>(block.release()), block_size};
123+
sparrow::u8_buffer<int32_t> dict_keys_buffer{reinterpret_cast<int32_t*>(block.release()), block_size};
123124

124125
const bool has_offset_buffer = column.has_extra_buffer(offset, ExtraBufferType::OFFSET);
125126
const bool has_string_buffer = column.has_extra_buffer(offset, ExtraBufferType::STRING);
126127
auto dict_values_array = [&]() -> sparrow::big_string_array {
127128
if (has_offset_buffer && has_string_buffer) {
128129
auto& string_offsets = column.get_extra_buffer(offset, ExtraBufferType::OFFSET);
129130
const auto offset_buffer_value_count = string_offsets.block(0)->bytes() / sizeof(int64_t);
130-
sparrow::u8_buffer<int64_t> offsets_buffer(reinterpret_cast<int64_t *>(string_offsets.block(0)->release()), offset_buffer_value_count);
131+
sparrow::u8_buffer<int64_t> offsets_buffer(
132+
reinterpret_cast<int64_t*>(string_offsets.block(0)->release()), offset_buffer_value_count
133+
);
131134
auto& strings = column.get_extra_buffer(offset, ExtraBufferType::STRING);
132135
const auto strings_buffer_size = strings.block(0)->bytes();
133-
sparrow::u8_buffer<char> strings_buffer(reinterpret_cast<char *>(strings.block(0)->release()), strings_buffer_size);
136+
sparrow::u8_buffer<char> strings_buffer(
137+
reinterpret_cast<char*>(strings.block(0)->release()), strings_buffer_size
138+
);
134139
return {std::move(strings_buffer), std::move(offsets_buffer)};
135140
} else if (!has_offset_buffer && !has_string_buffer) {
136141
return minimal_strings_dict();
137142
} else {
138-
util::raise_rte("Arrow output string creation expected either both or neither of OFFSET and STRING buffers to be present");
143+
util::raise_rte("Arrow output string creation expected either both or neither of OFFSET and STRING buffers "
144+
"to be present");
139145
}
140146
}();
141147

142148
auto dict_encoded = create_dict_array<int32_t>(
143-
sparrow::array{std::move(dict_values_array)},
144-
std::move(dict_keys_buffer),
145-
std::move(maybe_bitmap)
149+
sparrow::array{std::move(dict_values_array)}, std::move(dict_keys_buffer), std::move(maybe_bitmap)
146150
);
147151

148152
sparrow::array arr{std::move(dict_encoded)};
149153
arr.set_name(name);
150154
return arr;
151155
}
152156

153-
template <typename TagType>
157+
template<typename TagType>
154158
sparrow::array arrow_array_from_block(
155-
TypedBlockData<TagType>& block,
156-
std::string_view name,
157-
std::optional<sparrow::validity_bitmap>&& maybe_bitmap) {
159+
TypedBlockData<TagType>& block, std::string_view name, std::optional<sparrow::validity_bitmap>&& maybe_bitmap
160+
) {
158161
using DataTagType = typename TagType::DataTypeTag;
159162
using RawType = typename DataTagType::raw_type;
160-
auto *data_ptr = block.release();
163+
auto* data_ptr = block.release();
161164
const auto data_size = block.row_count();
162165
auto arr = [&]() {
163166
if constexpr (is_time_type(TagType::DataTypeTag::data_type)) {
@@ -172,4 +175,4 @@ sparrow::array arrow_array_from_block(
172175
return arr;
173176
}
174177

175-
}
178+
} // namespace arcticdb

cpp/arcticdb/arrow/arrow_handlers.cpp

Lines changed: 49 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
*
33
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
44
*
5-
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
5+
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
6+
* will be governed by the Apache License, version 2.0.
67
*/
78
#include <arrow/arrow_handlers.hpp>
89
#include <arcticdb/codec/encoding_sizes.hpp>
@@ -14,46 +15,41 @@
1415
namespace arcticdb {
1516

1617
void ArrowStringHandler::handle_type(
17-
const uint8_t *&data,
18-
Column& dest_column,
19-
const EncodedFieldImpl &field,
20-
const ColumnMapping& m,
21-
const DecodePathData& shared_data,
22-
std::any& handler_data,
23-
EncodingVersion encoding_version,
24-
const std::shared_ptr<StringPool>& string_pool) {
18+
const uint8_t*& data, Column& dest_column, const EncodedFieldImpl& field, const ColumnMapping& m,
19+
const DecodePathData& shared_data, std::any& handler_data, EncodingVersion encoding_version,
20+
const std::shared_ptr<StringPool>& string_pool
21+
) {
2522
ARCTICDB_SAMPLE(ArrowHandleString, 0)
2623
util::check(field.has_ndarray(), "String handler expected array");
2724
schema::check<ErrorCode::E_UNSUPPORTED_COLUMN_TYPE>(
2825
m.source_type_desc_.data_type() == DataType::UTF_DYNAMIC64,
2926
"Cannot read column '{}' into Arrow output format as it is of unsupported type {} (only {} is supported)",
30-
m.frame_field_descriptor_.name(), m.source_type_desc_.data_type(), DataType::UTF_DYNAMIC64);
27+
m.frame_field_descriptor_.name(),
28+
m.source_type_desc_.data_type(),
29+
DataType::UTF_DYNAMIC64
30+
);
3131
ARCTICDB_DEBUG(log::version(), "String handler got encoded field: {}", field.DebugString());
32-
const auto &ndarray = field.ndarray();
32+
const auto& ndarray = field.ndarray();
3333
const auto bytes = encoding_sizes::data_uncompressed_size(ndarray);
3434

35-
Column decoded_data{m.source_type_desc_, bytes / get_type_size(m.source_type_desc_.data_type()),
36-
AllocationType::DYNAMIC, Sparsity::PERMITTED};
37-
35+
Column decoded_data{
36+
m.source_type_desc_,
37+
bytes / get_type_size(m.source_type_desc_.data_type()),
38+
AllocationType::DYNAMIC,
39+
Sparsity::PERMITTED
40+
};
3841

39-
data += decode_field(m.source_type_desc_, field, data, decoded_data, decoded_data.opt_sparse_map(), encoding_version);
42+
data += decode_field(
43+
m.source_type_desc_, field, data, decoded_data, decoded_data.opt_sparse_map(), encoding_version
44+
);
4045

41-
convert_type(
42-
decoded_data,
43-
dest_column,
44-
m,
45-
shared_data,
46-
handler_data,
47-
string_pool);
46+
convert_type(decoded_data, dest_column, m, shared_data, handler_data, string_pool);
4847
}
4948

5049
void ArrowStringHandler::convert_type(
51-
const Column& source_column,
52-
Column& dest_column,
53-
const ColumnMapping& mapping,
54-
const DecodePathData&,
55-
std::any&,
56-
const std::shared_ptr<StringPool>& string_pool) const {
50+
const Column& source_column, Column& dest_column, const ColumnMapping& mapping, const DecodePathData&,
51+
std::any&, const std::shared_ptr<StringPool>& string_pool
52+
) const {
5753
using ArcticStringColumnTag = ScalarTagType<DataTypeTag<DataType::UTF_DYNAMIC64>>;
5854
auto input_data = source_column.data();
5955
struct DictEntry {
@@ -69,7 +65,9 @@ void ArrowStringHandler::convert_type(
6965
unique_offsets.reserve(source_column.row_count());
7066
int64_t bytes = 0;
7167
int32_t unique_offset_count = 0;
72-
auto dest_ptr = reinterpret_cast<int32_t*>(dest_column.bytes_at(mapping.offset_bytes_, source_column.row_count() * sizeof(int32_t)));
68+
auto dest_ptr = reinterpret_cast<int32_t*>(
69+
dest_column.bytes_at(mapping.offset_bytes_, source_column.row_count() * sizeof(int32_t))
70+
);
7371

7472
util::BitSet bitset;
7573
util::BitSet::bulk_insert_iterator inserter(bitset);
@@ -78,7 +76,12 @@ void ArrowStringHandler::convert_type(
7876
// TODO: This can't be right if the column was sparse as it has only been decoded, not expanded
7977
for (auto en = input_data.cbegin<ArcticStringColumnTag, IteratorType::ENUMERATED>(); en != end; ++en) {
8078
if (is_a_string(en->value())) {
81-
auto [entry, is_emplaced] = unique_offsets.try_emplace(en->value(), DictEntry{static_cast<int32_t>(unique_offset_count), bytes, string_pool->get_const_view(en->value())});
79+
auto [entry, is_emplaced] = unique_offsets.try_emplace(
80+
en->value(),
81+
DictEntry{
82+
static_cast<int32_t>(unique_offset_count), bytes, string_pool->get_const_view(en->value())
83+
}
84+
);
8285
if (is_emplaced) {
8386
bytes += entry->second.strv.size();
8487
unique_offsets_in_order.push_back(en->value());
@@ -102,14 +105,22 @@ void ArrowStringHandler::convert_type(
102105
create_dense_bitmap(mapping.offset_bytes_, bitset, dest_column, AllocationType::DETACHABLE);
103106
} // else there weren't any Nones or NaNs
104107
// bitset.count() == 0 is the special case where all of the rows contained None or NaN. In this case, do not create
105-
// the extra string and offset buffers. string_dict_from_block will then do the right thing and call minimal_strings_dict
108+
// the extra string and offset buffers. string_dict_from_block will then do the right thing and call
109+
// minimal_strings_dict
106110
if (bitset.count() > 0) {
107-
auto& string_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE);
108-
auto& offsets_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::OFFSET, (unique_offsets_in_order.size() + 1) * sizeof(int64_t), AllocationType::DETACHABLE);
111+
auto& string_buffer = dest_column.create_extra_buffer(
112+
mapping.offset_bytes_, ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE
113+
);
114+
auto& offsets_buffer = dest_column.create_extra_buffer(
115+
mapping.offset_bytes_,
116+
ExtraBufferType::OFFSET,
117+
(unique_offsets_in_order.size() + 1) * sizeof(int64_t),
118+
AllocationType::DETACHABLE
119+
);
109120
// Then go through unique_offsets to fill up the offset and string buffers.
110121
auto offsets_ptr = reinterpret_cast<int64_t*>(offsets_buffer.data());
111122
auto string_ptr = reinterpret_cast<char*>(string_buffer.data());
112-
for (auto unique_offset: unique_offsets_in_order) {
123+
for (auto unique_offset : unique_offsets_in_order) {
113124
const auto& entry = unique_offsets[unique_offset];
114125
*offsets_ptr++ = entry.string_buffer_pos_;
115126
memcpy(string_ptr, entry.strv.data(), entry.strv.size());
@@ -123,17 +134,11 @@ TypeDescriptor ArrowStringHandler::output_type(const TypeDescriptor&) const {
123134
return make_scalar_type(DataType::UTF_DYNAMIC32);
124135
}
125136

126-
int ArrowStringHandler::type_size() const {
127-
return sizeof(uint32_t);
128-
}
137+
int ArrowStringHandler::type_size() const { return sizeof(uint32_t); }
129138

130139
void ArrowStringHandler::default_initialize(
131-
ChunkedBuffer& /*buffer*/,
132-
size_t /*offset*/,
133-
size_t /*byte_size*/,
134-
const DecodePathData& /*shared_data*/,
135-
std::any& /*handler_data*/) const {
136-
137-
}
140+
ChunkedBuffer& /*buffer*/, size_t /*offset*/, size_t /*byte_size*/, const DecodePathData& /*shared_data*/,
141+
std::any& /*handler_data*/
142+
) const {}
138143

139144
} // namespace arcticdb

0 commit comments

Comments
 (0)