@@ -64,32 +64,24 @@ constexpr int64_t kUnknownNullCount = -1;
6464// /
6565// / This data structure is a self-contained representation of the memory and
6666// / metadata inside an Arrow array data structure (called vectors in Java). The
67- // / classes arrow:: Array and its subclasses provide strongly-typed accessors
67+ // / Array class and its concrete subclasses provide strongly-typed accessors
6868// / with support for the visitor pattern and other affordances.
6969// /
7070// / This class is designed for easy internal data manipulation, analytical data
71- // / processing, and data transport to and from IPC messages. For example, we
72- // / could cast from int64 to float64 like so:
71+ // / processing, and data transport to and from IPC messages.
7372// /
74- // / Int64Array arr = GetMyData();
75- // / auto new_data = arr.data()->Copy();
76- // / new_data->type = arrow::float64();
77- // / DoubleArray double_arr(new_data);
73+ // / This class is also useful in an analytics setting where memory may be
74+ // / efficiently reused. For example, computing the Abs of a numeric array
75+ // / should return null iff the input is null: therefore, an Abs function can
76+ // / reuse the validity bitmap (a Buffer) of its input as the validity bitmap
77+ // / of its output.
7878// /
79- // / This object is also useful in an analytics setting where memory may be
80- // / reused. For example, if we had a group of operations all returning doubles,
81- // / say:
82- // /
83- // / Log(Sqrt(Expr(arr)))
84- // /
85- // / Then the low-level implementations of each of these functions could have
86- // / the signatures
87- // /
88- // / void Log(const ArrayData& values, ArrayData* out);
89- // /
90- // / As another example a function may consume one or more memory buffers in an
91- // / input array and replace them with newly-allocated data, changing the output
92- // / data type as well.
79+ // / This class is meant mostly for immutable data access. Any mutable access
80+ // / (either to ArrayData members or to the contents of its Buffers) should take
81+ // / into account the fact that ArrayData instances are typically wrapped in a
82+ // / shared_ptr and can therefore have multiple owners at any given time.
83+ // / Therefore, mutable access is discouraged except when initially populating
84+ // / the ArrayData.
9385struct ARROW_EXPORT ArrayData {
9486 ArrayData () = default ;
9587
@@ -194,25 +186,37 @@ struct ARROW_EXPORT ArrayData {
194186 return *this ;
195187 }
196188
189+ // / \brief Return a shallow copy of this ArrayData
197190 std::shared_ptr<ArrayData> Copy () const { return std::make_shared<ArrayData>(*this ); }
198191
199- // / \brief Copy all buffers and children recursively to destination MemoryManager
192+ // / \brief Deep copy this ArrayData to destination memory manager
200193 // /
201- // / This utilizes MemoryManager::CopyBuffer to create a new ArrayData object
202- // / recursively copying the buffers and all child buffers to the destination
203- // / memory manager. This includes dictionaries if applicable.
194+ // / Returns a new ArrayData object with buffers and all child buffers
195+ // / copied to the destination memory manager. This includes dictionaries
196+ // / if applicable.
204197 Result<std::shared_ptr<ArrayData>> CopyTo (
205198 const std::shared_ptr<MemoryManager>& to) const ;
206- // / \brief View or Copy this ArrayData to destination memory manager.
199+
200+ // / \brief View or copy this ArrayData to destination memory manager
207201 // /
208202 // / Tries to view the buffer contents on the given memory manager's device
209203 // / if possible (to avoid a copy) but falls back to copying if a no-copy view
210204 // / isn't supported.
211205 Result<std::shared_ptr<ArrayData>> ViewOrCopyTo (
212206 const std::shared_ptr<MemoryManager>& to) const ;
213207
208+ // / \brief Return the null-ness of a given array element
209+ // /
210+ // / Calling `IsNull(i)` is the same as `!IsValid(i)`.
214211 bool IsNull (int64_t i) const { return !IsValid (i); }
215212
213+ // / \brief Return the validity of a given array element
214+ // /
215+ // / For most data types, this will simply query the validity bitmap.
216+ // / For union and run-end-encoded arrays, the underlying child data is
217+ // / queried instead.
218+ // / For dictionary arrays, this reflects the validity of the dictionary
219+ // / index, but the corresponding dictionary value might still be null.
216220 bool IsValid (int64_t i) const {
217221 if (buffers[0 ] != NULLPTR) {
218222 return bit_util::GetBit (buffers[0 ]->data (), i + offset);
@@ -230,7 +234,19 @@ struct ARROW_EXPORT ArrayData {
230234 return null_count.load () != length;
231235 }
232236
233- // Access a buffer's data as a typed C pointer
237+ // / \brief Access a buffer's data as a typed C pointer
238+ // /
239+ // / \param i the buffer index
240+ // / \param absolute_offset the offset into the buffer
241+ // /
242+ // / If `absolute_offset` is non-zero, the type `T` must match the
243+ // / layout of buffer number `i` for the array's data type; otherwise
244+ // / offset computation would be incorrect.
245+ // /
246+ // / If the given buffer is bit-packed (such as a validity bitmap, or
247+ // / the data buffer of a boolean array), then `absolute_offset` must be
248+ // / zero for correct results, and any bit offset must be applied manually
249+ // / by the caller.
234250 template <typename T>
235251 inline const T* GetValues (int i, int64_t absolute_offset) const {
236252 if (buffers[i]) {
@@ -240,13 +256,27 @@ struct ARROW_EXPORT ArrayData {
240256 }
241257 }
242258
259+ // / \brief Access a buffer's data as a typed C pointer
260+ // /
261+ // / \param i the buffer index
262+ // /
263+ // / This method uses the array's offset to index into buffer number `i`.
264+ // /
265+ // / Calling this method on a bit-packed buffer (such as a validity bitmap, or
266+ // / the data buffer of a boolean array) will lead to incorrect results.
267+ // / You should instead call `GetValues(i, 0)` and apply the bit offset manually.
243268 template <typename T>
244269 inline const T* GetValues (int i) const {
245270 return GetValues<T>(i, offset);
246271 }
247272
248- // Like GetValues, but returns NULLPTR instead of aborting if the underlying
249- // buffer is not a CPU buffer.
273+ // / \brief Access a buffer's data as a typed C pointer
274+ // /
275+ // / \param i the buffer index
276+ // / \param absolute_offset the offset into the buffer
277+ // /
278+ // / Like `GetValues(i, absolute_offset)`, but returns nullptr if the given buffer
279+ // / is not a CPU buffer.
250280 template <typename T>
251281 inline const T* GetValuesSafe (int i, int64_t absolute_offset) const {
252282 if (buffers[i] && buffers[i]->is_cpu ()) {
@@ -256,12 +286,24 @@ struct ARROW_EXPORT ArrayData {
256286 }
257287 }
258288
289+ // / \brief Access a buffer's data as a typed C pointer
290+ // /
291+ // / \param i the buffer index
292+ // /
293+ // / Like `GetValues(i)`, but returns nullptr if the given buffer is not a CPU buffer.
259294 template <typename T>
260295 inline const T* GetValuesSafe (int i) const {
261296 return GetValuesSafe<T>(i, offset);
262297 }
263298
264- // Access a buffer's data as a typed C pointer
299+ // / \brief Access a buffer's data as a mutable typed C pointer
300+ // /
301+ // / \param i the buffer index
302+ // / \param absolute_offset the offset into the buffer
303+ // /
304+ // / Like `GetValues(i, absolute_offset)`, but allows mutating buffer contents.
305+ // / This should only be used when initially populating the ArrayData, before
306+ // / it is attached to a Array instance.
265307 template <typename T>
266308 inline T* GetMutableValues (int i, int64_t absolute_offset) {
267309 if (buffers[i]) {
@@ -271,43 +313,62 @@ struct ARROW_EXPORT ArrayData {
271313 }
272314 }
273315
316+ // / \brief Access a buffer's data as a mutable typed C pointer
317+ // /
318+ // / \param i the buffer index
319+ // /
320+ // / Like `GetValues(i)`, but allows mutating buffer contents.
321+ // / This should only be used when initially populating the ArrayData, before
322+ // / it is attached to a Array instance.
274323 template <typename T>
275324 inline T* GetMutableValues (int i) {
276325 return GetMutableValues<T>(i, offset);
277326 }
278327
279328 // / \brief Construct a zero-copy slice of the data with the given offset and length
280329 // /
281- // / The associated `ArrayStatistics` is always discarded in a sliced
282- // / `ArrayData`. Because `ArrayStatistics` in the original
283- // / `ArrayData` may be invalid in a sliced `ArrayData`. If you want
284- // / to reuse statistics in the original `ArrayData`, you need to do
285- // / it by yourself.
286- // /
287- // / If the specified slice range has the same range as the original
288- // / `ArrayData`, we can reuse statistics in the original
289- // / ` ArrayData`. Because it has the same data as the original
290- // / `ArrayData`. But the associated `ArrayStatistics` is discarded
291- // / in this case too. Use `Copy()` instead for the case .
330+ // / This method applies the given slice to this ArrayData, taking into account
331+ // / its existing offset and length.
332+ // / If the given `length` is too large, the slice length is clamped so as not
333+ // / to go past the offset end.
334+ // / If the given `often` is too large, or if either `offset` or `length` is negative,
335+ // / behavior is undefined.
336+ // /
337+ // / The associated ArrayStatistics is always discarded in a sliced
338+ // / ArrayData, even if the slice is trivially equal to the original ArrayData.
339+ // / If you want to reuse the statistics from the original ArrayData, you must
340+ // / explicitly reattach them .
292341 std::shared_ptr<ArrayData> Slice (int64_t offset, int64_t length) const ;
293342
294- // / \brief Input-checking variant of Slice
343+ // / \brief Construct a zero-copy slice of the data with the given offset and length
295344 // /
296- // / An Invalid Status is returned if the requested slice falls out of bounds.
297- // / Note that unlike Slice, `length` isn't clamped to the available buffer size.
345+ // / Like `Slice(offset, length)`, but returns an error if the requested slice
346+ // / falls out of bounds.
347+ // / Unlike Slice, `length` isn't clamped to the available buffer size.
298348 Result<std::shared_ptr<ArrayData>> SliceSafe (int64_t offset, int64_t length) const ;
299349
350+ // / \brief Set the cached physical null count
351+ // /
352+ // / \param v the number of nulls in the ArrayData
353+ // /
354+ // / This should only be used when initially populating the ArrayData, if
355+ // / it possible to compute the null count without visiting the entire validity
356+ // / bitmap. In most cases, relying on `GetNullCount` is sufficient.
300357 void SetNullCount (int64_t v) { null_count.store (v); }
301358
302- // / \brief Return physical null count, or compute and set it if it's not known
359+ // / \brief Return the physical null count
360+ // /
361+ // / The null count is lazily computed from the array's validity bitmap,
362+ // / if not already cached.
303363 int64_t GetNullCount () const ;
304364
305- // / \brief Return true if the data has a validity bitmap and the physical null
306- // / count is known to be non-zero or not yet known.
365+ // / \brief Return true if the array may have nulls in its validity bitmap
307366 // /
308- // / Note that this is not the same as MayHaveLogicalNulls, which also checks
309- // / for the presence of nulls in child data for types like unions and run-end
310- // / encoded types.
367+ // / This method returns true if the data has a validity bitmap, and the physical
368+ // / null count is either known to be non-zero or not yet known.
369+ // /
370+ // / Unlike `MayHaveLogicalNulls`, this does not check for the presence of nulls
371+ // / in child data for data types such as unions and run-end encoded types.
311372 // /
312373 // / \see HasValidityBitmap
313374 // / \see MayHaveLogicalNulls
@@ -317,18 +378,20 @@ struct ARROW_EXPORT ArrayData {
317378 return null_count.load () != 0 && buffers[0 ] != NULLPTR;
318379 }
319380
320- // / \brief Return true if the data has a validity bitmap
381+ // / \brief Return true if the array has a validity bitmap
321382 bool HasValidityBitmap () const { return buffers[0 ] != NULLPTR; }
322383
323- // / \brief Return true if the validity bitmap may have 0's in it, or if the
324- // / child arrays (in the case of types without a validity bitmap) may have
325- // / nulls, or if the dictionary of dictionary array may have nulls.
384+ // / \brief Return true if the array may have logical nulls
385+ // /
386+ // / Unlike `MayHaveNulls`, this method checks for null child values
387+ // / for types without a validity bitmap, such as unions and run-end encoded
388+ // / types, and for null dictionary values for dictionary types.
326389 // /
327- // / This is not a drop-in replacement for MayHaveNulls, as historically
328- // / MayHaveNulls() has been used to check for the presence of a validity
329- // / bitmap that needs to be checked .
390+ // / This implies that `MayHaveLogicalNulls` may return true for arrays that
391+ // / don't have a top-level validity bitmap. It is therefore necessary
392+ // / to call `HasValidityBitmap` before accessing a top-level validity bitmap .
330393 // /
331- // / Code that previously used MayHaveNulls() and then dealt with the validity
394+ // / Code that previously used MayHaveNulls and then dealt with the validity
332395 // / bitmap directly can be fixed to handle all types correctly without
333396 // / performance degradation when handling most types by adopting
334397 // / HasValidityBitmap and MayHaveLogicalNulls.
@@ -373,13 +436,12 @@ struct ARROW_EXPORT ArrayData {
373436 return null_count.load () != 0 ;
374437 }
375438
376- // / \brief Computes the logical null count for arrays of all types including
377- // / those that do not have a validity bitmap like union and run-end encoded
378- // / arrays
439+ // / \brief Compute the logical null count for arrays of all types
379440 // /
380441 // / If the array has a validity bitmap, this function behaves the same as
381- // / GetNullCount. For types that have no validity bitmap, this function will
382- // / recompute the null count every time it is called.
442+ // / GetNullCount. For arrays that have no validity bitmap but whose values
443+ // / may be logically null (such as union arrays and run-end encoded arrays),
444+ // / this function recomputes the null count every time it is called.
383445 // /
384446 // / \see GetNullCount
385447 int64_t ComputeLogicalNullCount () const ;
0 commit comments