Skip to content

Commit dc7c109

Browse files
committed
Add: Missing sz_sequence_t helpers
1 parent 944804e commit dc7c109

File tree

2 files changed

+80
-47
lines changed

2 files changed

+80
-47
lines changed

include/stringzilla/types.h

Lines changed: 67 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -461,68 +461,71 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
461461

462462
#pragma region API Signature Types
463463

464-
/** @brief Signature of ::sz_hash. */
464+
/** @brief Signature of `sz_hash`. */
465465
typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t, sz_u64_t);
466466

467-
/** @brief Signature of ::sz_hash_state_init. */
467+
/** @brief Signature of `sz_hash_state_init`. */
468468
typedef void (*sz_hash_state_init_t)(struct sz_hash_state_t *, sz_u64_t);
469469

470-
/** @brief Signature of ::sz_hash_state_stream. */
470+
/** @brief Signature of `sz_hash_state_stream`. */
471471
typedef void (*sz_hash_state_stream_t)(struct sz_hash_state_t *, sz_cptr_t, sz_size_t);
472472

473-
/** @brief Signature of ::sz_hash_state_fold. */
473+
/** @brief Signature of `sz_hash_state_fold`. */
474474
typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
475475

476-
/** @brief Signature of ::sz_bytesum. */
476+
/** @brief Signature of `sz_bytesum`. */
477477
typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
478478

479-
/** @brief Signature of ::sz_generate. */
479+
/** @brief Signature of `sz_generate`. */
480480
typedef void (*sz_generate_t)(sz_ptr_t, sz_size_t, sz_u64_t);
481481

482-
/** @brief Signature of ::sz_equal. */
482+
/** @brief Signature of `sz_equal`. */
483483
typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
484484

485-
/** @brief Signature of ::sz_order. */
485+
/** @brief Signature of `sz_order`. */
486486
typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
487487

488-
/** @brief Signature of ::sz_look_up_transform. */
489-
typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
488+
/** @brief Signature of `sz_lookup`. */
489+
typedef void (*sz_lookup_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
490490

491-
/** @brief Signature of ::sz_move. */
491+
/** @brief Signature of `sz_move`. */
492492
typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
493493

494-
/** @brief Signature of ::sz_fill. */
494+
/** @brief Signature of `sz_fill`. */
495495
typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
496496

497-
/** @brief Signature of ::sz_find_byte. */
497+
/** @brief Signature of `sz_find_byte`. */
498498
typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
499499

500-
/** @brief Signature of ::sz_find. */
500+
/** @brief Signature of `sz_find`. */
501501
typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
502502

503-
/** @brief Signature of ::sz_find_set. */
503+
/** @brief Signature of `sz_find_set`. */
504504
typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
505505

506-
/** @brief Signature of ::sz_hamming_distance. */
507-
typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
506+
/** @brief Signature of `sz_hamming_distance`. */
507+
typedef sz_status_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_size_t *);
508508

509-
/** @brief Signature of ::sz_edit_distance. */
510-
typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
509+
/** @brief Signature of `sz_levenshtein_distance`. */
510+
typedef sz_status_t (*sz_levenshtein_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t,
511+
sz_memory_allocator_t *, sz_size_t *);
511512

512-
/** @brief Signature of ::sz_alignment_score. */
513-
typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
514-
sz_error_cost_t, sz_memory_allocator_t *);
513+
/** @brief Signature of `sz_needleman_wunsch_score`. */
514+
typedef sz_status_t (*sz_needleman_wunsch_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
515+
sz_error_cost_t, sz_memory_allocator_t *, sz_ssize_t *);
515516

516-
/** @brief Signature of ::sz_sequence_argsort. */
517-
typedef sz_bool_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
517+
/** @brief Signature of `sz_sequence_argsort`. */
518+
typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *,
519+
sz_bool_t *);
518520

519-
/** @brief Signature of ::sz_pgrams_sort. */
520-
typedef sz_bool_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *);
521+
/** @brief Signature of `sz_pgrams_sort`. */
522+
typedef sz_status_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *,
523+
sz_bool_t *);
521524

522-
/** @brief Signature of ::sz_sequence_argsort_stable. */
525+
/** @brief Signature of `sz_sequence_argsort_stable`. */
523526
typedef sz_sequence_argsort_t sz_sequence_argsort_stable_t;
524527

525-
/** @brief Signature of ::sz_pgrams_sort_stable. */
528+
/** @brief Signature of `sz_pgrams_sort_stable`. */
526529
typedef sz_pgrams_sort_t sz_pgrams_sort_stable_t;
527530

528531
#pragma endregion
@@ -683,9 +686,17 @@ SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_le
683686

684687
#pragma region String Sequences API
685688

686-
typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
687-
typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
689+
/** @brief Signature of `sz_sequence_t::get_start` used to get the start of a member string at a given index. */
690+
typedef sz_cptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
691+
/** @brief Signature of `sz_sequence_t::get_length` used to get the length of a member string at a given index. */
692+
typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
688693

694+
/**
695+
* @brief Structure to represent an ordered collection of strings.
696+
* It's a generic structure that can be used to represent a sequence of strings in different layouts.
697+
* It can be easily combined with Apache Arrow and its tape-like concatenated strings.
698+
* @sa sz_sequence_from_null_terminated_strings
699+
*/
689700
typedef struct sz_sequence_t {
690701
void const *handle;
691702
sz_size_t count;
@@ -694,20 +705,12 @@ typedef struct sz_sequence_t {
694705
} sz_sequence_t;
695706

696707
/**
697-
* @brief Initiates the sequence structure from a tape layout, used by Apache Arrow.
698-
* Expects ::offsets to contains `count + 1` entries, the last pointing at the end
699-
* of the last string, indicating the total length of the ::tape.
700-
*/
701-
SZ_PUBLIC void sz_sequence_from_u32tape( //
702-
sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count, sz_sequence_t *sequence);
703-
704-
/**
705-
* @brief Initiates the sequence structure from a tape layout, used by Apache Arrow.
706-
* Expects ::offsets to contains `count + 1` entries, the last pointing at the end
707-
* of the last string, indicating the total length of the ::tape.
708+
* @brief Initiates the sequence structure from a typical C-style strings array, like `char *[]`.
709+
* @param[in] start Pointer to the array of strings.
710+
* @param[in] count Number of strings in the array.
711+
* @param[out] sequence Sequence structure to initialize.
708712
*/
709-
SZ_PUBLIC void sz_sequence_from_u64tape( //
710-
sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count, sz_sequence_t *sequence);
713+
SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_size_t count, sz_sequence_t *sequence);
711714

712715
#pragma endregion
713716

@@ -857,7 +860,7 @@ SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap
857860
SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
858861

859862
/**
860-
* @brief Select bits from either ::a or ::b depending on the value of ::mask bits.
863+
* @brief Select bits from either @p a or @p b depending on the value of @p mask bits.
861864
*
862865
* Similar to `_mm_blend_epi16` intrinsic on x86.
863866
* Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
@@ -987,7 +990,7 @@ SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
987990
}
988991

989992
/**
990-
* @brief Compute the smallest power of two greater than or equal to ::x.
993+
* @brief Compute the smallest power of two greater than or equal to @p x.
991994
*/
992995
SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
993996
// Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
@@ -1149,6 +1152,25 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
11491152
*(sz_ptr_t)buffer = *(sz_cptr_t)&length;
11501153
}
11511154

1155+
SZ_PUBLIC sz_cptr_t _sz_sequence_from_null_terminated_strings_get_start(void const *handle, sz_size_t i) {
1156+
sz_cptr_t const *start = (sz_cptr_t const *)handle;
1157+
return start[i];
1158+
}
1159+
1160+
SZ_PUBLIC sz_size_t _sz_sequence_from_null_terminated_strings_get_length(void const *handle, sz_size_t i) {
1161+
sz_cptr_t const *start = (sz_cptr_t const *)handle;
1162+
sz_size_t length = 0;
1163+
for (sz_cptr_t ptr = start[i]; *ptr; ptr++) length++;
1164+
return length;
1165+
}
1166+
1167+
SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_size_t count, sz_sequence_t *sequence) {
1168+
sequence->handle = start;
1169+
sequence->count = count;
1170+
sequence->get_start = _sz_sequence_from_null_terminated_strings_get_start;
1171+
sequence->get_length = _sz_sequence_from_null_terminated_strings_get_length;
1172+
}
1173+
11521174
#pragma endregion
11531175

11541176
#ifdef __cplusplus

scripts/test.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,8 +1576,8 @@ void test_replacements(std::size_t lookup_tables_to_try = 128, std::size_t slice
15761576
std::size_t slice_offset = std::rand() % (body.length());
15771577
std::size_t slice_length = std::rand() % (body.length() - slice_offset);
15781578

1579-
sz::transform<char>(sz::string_view(body.data() + slice_offset, slice_length), lut,
1580-
const_cast<char *>(transformed.data()) + slice_offset);
1579+
sz::lookup<char>(sz::string_view(body.data() + slice_offset, slice_length), lut,
1580+
const_cast<char *>(transformed.data()) + slice_offset);
15811581
for (std::size_t i = 0; i != slice_length; ++i) {
15821582
assert(transformed[slice_offset + i] == lut[body[slice_offset + i]]);
15831583
}
@@ -1592,6 +1592,17 @@ static void test_sequence_algorithms() {
15921592
using strs_t = std::vector<std::string>;
15931593
using order_t = std::vector<sz::sorted_idx_t>;
15941594

1595+
// Make sure teh helper functions work as expected.
1596+
{
1597+
sz_sequence_t sequence;
1598+
sz_cptr_t strings[] = {"banana", "apple", "cherry"};
1599+
sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
1600+
assert(sequence.size == 3);
1601+
assert(sequence.get_start(sequence.handle, 0) == "banana"_sv);
1602+
assert(sequence.get_start(sequence.handle, 1) == "apple"_sv);
1603+
assert(sequence.get_start(sequence.handle, 2) == "cherry"_sv);
1604+
}
1605+
15951606
// Basic tests with predetermined orders.
15961607
assert_scoped(strs_t x({"a", "b", "c", "d"}), (void)0, sz::argsort(x) == order_t({0u, 1u, 2u, 3u}));
15971608
assert_scoped(strs_t x({"b", "c", "d", "a"}), (void)0, sz::argsort(x) == order_t({3u, 0u, 1u, 2u}));

0 commit comments

Comments
 (0)