@@ -88,8 +88,8 @@ struct similarity_memory_requirements {
8888 * @param[in] first_length The length of the first string in characters/codepoints.
8989 * @param[in] second_length The length of the second string in characters/codepoints.
9090 * @param[in] max_magnitude_change The absolute value of the maximum change in nearby cells.
91- * @param[in] bytes_per_character The number of bytes per character, 4 for UTF-32, 1 for ASCII.
92- * @param[in] word_alignment The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
91+ * @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
92+ * @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
9393 *
9494 * To understand the @p max_magnitude_change parameter, consider the following example:
9595 * - substitution costs ranging from -16 to +15
@@ -99,8 +99,8 @@ struct similarity_memory_requirements {
9999 constexpr similarity_memory_requirements ( //
100100 size_t first_length, size_t second_length, //
101101 size_t max_magnitude_change, //
102- size_t bytes_per_character, //
103- size_t word_alignment ) noexcept {
102+ size_t bytes_per_char, //
103+ size_t register_width ) noexcept {
104104
105105 // Each diagonal in the DP matrix is only by 1 longer than the shorter string.
106106 size_t shorter_length = sz_min_of_two (first_length, second_length);
@@ -125,12 +125,11 @@ struct similarity_memory_requirements {
125125
126126 // For each string we need to copy its contents, and allocate 3 bands proportional to the length
127127 // of the shorter string with each cell being big enough to hold the length of the longer one.
128- // The diagonals should be aligned to `word_alignment` bytes to allow for SIMD operations.
129- this ->bytes_per_diagonal = round_up_to_multiple<size_t >(max_diagonal_length * bytes_per_cell, word_alignment);
130- this ->total = //
131- 3 * bytes_per_diagonal + //
132- round_up_to_multiple<size_t >(first_length * bytes_per_character, word_alignment) + //
133- round_up_to_multiple<size_t >(second_length * bytes_per_character, word_alignment);
128+ // The diagonals should be aligned to `register_width` bytes to allow for SIMD operations.
129+ this ->bytes_per_diagonal = round_up_to_multiple<size_t >(max_diagonal_length * bytes_per_cell, register_width);
130+ size_t first_length_bytes = round_up_to_multiple<size_t >(first_length * bytes_per_char, register_width);
131+ size_t second_length_bytes = round_up_to_multiple<size_t >(second_length * bytes_per_char, register_width);
132+ this ->total = 3 * bytes_per_diagonal + first_length_bytes + second_length_bytes;
134133 }
135134};
136135
@@ -218,7 +217,11 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
218217 score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
219218 score_t *scores_new) noexcept {
220219
221- #pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
220+ #if (capability_k & sz_cap_parallel_k)
221+ #pragma omp parallel for simd
222+ #else
223+ #pragma omp simd
224+ #endif
222225 for (sz_size_t i = 0 ; i < n; ++i) {
223226 score_t pre_substitution = scores_pre_substitution[i];
224227 score_t pre_insertion = scores_pre_insertion[i];
@@ -1685,7 +1688,11 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
16851688 sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
16861689 sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
16871690
1688- #pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
1691+ #if (capability_k & sz_cap_parallel_k)
1692+ #pragma omp parallel for simd
1693+ #else
1694+ #pragma omp simd
1695+ #endif
16891696 // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
16901697 for (sz_size_t i = 0 ; i < n; i += 32 )
16911698 slice (first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
@@ -1749,7 +1756,11 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs
17491756 sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
17501757 sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
17511758
1752- #pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
1759+ #if (capability_k & sz_cap_parallel_k)
1760+ #pragma omp parallel for simd
1761+ #else
1762+ #pragma omp simd
1763+ #endif
17531764 // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
17541765 for (sz_size_t i = 0 ; i < n; i += 16 )
17551766 slice (first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
@@ -2040,7 +2051,11 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
20402051
20412052 sz_size_t const count_slices = n / 64 ;
20422053
2043- #pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
2054+ #if (capability_k & sz_cap_parallel_k)
2055+ #pragma omp parallel for simd
2056+ #else
2057+ #pragma omp simd
2058+ #endif
20442059 // Progress through the row 64 characters at a time.
20452060 for (sz_size_t idx_slice = 0 ; idx_slice != count_slices; ++idx_slice)
20462061 slice_64chars (second_slice, idx_slice * 64 , gap, scores_pre_substitution, scores_pre_insertion, scores_new);
0 commit comments