Skip to content

Commit 41e1a6e

Browse files
committed
Fix: Using OpenMP directives
1 parent 1879aeb commit 41e1a6e

File tree

1 file changed

+29
-14
lines changed

1 file changed

+29
-14
lines changed

include/stringcuzilla/similarity.hpp

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ struct similarity_memory_requirements {
8888
* @param[in] first_length The length of the first string in characters/codepoints.
8989
* @param[in] second_length The length of the second string in characters/codepoints.
9090
* @param[in] max_magnitude_change The absolute value of the maximum change in nearby cells.
91-
* @param[in] bytes_per_character The number of bytes per character, 4 for UTF-32, 1 for ASCII.
92-
* @param[in] word_alignment The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
91+
* @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
92+
* @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
9393
*
9494
* To understand the @p max_magnitude_change parameter, consider the following example:
9595
* - substitution costs ranging from -16 to +15
@@ -99,8 +99,8 @@ struct similarity_memory_requirements {
9999
constexpr similarity_memory_requirements( //
100100
size_t first_length, size_t second_length, //
101101
size_t max_magnitude_change, //
102-
size_t bytes_per_character, //
103-
size_t word_alignment) noexcept {
102+
size_t bytes_per_char, //
103+
size_t register_width) noexcept {
104104

105105
// Each diagonal in the DP matrix is only by 1 longer than the shorter string.
106106
size_t shorter_length = sz_min_of_two(first_length, second_length);
@@ -125,12 +125,11 @@ struct similarity_memory_requirements {
125125

126126
// For each string we need to copy its contents, and allocate 3 bands proportional to the length
127127
// of the shorter string with each cell being big enough to hold the length of the longer one.
128-
// The diagonals should be aligned to `word_alignment` bytes to allow for SIMD operations.
129-
this->bytes_per_diagonal = round_up_to_multiple<size_t>(max_diagonal_length * bytes_per_cell, word_alignment);
130-
this->total = //
131-
3 * bytes_per_diagonal + //
132-
round_up_to_multiple<size_t>(first_length * bytes_per_character, word_alignment) + //
133-
round_up_to_multiple<size_t>(second_length * bytes_per_character, word_alignment);
128+
// The diagonals should be aligned to `register_width` bytes to allow for SIMD operations.
129+
this->bytes_per_diagonal = round_up_to_multiple<size_t>(max_diagonal_length * bytes_per_cell, register_width);
130+
size_t first_length_bytes = round_up_to_multiple<size_t>(first_length * bytes_per_char, register_width);
131+
size_t second_length_bytes = round_up_to_multiple<size_t>(second_length * bytes_per_char, register_width);
132+
this->total = 3 * bytes_per_diagonal + first_length_bytes + second_length_bytes;
134133
}
135134
};
136135

@@ -218,7 +217,11 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
218217
score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
219218
score_t *scores_new) noexcept {
220219

221-
#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
220+
#if (capability_k & sz_cap_parallel_k)
221+
#pragma omp parallel for simd
222+
#else
223+
#pragma omp simd
224+
#endif
222225
for (sz_size_t i = 0; i < n; ++i) {
223226
score_t pre_substitution = scores_pre_substitution[i];
224227
score_t pre_insertion = scores_pre_insertion[i];
@@ -1685,7 +1688,11 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
16851688
sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
16861689
sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
16871690

1688-
#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
1691+
#if (capability_k & sz_cap_parallel_k)
1692+
#pragma omp parallel for simd
1693+
#else
1694+
#pragma omp simd
1695+
#endif
16891696
// In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
16901697
for (sz_size_t i = 0; i < n; i += 32)
16911698
slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
@@ -1749,7 +1756,11 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs
17491756
sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
17501757
sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
17511758

1752-
#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
1759+
#if (capability_k & sz_cap_parallel_k)
1760+
#pragma omp parallel for simd
1761+
#else
1762+
#pragma omp simd
1763+
#endif
17531764
// In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
17541765
for (sz_size_t i = 0; i < n; i += 16)
17551766
slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
@@ -2040,7 +2051,11 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
20402051

20412052
sz_size_t const count_slices = n / 64;
20422053

2043-
#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
2054+
#if (capability_k & sz_cap_parallel_k)
2055+
#pragma omp parallel for simd
2056+
#else
2057+
#pragma omp simd
2058+
#endif
20442059
// Progress through the row 64 characters at a time.
20452060
for (sz_size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
20462061
slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);

0 commit comments

Comments
 (0)