Skip to content

Commit f656577

Browse files
committed
Improve: Fix minor inconsistencies
1 parent 5ea0698 commit f656577

File tree

8 files changed

+159
-105
lines changed

8 files changed

+159
-105
lines changed

.clang-format

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ NamespaceIndentation: None
66
ColumnLimit: 120
77
ReflowComments: true
88
UseTab: Never
9+
IndentPPDirectives: None
910

1011
AlignConsecutiveAssignments: false
1112
AlignConsecutiveDeclarations: false

.vscode/settings.json

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
3+
"C_Cpp.dimInactiveRegions": false,
34
// This may cause overheating.
45
// https://github.com/microsoft/vscode-cpptools/issues/1816
56
"C_Cpp.workspaceParsingPriority": "low",
@@ -17,13 +18,15 @@
1718
},
1819
"cmake.sourceDirectory": "${workspaceRoot}",
1920
"cSpell.words": [
21+
"aesdec",
2022
"allowoverlap",
2123
"aminoacid",
2224
"aminoacids",
2325
"Apostolico",
2426
"Appleby",
2527
"ASAN",
2628
"ashvardanian",
29+
"Aumasson",
2730
"Baeza",
2831
"basicsize",
2932
"bigram",
@@ -32,17 +35,24 @@
3235
"bioinformatics",
3336
"Bitap",
3437
"bitcast",
38+
"bitceil",
3539
"BLOSUM",
40+
"Borwein",
41+
"Brase",
3642
"Brumme",
43+
"Byteset",
44+
"bytesum",
3745
"carray",
3846
"Cawley",
47+
"chardet",
3948
"cheminformatics",
4049
"cibuildwheel",
4150
"CONCAT",
4251
"constexpr",
4352
"copydoc",
4453
"Corasick",
4554
"cptr",
55+
"DRBG",
4656
"endregion",
4757
"endswith",
4858
"Eron",
@@ -51,7 +61,9 @@
5161
"getitem",
5262
"getslice",
5363
"Giancarlo",
64+
"Giordano",
5465
"Gonnet",
66+
"Gotoh",
5567
"Haswell",
5668
"Heikki",
5769
"hexdigits",
@@ -65,6 +77,7 @@
6577
"isprintable",
6678
"itemsize",
6779
"Jaccard",
80+
"Kaitchuck",
6881
"Karp",
6982
"keeplinebreaks",
7083
"keepseparator",
@@ -82,13 +95,18 @@
8295
"memcpy",
8396
"Merkle-Damgård",
8497
"Mersenne",
98+
"misalign",
8599
"MODINIT",
100+
"Morten",
101+
"Mosè",
86102
"MSVC",
87103
"napi",
88104
"nargsf",
89105
"ndim",
90106
"Needleman",
91107
"newfunc",
108+
"ngram",
109+
"ngrams",
92110
"NOARGS",
93111
"noexcept",
94112
"NOMINMAX",
@@ -97,14 +115,19 @@
97115
"numpy",
98116
"octdigits",
99117
"octogram",
118+
"pgram",
119+
"pgrams",
120+
"Plouffe",
100121
"printables",
101122
"pytest",
102123
"Pythonic",
103124
"qsort",
104125
"quadgram",
105126
"Raita",
106127
"readlines",
128+
"Reini",
107129
"releasebuffer",
130+
"repr",
108131
"rfind",
109132
"rfinds",
110133
"richcompare",
@@ -116,6 +139,7 @@
116139
"rsplits",
117140
"rstrip",
118141
"SIMD",
142+
"sklearn",
119143
"Skylake",
120144
"splitlines",
121145
"ssize",
@@ -138,18 +162,21 @@
138162
"Vardanian",
139163
"VBMI",
140164
"vectorcallfunc",
165+
"Vectorizer",
141166
"Wagner",
142167
"whitespaces",
143168
"Wunsch",
144169
"XDECREF",
170+
"xmms",
171+
"Yann",
172+
"Yaroshevskiy",
145173
"Zilla"
146174
],
147175
"editor.formatOnSave": true,
148176
"editor.rulers": [
149177
120
150178
],
151179
"files.associations": {
152-
"*.tcc": "cpp",
153180
"__bit_reference": "cpp",
154181
"__bits": "cpp",
155182
"__config": "cpp",
@@ -168,12 +195,14 @@
168195
"__tree": "cpp",
169196
"__tuple": "cpp",
170197
"__verbose_abort": "cpp",
198+
"*.tcc": "cpp",
171199
"algorithm": "cpp",
172200
"any": "cpp",
173201
"array": "cpp",
174202
"atomic": "cpp",
175203
"bit": "cpp",
176204
"bitset": "cpp",
205+
"cassert": "cpp",
177206
"cctype": "cpp",
178207
"charconv": "c",
179208
"chrono": "cpp",
@@ -231,6 +260,7 @@
231260
"semaphore": "cpp",
232261
"set": "cpp",
233262
"shared_mutex": "cpp",
263+
"sort.h": "c",
234264
"source_location": "cpp",
235265
"span": "cpp",
236266
"sstream": "cpp",
@@ -269,6 +299,6 @@
269299
"xstring": "cpp",
270300
"xtr1common": "cpp",
271301
"xtree": "cpp",
272-
"xutility": "cpp",
302+
"xutility": "cpp"
273303
}
274304
}

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,11 +1072,11 @@ Similar to Python it also defines the commonly used character sets.
10721072
auto protein = sz::string::random(300, "ARNDCQEGHILKMFPSTWYV"); // static method
10731073
auto dna = sz::basic_string<custom_allocator>::random(3_000_000_000, "ACGT");
10741074
1075-
dna.randomize("ACGT"); // `noexcept` pre-allocated version
1076-
dna.randomize(&std::rand, "ACGT"); // pass any generator, like `std::mt19937`
1075+
dna.fill_random("ACGT"); // `noexcept` pre-allocated version
1076+
dna.fill_random(&std::rand, "ACGT"); // pass any generator, like `std::mt19937`
10771077
10781078
char uuid[36];
1079-
sz::randomize(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
1079+
sz::fill_random(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
10801080
```
10811081

10821082
### Bulk Replacements

include/stringzilla/hash.h

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64
843843
__m128i k1 = _mm_xor_si128(seed_vec, pi0);
844844
__m128i k2 = _mm_xor_si128(seed_vec, pi1);
845845

846-
// The first 128 bits of the "sum" and "AES" blocks are the same
846+
// The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
847847
state->aes.xmm = k1;
848848
state->sum.xmm = k2;
849849
}
@@ -1559,6 +1559,8 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
15591559

15601560
SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
15611561

1562+
// For short strings the "masked loads" are identical to Skylake-X and
1563+
// the "logic" is identical to Haswell.
15621564
if (length <= 16) {
15631565
// Initialize the AES block with a given seed
15641566
_sz_hash_minimal_t state;
@@ -1611,6 +1613,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
16111613
_sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
16121614
return _sz_hash_minimal_finalize_haswell(&state, length);
16131615
}
1616+
// This is where the logic differs from Skylake-X and other pre-Ice Lake CPUs:
16141617
else {
16151618
// Use a larger state to handle the main loop and add different offsets
16161619
// to different lanes of the register
@@ -1716,6 +1719,64 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t no
17161719
}
17171720
}
17181721

1722+
/**
1723+
* @brief A wider parallel analog of `_sz_hash_minimal_t`, which is not used for computing individual hashes,
1724+
* but for parallel hashing of @b short 4x separate strings under 16 bytes long.
1725+
* Useful for higher-level Database and Machine Learning operations.
1726+
*/
1727+
typedef struct _sz_hash_minimal_x4_t {
1728+
sz_u512_vec_t aes;
1729+
sz_u512_vec_t sum;
1730+
sz_u512_vec_t key;
1731+
} _sz_hash_minimal_x4_t;
1732+
1733+
SZ_INTERNAL void _sz_hash_minimal_x4_init_ice(_sz_hash_minimal_x4_t *state, sz_u64_t seed) {
1734+
1735+
// The key is made from the seed and half of it will be mixed with the length in the end
1736+
__m512i seed_vec = _mm512_set1_epi64(seed);
1737+
state->key.zmm = seed_vec;
1738+
1739+
// XOR the user-supplied keys with the two "pi" constants
1740+
sz_u64_t const *pi = _sz_hash_pi_constants();
1741+
__m512i pi0 = _mm512_load_si512((__m512i const *)(pi));
1742+
__m512i pi1 = _mm512_load_si512((__m512i const *)(pi + 8));
1743+
// We will load the entire 512-bit values, but will only use the first 128 bits,
1744+
// replicating it 4x times across the register. The `_mm512_shuffle_i64x2` is supposed to
1745+
// be faster than `_mm512_broadcast_i64x2` on Ice Lake.
1746+
pi0 = _mm512_shuffle_i64x2(pi0, pi0, 0);
1747+
pi1 = _mm512_shuffle_i64x2(pi1, pi1, 0);
1748+
__m512i k1 = _mm512_xor_si512(seed_vec, pi0);
1749+
__m512i k2 = _mm512_xor_si512(seed_vec, pi1);
1750+
1751+
// The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
1752+
state->aes.zmm = k1;
1753+
state->sum.zmm = k2;
1754+
}
1755+
1756+
SZ_INTERNAL __m256i _sz_hash_minimal_x4_finalize_ice(_sz_hash_minimal_x4_t const *state, //
1757+
sz_size_t length0, sz_size_t length1, sz_size_t length2,
1758+
sz_size_t length3) {
1759+
__m512i const padded_lengths = _mm512_set_epi64(0, length3, 0, length2, 0, length1, 0, length0);
1760+
// Mix the length into the key
1761+
__m512i key_with_length = _mm512_add_epi64(state->key.zmm, padded_lengths);
1762+
// Combine the "sum" and the "AES" blocks
1763+
__m512i mixed_registers = _mm512_aesenc_epi128(state->sum.zmm, state->aes.zmm);
1764+
// Make sure the "key" mixes enough with the state,
1765+
// as with less than 2 rounds - SMHasher fails
1766+
__m512i mixed_within_register =
1767+
_mm512_aesenc_epi128(_mm512_aesenc_epi128(mixed_registers, key_with_length), mixed_registers);
1768+
// Extract the low 64 bits from each 128-bit lane - weirdly using the `permutexvar` instruction
1769+
// is cheaper than compressing instructions like `_mm512_maskz_compress_epi64`.
1770+
return _mm512_castsi512_si256(
1771+
_mm512_permutexvar_epi64(_mm512_set_epi64(0, 0, 0, 0, 6, 4, 2, 0), mixed_within_register));
1772+
}
1773+
1774+
SZ_INTERNAL void _sz_hash_minimal_x4_update_ice(_sz_hash_minimal_x4_t *state, __m512i blocks) {
1775+
__m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
1776+
state->aes.zmm = _mm512_aesenc_epi128(state->aes.zmm, blocks);
1777+
state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), blocks);
1778+
}
1779+
17191780
#pragma clang attribute pop
17201781
#pragma GCC pop_options
17211782
#endif // SZ_USE_ICE

include/stringzilla/memory.h

Lines changed: 3 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
*
66
* Includes core APIs for contiguous memory operations:
77
*
8-
* - @b `sz_copy` - analog to `memcpy`, probably the most common operation in a computer
9-
* - @b `sz_move` - analog to `memmove`, allowing overlapping memory regions, often used in string manipulation
10-
* - @b `sz_fill` - analog to `memset`, often used to initialize memory with a constant value, like zero
8+
* - @b `sz_copy` - analog to @b `memcpy`, probably the most common operation in a computer
9+
* - @b `sz_move` - analog to @b `memmove`, allowing overlapping memory regions, often used in string manipulation
10+
* - @b `sz_fill` - analog to @b `memset`, often used to initialize memory with a constant value, like zero
1111
* - @b `sz_lookup` - Look-Up Table @b (LUT) transformation of a string, mapping each byte to a new value
1212
* - TODO: @b `sz_lookup_utf8` - LUT transformation of a UTF8 string, which can be used for normalization
13-
* - TODO: @b `sz_detect_encoding` - detects the character encoding similar to "iconv" or "chardet" tools
1413
*
1514
* All of the core APIs receive the target output buffer as the first argument,
1615
* and aim to minimize the number of "store" instructions, especially unaligned ones,
@@ -1084,62 +1083,6 @@ SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source
10841083
}
10851084
}
10861085

1087-
enum sz_encoding_t {
1088-
sz_encoding_unknown_k = 0,
1089-
sz_encoding_ascii_k = 1,
1090-
sz_encoding_utf8_k = 2,
1091-
sz_encoding_utf16_k = 3,
1092-
sz_encoding_utf32_k = 4,
1093-
sz_encoding_jwt_k = 5,
1094-
sz_encoding_base64_k = 6,
1095-
// Low priority encodings:
1096-
sz_encoding_utf8bom_k = 7,
1097-
sz_encoding_utf16le_k = 8,
1098-
sz_encoding_utf16be_k = 9,
1099-
sz_encoding_utf32le_k = 10,
1100-
sz_encoding_utf32be_k = 11,
1101-
};
1102-
1103-
// Character Set Detection is one of the most commonly performed operations in data processing with
1104-
// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
1105-
// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
1106-
// All of them are notoriously slow.
1107-
//
1108-
// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
1109-
// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
1110-
// - ISO-8859-1: 1.2%
1111-
// - Windows-1252: 0.3%
1112-
// - Windows-1251: 0.2%
1113-
// - EUC-JP: 0.1%
1114-
// - Shift JIS: 0.1%
1115-
// - EUC-KR: 0.1%
1116-
// - GB2312: 0.1%
1117-
// - Windows-1250: 0.1%
1118-
// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
1119-
// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
1120-
// the rest.
1121-
//
1122-
// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
1123-
// and focuses more on incremental validation & transcoding, rather than detection.
1124-
//
1125-
// So we need a very fast and efficient way of determining
1126-
SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
1127-
// https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
1128-
// https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
1129-
// https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
1130-
// https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
1131-
1132-
// We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
1133-
// have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
1134-
// with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
1135-
// codepoints. In the case of emojis, we deal with 4-byte codepoints.
1136-
// We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
1137-
int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
1138-
sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
1139-
sz_unused(text && length);
1140-
return sz_false_k;
1141-
}
1142-
11431086
#pragma clang attribute pop
11441087
#pragma GCC pop_options
11451088
#endif // SZ_USE_ICE

include/stringzilla/similarity.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
309309
}
310310

311311
// TODO: Generalize to remove the following asserts!
312-
_sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
312+
_sz_assert(bound >= longer_length && "For bounded search the method should only evaluate one band of the matrix.");
313313
_sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
314314
sz_unused(longer_length && bound);
315315

@@ -860,7 +860,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
860860

861861
// Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
862862
__mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
863-
if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
863+
if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
864864
}
865865

866866
// Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
@@ -891,7 +891,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
891891

892892
// Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
893893
__mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
894-
if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
894+
if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
895895
}
896896

897897
// Now let's handle the bottom right triangle.
@@ -915,7 +915,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
915915

916916
// Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
917917
__mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
918-
if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
918+
if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
919919

920920
// In every following iterations we take use a shorter prefix of each register,
921921
// but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.

0 commit comments

Comments
 (0)