Improve: Fix minor inconsistencies

ashvardanian · ashvardanian · commit f656577f60ec · 2025-03-09T05:39:45.000Z
diff --git a/.clang-format b/.clang-format
@@ -6,6 +6,7 @@ NamespaceIndentation: None
 ColumnLimit: 120
 ReflowComments: true
 UseTab: Never
+IndentPPDirectives: None
 
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,5 +1,6 @@
 {
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
+  "C_Cpp.dimInactiveRegions": false,
   // This may cause overheating.
   // https://github.com/microsoft/vscode-cpptools/issues/1816
   "C_Cpp.workspaceParsingPriority": "low",
@@ -17,13 +18,15 @@
   },
   "cmake.sourceDirectory": "${workspaceRoot}",
   "cSpell.words": [
+    "aesdec",
     "allowoverlap",
     "aminoacid",
     "aminoacids",
     "Apostolico",
     "Appleby",
     "ASAN",
     "ashvardanian",
+    "Aumasson",
     "Baeza",
     "basicsize",
     "bigram",
@@ -32,17 +35,24 @@
     "bioinformatics",
     "Bitap",
     "bitcast",
+    "bitceil",
     "BLOSUM",
+    "Borwein",
+    "Brase",
     "Brumme",
+    "Byteset",
+    "bytesum",
     "carray",
     "Cawley",
+    "chardet",
     "cheminformatics",
     "cibuildwheel",
     "CONCAT",
     "constexpr",
     "copydoc",
     "Corasick",
     "cptr",
+    "DRBG",
     "endregion",
     "endswith",
     "Eron",
@@ -51,7 +61,9 @@
     "getitem",
     "getslice",
     "Giancarlo",
+    "Giordano",
     "Gonnet",
+    "Gotoh",
     "Haswell",
     "Heikki",
     "hexdigits",
@@ -65,6 +77,7 @@
     "isprintable",
     "itemsize",
     "Jaccard",
+    "Kaitchuck",
     "Karp",
     "keeplinebreaks",
     "keepseparator",
@@ -82,13 +95,18 @@
     "memcpy",
     "Merkle-Damgård",
     "Mersenne",
+    "misalign",
     "MODINIT",
+    "Morten",
+    "Mosè",
     "MSVC",
     "napi",
     "nargsf",
     "ndim",
     "Needleman",
     "newfunc",
+    "ngram",
+    "ngrams",
     "NOARGS",
     "noexcept",
     "NOMINMAX",
@@ -97,14 +115,19 @@
     "numpy",
     "octdigits",
     "octogram",
+    "pgram",
+    "pgrams",
+    "Plouffe",
     "printables",
     "pytest",
     "Pythonic",
     "qsort",
     "quadgram",
     "Raita",
     "readlines",
+    "Reini",
     "releasebuffer",
+    "repr",
     "rfind",
     "rfinds",
     "richcompare",
@@ -116,6 +139,7 @@
     "rsplits",
     "rstrip",
     "SIMD",
+    "sklearn",
     "Skylake",
     "splitlines",
     "ssize",
@@ -138,18 +162,21 @@
     "Vardanian",
     "VBMI",
     "vectorcallfunc",
+    "Vectorizer",
     "Wagner",
     "whitespaces",
     "Wunsch",
     "XDECREF",
+    "xmms",
+    "Yann",
+    "Yaroshevskiy",
     "Zilla"
   ],
   "editor.formatOnSave": true,
   "editor.rulers": [
     120
   ],
   "files.associations": {
-    "*.tcc": "cpp",
     "__bit_reference": "cpp",
     "__bits": "cpp",
     "__config": "cpp",
@@ -168,12 +195,14 @@
     "__tree": "cpp",
     "__tuple": "cpp",
     "__verbose_abort": "cpp",
+    "*.tcc": "cpp",
     "algorithm": "cpp",
     "any": "cpp",
     "array": "cpp",
     "atomic": "cpp",
     "bit": "cpp",
     "bitset": "cpp",
+    "cassert": "cpp",
     "cctype": "cpp",
     "charconv": "c",
     "chrono": "cpp",
@@ -231,6 +260,7 @@
     "semaphore": "cpp",
     "set": "cpp",
     "shared_mutex": "cpp",
+    "sort.h": "c",
     "source_location": "cpp",
     "span": "cpp",
     "sstream": "cpp",
@@ -269,6 +299,6 @@
     "xstring": "cpp",
     "xtr1common": "cpp",
     "xtree": "cpp",
-    "xutility": "cpp",
+    "xutility": "cpp"
   }
 }
diff --git a/README.md b/README.md
@@ -1072,11 +1072,11 @@ Similar to Python it also defines the commonly used character sets.
 auto protein = sz::string::random(300, "ARNDCQEGHILKMFPSTWYV"); // static method
 auto dna = sz::basic_string<custom_allocator>::random(3_000_000_000, "ACGT");
 
-dna.randomize("ACGT"); // `noexcept` pre-allocated version
-dna.randomize(&std::rand, "ACGT"); // pass any generator, like `std::mt19937`
+dna.fill_random("ACGT"); // `noexcept` pre-allocated version
+dna.fill_random(&std::rand, "ACGT"); // pass any generator, like `std::mt19937`
 
 char uuid[36];
-sz::randomize(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
+sz::fill_random(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
 ```
 
 ### Bulk Replacements
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
@@ -843,7 +843,7 @@ SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64
     __m128i k1 = _mm_xor_si128(seed_vec, pi0);
     __m128i k2 = _mm_xor_si128(seed_vec, pi1);
 
-    // The first 128 bits of the "sum" and "AES" blocks are the same
+    // The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
     state->aes.xmm = k1;
     state->sum.xmm = k2;
 }
@@ -1559,6 +1559,8 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
 
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
 
+    // For short strings the "masked loads" are identical to Skylake-X and
+    // the "logic" is identical to Haswell.
     if (length <= 16) {
         // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
@@ -1611,6 +1613,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
         _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
         return _sz_hash_minimal_finalize_haswell(&state, length);
     }
+    // This is where the logic differs from Skylake-X and other pre-Ice Lake CPUs:
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
@@ -1716,6 +1719,64 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t no
     }
 }
 
+/**
+ *  @brief  A wider parallel analog of `_sz_hash_minimal_t`, which is not used for computing individual hashes,
+ *          but for parallel hashing of @b short 4x separate strings under 16 bytes long.
+ *          Useful for higher-level Database and Machine Learning operations.
+ */
+typedef struct _sz_hash_minimal_x4_t {
+    sz_u512_vec_t aes;
+    sz_u512_vec_t sum;
+    sz_u512_vec_t key;
+} _sz_hash_minimal_x4_t;
+
+SZ_INTERNAL void _sz_hash_minimal_x4_init_ice(_sz_hash_minimal_x4_t *state, sz_u64_t seed) {
+
+    // The key is made from the seed and half of it will be mixed with the length in the end
+    __m512i seed_vec = _mm512_set1_epi64(seed);
+    state->key.zmm = seed_vec;
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    __m512i pi0 = _mm512_load_si512((__m512i const *)(pi));
+    __m512i pi1 = _mm512_load_si512((__m512i const *)(pi + 8));
+    // We will load the entire 512-bit values, but will only use the first 128 bits,
+    // replicating it 4x times across the register. The `_mm512_shuffle_i64x2` is supposed to
+    // be faster than `_mm512_broadcast_i64x2` on Ice Lake.
+    pi0 = _mm512_shuffle_i64x2(pi0, pi0, 0);
+    pi1 = _mm512_shuffle_i64x2(pi1, pi1, 0);
+    __m512i k1 = _mm512_xor_si512(seed_vec, pi0);
+    __m512i k2 = _mm512_xor_si512(seed_vec, pi1);
+
+    // The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
+    state->aes.zmm = k1;
+    state->sum.zmm = k2;
+}
+
+SZ_INTERNAL __m256i _sz_hash_minimal_x4_finalize_ice(_sz_hash_minimal_x4_t const *state, //
+                                                     sz_size_t length0, sz_size_t length1, sz_size_t length2,
+                                                     sz_size_t length3) {
+    __m512i const padded_lengths = _mm512_set_epi64(0, length3, 0, length2, 0, length1, 0, length0);
+    // Mix the length into the key
+    __m512i key_with_length = _mm512_add_epi64(state->key.zmm, padded_lengths);
+    // Combine the "sum" and the "AES" blocks
+    __m512i mixed_registers = _mm512_aesenc_epi128(state->sum.zmm, state->aes.zmm);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    __m512i mixed_within_register =
+        _mm512_aesenc_epi128(_mm512_aesenc_epi128(mixed_registers, key_with_length), mixed_registers);
+    // Extract the low 64 bits from each 128-bit lane - weirdly using the `permutexvar` instruction
+    // is cheaper than compressing instructions like `_mm512_maskz_compress_epi64`.
+    return _mm512_castsi512_si256(
+        _mm512_permutexvar_epi64(_mm512_set_epi64(0, 0, 0, 0, 6, 4, 2, 0), mixed_within_register));
+}
+
+SZ_INTERNAL void _sz_hash_minimal_x4_update_ice(_sz_hash_minimal_x4_t *state, __m512i blocks) {
+    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
+    state->aes.zmm = _mm512_aesenc_epi128(state->aes.zmm, blocks);
+    state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), blocks);
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
@@ -5,12 +5,11 @@
  *
  *  Includes core APIs for contiguous memory operations:
  *
- *  - @b `sz_copy` - analog to `memcpy`, probably the most common operation in a computer
- *  - @b `sz_move` - analog to `memmove`, allowing overlapping memory regions, often used in string manipulation
- *  - @b `sz_fill` - analog to `memset`, often used to initialize memory with a constant value, like zero
+ *  - @b `sz_copy` - analog to @b `memcpy`, probably the most common operation in a computer
+ *  - @b `sz_move` - analog to @b `memmove`, allowing overlapping memory regions, often used in string manipulation
+ *  - @b `sz_fill` - analog to @b `memset`, often used to initialize memory with a constant value, like zero
  *  - @b `sz_lookup` - Look-Up Table @b (LUT) transformation of a string, mapping each byte to a new value
  *  - TODO: @b `sz_lookup_utf8` - LUT transformation of a UTF8 string, which can be used for normalization
- *  - TODO: @b `sz_detect_encoding` - detects the character encoding similar to "iconv" or "chardet" tools
  *
  *  All of the core APIs receive the target output buffer as the first argument,
  *  and aim to minimize the number of "store" instructions, especially unaligned ones,
@@ -1084,62 +1083,6 @@ SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source
     }
 }
 
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_encoding_jwt_k = 5,
-    sz_encoding_base64_k = 6,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 7,
-    sz_encoding_utf16le_k = 8,
-    sz_encoding_utf16be_k = 9,
-    sz_encoding_utf32le_k = 10,
-    sz_encoding_utf32be_k = 11,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
@@ -309,7 +309,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
     }
 
     // TODO: Generalize to remove the following asserts!
-    _sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
+    _sz_assert(bound >= longer_length && "For bounded search the method should only evaluate one band of the matrix.");
     _sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
     sz_unused(longer_length && bound);
 
@@ -860,7 +860,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
     }
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
@@ -891,7 +891,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
     }
 
     // Now let's handle the bottom right triangle.
@@ -915,7 +915,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
 
         // In every following iterations we take use a shorter prefix of each register,
         // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
diff --git a/rust/lib.rs b/rust/lib.rs