Docs: StringCuZilla design choices

ashvardanian · ashvardanian · commit f42aa85b50ff · 2025-04-01T11:54:53.000Z
diff --git a/include/stringcuzilla/features.hpp b/include/stringcuzilla/features.hpp
@@ -1,6 +1,6 @@
 /**
  *  @brief  Hardware-accelerated feature extractions for string collections.
- *  @file   features.h
+ *  @file   features.hpp
  *  @author Ash Vardanian
  *
  *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
@@ -28,8 +28,8 @@
  *    - output hashes into a high-dimensional bit-vector.
  *
  */
-#ifndef STRINGZILLA_FEATURES_H_
-#define STRINGZILLA_FEATURES_H_
+#ifndef STRINGZILLA_FEATURES_HPP_
+#define STRINGZILLA_FEATURES_HPP_
 
 #include "types.h"
 
@@ -142,4 +142,4 @@ SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
 #ifdef __cplusplus
 }
 #endif // __cplusplus
-#endif // STRINGZILLA_FEATURES_H_
+#endif // STRINGZILLA_FEATURES_HPP_
diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
@@ -0,0 +1,38 @@
+/**
+ *  @brief  Hardware-accelerated multi-pattern exact substring search.
+ *  @file   find_many.hpp
+ *  @author Ash Vardanian
+ *
+ *  One of the most broadly used algorithms in string processing is the multi-pattern Aho-Corasick
+ *  algorithm, that constructs a trie from the patterns, transforms it into a finite state machine,
+ *  and then uses it to search for all patterns in the text in a single pass.
+ *
+ *  One of its biggest issues is the memory consumption, as the naive implementation requires each
+ *  state to be proportional to the size of the alphabet, or 256 for byte-level processing. Such dense
+ *  representations simplify transition lookup down to a single memory access, but that access can be
+ *  expensive if the memory doesn't fir into the CPU caches for really large vocabulary sizes.
+ *
+ *  Addressing this, we provide a sparse layout variant of the FSM, that uses predicated SIMD instructions
+ *  to rapidly probe the transitions and find the next state. This allows us to use a much smaller state,
+ *  fitting in L1/L2 caches much more frequently.
+ */
+#ifndef STRINGZILLA_FIND_MANY_HPP_
+#define STRINGZILLA_FIND_MANY_HPP_
+
+#include "types.h"
+
+#include "compare.h" // `sz_compare`
+#include "memory.h"  // `sz_copy`
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma region Core API
+
+#pragma endregion // Core API
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // STRINGZILLA_FIND_MANY_HPP_
diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
@@ -1,9 +1,10 @@
 /**
  *  @brief  CUDA-accelerated string similarity utilities.
- *  @file   similarities.cuh
+ *  @file   similarity.cuh
  *  @author Ash Vardanian
  *
- *  Includes core APIs:
+ *  Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses on
+ *  batch-processing of large collections of strings, generally, assigning a single warp to each string pair:
  *
  *  - `sz::cuda::levenshtein_distances` & `sz::cuda::levenshtein_distances_utf8` for Levenshtein edit-distances.
  *  - `sz::cuda::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
@@ -25,11 +25,15 @@
  *  - `stringzilla.h` - umbrella header for the core C API.
  *  - `stringzilla.hpp` - umbrella header for the core C++ API.
  *
- *  It also provides many higher-level algorithms, mostly implemented in C++ with OpenMP and CUDA,
- *  also exposed via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared libraries:
+ *  It also provides many higher-level parallel algorithms, mostly implemented in C++ with OpenMP and CUDA, also exposed
+ *  via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared @b StringCuZilla libraries:
  *
- *  - `similarity.hpp` - similarity measures, like Levenshtein distance, Needleman-Wunsch, & Smith-Waterman alignment.
- *  - `features.hpp` - feature extraction for TF-IDF and other Machine Learning algorithms.
+ *  - `similarity.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
+ *  - `features.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
+ *  - `find_many.{hpp,cuh}` - Aho-Corasick multi-pattern search.
+ *
+ *  The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to
+ *  template functions to simplify specialized overloads and reusing the state between invocations.
  *
  *  @section    Compilation Settings
  *

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,10 @@`
`1`	`1`	`/**`
`2`	`2`	`* @brief CUDA-accelerated string similarity utilities.`
`3`		`- * @file similarities.cuh`
	`3`	`+ * @file similarity.cuh`
`4`	`4`	`* @author Ash Vardanian`
`5`	`5`	`*`
`6`		`- * Includes core APIs:`
	`6`	`+ * Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses on`
	`7`	`+ * batch-processing of large collections of strings, generally, assigning a single warp to each string pair:`
`7`	`8`	`*`
`8`	`9`	* - `sz::cuda::levenshtein_distances` & `sz::cuda::levenshtein_distances_utf8` for Levenshtein edit-distances.
`9`	`10`	* - `sz::cuda::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
Original file line number	Diff line number	Diff line change
`@@ -25,11 +25,15 @@`
`25`	`25`	* - `stringzilla.h` - umbrella header for the core C API.
`26`	`26`	* - `stringzilla.hpp` - umbrella header for the core C++ API.
`27`	`27`	`*`
`28`		`- * It also provides many higher-level algorithms, mostly implemented in C++ with OpenMP and CUDA,`
`29`		`- * also exposed via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared libraries:`
	`28`	`+ * It also provides many higher-level parallel algorithms, mostly implemented in C++ with OpenMP and CUDA, also exposed`
	`29`	`+ * via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared @b StringCuZilla libraries:`
`30`	`30`	`*`
`31`		- * - `similarity.hpp` - similarity measures, like Levenshtein distance, Needleman-Wunsch, & Smith-Waterman alignment.
`32`		- * - `features.hpp` - feature extraction for TF-IDF and other Machine Learning algorithms.
	`31`	+ * - `similarity.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
	`32`	+ * - `features.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
	`33`	+ * - `find_many.{hpp,cuh}` - Aho-Corasick multi-pattern search.
	`34`	`+ *`
	`35`	`+ * The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to`
	`36`	`+ * template functions to simplify specialized overloads and reusing the state between invocations.`
`33`	`37`	`*`
`34`	`38`	`* @section Compilation Settings`
`35`	`39`	`*`