33
33
detailed explanation. */
34
34
35
35
/* *
36
+ * @internal
36
37
* @brief Mode for counting the number of occurrences of a substring
37
38
*/
38
39
#define FAST_COUNT 0
39
40
40
41
/* *
42
+ * @internal
41
43
* @brief Mode for performing a forward search for a substring
42
44
*/
43
45
#define FAST_SEARCH 1
44
46
45
47
/* *
48
+ * @internal
46
49
* @brief Mode for performing a reverse (backward) search for a substring
47
50
*/
48
51
#define FAST_RSEARCH 2
49
52
50
53
/* *
54
+ * @file_internal
51
55
* @brief Defines the bloom filter width based on the size of LONG_BIT.
52
56
*
53
57
* This macro sets the value of `STRINGLIB_BLOOM_WIDTH` depending on the
67
71
#endif
68
72
69
73
/* *
74
+ * @file_internal
70
75
* @brief Adds a character to the bloom filter mask.
71
76
*
72
77
* This macro sets the bit in the bloom filter `mask` corresponding to the
80
85
((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1 )))))
81
86
82
87
/* *
88
+ * @file_internal
83
89
* @brief Checks if a character is present in the bloom filter mask.
84
90
*
85
91
* This macro checks if the bit corresponding to the character `ch` is set
92
98
#define STRINGLIB_BLOOM (mask, ch ) \
93
99
((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1 )))))
94
100
95
- #define FORWARD_DIRECTION 1 // /< Defines the forward search direction
96
- #define BACKWARD_DIRECTION (-1 ) // /< Defines the backward search direction
97
-
98
101
/* *
102
+ * @file_internal
99
103
* @brief Threshold for using memchr or wmemchr in character search.
100
104
*
101
105
* If the search length exceeds this value, memchr/wmemchr is used.
104
108
105
109
106
110
/* *
111
+ * @internal
107
112
* @brief A checked indexer for buffers of a specified character type.
108
113
*
109
114
* This structure provides safe indexing into a buffer with boundary checks.
110
115
*
116
+ * @internal
117
+ *
111
118
* @tparam char_type The type of characters stored in the buffer.
112
119
*/
113
120
template <typename char_type>
@@ -335,6 +342,7 @@ struct CheckedIndexer {
335
342
336
343
337
344
/* *
345
+ * @internal
338
346
* @brief Finds the first occurrence of a specified character in a
339
347
* given range of a buffer.
340
348
*
@@ -387,6 +395,7 @@ find_char(CheckedIndexer<char_type> s, Py_ssize_t n, char_type ch)
387
395
}
388
396
389
397
/* *
398
+ * @internal
390
399
* @brief Finds the last occurrence of a specified character in a
391
400
* given range of a buffer.
392
401
*
@@ -418,6 +427,7 @@ rfind_char(CheckedIndexer<char_type> s, Py_ssize_t n, char_type ch)
418
427
419
428
420
429
/* *
430
+ * @file_internal
421
431
* @brief Conditional logging for string fast search.
422
432
*
423
433
* Set to 1 to enable logging macros.
@@ -445,11 +455,15 @@ rfind_char(CheckedIndexer<char_type> s, Py_ssize_t n, char_type ch)
445
455
#endif
446
456
447
457
/* *
458
+ * @file_internal
448
459
* @brief Perform a lexicographic search for the maximal suffix in
449
460
* a given string.
450
461
*
451
462
* This function searches through the `needle` string to find the
452
463
* maximal suffix, which is essentially the largest lexicographic suffix.
464
+ * Essentially this:
465
+ * - max(needle[i:] for i in range(len(needle)+1))
466
+ *
453
467
* Additionally, it computes the period of the right half of the string.
454
468
*
455
469
* @param needle The string to search in.
@@ -513,6 +527,7 @@ lex_search(CheckedIndexer<char_type> needle, Py_ssize_t len_needle,
513
527
}
514
528
515
529
/* *
530
+ * @file_internal
516
531
* @brief Perform a critical factorization on a string.
517
532
*
518
533
* This function splits the input string into two parts where the local
@@ -575,32 +590,38 @@ factorize(CheckedIndexer<char_type> needle,
575
590
576
591
577
592
/* *
593
+ * @file_internal
578
594
* @brief Internal macro to define the shift type used in the table.
579
595
*/
580
596
#define SHIFT_TYPE uint8_t
581
597
582
598
/* *
599
+ * @file_internal
583
600
* @brief Internal macro to define the maximum shift value.
584
601
*/
585
602
#define MAX_SHIFT UINT8_MAX
586
603
587
604
588
605
/* *
606
+ * @file_internal
589
607
* @brief Internal macro to define the number of bits for the table size.
590
608
*/
591
609
#define TABLE_SIZE_BITS 6u
592
610
593
611
/* *
612
+ * @file_internal
594
613
* @brief Internal macro to define the table size based on TABLE_SIZE_BITS.
595
614
*/
596
615
#define TABLE_SIZE (1U << TABLE_SIZE_BITS)
597
616
598
617
/* *
618
+ * @file_internal
599
619
* @brief Internal macro to define the table mask used for bitwise operations.
600
620
*/
601
621
#define TABLE_MASK (TABLE_SIZE - 1U )
602
622
603
623
/* *
624
+ * @file_internal
604
625
* @brief Struct to store precomputed data for string search algorithms.
605
626
*
606
627
* This structure holds all the necessary precomputed values needed
@@ -621,6 +642,7 @@ struct search_prep_data {
621
642
622
643
623
644
/* *
645
+ * @file_internal
624
646
* @brief Preprocesses the needle (substring) for optimized string search.
625
647
*
626
648
* This function performs preprocessing on the given needle (substring)
@@ -695,6 +717,7 @@ preprocess(CheckedIndexer<char_type> needle, Py_ssize_t len_needle,
695
717
}
696
718
697
719
/* *
720
+ * @file_internal
698
721
* @brief Searches for a needle (substring) within a haystack (string)
699
722
* using the Two-Way string matching algorithm.
700
723
*
@@ -858,6 +881,7 @@ two_way(CheckedIndexer<char_type> haystack, Py_ssize_t len_haystack,
858
881
859
882
860
883
/* *
884
+ * @file_internal
861
885
* @brief Finds the first occurrence of a needle (substring) within a haystack (string).
862
886
*
863
887
* This function applies the two-way string matching algorithm to efficiently
@@ -884,6 +908,7 @@ two_way_find(CheckedIndexer<char_type> haystack, Py_ssize_t len_haystack,
884
908
885
909
886
910
/* *
911
+ * @file_internal
887
912
* @brief Counts the occurrences of a needle (substring) within a haystack (string).
888
913
*
889
914
* This function applies the two-way string matching algorithm to count how many
@@ -937,6 +962,7 @@ two_way_count(CheckedIndexer<char_type> haystack, Py_ssize_t len_haystack,
937
962
#undef LOG_LINEUP
938
963
939
964
/* *
965
+ * @internal
940
966
* @brief A function that searches for a substring `p` in the
941
967
* string `s` using a bloom filter to optimize character matching.
942
968
*
@@ -1022,6 +1048,7 @@ default_find(CheckedIndexer<char_type> s, Py_ssize_t n,
1022
1048
1023
1049
1024
1050
/* *
1051
+ * @internal
1025
1052
* @brief Performs an adaptive string search using a bloom filter and fallback
1026
1053
* to two-way search for large data.
1027
1054
*
@@ -1109,6 +1136,7 @@ adaptive_find(CheckedIndexer<char_type> s, Py_ssize_t n,
1109
1136
1110
1137
1111
1138
/* *
1139
+ * @internal
1112
1140
* @brief Performs a reverse Boyer-Moore string search.
1113
1141
*
1114
1142
* This function searches for the last occurrence of a pattern in a string,
@@ -1176,6 +1204,7 @@ default_rfind(CheckedIndexer<char_type> s, Py_ssize_t n,
1176
1204
1177
1205
1178
1206
/* *
1207
+ * @internal
1179
1208
* @brief Counts occurrences of a specified character in a given string.
1180
1209
*
1181
1210
* This function iterates through the string `s` and counts how many times
@@ -1208,6 +1237,7 @@ countchar(CheckedIndexer<char_type> s, Py_ssize_t n,
1208
1237
1209
1238
1210
1239
/* *
1240
+ * @internal
1211
1241
* @brief Searches for occurrences of a substring `p` in the string `s`
1212
1242
* using various optimized search algorithms.
1213
1243
*
0 commit comments