|
8 | 8 | #include "stdlib.h" |
9 | 9 |
|
10 | 10 | #include "ctypes.h" |
| 11 | +#include "mem.h" |
11 | 12 | #include "stdio.h" |
12 | 13 |
|
| 14 | +/** |
| 15 | + * @brief Алгоритм Fuzzy search |
| 16 | + * |
| 17 | + * @param[in] text Текст |
| 18 | + * @param[in] query Запрос |
| 19 | + * @param[in] build_score Оценка за построение |
| 20 | + * @param score Оценка |
| 21 | + * @param score_len Длина оценка |
| 22 | + * |
| 23 | + * @return int Оценка |
| 24 | + */ |
| 25 | +int fuzzy_search(const char* text, const char* query, int build_score, int** score, u32* score_len) { |
| 26 | + u32 total_score = 0; |
| 27 | + if (build_score) { // Build score is an optimization when searching through large database |
| 28 | + (*score) = (int*)kmalloc(sizeof(int) * strlen(text)); |
| 29 | + memset(*score, 0, sizeof(int) * strlen(text)); |
| 30 | + *score_len = strlen(text); |
| 31 | + } |
| 32 | + |
| 33 | + u32 first_character_boosts = 1; |
| 34 | + |
| 35 | + for (u32 t_idx = 0; t_idx < strlen(text); t_idx++) { |
| 36 | + char t = |
| 37 | + tolower(text[t_idx]); // NOTE(deter0): to lower performs kind of strangely probably due to UTF8 |
| 38 | + for (u32 q_idx = 0; q_idx < strlen(query); q_idx++) { |
| 39 | + char q = tolower(query[q_idx]); |
| 40 | + |
| 41 | + if (t == q) { |
| 42 | + // Start of word awards more but falls off fast |
| 43 | + if (t_idx == 0 || (t_idx > 0 && isspace(text[t_idx - 1]))) { |
| 44 | + int factor = 8 / (first_character_boosts++); |
| 45 | + |
| 46 | + if (build_score) { |
| 47 | + (*score)[t_idx] += factor; |
| 48 | + } |
| 49 | + total_score += factor; |
| 50 | + } else { |
| 51 | + if (build_score) { |
| 52 | + (*score)[t_idx]++; |
| 53 | + } |
| 54 | + total_score++; |
| 55 | + } |
| 56 | + |
| 57 | + u32 streak = 0; |
| 58 | + for (u32 s_idx = 1; s_idx < strlen(query) - q_idx; s_idx++) { |
| 59 | + char sq = tolower(query[q_idx + s_idx]); |
| 60 | + char st = tolower(text[t_idx + s_idx]); |
| 61 | + |
| 62 | + if (sq != st) { |
| 63 | + break; |
| 64 | + } |
| 65 | + streak++; |
| 66 | + |
| 67 | + // Beginning of string yields few points more; eg -> "Term" :: "Terminus", "Fluent |
| 68 | + // Terminal" |
| 69 | + if (((float)t_idx / (float)strlen(text)) <= 0.35) { |
| 70 | + streak++; |
| 71 | + } |
| 72 | + |
| 73 | + int factor = streak * 3 / (strlen(query) * 0.2); |
| 74 | + if (build_score) { |
| 75 | + (*score)[t_idx + s_idx] += factor; |
| 76 | + } |
| 77 | + total_score += factor; |
| 78 | + } |
| 79 | + |
| 80 | + // (N * (N + 1) ) /2 |
| 81 | + // (*score)[t_idx] += (streak * (streak + 1)) / 2; |
| 82 | + t_idx += streak; |
| 83 | + } |
| 84 | + } |
| 85 | + } |
| 86 | + |
| 87 | + return total_score; |
| 88 | +} |
| 89 | + |
13 | 90 | void booltochar(u8 value, u8* str) { |
14 | 91 | if (value) { |
15 | 92 | strcpy(str, (u8*)"true"); |
|
0 commit comments