|
| 1 | +import { entries, isNotNil, many, Many, mapKeys } from '@seedcompany/common'; |
1 | 2 | import { Query } from 'cypher-query-builder';
|
| 3 | +import { pickBy } from 'lodash'; |
| 4 | +import { LiteralUnion } from 'type-fest'; |
| 5 | +import { CypherExpression, exp, isExp } from './cypher-expression'; |
| 6 | +import { db } from './cypher-functions'; |
2 | 7 |
|
3 | 8 | /**
|
4 |
| - * Query a full text index for results. |
5 |
| - * |
6 |
| - * NOTE: the `query` is Lucene syntax. If this is coming from user input, consider |
7 |
| - * using the {@link escapeLuceneSyntax} function. |
| 9 | + * @see https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/full-text-indexes/ |
8 | 10 | */
|
9 |
| -export const fullTextQuery = |
10 |
| - (index: string, query: string, yieldTerms = ['node']) => |
11 |
| - (q: Query) => |
12 |
| - q.raw( |
13 |
| - 'CALL db.index.fulltext.queryNodes($index, $query)' + |
14 |
| - (yieldTerms && yieldTerms.length > 0 |
15 |
| - ? ' YIELD ' + yieldTerms.join(', ') |
16 |
| - : ''), |
17 |
| - { |
18 |
| - index, |
19 |
| - // fallback to "" when no query is given, so that no results are |
20 |
| - // returned instead of the procedure failing |
21 |
| - query: query.trim() || '""', |
22 |
| - }, |
23 |
| - ); |
| 11 | +export const FullTextIndex = (config: { |
| 12 | + indexName: string; |
| 13 | + labels: Many<string>; |
| 14 | + properties: Many<string>; |
| 15 | + analyzer?: Analyzer; |
| 16 | + /** |
| 17 | + * This means that updates will be applied in a background thread "as soon as possible", |
| 18 | + * instead of during a transaction commit, which is true for other indexes. |
| 19 | + */ |
| 20 | + eventuallyConsistent?: boolean; |
| 21 | +}) => { |
| 22 | + const quote = (q: string) => `'${q}'`; |
| 23 | + |
| 24 | + const { indexName } = config; |
| 25 | + |
| 26 | + return { |
| 27 | + /** |
| 28 | + * Query to create the full text index (if needed). |
| 29 | + */ |
| 30 | + create: () => { |
| 31 | + const parsedConfig = { |
| 32 | + analyzer: config.analyzer ? quote(config.analyzer) : undefined, |
| 33 | + // eslint-disable-next-line @typescript-eslint/naming-convention |
| 34 | + eventually_consistent: config.eventuallyConsistent |
| 35 | + ? exp(config.eventuallyConsistent) |
| 36 | + : undefined, |
| 37 | + }; |
| 38 | + const options = |
| 39 | + entries(pickBy(parsedConfig, (v) => v !== undefined)).length > 0 |
| 40 | + ? { |
| 41 | + indexConfig: mapKeys(parsedConfig, (k) => `fulltext.${k}`) |
| 42 | + .asRecord, |
| 43 | + } |
| 44 | + : undefined; |
| 45 | + const query = ` |
| 46 | + CREATE FULLTEXT INDEX ${indexName} IF NOT EXISTS |
| 47 | + FOR (n:${many(config.labels).join('|')}) |
| 48 | + ON EACH ${exp(many(config.properties).map((p) => `n.${p}`))} |
| 49 | + ${options ? `OPTIONS ${exp(options)}` : ''} |
| 50 | + `; |
| 51 | + return (q: Query) => q.raw(query); |
| 52 | + }, |
| 53 | + |
| 54 | + /** |
| 55 | + * Query the full text index for results. |
| 56 | + * |
| 57 | + * NOTE: the `query` is Lucene syntax. |
| 58 | + * If this is coming from user input, consider using the {@link escapeLuceneSyntax} function. |
| 59 | + */ |
| 60 | + search: ( |
| 61 | + query: string, |
| 62 | + config: { |
| 63 | + yield?: Many<string>; |
| 64 | + skip?: number; |
| 65 | + limit?: number; |
| 66 | + analyzer?: Analyzer; |
| 67 | + } = {}, |
| 68 | + ) => { |
| 69 | + const { yield: yieldTerms = ['node'], ...options } = config; |
| 70 | + |
| 71 | + // fallback to "" when no query is given, so that no results are |
| 72 | + // returned instead of the procedure failing |
| 73 | + query = query.trim() || '""'; |
| 74 | + |
| 75 | + return (q: Query) => |
| 76 | + q |
| 77 | + .call(db.index.fulltext.queryNodes(indexName, query, options)) |
| 78 | + .yield(yieldTerms); |
| 79 | + }, |
| 80 | + }; |
| 81 | +}; |
24 | 82 |
|
25 | 83 | export const escapeLuceneSyntax = (query: string) =>
|
26 | 84 | query
|
27 | 85 | .replace(/[![\]~)(+\-:?*"^&|{}\\/]/g, (char) => `\\${char}`)
|
28 | 86 | .replace(/\b(OR|AND|NOT)\b/g, (char) => `"${char}"`);
|
| 87 | + |
| 88 | +export const IndexFullTextQueryNodes = ( |
| 89 | + indexName: string, |
| 90 | + query: string, |
| 91 | + options?: |
| 92 | + | { |
| 93 | + skip?: number; |
| 94 | + limit?: number; |
| 95 | + analyzer?: string; |
| 96 | + } |
| 97 | + | CypherExpression, |
| 98 | +) => ({ |
| 99 | + name: 'db.index.fulltext.queryNodes', |
| 100 | + args: { |
| 101 | + indexName, |
| 102 | + query, |
| 103 | + ...(options && |
| 104 | + (Object.values(options).filter(isNotNil).length > 0 || isExp(options)) |
| 105 | + ? { options } |
| 106 | + : undefined), |
| 107 | + }, |
| 108 | +}); |
| 109 | + |
| 110 | +type Analyzer = LiteralUnion<KnownAnalyzer, string>; |
| 111 | + |
| 112 | +/** |
| 113 | + * List from Neo4j with: |
| 114 | + * CALL db.index.fulltext.listAvailableAnalyzers() |
| 115 | + */ |
| 116 | +type KnownAnalyzer = |
| 117 | + // Analyzer that uses ASCIIFoldingFilter to remove accents (diacritics). |
| 118 | + // Otherwise, it behaves as a standard english analyzer. |
| 119 | + // Note: This analyzer may have unexpected behavior, such as tokenizing, for all non-ASCII numbers and symbols. |
| 120 | + | 'standard-folding' |
| 121 | + // A simple analyzer that tokenizes at non-letter boundaries. |
| 122 | + // No stemming or filtering. |
| 123 | + // Works okay for most European languages, but is terrible for languages where words aren't separated by spaces, such as many Asian languages. |
| 124 | + | 'simple' |
| 125 | + // Stop analyzer tokenizes at non-letter characters, and filters out English stop words. |
| 126 | + // This differs from the 'classic' and 'standard' analyzers in that it makes no effort to recognize special terms, |
| 127 | + // like likely product names, URLs or email addresses. |
| 128 | + | 'stop' |
| 129 | + // Keyword analyzer "tokenizes" the text as a single term. |
| 130 | + // Useful for zip-codes, ids, etc. Situations where complete and exact matches are desired. |
| 131 | + | 'keyword' |
| 132 | + // The standard analyzer. |
| 133 | + // Tokenizes on non-letter and filters out English stop words and punctuation. |
| 134 | + // Does no stemming, but takes care to keep likely product names, URLs and email addresses as single terms. |
| 135 | + | 'standard' |
| 136 | + // Breaks text into terms by characters that have the unicode WHITESPACE property. |
| 137 | + | 'unicode_whitespace' |
| 138 | + // Tokenizes into sequences of alphanumeric, numeric, URL, email, southeast asian terms, |
| 139 | + // and into terms of individual ideographic and hiragana characters. |
| 140 | + // English stop words are filtered out. |
| 141 | + | 'url' |
| 142 | + // English analyzer with stemming and stop word filtering. |
| 143 | + | 'english' |
| 144 | + // Tokenizes into sequences of alphanumeric, numeric, URL, email, southeast asian terms, |
| 145 | + // and into terms of individual ideographic and hiragana characters. |
| 146 | + // English stop words are filtered out. |
| 147 | + | 'url_or_email' |
| 148 | + // The default analyzer. |
| 149 | + // Similar to the 'standard' analyzer, but filters no stop words. |
| 150 | + // Tokenizes on non-letter boundaries filters out punctuation. |
| 151 | + // Does no stemming, but takes care to keep likely product names, URLs and email addresses as single terms. |
| 152 | + | 'standard-no-stop-words' |
| 153 | + // Classic Lucene analyzer. Similar to 'standard', but with worse unicode support. |
| 154 | + | 'classic' |
| 155 | + // Tokenizes into sequences of alphanumeric, numeric, URL, email, southeast asian terms, |
| 156 | + // and into terms of individual ideographic and hiragana characters. |
| 157 | + // English stop words are filtered out. |
| 158 | + | 'email' |
| 159 | + // Breaks text into terms by characters that are considered "Java whitespace". |
| 160 | + | 'whitespace'; |
0 commit comments