Refactor full text abstraction

CarsonF · CarsonF · commit fdbbf0c8b752 · 2024-06-20T14:08:16.000-05:00
diff --git a/src/components/product/product.repository.ts b/src/components/product/product.repository.ts
@@ -27,7 +27,7 @@ import {
   deactivateProperty,
   escapeLuceneSyntax,
   filter,
-  fullTextQuery,
+  FullTextIndex,
   matchProps,
   matchPropsAndProjectSensAndScopedRoles,
   merge,
@@ -552,7 +552,7 @@ export class ProductRepository extends CommonRepository {
       .query()
       .apply((q) =>
         query
-          ? q.apply(fullTextQuery('ProductCompletionDescription', query))
+          ? q.apply(ProductCompletionDescriptionIndex.search(query))
           : q.matchNode('node', 'ProductCompletionDescription'),
       )
       .apply((q) =>
@@ -569,18 +569,21 @@ export class ProductRepository extends CommonRepository {
 
   @OnIndex('schema')
   private async createCompletionDescriptionIndex() {
-    await this.db.createFullTextIndex(
-      'ProductCompletionDescription',
-      ['ProductCompletionDescription'],
-      ['value'],
-      {
-        analyzer: 'standard-folding',
-      },
-    );
+    await this.db
+      .query()
+      .apply(ProductCompletionDescriptionIndex.create())
+      .run();
   }
 
   @OnIndex()
   private createResourceIndexes() {
     return this.getConstraintsFor(Product);
   }
 }
+
+const ProductCompletionDescriptionIndex = FullTextIndex({
+  indexName: 'ProductCompletionDescription',
+  labels: 'ProductCompletionDescription',
+  properties: 'value',
+  analyzer: 'standard-folding',
+});
diff --git a/src/components/search/search.repository.ts b/src/components/search/search.repository.ts
@@ -4,7 +4,7 @@ import { CommonRepository, OnIndex, OnIndexParams } from '~/core/database';
 import {
   ACTIVE,
   escapeLuceneSyntax,
-  fullTextQuery,
+  FullTextIndex,
 } from '~/core/database/query';
 import { BaseNode } from '~/core/database/results';
 import { SearchInput } from './dto';
@@ -13,9 +13,7 @@ import { SearchInput } from './dto';
 export class SearchRepository extends CommonRepository {
   @OnIndex('schema')
   protected async applyIndexes({ db }: OnIndexParams) {
-    await db.createFullTextIndex('propValue', ['Property'], ['value'], {
-      analyzer: 'standard-folding',
-    });
+    await db.query().apply(GlobalIndex.create()).run();
   }
 
   /**
@@ -36,8 +34,7 @@ export class SearchRepository extends CommonRepository {
 
           .union()
 
-          .raw('', { query: lucene })
-          .apply(fullTextQuery('propValue', '$query', ['node as property']))
+          .apply(GlobalIndex.search(lucene, { yield: 'node as property' }))
           .match([node('node'), relation('out', 'r', ACTIVE), node('property')])
           .return(['node', 'collect(type(r)) as matchedProps'])
           // The input.count is going to be applied once the results are 'filtered'
@@ -62,3 +59,10 @@ export class SearchRepository extends CommonRepository {
     return await query.run();
   }
 }
+
+const GlobalIndex = FullTextIndex({
+  indexName: 'propValue',
+  labels: 'Property',
+  properties: 'value',
+  analyzer: 'standard-folding',
+});
diff --git a/src/core/database/query/cypher-functions.ts b/src/core/database/query/cypher-functions.ts
@@ -1,4 +1,5 @@
 import { exp, ExpressionInput } from './cypher-expression';
+import { IndexFullTextQueryNodes } from './full-text';
 
 /** Create a function with a name that takes a variable number of arguments */
 const fn =
@@ -111,3 +112,11 @@ export const any = (
   list: ExpressionInput,
   predicate: ExpressionInput,
 ) => fn('any')(`${variable} IN ${exp(list)} WHERE ${exp(predicate)}`);
+
+export const db = {
+  index: {
+    fulltext: {
+      queryNodes: IndexFullTextQueryNodes,
+    },
+  },
+};
diff --git a/src/core/database/query/full-text.ts b/src/core/database/query/full-text.ts
@@ -1,28 +1,160 @@
+import { entries, isNotNil, many, Many, mapKeys } from '@seedcompany/common';
 import { Query } from 'cypher-query-builder';
+import { pickBy } from 'lodash';
+import { LiteralUnion } from 'type-fest';
+import { CypherExpression, exp, isExp } from './cypher-expression';
+import { db } from './cypher-functions';
 
 /**
- * Query a full text index for results.
- *
- * NOTE: the `query` is Lucene syntax. If this is coming from user input, consider
- * using the {@link escapeLuceneSyntax} function.
+ * @see https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/full-text-indexes/
  */
-export const fullTextQuery =
-  (index: string, query: string, yieldTerms = ['node']) =>
-  (q: Query) =>
-    q.raw(
-      'CALL db.index.fulltext.queryNodes($index, $query)' +
-        (yieldTerms && yieldTerms.length > 0
-          ? ' YIELD ' + yieldTerms.join(', ')
-          : ''),
-      {
-        index,
-        // fallback to "" when no query is given, so that no results are
-        // returned instead of the procedure failing
-        query: query.trim() || '""',
-      },
-    );
+export const FullTextIndex = (config: {
+  indexName: string;
+  labels: Many<string>;
+  properties: Many<string>;
+  analyzer?: Analyzer;
+  /**
+   * This means that updates will be applied in a background thread "as soon as possible",
+   * instead of during a transaction commit, which is true for other indexes.
+   */
+  eventuallyConsistent?: boolean;
+}) => {
+  const quote = (q: string) => `'${q}'`;
+
+  const { indexName } = config;
+
+  return {
+    /**
+     * Query to create the full text index (if needed).
+     */
+    create: () => {
+      const parsedConfig = {
+        analyzer: config.analyzer ? quote(config.analyzer) : undefined,
+        // eslint-disable-next-line @typescript-eslint/naming-convention
+        eventually_consistent: config.eventuallyConsistent
+          ? exp(config.eventuallyConsistent)
+          : undefined,
+      };
+      const options =
+        entries(pickBy(parsedConfig, (v) => v !== undefined)).length > 0
+          ? {
+              indexConfig: mapKeys(parsedConfig, (k) => `fulltext.${k}`)
+                .asRecord,
+            }
+          : undefined;
+      const query = `
+        CREATE FULLTEXT INDEX ${indexName} IF NOT EXISTS
+        FOR (n:${many(config.labels).join('|')})
+        ON EACH ${exp(many(config.properties).map((p) => `n.${p}`))}
+        ${options ? `OPTIONS ${exp(options)}` : ''}
+      `;
+      return (q: Query) => q.raw(query);
+    },
+
+    /**
+     * Query the full text index for results.
+     *
+     * NOTE: the `query` is Lucene syntax.
+     * If this is coming from user input, consider using the {@link escapeLuceneSyntax} function.
+     */
+    search: (
+      query: string,
+      config: {
+        yield?: Many<string>;
+        skip?: number;
+        limit?: number;
+        analyzer?: Analyzer;
+      } = {},
+    ) => {
+      const { yield: yieldTerms = ['node'], ...options } = config;
+
+      // fallback to "" when no query is given, so that no results are
+      // returned instead of the procedure failing
+      query = query.trim() || '""';
+
+      return (q: Query) =>
+        q
+          .call(db.index.fulltext.queryNodes(indexName, query, options))
+          .yield(yieldTerms);
+    },
+  };
+};
 
 export const escapeLuceneSyntax = (query: string) =>
   query
     .replace(/[![\]~)(+\-:?*"^&|{}\\/]/g, (char) => `\\${char}`)
     .replace(/\b(OR|AND|NOT)\b/g, (char) => `"${char}"`);
+
+export const IndexFullTextQueryNodes = (
+  indexName: string,
+  query: string,
+  options?:
+    | {
+        skip?: number;
+        limit?: number;
+        analyzer?: string;
+      }
+    | CypherExpression,
+) => ({
+  name: 'db.index.fulltext.queryNodes',
+  args: {
+    indexName,
+    query,
+    ...(options &&
+    (Object.values(options).filter(isNotNil).length > 0 || isExp(options))
+      ? { options }
+      : undefined),
+  },
+});
+
+type Analyzer = LiteralUnion<KnownAnalyzer, string>;
+
+/**
+ * List from Neo4j with:
+ * CALL db.index.fulltext.listAvailableAnalyzers()
+ */
+type KnownAnalyzer =
+  // Analyzer that uses ASCIIFoldingFilter to remove accents (diacritics).
+  // Otherwise, it behaves as a standard english analyzer.
+  // Note: This analyzer may have unexpected behavior, such as tokenizing, for all non-ASCII numbers and symbols.
+  | 'standard-folding'
+  // A simple analyzer that tokenizes at non-letter boundaries.
+  // No stemming or filtering.
+  // Works okay for most European languages, but is terrible for languages where words aren't separated by spaces, such as many Asian languages.
+  | 'simple'
+  // Stop analyzer tokenizes at non-letter characters, and filters out English stop words.
+  // This differs from the 'classic' and 'standard' analyzers in that it makes no effort to recognize special terms,
+  // like likely product names, URLs or email addresses.
+  | 'stop'
+  // Keyword analyzer "tokenizes" the text as a single term.
+  // Useful for zip-codes, ids, etc. Situations where complete and exact matches are desired.
+  | 'keyword'
+  // The standard analyzer.
+  // Tokenizes on non-letter and filters out English stop words and punctuation.
+  // Does no stemming, but takes care to keep likely product names, URLs and email addresses as single terms.
+  | 'standard'
+  // Breaks text into terms by characters that have the unicode WHITESPACE property.
+  | 'unicode_whitespace'
+  // Tokenizes into sequences of alphanumeric, numeric, URL, email, southeast asian terms,
+  // and into terms of individual ideographic and hiragana characters.
+  // English stop words are filtered out.
+  | 'url'
+  // English analyzer with stemming and stop word filtering.
+  | 'english'
+  // Tokenizes into sequences of alphanumeric, numeric, URL, email, southeast asian terms,
+  // and into terms of individual ideographic and hiragana characters.
+  // English stop words are filtered out.
+  | 'url_or_email'
+  // The default analyzer.
+  // Similar to the 'standard' analyzer, but filters no stop words.
+  // Tokenizes on non-letter boundaries filters out punctuation.
+  // Does no stemming, but takes care to keep likely product names, URLs and email addresses as single terms.
+  | 'standard-no-stop-words'
+  // Classic Lucene analyzer. Similar to 'standard', but with worse unicode support.
+  | 'classic'
+  // Tokenizes into sequences of alphanumeric, numeric, URL, email, southeast asian terms,
+  // and into terms of individual ideographic and hiragana characters.
+  // English stop words are filtered out.
+  | 'email'
+  // Breaks text into terms by characters that are considered "Java whitespace".
+  | 'whitespace';
diff --git a/src/core/database/query/index.ts b/src/core/database/query/index.ts
@@ -11,7 +11,7 @@ export * from './properties/update-relation-list';
 export * from './create-relationships';
 export * from './cypher-expression';
 export * from './cypher-functions';
-export * from './full-text';
+export { FullTextIndex, escapeLuceneSyntax } from './full-text';
 export * from './lists';
 export * from './sorting';
 export * from './matching';