Lucene Linguistics: multiple profiles sample app

radu-gheorghe · radu-gheorghe · commit dd0cde7da946 · 2026-01-22T17:27:21.000+02:00
diff --git a/examples/lucene-linguistics/multiple-profiles/.gitignore b/examples/lucene-linguistics/multiple-profiles/.gitignore
@@ -0,0 +1 @@
+components
diff --git a/examples/lucene-linguistics/multiple-profiles/README.md b/examples/lucene-linguistics/multiple-profiles/README.md
@@ -0,0 +1,68 @@
+<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+
+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="https://assets.vespa.ai/logos/Vespa-logo-green-RGB.svg">
+  <source media="(prefers-color-scheme: light)" srcset="https://assets.vespa.ai/logos/Vespa-logo-dark-RGB.svg">
+  <img alt="#Vespa" width="200" src="https://assets.vespa.ai/logos/Vespa-logo-dark-RGB.svg" style="margin-bottom: 25px;">
+</picture>
+
+# Vespa sample applications - Lucene Linguistics 
+
+This app demonstrates how to use multiple analyzer profiles in [Lucene Linguistics](https://docs.vespa.ai/en/linguistics/lucene-linguistics.html).
+
+You can bind different fields to different analyzer profiles in the schema. Here, we have three analyzers in [services.xml](app/services.xml):
+- `lowerFolding`: [standard tokenizer](https://lucene.apache.org/core/9_11_1/core/org/apache/lucene/analysis/standard/StandardTokenizer.html) + [lowercase](https://lucene.apache.org/core/9_11_1/core/org/apache/lucene/analysis/LowerCaseFilter.html) and [ASCII folding](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.html) token filters
+- `lowerFoldingStemming`: lowerFolding + [kStem for English](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/en/KStemFilterFactory.html)
+- `lowerFoldingStemmingSynonyms`: lowerFoldingStemming + [synonym expansion](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/synonym/SynonymGraphFilterFactory.html)
+
+We have three fields in the schema:
+- `title`: bound to `lowerFolding`
+- `description`: bound to `lowerFoldingStemming` at write time, and `lowerFoldingStemmingSynonyms` at search time. We want to expand synonyms at search time only, it doesn't make sense to do it on both sides.
+
+In this example, we only use English, but you can combine this with multiple languages if you wanted to. Steps to do this are:
+1. In `services.xml`, define an analyzer for each profile+language combination.
+   - Use `default` profile for fields that are not bound to a specific profile.
+2. In the schema, use `linguistics` block to bind the field to the profile (or profiles, if you need different profiles for index and search).
+3. Use [language tags and detection](https://docs.vespa.ai/en/linguistics/linguistics.html#language-handling) as before.
+
+## Deploy the application
+Follow [app deploy guide](https://docs.vespa.ai/en/basics/deploy-an-application)
+through the <code>vespa deploy</code> step, cloning `examples/lucene-linguistics/multiple-profiles` instead of `album-recommendation`.
+
+## Feed the sample document
+
+```bash
+vespa feed ext/*.json
+```
+
+## Run test queries
+
+This will confirm that ASCII folding is working on the `title` field, because it will match `åao` with `åäö`:
+```bash
+curl -s -X POST -d '{
+  "yql":"select * from sources * where title contains \"åao\"",
+  "presentation.summary": "debug-text-tokens",
+  "model.locale": "en",
+  "trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
+```
+
+You can also force a different profile for the query via `model.type.profile`. This will match "dubious" with "special" (our test synonym expansion):
+
+```bash
+curl -s -X POST -d '{
+  "yql":"select * from sources * where title contains \"dubious\"",
+  "model.type.profile": "lowerFoldingStemmingSynonyms",
+  "presentation.summary": "debug-text-tokens",
+  "model.locale": "en",
+  "trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
+```
+
+For the `description` field, we already use a different profile for search time which already does synonym expansion (as defined in [the schema](app/schemas/doc.sd)). So it will match "dubious" with "special" out of the box:
+
+```bash
+curl -s -X POST -d '{
+  "yql":"select * from sources * where description contains \"dubious\"",
+  "presentation.summary": "debug-text-tokens",
+  "model.locale": "en",
+  "trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
+```
diff --git a/examples/lucene-linguistics/multiple-profiles/app/.vespaignore b/examples/lucene-linguistics/multiple-profiles/app/.vespaignore
@@ -0,0 +1,6 @@
+# This file excludes unnecessary files from the application package. See
+# https://docs.vespa.ai/en/reference/vespaignore.html for more information.
+.DS_Store
+.gitignore
+README.md
+ext/
diff --git a/examples/lucene-linguistics/multiple-profiles/app/lucene-linguistics/en/synonyms.txt b/examples/lucene-linguistics/multiple-profiles/app/lucene-linguistics/en/synonyms.txt
@@ -0,0 +1,2 @@
+# using Solr synonyms format (default for synonymGraph token filter)
+dubious =>special
diff --git a/examples/lucene-linguistics/multiple-profiles/app/schemas/doc.sd b/examples/lucene-linguistics/multiple-profiles/app/schemas/doc.sd
@@ -0,0 +1,47 @@
+schema doc {
+
+    document doc {
+        field language type string {
+            indexing: set_language | summary | index
+            match: word
+        }
+
+        field title type string {
+            indexing: summary | index
+            # use this when the profile (analyzer configuration) is the same for indexing and searching
+            linguistics {
+                profile: lowerFolding
+            }
+            index: enable-bm25
+        }
+
+        field description type string {
+            indexing: summary | index
+            # profile/analyzer can be different for index and search strings
+            # typical use-case: synonym expansion (usually done at search time only)
+            linguistics {
+                profile {
+                    index: lowerFoldingStemming
+                    search: lowerFoldingStemmingSynonyms
+                }
+            }
+            index: enable-bm25
+        }
+    }
+
+    document-summary debug-text-tokens {
+        summary documentid {}
+        summary language {}
+        summary title {}
+        summary description {}
+        summary title_tokens {
+            source: title
+            tokens
+        }
+        summary description_tokens {
+            source: description
+            tokens
+        }
+        from-disk
+    }
+}
diff --git a/examples/lucene-linguistics/multiple-profiles/app/services.xml b/examples/lucene-linguistics/multiple-profiles/app/services.xml
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<services version="1.0" minimum-required-vespa-version="8.315.19">
+  <container id="container" version="1.0">
+    <component id="linguistics"
+    class="com.yahoo.language.lucene.LuceneLinguistics"
+    bundle="lucene-linguistics">
+      <config name="com.yahoo.language.lucene.lucene-analysis">
+      <!-- we store synonyms (and potentially other files) in this directory under the application package -->
+      <configDir>lucene-linguistics</configDir>
+        <analysis>
+          <!-- profile is essentially the name of the analyzer configuration; use it in the schema and at query time -->
+          <item key="profile=lowerFolding;language=en">
+            <tokenizer>
+              <name>standard</name>
+            </tokenizer>
+            <tokenFilters>
+              <item>
+                <name>lowercase</name>
+              </item>
+              <item>
+                <name>asciiFolding</name>
+              </item>
+            </tokenFilters>
+          </item>
+
+          <item key="profile=lowerFoldingStemming;language=en">
+            <tokenizer>
+              <name>standard</name>
+            </tokenizer>
+            <tokenFilters>
+              <item>
+                <name>lowercase</name>
+              </item>
+              <item>
+                <name>asciiFolding</name>
+              </item>
+              <item>
+                <name>kStem</name>
+              </item>
+            </tokenFilters>
+          </item>
+
+          <item key="profile=lowerFoldingStemmingSynonyms;language=en">
+            <tokenizer>
+              <name>standard</name>
+            </tokenizer>
+            <tokenFilters>
+              <item>
+                <name>lowercase</name>
+              </item>
+              <item>
+                <name>asciiFolding</name>
+              </item>
+              <item>
+                <name>kStem</name>
+              </item>
+              <item>
+                <name>synonymGraph</name>
+                <conf>
+                  <item key="synonyms">en/synonyms.txt</item>
+                </conf>
+              </item>
+            </tokenFilters>
+          </item>
+        </analysis>
+      </config>
+    </component>
+    <document-processing/>
+    <document-api/>
+    <search/>
+  </container>
+  <content id="content" version="1.0">
+    <min-redundancy>1</min-redundancy>
+    <documents>
+      <document type="doc" mode="index"/>
+    </documents>
+  </content>
+</services>
diff --git a/examples/lucene-linguistics/multiple-profiles/ext/en.json b/examples/lucene-linguistics/multiple-profiles/ext/en.json
@@ -0,0 +1,8 @@
+{
+	"put": "id:en:doc::1", 
+	"fields": {
+		"title": "Title with special characters åäö",
+		"description": "No character specials here",
+		"language": "en"
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# using Solr synonyms format (default for synonymGraph token filter)`
	`2`	`+dubious =>special`