Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions album-recommendation-java/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ to see the custom metric in
<a href="http://localhost:19092/metrics/v1/values?consumer=my-metrics" data-proofer-ignore>
http://localhost:19092/metrics/v1/values?consumer=my-metrics</a>

This code uses a [Counter](https://github.com/vespa-engine/vespa/blob/master/container-core/src/main/java/com/yahoo/metrics/simple/Counter.java) -
A [Gauge](https://github.com/vespa-engine/vespa/blob/master/container-core/src/main/java/com/yahoo/metrics/simple/Gauge.java)
This code uses a [Counter](https://github.com/vespa-engine/vespa/blob/master/container-disc/src/main/java/com/yahoo/metrics/simple/Counter.java) -
A [Gauge](https://github.com/vespa-engine/vespa/blob/master/container-disc/src/main/java/com/yahoo/metrics/simple/Gauge.java)
example, with a dimension could be like:

````
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
components
61 changes: 61 additions & 0 deletions examples/lucene-linguistics/custom-analyzer-non-java/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->

<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://assets.vespa.ai/logos/Vespa-logo-green-RGB.svg">
<source media="(prefers-color-scheme: light)" srcset="https://assets.vespa.ai/logos/Vespa-logo-dark-RGB.svg">
<img alt="#Vespa" width="200" src="https://assets.vespa.ai/logos/Vespa-logo-dark-RGB.svg" style="margin-bottom: 25px;">
</picture>

# Vespa sample applications - Lucene Linguistics

This app demonstrates how to use a custom analyzer in [Lucene Linguistics](https://docs.vespa.ai/en/linguistics/lucene-linguistics.html) without Java.

This is useful when default analyzers (e.g., language analyzers) do not meet your needs. For example, on a text like:
```text
c++ developer (*nix OS)
```

You'd often lose the `++` part. A [Pattern tokenizer](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/pattern/PatternTokenizer.html) can help here ([services.xml snippet](app/services.xml)):

```xml
<tokenizer>
<name>pattern</name>
<conf>
<!-- Split on spaces and parentheses only -->
<item key="pattern">\s|\(|\)</item>
</conf>
</tokenizer>
```

For all the character filters, tokenizers and token filters available, check out the [Lucene analysis-common Javadoc](https://lucene.apache.org/core/9_11_1/analysis/common/allclasses-index.html).


## Deploy the application
Follow [app deploy guide](https://docs.vespa.ai/en/basics/deploy-an-application)
through the <code>vespa deploy</code> step, cloning `examples/lucene-linguistics/custom-analyzer-non-java` instead of `album-recommendation`.

## Feed test data
Feed the sample document:

```bash
vespa feed ext/*.json
```

## Run a test query
```bash
curl -s -X POST -d '{
"yql":"select * from sources * where text contains \"c++\"",
"presentation.summary": "debug-text-tokens",
"model.locale": "en",
"trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
```

You'd see the document match and its tokens:
```
"text_tokens": [
"c++",
"developer",
"*nix",
"os"
]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# This file excludes unnecessary files from the application package. See
# https://docs.vespa.ai/en/reference/vespaignore.html for more information.
.DS_Store
.gitignore
README.md
ext/
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
schema doc {

document doc {
field language type string {
indexing: set_language | summary | index
match: word
}
field text type string {
indexing: summary | index
index: enable-bm25
}
}

fieldset default {
fields: text
}
document-summary debug-text-tokens {
summary documentid {}
summary language {}
summary text {}
summary text_tokens {
source: text
tokens
}
from-disk
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?xml version="1.0" encoding="utf-8" ?>
<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
<services version="1.0" minimum-required-vespa-version="8.315.19">
<container id="container" version="1.0">
<component id="linguistics"
class="com.yahoo.language.lucene.LuceneLinguistics"
bundle="lucene-linguistics">
<config name="com.yahoo.language.lucene.lucene-analysis">
<analysis>
<item key="en">
<tokenizer>
<name>pattern</name>
<conf>
<item key="pattern">\s|\(|\)</item>
</conf>
</tokenizer>
<tokenFilters>
<item>
<name>lowercase</name>
</item>
</tokenFilters>
</item>
</analysis>
</config>
</component>
<document-processing/>
<document-api/>
<search/>
</container>
<content id="content" version="1.0">
<min-redundancy>1</min-redundancy>
<documents>
<document type="doc" mode="index"/>
</documents>
</content>
</services>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"put": "id:en:doc::1",
"fields": {
"text": "c++ developer (*nix OS)",
"language": "en"
}
}
1 change: 1 addition & 0 deletions examples/lucene-linguistics/multiple-profiles/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
components
68 changes: 68 additions & 0 deletions examples/lucene-linguistics/multiple-profiles/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->

<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://assets.vespa.ai/logos/Vespa-logo-green-RGB.svg">
<source media="(prefers-color-scheme: light)" srcset="https://assets.vespa.ai/logos/Vespa-logo-dark-RGB.svg">
<img alt="#Vespa" width="200" src="https://assets.vespa.ai/logos/Vespa-logo-dark-RGB.svg" style="margin-bottom: 25px;">
</picture>

# Vespa sample applications - Lucene Linguistics

This app demonstrates how to use multiple analyzer profiles in [Lucene Linguistics](https://docs.vespa.ai/en/linguistics/lucene-linguistics.html).

You can bind different fields to different analyzer profiles in the schema. Here, we have three analyzers in [services.xml](app/services.xml):
- `lowerFolding`: [standard tokenizer](https://lucene.apache.org/core/9_11_1/core/org/apache/lucene/analysis/standard/StandardTokenizer.html) + [lowercase](https://lucene.apache.org/core/9_11_1/core/org/apache/lucene/analysis/LowerCaseFilter.html) and [ASCII folding](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.html) token filters
- `lowerFoldingStemming`: lowerFolding + [kStem for English](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/en/KStemFilterFactory.html)
- `lowerFoldingStemmingSynonyms`: lowerFoldingStemming + [synonym expansion](https://lucene.apache.org/core/9_11_1/analysis/common/org/apache/lucene/analysis/synonym/SynonymGraphFilterFactory.html)

We have three fields in the schema:
- `title`: bound to `lowerFolding`
- `description`: bound to `lowerFoldingStemming` at write time, and `lowerFoldingStemmingSynonyms` at search time. We want to expand synonyms at search time only, it doesn't make sense to do it on both sides.

In this example, we only use English, but you can combine this with multiple languages if you wanted to. Steps to do this are:
1. In `services.xml`, define an analyzer for each profile+language combination.
- Use `default` profile for fields that are not bound to a specific profile.
2. In the schema, use `linguistics` block to bind the field to the profile (or profiles, if you need different profiles for index and search).
3. Use [language tags and detection](https://docs.vespa.ai/en/linguistics/linguistics.html#language-handling) as before.

## Deploy the application
Follow [app deploy guide](https://docs.vespa.ai/en/basics/deploy-an-application)
through the <code>vespa deploy</code> step, cloning `examples/lucene-linguistics/multiple-profiles` instead of `album-recommendation`.

## Feed the sample document

```bash
vespa feed ext/*.json
```

## Run test queries

This will confirm that ASCII folding is working on the `title` field, because it will match `åao` with `åäö`:
```bash
curl -s -X POST -d '{
"yql":"select * from sources * where title contains \"åao\"",
"presentation.summary": "debug-text-tokens",
"model.locale": "en",
"trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
```

You can also force a different profile for the query via `model.type.profile`. This will match "dubious" with "special" (our test synonym expansion):

```bash
curl -s -X POST -d '{
"yql":"select * from sources * where title contains \"dubious\"",
"model.type.profile": "lowerFoldingStemmingSynonyms",
"presentation.summary": "debug-text-tokens",
"model.locale": "en",
"trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
```

For the `description` field, we already use a different profile for search time which already does synonym expansion (as defined in [the schema](app/schemas/doc.sd)). So it will match "dubious" with "special" out of the box:

```bash
curl -s -X POST -d '{
"yql":"select * from sources * where description contains \"dubious\"",
"presentation.summary": "debug-text-tokens",
"model.locale": "en",
"trace.level":2}' -H "Content-Type: application/json" 'http://localhost:8080/search/' | jq .
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# This file excludes unnecessary files from the application package. See
# https://docs.vespa.ai/en/reference/vespaignore.html for more information.
.DS_Store
.gitignore
README.md
ext/
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# using Solr synonyms format (default for synonymGraph token filter)
dubious =>special
47 changes: 47 additions & 0 deletions examples/lucene-linguistics/multiple-profiles/app/schemas/doc.sd
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
schema doc {

document doc {
field language type string {
indexing: set_language | summary | index
match: word
}

field title type string {
indexing: summary | index
# use this when the profile is the same for indexing and searching
linguistics {
profile: lowerFolding
}
index: enable-bm25
}

field description type string {
indexing: summary | index
# profile/analyzer can be different for index and search strings
# typical use-case: synonym expansion (usually done at search time only)
linguistics {
profile {
index: lowerFoldingStemming
search: lowerFoldingStemmingSynonyms
}
}
index: enable-bm25
}
}

document-summary debug-text-tokens {
summary documentid {}
summary language {}
summary title {}
summary description {}
summary title_tokens {
source: title
tokens
}
summary description_tokens {
source: description
tokens
}
from-disk
}
}
79 changes: 79 additions & 0 deletions examples/lucene-linguistics/multiple-profiles/app/services.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<?xml version="1.0" encoding="utf-8" ?>
<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
<services version="1.0" minimum-required-vespa-version="8.315.19">
<container id="container" version="1.0">
<component id="linguistics"
class="com.yahoo.language.lucene.LuceneLinguistics"
bundle="lucene-linguistics">
<config name="com.yahoo.language.lucene.lucene-analysis">
<!-- we store synonyms (and potentially other files) in this directory under the application package -->
<configDir>lucene-linguistics</configDir>
<analysis>
<!-- profile is essentially the name of the analyzer configuration; use it in the schema and at query time -->
<item key="profile=lowerFolding;language=en">
<tokenizer>
<name>standard</name>
</tokenizer>
<tokenFilters>
<item>
<name>lowercase</name>
</item>
<item>
<name>asciiFolding</name>
</item>
</tokenFilters>
</item>

<item key="profile=lowerFoldingStemming;language=en">
<tokenizer>
<name>standard</name>
</tokenizer>
<tokenFilters>
<item>
<name>lowercase</name>
</item>
<item>
<name>asciiFolding</name>
</item>
<item>
<name>kStem</name>
</item>
</tokenFilters>
</item>

<item key="profile=lowerFoldingStemmingSynonyms;language=en">
<tokenizer>
<name>standard</name>
</tokenizer>
<tokenFilters>
<item>
<name>lowercase</name>
</item>
<item>
<name>asciiFolding</name>
</item>
<item>
<name>kStem</name>
</item>
<item>
<name>synonymGraph</name>
<conf>
<item key="synonyms">en/synonyms.txt</item>
</conf>
</item>
</tokenFilters>
</item>
</analysis>
</config>
</component>
<document-processing/>
<document-api/>
<search/>
</container>
<content id="content" version="1.0">
<min-redundancy>1</min-redundancy>
<documents>
<document type="doc" mode="index"/>
</documents>
</content>
</services>
8 changes: 8 additions & 0 deletions examples/lucene-linguistics/multiple-profiles/ext/en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"put": "id:en:doc::1",
"fields": {
"title": "Title with special characters åäö",
"description": "No character specials here",
"language": "en"
}
}