Skip to content

Commit f32a53c

Browse files
authored
Merge pull request #16 from alldroll/next_word_suggestion
Next word suggestion
2 parents 2e32cdb + 25423b5 commit f32a53c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+1928
-1167
lines changed

README.md

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# Suggest
22

3-
Library for Top-k Approximate String Matching.
3+
Library for Top-k Approximate String Matching, autocomplete and spell checking.
44

5+
[![Build Status](https://travis-ci.com/alldroll/suggest.svg?branch=master)](https://travis-ci.com/alldroll/suggest)
56
[![Go Report Card](https://goreportcard.com/badge/github.com/alldroll/suggest)](https://goreportcard.com/report/github.com/alldroll/suggest)
67
[![GoDoc](https://godoc.org/github.com/alldroll/suggest?status.svg)](https://godoc.org/github.com/alldroll/suggest)
78

@@ -15,16 +16,17 @@ The library was mostly inspired by
1516

1617
## Purpose
1718

18-
Let's imagine you have a website, for instance a pharmacy website.
19+
Let's imagine you have a website, for instance, a pharmacy website.
1920
There could be a lot of dictionaries, such as a list of medical drugs,
2021
a list of cities (countries), where you can deliver your goods and so on.
21-
Some of these dictionaries could be a pretty large, and it might be a
22+
Some of these dictionaries could be pretty large, and it might be a
2223
tedious for a customer to choose the correct option from the dictionary.
2324
Having the possibility of `Top-k approximate string search` in a dictionary
24-
is a significant in these cases.
25+
is significant in these cases.
2526

26-
This library provides API and the simple `http service` for such purposes.
27+
Also, the library provides spell checking functionality, that allows you to predict the next word.
2728

29+
The library provides API and the simple `HTTP service` for such purposes.
2830

2931
## Demo
3032

@@ -55,8 +57,9 @@ of choosing a similarity, type of metric and topK.
5557
## Usage
5658

5759
```go
58-
// The dictionary, on which we expect fuzzy search
59-
dictionary := dictionary.NewInMemoryDictionary([]string{
60+
// we create InMemoryDictionary. Here we can use anything we want,
61+
// for example SqlDictionary, CDBDictionary and so on
62+
dict := dictionary.NewInMemoryDictionary([]string{
6063
"Nissan March",
6164
"Nissan Juke",
6265
"Nissan Maxima",
@@ -67,10 +70,7 @@ dictionary := dictionary.NewInMemoryDictionary([]string{
6770
"Toyota Corona",
6871
})
6972

70-
// create suggest service
71-
service := suggest.NewService()
72-
73-
// here we describe our index configuration
73+
// describe index configuration
7474
indexDescription := suggest.IndexDescription{
7575
Name: "cars", // name of the dictionary
7676
NGramSize: 3, // size of the nGram
@@ -79,15 +79,17 @@ indexDescription := suggest.IndexDescription{
7979
Alphabet: []string{"english", "$"}, // alphabet of allowed chars (other chars will be replaced with pad symbol)
8080
}
8181

82-
// create runtime search index builder (because we don't have indexed data)
83-
builder, err := suggest.NewRAMBuilder(dictionary, indexDescription)
82+
// create runtime search index builder
83+
builder, err := suggest.NewRAMBuilder(dict, indexDescription)
8484

8585
if err != nil {
8686
log.Fatalf("Unexpected error: %v", err)
8787
}
8888

89-
// asking our service for adding a new search index with the given configuration
90-
if err := service.AddIndex(indexDescription.Name, dictionary, builder); err != nil {
89+
service := suggest.NewService()
90+
91+
// add a new search index with the given configuration
92+
if err := service.AddIndex(indexDescription.Name, dict, builder); err != nil {
9193
log.Fatalf("Unexpected error: %v", err)
9294
}
9395

@@ -116,16 +118,32 @@ fmt.Println(values)
116118

117119
## Suggest eval
118120

119-
Eval command is a command line tool for approximate string search.
121+
Eval command is a command-line tool for approximate string search.
120122

121123
## Suggest indexer
122124

123125
Indexer command builds a search index with the given [configuration](##index-description-format).
124-
Generated data is required by `DISC` implementation of a index driver.
126+
Generated data is required by `DISC` implementation of an index driver.
125127

126128
## Suggest service-run
127129

128-
Runs a http web server with suggest methods.
130+
Runs HTTP webserver with suggest methods.
131+
132+
## Language model ngram-count
133+
134+
Creates Google n-grams format
135+
136+
## Language model build-lm
137+
138+
Builds a binary representation of a stupid-backoff language model and writes it to disk
139+
140+
## Language model eval
141+
142+
Eval command is a cli for lm scoring
143+
144+
## Spellchecker
145+
146+
Cli for spell checking
129147

130148
### REST API
131149

@@ -221,9 +239,3 @@ Returns a list of managed dictionaries
221239
222240
* **Code:** 500 SERVER ERROR <br />
223241
**Content:** `description`
224-
225-
## TODO
226-
227-
* Autocomplete (to improve initial prototype)
228-
* NGram language model
229-
* Spellchecker

pkg/analysis/filter_tokenizer.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package analysis
2+
3+
import "strings"
4+
5+
// filterTokenizer performs tokenize and filter operations
6+
type filterTokenizer struct {
7+
tokenizer Tokenizer
8+
filter TokenFilter
9+
}
10+
11+
// NewFilterTokenizer creates a new instance of filter tokenizer
12+
func NewFilterTokenizer(tokenizer Tokenizer, filter TokenFilter) Tokenizer {
13+
return &filterTokenizer{
14+
tokenizer: tokenizer,
15+
filter: filter,
16+
}
17+
}
18+
19+
// Tokenize splits the given text on a sequence of tokens
20+
func (t *filterTokenizer) Tokenize(text string) []Token {
21+
text = strings.ToLower(text)
22+
text = strings.Trim(text, " ")
23+
24+
tokens := t.tokenizer.Tokenize(text)
25+
26+
return t.filter.Filter(tokens)
27+
}

pkg/analysis/ngram_tokenizer.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package analysis
2+
3+
const maxN = 8
4+
5+
// NewNGramTokenizer creates a new instance of Tokenizer
6+
func NewNGramTokenizer(nGramSize int) Tokenizer {
7+
return &nGramTokenizer{
8+
nGramSize: nGramSize,
9+
}
10+
}
11+
12+
type nGramTokenizer struct {
13+
nGramSize int
14+
}
15+
16+
// Tokenize splits the given text on a sequence of tokens
17+
func (t *nGramTokenizer) Tokenize(text string) []Token {
18+
if len(text) < t.nGramSize {
19+
return []Token{}
20+
}
21+
22+
result := make([]Token, 0, len(text)-t.nGramSize+1)
23+
prevIndexes := [maxN]int{}
24+
i := 0
25+
26+
for index := range text {
27+
i++
28+
29+
if i > t.nGramSize {
30+
top := prevIndexes[(i-t.nGramSize)%t.nGramSize]
31+
nGram := text[top:index]
32+
result = appendUnique(result, nGram)
33+
}
34+
35+
prevIndexes[i%t.nGramSize] = index
36+
}
37+
38+
top := prevIndexes[(i+1)%t.nGramSize]
39+
nGram := text[top:]
40+
result = appendUnique(result, nGram)
41+
42+
return result
43+
}
44+
45+
// https://blog.golang.org/profiling-go-programs
46+
func appendUnique(a []Token, x Token) []Token {
47+
for _, y := range a {
48+
if x == y {
49+
return a
50+
}
51+
}
52+
53+
return append(a, x)
54+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package analysis
2+
3+
import (
4+
"reflect"
5+
"testing"
6+
)
7+
8+
func TestTokenizeNGrams(t *testing.T) {
9+
cases := []struct {
10+
word string
11+
k int
12+
ngrams []Token
13+
}{
14+
{
15+
"tet",
16+
2,
17+
[]Token{"te", "et"},
18+
},
19+
{
20+
"te",
21+
2,
22+
[]Token{"te"},
23+
},
24+
{
25+
"testing",
26+
3,
27+
[]Token{"tes", "est", "sti", "tin", "ing"},
28+
},
29+
{
30+
"жигули",
31+
2,
32+
[]Token{"жи", "иг", "гу", "ул", "ли"},
33+
},
34+
{
35+
"",
36+
2,
37+
[]Token{},
38+
},
39+
{
40+
"lalala",
41+
2,
42+
[]Token{"la", "al"},
43+
},
44+
}
45+
46+
for _, c := range cases {
47+
tokenizer := NewNGramTokenizer(c.k)
48+
actual := tokenizer.Tokenize(c.word)
49+
50+
if !reflect.DeepEqual(actual, c.ngrams) {
51+
t.Errorf(
52+
"Test Fail, expected %v, got %v",
53+
c.ngrams,
54+
actual,
55+
)
56+
}
57+
}
58+
}
59+
60+
func BenchmarkNGramTokenizer(b *testing.B) {
61+
tokenizer := NewNGramTokenizer(3)
62+
63+
for i := 0; i < b.N; i++ {
64+
tokenizer.Tokenize("abcdefghkl123456йцукен")
65+
}
66+
}

pkg/analysis/normalizer.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package analysis
2+
3+
import (
4+
"github.com/alldroll/suggest/pkg/alphabet"
5+
)
6+
7+
type normalizeFilter struct {
8+
chars alphabet.Alphabet
9+
pad string
10+
}
11+
12+
// NewNormalizerFilter returns tokens filter
13+
func NewNormalizerFilter(chars alphabet.Alphabet, pad string) TokenFilter {
14+
return &normalizeFilter{
15+
chars: chars,
16+
pad: pad,
17+
}
18+
}
19+
20+
// Filter filters the given list with described behaviour
21+
func (f *normalizeFilter) Filter(list []Token) []Token {
22+
for i, token := range list {
23+
res := ""
24+
25+
for _, r := range token {
26+
if f.chars.Has(r) {
27+
res += string(r)
28+
} else {
29+
res += f.pad
30+
}
31+
}
32+
33+
list[i] = res
34+
}
35+
36+
return list
37+
}

pkg/analysis/tokenizer.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package analysis
2+
3+
// Token is a string with an assigned and thus identified meaning
4+
type Token = string
5+
6+
// Tokenizer performs splitting the given text on a sequence of tokens
7+
type Tokenizer interface {
8+
// Splits the given text on a sequence of tokens
9+
Tokenize(text string) []Token
10+
}
11+
12+
// TokenFilter is responsible for removing, modifiying and altering the given token flow
13+
type TokenFilter interface {
14+
// Filter filters the given list with described behaviour
15+
Filter(list []Token) []Token
16+
}

pkg/analysis/word_tokenizer.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package analysis
2+
3+
import (
4+
"unicode/utf8"
5+
6+
"github.com/alldroll/suggest/pkg/alphabet"
7+
)
8+
9+
// NewWordTokenizer creates a new instance of Tokenizer
10+
func NewWordTokenizer(alphabet alphabet.Alphabet) Tokenizer {
11+
return &wordTokenizer{
12+
alphabet: alphabet,
13+
}
14+
}
15+
16+
// tokenizer implements Tokenizer interface
17+
type wordTokenizer struct {
18+
alphabet alphabet.Alphabet
19+
}
20+
21+
// Tokenize splits the given text on a sequence of tokens
22+
func (t *wordTokenizer) Tokenize(text string) []Token {
23+
words := []Token{}
24+
wordStart, wordLen := -1, 0
25+
26+
for i, char := range text {
27+
if t.alphabet.Has(char) {
28+
if wordStart == -1 {
29+
wordStart = i
30+
}
31+
32+
wordLen += utf8.RuneLen(char)
33+
} else {
34+
if wordStart != -1 {
35+
words = append(words, text[wordStart:wordStart+wordLen])
36+
}
37+
38+
wordStart, wordLen = -1, 0
39+
}
40+
}
41+
42+
if wordStart != -1 {
43+
words = append(words, text[wordStart:wordStart+wordLen])
44+
}
45+
46+
return words
47+
}

0 commit comments

Comments
 (0)