Skip to content

Commit 9933957

Browse files
committed
Add new features to ngram tokenizer, closes #1038
1 parent 38170a4 commit 9933957

File tree

2 files changed

+12
-2
lines changed

2 files changed

+12
-2
lines changed

docs/pipeline/data/tokenizer.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ tokenizer = Tokenizer(ngrams={
3131
"ngrams": 3, "lpad": " ", "rpad": " ", "unique": True
3232
})
3333
tokenize("text to tokenize")
34+
35+
# Tokenize into edge ngrams
36+
tokenizer = Tokenizer(ngrams={"nmin": 2, "nmax": 5, "edge": True})
37+
tokenizer("text to tokenize")
3438
```
3539

3640
## Configuration-driven example

src/python/txtai/pipeline/data/tokenizer.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,18 +150,24 @@ def ngramtokenize(self, text):
150150

151151
# Ngram configuration
152152
number = self.ngrams.get("ngrams", 3)
153+
nmin = self.ngrams.get("nmin", number)
154+
nmax = self.ngrams.get("nmax", number)
155+
153156
lpad = self.ngrams.get("lpad", "")
154157
rpad = self.ngrams.get("rpad", "")
155158
unique = self.ngrams.get("unique", False)
159+
edge = self.ngrams.get("edge", False)
156160

157161
# Split on non-whitespace and apply optional word padding
158162
words = [f"{lpad}{x}{rpad}" for x in re.split(r"\W+", text.lower()) if x.strip()]
159163

160164
# Generate ngrams
161165
ngrams = []
162166
for word in words:
163-
for x in range(0, len(word) - number + 1):
164-
ngrams.append(word[x : x + number])
167+
for n in range(nmin, min(nmax, len(word)) + 1):
168+
for x in range(0, len(word) - n + 1):
169+
if not edge or x == 0:
170+
ngrams.append(word[x : x + n])
165171

166172
# Reduce to unique ngrams, if necessary and return
167173
return list(set(ngrams)) if unique else ngrams

0 commit comments

Comments
 (0)