File tree Expand file tree Collapse file tree 2 files changed +12
-2
lines changed
src/python/txtai/pipeline/data Expand file tree Collapse file tree 2 files changed +12
-2
lines changed Original file line number Diff line number Diff line change @@ -31,6 +31,10 @@ tokenizer = Tokenizer(ngrams={
3131 " ngrams" : 3 , " lpad" : " " , " rpad" : " " , " unique" : True
3232})
3333tokenize(" text to tokenize" )
34+
35+ # Tokenize into edge ngrams
36+ tokenizer = Tokenizer(ngrams = {" nmin" : 2 , " nmax" : 5 , " edge" : True })
37+ tokenizer(" text to tokenize" )
3438```
3539
3640## Configuration-driven example
Original file line number Diff line number Diff line change @@ -150,18 +150,24 @@ def ngramtokenize(self, text):
150150
151151 # Ngram configuration
152152 number = self .ngrams .get ("ngrams" , 3 )
153+ nmin = self .ngrams .get ("nmin" , number )
154+ nmax = self .ngrams .get ("nmax" , number )
155+
153156 lpad = self .ngrams .get ("lpad" , "" )
154157 rpad = self .ngrams .get ("rpad" , "" )
155158 unique = self .ngrams .get ("unique" , False )
159+ edge = self .ngrams .get ("edge" , False )
156160
157161 # Split on non-whitespace and apply optional word padding
158162 words = [f"{ lpad } { x } { rpad } " for x in re .split (r"\W+" , text .lower ()) if x .strip ()]
159163
160164 # Generate ngrams
161165 ngrams = []
162166 for word in words :
163- for x in range (0 , len (word ) - number + 1 ):
164- ngrams .append (word [x : x + number ])
167+ for n in range (nmin , min (nmax , len (word )) + 1 ):
168+ for x in range (0 , len (word ) - n + 1 ):
169+ if not edge or x == 0 :
170+ ngrams .append (word [x : x + n ])
165171
166172 # Reduce to unique ngrams, if necessary and return
167173 return list (set (ngrams )) if unique else ngrams
You can’t perform that action at this time.
0 commit comments