1- ''' Full-text searcher for headwords/phrases/examples/definitions'''
1+ """ Full-text searcher for headwords/phrases/examples/definitions"""
22
3- from __future__ import absolute_import
4-
5- import re
3+ import fnmatch
64import os .path
5+ import re
76from operator import itemgetter
8- import fnmatch
97
108from whoosh import index as wh_index
11- from whoosh .fields import Schema , STORED , IDLIST , ID , TEXT
12- from whoosh .analysis import StandardAnalyzer , Filter
13- from whoosh .query import Variations , Term , Or , And
14- from whoosh .qparser import QueryParser , \
15- RangePlugin , BoostPlugin , WildcardPlugin , OperatorsPlugin
16- from whoosh .highlight import WholeFragmenter , HtmlFormatter
17- from whoosh .collectors import WrappingCollector , \
18- UnlimitedCollector , TopCollector
9+ from whoosh .analysis import Filter , StandardAnalyzer
10+ from whoosh .collectors import TopCollector , UnlimitedCollector , WrappingCollector
11+ from whoosh .fields import ID , IDLIST , STORED , TEXT , Schema
12+ from whoosh .highlight import HtmlFormatter , WholeFragmenter
13+ from whoosh .qparser import (
14+ BoostPlugin ,
15+ OperatorsPlugin ,
16+ QueryParser ,
17+ RangePlugin ,
18+ WildcardPlugin ,
19+ )
20+ from whoosh .query import And , Or , Term , Variations
1921
20- from .utils .cdb import CDBReader , CDBMaker , CDBError
21- from .utils .text import normalize_token , normalize_index_key ,\
22- enc_utf8 , dec_utf8
22+ from .utils .cdb import CDBError , CDBMaker , CDBReader
23+ from .utils .text import dec_utf8 , enc_utf8 , normalize_index_key , normalize_token
2324
2425
2526class IndexError (Exception ):
@@ -46,9 +47,10 @@ def abort(self):
4647 self ._aborted = True
4748
4849
49- #-----------------
50+ # -----------------
5051# Word Vatiations
51- #-----------------
52+ # -----------------
53+
5254
5355class VariationsReader (object ):
5456 def __init__ (self , path ):
@@ -68,13 +70,13 @@ def close(self):
6870 self ._reader = None
6971
7072 def get_variations (self , word ):
71- r = set ((word , ))
73+ r = set ((word ,))
7274 try :
7375 s = self ._reader [enc_utf8 (word )]
7476 except KeyError :
7577 return r
7678
77- r .update (dec_utf8 (w ) for w in s .split (b' \0 ' ))
79+ r .update (dec_utf8 (w ) for w in s .split (b" \0 " ))
7880 return r
7981
8082
@@ -83,18 +85,18 @@ def __init__(self, f):
8385 self ._writer = CDBMaker (f )
8486
8587 def add (self , word , variations ):
86- self ._writer .add (
87- enc_utf8 (word ),
88- b'\0 ' .join (enc_utf8 (v ) for v in variations ))
88+ self ._writer .add (enc_utf8 (word ), b"\0 " .join (enc_utf8 (v ) for v in variations ))
8989
9090 def finalize (self ):
9191 self ._writer .finalize ()
9292
9393
9494def my_variations (var_reader ):
9595 if var_reader :
96+
9697 def f (fieldname , text , boost = 1.0 ):
9798 return MyVariations (var_reader , fieldname , text , boost )
99+
98100 return f
99101 else :
100102 return Term
@@ -116,43 +118,47 @@ def _words(self, ixreader):
116118 return cache [text ]
117119 else :
118120 fieldname = self .fieldname
119- words = [word for word in self .__var_reader .get_variations (text )
120- if (fieldname , word ) in ixreader ]
121+ words = [
122+ word
123+ for word in self .__var_reader .get_variations (text )
124+ if (fieldname , word ) in ixreader
125+ ]
121126 cache [text ] = words
122127 return words
123128
124129 def __deepcopy__ (self , x ):
125- return MyVariations (self .__var_reader ,
126- self .__fieldname , self .__text , self .__boost )
130+ return MyVariations (
131+ self .__var_reader , self .__fieldname , self .__text , self .__boost
132+ )
127133
128134
129- #-----------------
135+ # -----------------
130136# Index Schema
131- #-----------------
137+ # -----------------
138+
132139
133140class _AccentFilter (Filter ):
134141 def __call__ (self , tokens ):
135142 for t in tokens :
136143 t .text = normalize_token (t .text )
137144 yield t
138145
139- _stopwords = frozenset (('a' , 'an' ))
140- _analyzer = (StandardAnalyzer (stoplist = _stopwords ) | _AccentFilter ())
146+
147+ _stopwords = frozenset (("a" , "an" ))
148+ _analyzer = StandardAnalyzer (stoplist = _stopwords ) | _AccentFilter ()
141149_schema = Schema (
142- content = TEXT (
143- stored = True ,
144- spelling = True ,
145- analyzer = _analyzer ),
150+ content = TEXT (stored = True , spelling = True , analyzer = _analyzer ),
146151 data = STORED , # tuple (label, path, prio, sortkey)
147152 itemtype = ID ,
148- asfilter = IDLIST
153+ asfilter = IDLIST ,
149154)
150- _schema [' content' ].scorable = False
155+ _schema [" content" ].scorable = False
151156
152157
153- #-----------------
158+ # -----------------
154159# Maker
155- #-----------------
160+ # -----------------
161+
156162
157163class Maker (object ):
158164 def __init__ (self , index_dir ):
@@ -167,13 +173,12 @@ def __init__(self, index_dir):
167173 self ._writer = index .writer ()
168174 self ._committed = False
169175
170- def add_item (self ,
171- itemtype , content , asfilter , label , path , prio , sortkey ):
176+ def add_item (self , itemtype , content , asfilter , label , path , prio , sortkey ):
172177 self ._writer .add_document (
173178 itemtype = itemtype ,
174179 content = content ,
175180 asfilter = asfilter ,
176- data = (label , path , prio , normalize_index_key (sortkey ))
181+ data = (label , path , prio , normalize_index_key (sortkey )),
177182 )
178183
179184 def commit (self ):
@@ -189,9 +194,10 @@ def close(self):
189194 self ._writer = None
190195
191196
192- #-----------------
197+ # -----------------
193198# Searcher
194- #-----------------
199+ # -----------------
200+
195201
196202class Searcher (object ):
197203 def __init__ (self , index_dir , var_path ):
@@ -204,26 +210,33 @@ def __init__(self, index_dir, var_path):
204210 self ._var_reader = self ._make_var_reader (var_path )
205211
206212 op = OperatorsPlugin (
207- And = r"\bAND\b|&" , Or = None , # r"\bOR\b|\|",
208- Not = r"\bNOT\b|\s+-" , AndMaybe = None , Require = None )
209- parser = QueryParser ('content' , _schema ,
210- termclass = my_variations (self ._var_reader ))
213+ And = r"\bAND\b|&" ,
214+ Or = None , # r"\bOR\b|\|",
215+ Not = r"\bNOT\b|\s+-" ,
216+ AndMaybe = None ,
217+ Require = None ,
218+ )
219+ parser = QueryParser (
220+ "content" , _schema , termclass = my_variations (self ._var_reader )
221+ )
211222 parser .remove_plugin_class (RangePlugin )
212223 parser .remove_plugin_class (BoostPlugin )
213224 parser .remove_plugin_class (WildcardPlugin )
214225 parser .replace_plugin (op )
215226 self ._parser = parser
216227
217- parser_wild = QueryParser ('content' , _schema ,
218- termclass = my_variations (self ._var_reader ))
228+ parser_wild = QueryParser (
229+ "content" , _schema , termclass = my_variations (self ._var_reader )
230+ )
219231 parser_wild .remove_plugin_class (RangePlugin )
220232 parser_wild .remove_plugin_class (BoostPlugin )
221233 parser_wild .replace_plugin (op )
222234 self ._parser_wild = parser_wild
223235
224- op_filter = OperatorsPlugin (And = r"\bAND\b" , Or = r"\bOR\b" ,
225- Not = None , AndMaybe = None , Require = None )
226- asf_parser = QueryParser ('asfilter' , _schema )
236+ op_filter = OperatorsPlugin (
237+ And = r"\bAND\b" , Or = r"\bOR\b" , Not = None , AndMaybe = None , Require = None
238+ )
239+ asf_parser = QueryParser ("asfilter" , _schema )
227240 asf_parser .replace_plugin (op_filter )
228241 self ._asf_parser = asf_parser
229242
@@ -257,16 +270,17 @@ def make_collector(self, limit=None):
257270 else :
258271 return AbortableCollector (TopCollector (limit ))
259272
260- def search (self , collector , query_str1 = None , query_str2 = None ,
261- itemtypes = (), highlight = False ):
273+ def search (
274+ self , collector , query_str1 = None , query_str2 = None , itemtypes = (), highlight = False
275+ ):
262276
263277 # rejects '*' and '?'
264278 if query_str1 :
265279 for kw in (s .strip () for s in query_str1 .split ()):
266280 if not kw .replace ("*" , "" ).replace ("?" , "" ).strip ():
267281 return []
268282
269- wildcard = ( query_str1 and any (c in query_str1 for c in "*?" ) )
283+ wildcard = query_str1 and any (c in query_str1 for c in "*?" )
270284
271285 parser = self ._parser_wild if wildcard else self ._parser
272286 asf_parser = self ._asf_parser
@@ -283,10 +297,9 @@ def search(self, collector, query_str1=None, query_str2=None,
283297
284298 if itemtypes :
285299 if len (itemtypes ) > 1 :
286- andlist .append (
287- Or ([Term ('itemtype' , t ) for t in itemtypes ]))
300+ andlist .append (Or ([Term ("itemtype" , t ) for t in itemtypes ]))
288301 else :
289- andlist .append (Term (' itemtype' , itemtypes [0 ]))
302+ andlist .append (Term (" itemtype" , itemtypes [0 ]))
290303
291304 query = And (andlist )
292305
@@ -296,7 +309,8 @@ def search(self, collector, query_str1=None, query_str2=None,
296309 if highlight :
297310 hits .fragmenter = WholeFragmenter ()
298311 hits .formatter = HtmlFormatter (
299- tagname = 'span' , classname = 's_match' , termclass = 's_term' )
312+ tagname = "span" , classname = "s_match" , termclass = "s_term"
313+ )
300314
301315 if wildcard and query_str1 :
302316 pat = query_str1 .replace ("-" , "" ).replace (" " , "" )
@@ -307,17 +321,17 @@ def search(self, collector, query_str1=None, query_str2=None,
307321 for hit in hits :
308322 if collector .aborted :
309323 return []
310- (label , path , prio , sortkey ) = hit [' data' ]
324+ (label , path , prio , sortkey ) = hit [" data" ]
311325
312326 if wildcard and query_str1 :
313327 if not wildmatch .match (sortkey ):
314328 continue
315329
316330 if highlight :
317331 if query_str1 :
318- text = hit .highlights (' content' )
332+ text = hit .highlights (" content" )
319333 else :
320- text = hit [' content' ]
334+ text = hit [" content" ]
321335 else :
322336 text = None
323337
@@ -328,4 +342,3 @@ def search(self, collector, query_str1=None, query_str2=None,
328342
329343 # Return
330344 return results
331-
0 commit comments