1414 QSortFilterProxyModel ,
1515 Qt ,
1616 QUrl ,
17+ QAbstractTableModel ,
1718)
1819from AnyQt .QtWidgets import (
1920 QAbstractItemView ,
3738from orangewidget .utils .listview import ListViewSearch
3839
3940from orangecontrib .text .corpus import Corpus
41+ from Orange .data import ContinuousVariable
4042
4143HTML = """
4244<!doctype html>
@@ -140,7 +142,7 @@ def _count_matches(content: List[str], regex: re.Pattern, state: TaskState) -> i
140142 return matches
141143
142144
143- class DocumentListModel (QAbstractListModel ):
145+ class DocumentListModel (QAbstractTableModel ):
144146 """
145147 Custom model for listing documents. Using custom model since Onrage's
146148 pylistmodel is too slow for large number of documents
@@ -150,6 +152,7 @@ def __init__(self, *args, **kwargs):
150152 super ().__init__ (* args , ** kwargs )
151153 self .__visible_data = []
152154 self .__filter_content = []
155+ self .__match_counts = []
153156
154157 def data (self , index : QModelIndex , role : int = Qt .DisplayRole ) -> Any :
155158 if role == Qt .DisplayRole :
@@ -160,12 +163,41 @@ def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any:
160163 def rowCount (self , parent : QModelIndex = None , * args , ** kwargs ) -> int :
161164 return len (self .__visible_data )
162165
163- def setup_data (self , data : List [str ], content : List [str ]):
166+ def setup_data (self , data : List [str ], content : List [str ], match_counts : List [ int ] = None ):
164167 self .beginResetModel ()
165168 self .__visible_data = data
166169 self .__filter_content = content
170+ self .__match_counts = match_counts or [0 ] * len (data )
167171 self .endResetModel ()
168172
173+ def set_match_counts (self , match_counts : List [int ]):
174+ """Update match counts for each document"""
175+ assert len (match_counts ) == len (self .__visible_data )
176+ self .__match_counts = match_counts
177+ self .dataChanged .emit (self .index (0 , 0 ), self .index (self .rowCount () - 1 , 1 ))
178+
179+ def data (self , index : QModelIndex , role : int = Qt .DisplayRole ) -> Any :
180+ """Return data for display or filtering"""
181+ row = index .row ()
182+ col = index .column () if index .isValid () else 0
183+ if role == Qt .DisplayRole :
184+ if col == 0 :
185+ return self .__visible_data [row ]
186+ elif col == 1 :
187+ return self .__match_counts [row ]
188+ elif role == Qt .UserRole :
189+ return self .__filter_content [row ]
190+
191+ def columnCount (self , parent = None ):
192+ """Return number of columns (2: title and match count)"""
193+ return 2
194+
195+ def headerData (self , section , orientation , role ):
196+ """Return column header titles"""
197+ if orientation == Qt .Horizontal and role == Qt .DisplayRole :
198+ return ["Title" , "Match Count" ][section ]
199+ return super ().headerData (section , orientation , role )
200+
169201 def update_filter_content (self , content : List [str ]):
170202 assert len (content ) == len (self .__visible_data )
171203 self .__filter_content = content
@@ -383,13 +415,16 @@ def __init__(self):
383415 self .doc_list .setSelectionMode (QTableView .ExtendedSelection )
384416 self .doc_list .setEditTriggers (QAbstractItemView .NoEditTriggers )
385417 self .doc_list .horizontalHeader ().setSectionResizeMode (QHeaderView .Stretch )
386- self .doc_list .horizontalHeader ().setVisible (False )
418+ self .doc_list .horizontalHeader ().setVisible (True )
387419 self .splitter .addWidget (self .doc_list )
420+ self .doc_list .setSortingEnabled (True )
388421
389422 self .doc_list_model = DocumentListModel ()
390423 proxy_model = DocumentsFilterProxyModel ()
391424 proxy_model .setSourceModel (self .doc_list_model )
392425 self .doc_list .setModel (proxy_model )
426+ self .doc_list .setSortingEnabled (True )
427+ self .doc_list .sortByColumn (1 , Qt .DescendingOrder )
393428 self .doc_list .selectionModel ().selectionChanged .connect (self .selection_changed )
394429 # Document contents
395430 self .doc_webview = gui .WebviewWidget (self .splitter , debug = False )
@@ -469,7 +504,17 @@ def select_variables(self):
469504 def list_docs (self ):
470505 """List documents into the left scrolling area"""
471506 docs = self .regenerate_docs ()
472- self .doc_list_model .setup_data (self .corpus .titles .tolist (), docs )
507+ match_counts = []
508+
509+ try :
510+ regex = re .compile (self .regexp_filter .strip ("|" ), re .IGNORECASE )
511+ except re .error :
512+ regex = re .compile ("" )
513+
514+ for doc in docs :
515+ match_counts .append (len (regex .findall (doc )) if regex .pattern else 0 )
516+
517+ self .doc_list_model .setup_data (self .corpus .titles .tolist (), docs , match_counts )
473518
474519 def get_selected_indexes (self ) -> Set [int ]:
475520 m = self .doc_list .model ().mapToSource
@@ -597,6 +642,7 @@ def refresh_search(self):
597642 self .Error .invalid_regex .clear ()
598643 if self .corpus is not None :
599644 self .doc_list .model ().set_filter_string (self .regexp_filter )
645+ self .doc_list .setColumnHidden (1 , not bool (self .regexp_filter .strip ("|" )))
600646 if not self .selected_documents :
601647 # when currently selected items are filtered selection is empty
602648 # select first element in the view in that case
@@ -621,8 +667,12 @@ def refresh_search(self):
621667 self .commit .deferred ()
622668
623669 def on_done (self , res : int ):
624- """When matches count is done show the result in the label"""
670+ """When matches count is done show the result in the label and update match counts """
625671 self .n_matches = f"{ int (res ):,} " if res is not None else "n/a"
672+ if self .compiled_regex and self .corpus :
673+ docs = self .doc_list_model .get_filter_content ()
674+ match_counts = [len (self .compiled_regex .findall (doc )) for doc in docs ]
675+ self .doc_list_model .set_match_counts (match_counts )
626676
627677 def on_exception (self , ex ):
628678 raise ex
@@ -649,6 +699,19 @@ def commit(self):
649699 mask [selected_docs ] = 0
650700 unmatched = self .corpus [mask ] if mask .any () else None
651701 annotated_corpus = create_annotated_table (self .corpus , selected_docs )
702+
703+ if annotated_corpus is not None :
704+ match_counts = self .doc_list_model ._DocumentListModel__match_counts
705+ match_var = ContinuousVariable ("Match Count" )
706+
707+ domain = annotated_corpus .domain
708+ new_domain = Domain (
709+ domain .attributes ,
710+ domain .class_vars ,
711+ domain .metas + (match_var ,)
712+ )
713+ annotated_corpus = Corpus (new_domain , annotated_corpus .X , annotated_corpus .Y , np .column_stack ([annotated_corpus .metas , np .array (match_counts , dtype = object ).reshape (- 1 , 1 )]))
714+
652715 self .Outputs .matching_docs .send (matched )
653716 self .Outputs .other_docs .send (unmatched )
654717 self .Outputs .corpus .send (annotated_corpus )
0 commit comments