Skip to content

Commit 6e0a0e1

Browse files
committed
fix match count in corpus viewer
1 parent 8831881 commit 6e0a0e1

File tree

2 files changed

+100
-23
lines changed

2 files changed

+100
-23
lines changed

orangecontrib/text/widgets/owcorpusviewer.py

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
QSortFilterProxyModel,
1515
Qt,
1616
QUrl,
17+
QAbstractTableModel,
1718
)
1819
from AnyQt.QtWidgets import (
1920
QAbstractItemView,
@@ -37,6 +38,7 @@
3738
from orangewidget.utils.listview import ListViewSearch
3839

3940
from orangecontrib.text.corpus import Corpus
41+
from Orange.data import ContinuousVariable
4042

4143
HTML = """
4244
<!doctype html>
@@ -140,7 +142,7 @@ def _count_matches(content: List[str], regex: re.Pattern, state: TaskState) -> i
140142
return matches
141143

142144

143-
class DocumentListModel(QAbstractListModel):
145+
class DocumentListModel(QAbstractTableModel):
144146
"""
145147
Custom model for listing documents. Using custom model since Onrage's
146148
pylistmodel is too slow for large number of documents
@@ -150,6 +152,7 @@ def __init__(self, *args, **kwargs):
150152
super().__init__(*args, **kwargs)
151153
self.__visible_data = []
152154
self.__filter_content = []
155+
self.__match_counts = []
153156

154157
def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any:
155158
if role == Qt.DisplayRole:
@@ -160,12 +163,41 @@ def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any:
160163
def rowCount(self, parent: QModelIndex = None, *args, **kwargs) -> int:
161164
return len(self.__visible_data)
162165

163-
def setup_data(self, data: List[str], content: List[str]):
166+
def setup_data(self, data: List[str], content: List[str], match_counts: List[int] = None):
164167
self.beginResetModel()
165168
self.__visible_data = data
166169
self.__filter_content = content
170+
self.__match_counts = match_counts or [0] * len(data)
167171
self.endResetModel()
168172

173+
def set_match_counts(self, match_counts: List[int]):
174+
"""Update match counts for each document"""
175+
assert len(match_counts) == len(self.__visible_data)
176+
self.__match_counts = match_counts
177+
self.dataChanged.emit(self.index(0, 0), self.index(self.rowCount() - 1, 1))
178+
179+
def data(self, index: QModelIndex, role: int = Qt.DisplayRole) -> Any:
180+
"""Return data for display or filtering"""
181+
row = index.row()
182+
col = index.column() if index.isValid() else 0
183+
if role == Qt.DisplayRole:
184+
if col == 0:
185+
return self.__visible_data[row]
186+
elif col == 1:
187+
return self.__match_counts[row]
188+
elif role == Qt.UserRole:
189+
return self.__filter_content[row]
190+
191+
def columnCount(self, parent=None):
192+
"""Return number of columns (2: title and match count)"""
193+
return 2
194+
195+
def headerData(self, section, orientation, role):
196+
"""Return column header titles"""
197+
if orientation == Qt.Horizontal and role == Qt.DisplayRole:
198+
return ["Title", "Match Count"][section]
199+
return super().headerData(section, orientation, role)
200+
169201
def update_filter_content(self, content: List[str]):
170202
assert len(content) == len(self.__visible_data)
171203
self.__filter_content = content
@@ -383,13 +415,16 @@ def __init__(self):
383415
self.doc_list.setSelectionMode(QTableView.ExtendedSelection)
384416
self.doc_list.setEditTriggers(QAbstractItemView.NoEditTriggers)
385417
self.doc_list.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
386-
self.doc_list.horizontalHeader().setVisible(False)
418+
self.doc_list.horizontalHeader().setVisible(True)
387419
self.splitter.addWidget(self.doc_list)
420+
self.doc_list.setSortingEnabled(True)
388421

389422
self.doc_list_model = DocumentListModel()
390423
proxy_model = DocumentsFilterProxyModel()
391424
proxy_model.setSourceModel(self.doc_list_model)
392425
self.doc_list.setModel(proxy_model)
426+
self.doc_list.setSortingEnabled(True)
427+
self.doc_list.sortByColumn(1, Qt.DescendingOrder)
393428
self.doc_list.selectionModel().selectionChanged.connect(self.selection_changed)
394429
# Document contents
395430
self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)
@@ -469,7 +504,17 @@ def select_variables(self):
469504
def list_docs(self):
470505
"""List documents into the left scrolling area"""
471506
docs = self.regenerate_docs()
472-
self.doc_list_model.setup_data(self.corpus.titles.tolist(), docs)
507+
match_counts = []
508+
509+
try:
510+
regex = re.compile(self.regexp_filter.strip("|"), re.IGNORECASE)
511+
except re.error:
512+
regex = re.compile("")
513+
514+
for doc in docs:
515+
match_counts.append(len(regex.findall(doc)) if regex.pattern else 0)
516+
517+
self.doc_list_model.setup_data(self.corpus.titles.tolist(), docs, match_counts)
473518

474519
def get_selected_indexes(self) -> Set[int]:
475520
m = self.doc_list.model().mapToSource
@@ -597,6 +642,7 @@ def refresh_search(self):
597642
self.Error.invalid_regex.clear()
598643
if self.corpus is not None:
599644
self.doc_list.model().set_filter_string(self.regexp_filter)
645+
self.doc_list.setColumnHidden(1, not bool(self.regexp_filter.strip("|")))
600646
if not self.selected_documents:
601647
# when currently selected items are filtered selection is empty
602648
# select first element in the view in that case
@@ -621,8 +667,12 @@ def refresh_search(self):
621667
self.commit.deferred()
622668

623669
def on_done(self, res: int):
624-
"""When matches count is done show the result in the label"""
670+
"""When matches count is done show the result in the label and update match counts"""
625671
self.n_matches = f"{int(res):,}" if res is not None else "n/a"
672+
if self.compiled_regex and self.corpus:
673+
docs = self.doc_list_model.get_filter_content()
674+
match_counts = [len(self.compiled_regex.findall(doc)) for doc in docs]
675+
self.doc_list_model.set_match_counts(match_counts)
626676

627677
def on_exception(self, ex):
628678
raise ex
@@ -649,6 +699,19 @@ def commit(self):
649699
mask[selected_docs] = 0
650700
unmatched = self.corpus[mask] if mask.any() else None
651701
annotated_corpus = create_annotated_table(self.corpus, selected_docs)
702+
703+
if annotated_corpus is not None:
704+
match_counts = self.doc_list_model._DocumentListModel__match_counts
705+
match_var = ContinuousVariable("Match Count")
706+
707+
domain = annotated_corpus.domain
708+
new_domain = Domain(
709+
domain.attributes,
710+
domain.class_vars,
711+
domain.metas + (match_var,)
712+
)
713+
annotated_corpus = Corpus(new_domain, annotated_corpus.X, annotated_corpus.Y, np.column_stack([annotated_corpus.metas, np.array(match_counts, dtype=object).reshape(-1, 1)]))
714+
652715
self.Outputs.matching_docs.send(matched)
653716
self.Outputs.other_docs.send(unmatched)
654717
self.Outputs.corpus.send(annotated_corpus)

orangecontrib/text/widgets/tests/test_owcorpusviewer.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,25 @@ def test_data(self):
3232
self.assertListEqual(model.get_filter_content(), contents)
3333
self.assertEqual(model.rowCount(), 3)
3434

35-
self.assertEqual(model.data(model.index(0)), documents[0])
36-
self.assertEqual(model.data(model.index(1)), documents[1])
37-
self.assertEqual(model.data(model.index(2)), documents[2])
35+
self.assertEqual(model.data(model.index(0, 0)), documents[0])
36+
self.assertEqual(model.data(model.index(1, 0)), documents[1])
37+
self.assertEqual(model.data(model.index(2, 0)), documents[2])
3838

3939
def test_data_method(self):
4040
model = DocumentListModel()
4141
documents = ["Doc 1", "Doc 2", "Doc 3"]
4242
contents = ["bar", "foo", "bar foo"]
4343
model.setup_data(documents, contents)
4444

45-
self.assertEqual(model.data(model.index(0), Qt.DisplayRole), documents[0])
46-
self.assertEqual(model.data(model.index(1), Qt.DisplayRole), documents[1])
47-
self.assertEqual(model.data(model.index(2), Qt.DisplayRole), documents[2])
45+
self.assertEqual(model.data(model.index(0, 0), Qt.DisplayRole), documents[0])
46+
self.assertEqual(model.data(model.index(1, 0), Qt.DisplayRole), documents[1])
47+
self.assertEqual(model.data(model.index(2, 0), Qt.DisplayRole), documents[2])
4848

49-
self.assertEqual(model.data(model.index(0), Qt.UserRole), contents[0])
50-
self.assertEqual(model.data(model.index(1), Qt.UserRole), contents[1])
51-
self.assertEqual(model.data(model.index(2), Qt.UserRole), contents[2])
49+
self.assertEqual(model.data(model.index(0, 0), Qt.UserRole), contents[0])
50+
self.assertEqual(model.data(model.index(1, 0), Qt.UserRole), contents[1])
51+
self.assertEqual(model.data(model.index(2, 0), Qt.UserRole), contents[2])
5252

53-
self.assertIsNone(model.data(model.index(2), Qt.BackgroundRole))
53+
self.assertIsNone(model.data(model.index(2, 0), Qt.BackgroundRole))
5454

5555
def test_update_filter_content(self):
5656
model = DocumentListModel()
@@ -59,9 +59,9 @@ def test_update_filter_content(self):
5959
model.setup_data(documents, contents)
6060

6161
model.update_filter_content(["a", "b", "c"])
62-
self.assertEqual(model.data(model.index(0), Qt.UserRole), "a")
63-
self.assertEqual(model.data(model.index(1), Qt.UserRole), "b")
64-
self.assertEqual(model.data(model.index(2), Qt.UserRole), "c")
62+
self.assertEqual(model.data(model.index(0, 0), Qt.UserRole), "a")
63+
self.assertEqual(model.data(model.index(1, 0), Qt.UserRole), "b")
64+
self.assertEqual(model.data(model.index(2, 0), Qt.UserRole), "c")
6565

6666
with self.assertRaises(AssertionError):
6767
model.update_filter_content(
@@ -119,26 +119,34 @@ def test_search(self):
119119
self.send_signal(self.widget.Inputs.corpus, self.corpus)
120120
self.widget.regexp_filter = "Human"
121121
self.widget.refresh_search()
122+
self.wait_until_finished()
123+
124+
sel_model = self.widget.doc_list.selectionModel()
125+
sel_model.select(sel_model.model().index(0, 0), QItemSelectionModel.Select | QItemSelectionModel.Rows)
126+
122127
self.process_events()
123128
out_corpus = self.get_output(self.widget.Outputs.matching_docs)
129+
self.assertIsNotNone(out_corpus)
124130
self.assertEqual(len(out_corpus), 1)
125-
self.assertEqual(self.widget.n_matches, 7)
131+
self.assertEqual(int(self.widget.n_matches), 7)
126132

127133
# first document is selected, when filter with word that is not in
128134
# selected document, first of shown documents is selected
129135
self.widget.regexp_filter = "graph"
130136
self.widget.refresh_search()
137+
self.wait_until_finished()
131138
self.process_events()
132139
self.assertEqual(1, len(self.get_output(self.widget.Outputs.matching_docs)))
133140
# word count doesn't depend on selection
134-
self.assertEqual(self.widget.n_matches, 7)
141+
self.assertEqual(int(self.widget.n_matches), 7)
135142

136143
# when filter is removed, matched words is 0
137144
self.widget.regexp_filter = ""
138145
self.widget.refresh_search()
146+
self.wait_until_finished()
139147
self.process_events()
140148
self.wait_until_finished()
141-
self.assertEqual(self.widget.n_matches, 0)
149+
self.assertEqual(int(self.widget.n_matches), 0)
142150

143151
def test_invalid_regex(self):
144152
# Error is shown when invalid regex is entered
@@ -205,7 +213,7 @@ def test_output(self):
205213
)
206214
self.assertEqual(8, len(self.get_output(self.widget.Outputs.other_docs)))
207215
self.assertEqual(
208-
len(self.corpus.domain.metas) + 1,
216+
len(self.corpus.domain.metas) + 2,
209217
len(self.get_output(self.widget.Outputs.corpus).domain.metas),
210218
)
211219

@@ -370,7 +378,13 @@ def test_migrate_settings(self):
370378
domain = self.corpus.domain
371379
self.assertListEqual(self.widget.display_features, [domain["Category"]])
372380
self.assertListEqual(self.widget.search_features, [domain["Text"]])
373-
381+
382+
def test_match_count_is_in_metas(self):
383+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
384+
self.widget.doc_list.selectAll()
385+
output = self.get_output(self.widget.Outputs.corpus)
386+
meta_names = [var.name for var in output.domain.metas]
387+
self.assertIn("Match Count", meta_names)
374388

375389
if __name__ == "__main__":
376390
unittest.main()

0 commit comments

Comments
 (0)