Skip to content

Commit 360e72f

Browse files
committed
Corpus - automated input/output summaries
1 parent 89cbaef commit 360e72f

File tree

2 files changed

+70
-0
lines changed

2 files changed

+70
-0
lines changed

orangecontrib/text/corpus.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@
2626
from orangecontrib.text.vectorization.base import get_unique_names
2727
from orangecontrib.text.vectorization import BowVectorizer
2828

29+
try:
30+
from orangewidget.utils.signals import summarize, PartialSummary
31+
# import to check if Table summary is available
32+
from Orange.widgets.utils import state_summary
33+
except ImportError:
34+
summarize, PartialSummary = None, None
35+
2936

3037
def get_sample_corpora_dir():
3138
path = os.path.dirname(__file__)
@@ -657,3 +664,23 @@ def arrays_equal(a, b):
657664
np.array_equal(self.pos_tags, other.pos_tags) and
658665
self.domain == other.domain and
659666
self.ngram_range == other.ngram_range)
667+
668+
669+
if summarize:
670+
# summarize is not available in older versions of orange-widget-base
671+
# skip if not available
672+
@summarize.register(Corpus)
673+
def summarize_(corpus: Corpus) -> PartialSummary:
674+
"""
675+
Provides automated input and output summaries for Corpus
676+
"""
677+
table_summary = summarize.dispatch(Table)(corpus)
678+
extras = (
679+
(
680+
f"<br/><nobr>Total tokens: {sum(map(len, corpus.tokens))}, "
681+
f"Total types: {len(corpus.dictionary)}</nobr>"
682+
)
683+
if corpus.has_tokens()
684+
else "<br/><nobr>Corpus is not preprocessed</nobr>"
685+
)
686+
return PartialSummary(table_summary.summary, table_summary.details + extras)

orangecontrib/text/tests/test_corpus.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import os
22
import pickle
33
import unittest
4+
from unittest import skipIf
45

56
import numpy as np
67
from numpy.testing import assert_array_equal
8+
from orangecontrib.text.preprocess import RegexpTokenizer
79
from scipy.sparse import csr_matrix, issparse
810

911
from Orange.data import Table, DiscreteVariable, StringVariable, Domain, ContinuousVariable
@@ -12,6 +14,11 @@
1214
from orangecontrib.text.corpus import Corpus
1315
from orangecontrib.text.tag import AveragedPerceptronTagger
1416

17+
try:
18+
from orangewidget.utils.signals import summarize
19+
except ImportError:
20+
summarize = None
21+
1522

1623
class CorpusTests(unittest.TestCase):
1724
def setUp(self):
@@ -629,5 +636,41 @@ def test_pickle_corpus(self):
629636
pickle.dumps(c)
630637

631638

639+
@skipIf(summarize is None, "summarize is not available for orange-widget-base<4.13")
640+
class TestCorpusSummaries(unittest.TestCase):
641+
def test_corpus_not_preprocessed(self):
642+
"""Check if details part of the summary is formatted correctly"""
643+
corpus = Corpus.from_file("book-excerpts")
644+
645+
n_features = len(corpus.domain.variables) + len(corpus.domain.metas)
646+
details = (
647+
f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>"
648+
f"<nobr>Features: — (no missing values)</nobr><br/>"
649+
f"<nobr>Target: categorical</nobr><br/>"
650+
f"<nobr>Metas: string</nobr><br/>"
651+
f"<nobr>Corpus is not preprocessed</nobr>"
652+
)
653+
summary = summarize.dispatch(Corpus)(corpus)
654+
self.assertEqual(140, summary.summary)
655+
self.assertEqual(details, summary.details)
656+
657+
def test_corpus_preprocessed(self):
658+
"""Check if details part of the summary is formatted correctly"""
659+
corpus = Corpus.from_file("book-excerpts")
660+
corpus = RegexpTokenizer()(corpus)
661+
662+
n_features = len(corpus.domain.variables) + len(corpus.domain.metas)
663+
details = (
664+
f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>"
665+
f"<nobr>Features: — (no missing values)</nobr><br/>"
666+
f"<nobr>Target: categorical</nobr><br/>"
667+
f"<nobr>Metas: string</nobr><br/>"
668+
f"<nobr>Total tokens: 128020, Total types: 11712</nobr>"
669+
)
670+
summary = summarize.dispatch(Corpus)(corpus)
671+
self.assertEqual(140, summary.summary)
672+
self.assertEqual(details, summary.details)
673+
674+
632675
if __name__ == "__main__":
633676
unittest.main()

0 commit comments

Comments
 (0)