Skip to content

Commit f7b2e3e

Browse files
committed
Corpus - automated input/output summaries
1 parent 0ce67c1 commit f7b2e3e

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

orangecontrib/text/corpus.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010
import scipy.sparse as sp
1111
from gensim import corpora
12+
from orangewidget.utils.signals import summarize, PartialSummary
1213

1314
from Orange.data import (
1415
Variable,
@@ -657,3 +658,20 @@ def arrays_equal(a, b):
657658
np.array_equal(self.pos_tags, other.pos_tags) and
658659
self.domain == other.domain and
659660
self.ngram_range == other.ngram_range)
661+
662+
663+
@summarize.register(Corpus)
664+
def summarize_(corpus: Corpus) -> PartialSummary:
665+
"""
666+
Provides automated input and output summaries for Corpus
667+
"""
668+
table_summary = summarize.dispatch(Table)(corpus)
669+
extras = (
670+
(
671+
f"<br/><nobr>Total tokens: {sum(map(len, corpus.tokens))}, "
672+
f"Total types: {len(corpus.dictionary)}</nobr>"
673+
)
674+
if corpus.has_tokens()
675+
else "<br/><nobr>Corpus is not preprocessed</nobr>"
676+
)
677+
return PartialSummary(table_summary.summary, table_summary.details + extras)

orangecontrib/text/tests/test_corpus.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import numpy as np
66
from numpy.testing import assert_array_equal
7+
from orangecontrib.text.preprocess import RegexpTokenizer
8+
from orangewidget.utils.signals import Input, summarize
79
from scipy.sparse import csr_matrix, issparse
810

911
from Orange.data import Table, DiscreteVariable, StringVariable, Domain, ContinuousVariable
@@ -629,5 +631,40 @@ def test_pickle_corpus(self):
629631
pickle.dumps(c)
630632

631633

634+
class TestCorpusSummaries(unittest.TestCase):
635+
def test_corpus_not_preprocessed(self):
636+
"""Check if details part of the summary is formatted correctly"""
637+
corpus = Corpus.from_file("book-excerpts")
638+
639+
n_features = len(corpus.domain.variables) + len(corpus.domain.metas)
640+
details = (
641+
f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>"
642+
f"<nobr>Features: — (no missing values)</nobr><br/>"
643+
f"<nobr>Target: categorical</nobr><br/>"
644+
f"<nobr>Metas: string</nobr><br/>"
645+
f"<nobr>Corpus is not preprocessed</nobr>"
646+
)
647+
table_summary = summarize.dispatch(Corpus)(corpus)
648+
self.assertEqual(140, table_summary.summary)
649+
self.assertEqual(details, table_summary.details)
650+
651+
def test_corpus_preprocessed(self):
652+
"""Check if details part of the summary is formatted correctly"""
653+
corpus = Corpus.from_file("book-excerpts")
654+
corpus = RegexpTokenizer()(corpus)
655+
656+
n_features = len(corpus.domain.variables) + len(corpus.domain.metas)
657+
details = (
658+
f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>"
659+
f"<nobr>Features: — (no missing values)</nobr><br/>"
660+
f"<nobr>Target: categorical</nobr><br/>"
661+
f"<nobr>Metas: string</nobr><br/>"
662+
f"<nobr>Total tokens: 128020, Total types: 11712</nobr>"
663+
)
664+
table_summary = summarize.dispatch(Corpus)(corpus)
665+
self.assertEqual(140, table_summary.summary)
666+
self.assertEqual(details, table_summary.details)
667+
668+
632669
if __name__ == "__main__":
633670
unittest.main()

0 commit comments

Comments
 (0)