|
1 | 1 | import os |
2 | 2 | import pickle |
3 | 3 | import unittest |
| 4 | +from unittest import skipIf |
4 | 5 |
|
5 | 6 | import numpy as np |
6 | 7 | from numpy.testing import assert_array_equal |
| 8 | +from orangecontrib.text.preprocess import RegexpTokenizer |
7 | 9 | from scipy.sparse import csr_matrix, issparse |
8 | 10 |
|
9 | 11 | from Orange.data import Table, DiscreteVariable, StringVariable, Domain, ContinuousVariable |
|
12 | 14 | from orangecontrib.text.corpus import Corpus |
13 | 15 | from orangecontrib.text.tag import AveragedPerceptronTagger |
14 | 16 |
|
| 17 | +try: |
| 18 | + from orangewidget.utils.signals import summarize |
| 19 | +except ImportError: |
| 20 | + summarize = None |
| 21 | + |
15 | 22 |
|
16 | 23 | class CorpusTests(unittest.TestCase): |
17 | 24 | def setUp(self): |
@@ -629,5 +636,41 @@ def test_pickle_corpus(self): |
629 | 636 | pickle.dumps(c) |
630 | 637 |
|
631 | 638 |
|
| 639 | +@skipIf(summarize is None, "summarize is not available for orange-widget-base<4.13") |
| 640 | +class TestCorpusSummaries(unittest.TestCase): |
| 641 | + def test_corpus_not_preprocessed(self): |
| 642 | + """Check if details part of the summary is formatted correctly""" |
| 643 | + corpus = Corpus.from_file("book-excerpts") |
| 644 | + |
| 645 | + n_features = len(corpus.domain.variables) + len(corpus.domain.metas) |
| 646 | + details = ( |
| 647 | + f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>" |
| 648 | + f"<nobr>Features: — (no missing values)</nobr><br/>" |
| 649 | + f"<nobr>Target: categorical</nobr><br/>" |
| 650 | + f"<nobr>Metas: string</nobr><br/>" |
| 651 | + f"<nobr>Corpus is not preprocessed</nobr>" |
| 652 | + ) |
| 653 | + summary = summarize.dispatch(Corpus)(corpus) |
| 654 | + self.assertEqual(140, summary.summary) |
| 655 | + self.assertEqual(details, summary.details) |
| 656 | + |
| 657 | + def test_corpus_preprocessed(self): |
| 658 | + """Check if details part of the summary is formatted correctly""" |
| 659 | + corpus = Corpus.from_file("book-excerpts") |
| 660 | + corpus = RegexpTokenizer()(corpus) |
| 661 | + |
| 662 | + n_features = len(corpus.domain.variables) + len(corpus.domain.metas) |
| 663 | + details = ( |
| 664 | + f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>" |
| 665 | + f"<nobr>Features: — (no missing values)</nobr><br/>" |
| 666 | + f"<nobr>Target: categorical</nobr><br/>" |
| 667 | + f"<nobr>Metas: string</nobr><br/>" |
| 668 | + f"<nobr>Total tokens: 128020, Total types: 11712</nobr>" |
| 669 | + ) |
| 670 | + summary = summarize.dispatch(Corpus)(corpus) |
| 671 | + self.assertEqual(140, summary.summary) |
| 672 | + self.assertEqual(details, summary.details) |
| 673 | + |
| 674 | + |
632 | 675 | if __name__ == "__main__": |
633 | 676 | unittest.main() |
0 commit comments