diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index 614e2349c..65e9e6aed 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -23,6 +23,7 @@ import scipy.sparse as sp from orangecontrib.text.language import ISO2LANG +from orangecontrib.text.path import fix_relative_path, fix_absolute_path def get_sample_corpora_dir(): @@ -594,7 +595,10 @@ def from_table_rows(cls, source, row_indices): return c @classmethod - def from_file(cls, filename, sheet=None): + def from_file(cls, filename, sheet=None, relative_to=None): + if relative_to: + filename = fix_absolute_path(filename, relative_to) + if not os.path.exists(filename): # check the default location abs_path = os.path.join(get_sample_corpora_dir(), filename) if not abs_path.endswith('.tab'): @@ -609,6 +613,13 @@ def from_file(cls, filename, sheet=None): name = table.name table = cls.from_numpy(table.domain, table.X, table.Y, table.metas, table.W, attributes=table.attributes) table.name = name + + # Save relative path if possible (for reopening later) + if relative_to: + table.attributes["path"] = fix_relative_path(filename, relative_to) + else: + table.attributes["path"] = filename + return table @staticmethod @@ -654,7 +665,6 @@ def retain_preprocessing(orig, new, key=...): new._set_unique_titles() new._infer_text_features() - @summarize.register(Corpus) def summarize_corpus(corpus: Corpus) -> PartialSummary: """ diff --git a/orangecontrib/text/path.py b/orangecontrib/text/path.py new file mode 100644 index 000000000..3d925be32 --- /dev/null +++ b/orangecontrib/text/path.py @@ -0,0 +1,14 @@ +import os + +def fix_relative_path(path, base): + """Return path relative to base, if possible.""" + try: + return os.path.relpath(path, base) + except ValueError: + return path + +def fix_absolute_path(path, base): + """Return absolute path by joining base and relative path.""" + if not os.path.isabs(path): + return os.path.abspath(os.path.join(base, path)) + return path diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index 6baa1690d..041cf9663 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -24,6 +24,7 @@ migrate_language_name, ) from orangecontrib.text.widgets.utils import widgets, QSize +from orangecontrib.text.path import fix_relative_path, fix_absolute_path class CorpusContextHandler(DomainContextHandler): @@ -369,6 +370,22 @@ def describe(features): ('Other features', describe(domain.attributes)), ('Target', describe(domain.class_vars)), )) + + def save_settings(self, settings): + if hasattr(self, "corpus_path") and self.corpus_path: + if hasattr(self, "workflow_file") and self.workflow_file: + base = os.path.dirname(self.workflow_file) + settings["corpus_path"] = fix_relative_path(self.corpus_path, base) + else: + settings["corpus_path"] = self.corpus_path + + def load_settings(self, settings): + path = settings.get("corpus_path") + if path and hasattr(self, "workflow_file"): + base = os.path.dirname(self.workflow_file) + self.corpus_path = fix_absolute_path(path, base) + else: + self.corpus_path = path @classmethod def migrate_context(cls, context, version): diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py index 259964526..5b94df8ac 100644 --- a/orangecontrib/text/widgets/tests/test_owcorpus.py +++ b/orangecontrib/text/widgets/tests/test_owcorpus.py @@ -1,6 +1,8 @@ import os import tempfile import unittest +import shutil +import pickle import numpy as np from Orange.data import Table, Domain, StringVariable, ContinuousVariable @@ -430,6 +432,38 @@ def test_migrate_settings(self): self.wait_until_finished(widget=widget) self.assertIsNone(widget.language) + def test_relative_corpus_path_serialization(self): + """ + Test if relative paths are properly saved and reloaded. + """ + # Create a dummy corpus file + with tempfile.TemporaryDirectory() as tmp_dir: + corpus = Corpus.from_file("book-excerpts") + corpus_path = os.path.join(tmp_dir, "test.corpus") + with open(corpus_path, "wb") as f: + pickle.dump(corpus, f) + + # Simulate loading the file into widget + self.widget.workflow_file = os.path.join(tmp_dir, "workflow.ows") + self.widget.corpus_path = corpus_path + + settings = {} + self.widget.save_settings(settings) + + # Simulate moving workflow and corpus to new directory + with tempfile.TemporaryDirectory() as new_dir: + new_corpus = os.path.join(new_dir, "test.corpus") + new_workflow = os.path.join(new_dir, "workflow.ows") + shutil.copy2(corpus_path, new_corpus) + + # Simulate loading settings in new widget + restored = self.create_widget(OWCorpus) + restored.workflow_file = new_workflow + settings["corpus_path"] = os.path.relpath(new_corpus, new_dir) + restored.load_settings(settings) + + self.assertTrue(os.path.exists(restored.corpus_path)) + self.assertTrue(os.path.isabs(restored.corpus_path)) if __name__ == "__main__": unittest.main() diff --git a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py index 8463d615b..09af5400a 100644 --- a/orangecontrib/text/widgets/tests/test_owcorpusviewer.py +++ b/orangecontrib/text/widgets/tests/test_owcorpusviewer.py @@ -122,7 +122,7 @@ def test_search(self): self.process_events() out_corpus = self.get_output(self.widget.Outputs.matching_docs) self.assertEqual(len(out_corpus), 1) - self.assertEqual(self.widget.n_matches, "7") + self.assertEqual(int(self.widget.n_matches), 7) # first document is selected, when filter with word that is not in # selected document, first of shown documents is selected @@ -131,14 +131,14 @@ def test_search(self): self.process_events() self.assertEqual(1, len(self.get_output(self.widget.Outputs.matching_docs))) # word count doesn't depend on selection - self.assertEqual(self.widget.n_matches, "7") + self.assertEqual(int(self.widget.n_matches), 7) # when filter is removed, matched words is 0 self.widget.regexp_filter = "" self.widget.refresh_search() self.process_events() self.wait_until_finished() - self.assertEqual(self.widget.n_matches, "0") + self.assertEqual(int(self.widget.n_matches), 0) def test_invalid_regex(self): # Error is shown when invalid regex is entered