Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import scipy.sparse as sp

from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.path import fix_relative_path, fix_absolute_path


def get_sample_corpora_dir():
Expand Down Expand Up @@ -594,7 +595,10 @@ def from_table_rows(cls, source, row_indices):
return c

@classmethod
def from_file(cls, filename, sheet=None):
def from_file(cls, filename, sheet=None, relative_to=None):
if relative_to:
filename = fix_absolute_path(filename, relative_to)

if not os.path.exists(filename): # check the default location
abs_path = os.path.join(get_sample_corpora_dir(), filename)
if not abs_path.endswith('.tab'):
Expand All @@ -609,6 +613,13 @@ def from_file(cls, filename, sheet=None):
name = table.name
table = cls.from_numpy(table.domain, table.X, table.Y, table.metas, table.W, attributes=table.attributes)
table.name = name

# Save relative path if possible (for reopening later)
if relative_to:
table.attributes["path"] = fix_relative_path(filename, relative_to)
else:
table.attributes["path"] = filename

return table

@staticmethod
Expand Down Expand Up @@ -654,7 +665,6 @@ def retain_preprocessing(orig, new, key=...):
new._set_unique_titles()
new._infer_text_features()


@summarize.register(Corpus)
def summarize_corpus(corpus: Corpus) -> PartialSummary:
"""
Expand Down
14 changes: 14 additions & 0 deletions orangecontrib/text/path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os

def fix_relative_path(path, base):
"""Return path relative to base, if possible."""
try:
return os.path.relpath(path, base)
except ValueError:
return path

def fix_absolute_path(path, base):
"""Return absolute path by joining base and relative path."""
if not os.path.isabs(path):
return os.path.abspath(os.path.join(base, path))
return path
17 changes: 17 additions & 0 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
migrate_language_name,
)
from orangecontrib.text.widgets.utils import widgets, QSize
from orangecontrib.text.path import fix_relative_path, fix_absolute_path


class CorpusContextHandler(DomainContextHandler):
Expand Down Expand Up @@ -369,6 +370,22 @@ def describe(features):
('Other features', describe(domain.attributes)),
('Target', describe(domain.class_vars)),
))

def save_settings(self, settings):
if hasattr(self, "corpus_path") and self.corpus_path:
if hasattr(self, "workflow_file") and self.workflow_file:
base = os.path.dirname(self.workflow_file)
settings["corpus_path"] = fix_relative_path(self.corpus_path, base)
else:
settings["corpus_path"] = self.corpus_path

def load_settings(self, settings):
path = settings.get("corpus_path")
if path and hasattr(self, "workflow_file"):
base = os.path.dirname(self.workflow_file)
self.corpus_path = fix_absolute_path(path, base)
else:
self.corpus_path = path

@classmethod
def migrate_context(cls, context, version):
Expand Down
34 changes: 34 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import tempfile
import unittest
import shutil
import pickle

import numpy as np
from Orange.data import Table, Domain, StringVariable, ContinuousVariable
Expand Down Expand Up @@ -430,6 +432,38 @@ def test_migrate_settings(self):
self.wait_until_finished(widget=widget)
self.assertIsNone(widget.language)

def test_relative_corpus_path_serialization(self):
"""
Test if relative paths are properly saved and reloaded.
"""
# Create a dummy corpus file
with tempfile.TemporaryDirectory() as tmp_dir:
corpus = Corpus.from_file("book-excerpts")
corpus_path = os.path.join(tmp_dir, "test.corpus")
with open(corpus_path, "wb") as f:
pickle.dump(corpus, f)

# Simulate loading the file into widget
self.widget.workflow_file = os.path.join(tmp_dir, "workflow.ows")
self.widget.corpus_path = corpus_path

settings = {}
self.widget.save_settings(settings)

# Simulate moving workflow and corpus to new directory
with tempfile.TemporaryDirectory() as new_dir:
new_corpus = os.path.join(new_dir, "test.corpus")
new_workflow = os.path.join(new_dir, "workflow.ows")
shutil.copy2(corpus_path, new_corpus)

# Simulate loading settings in new widget
restored = self.create_widget(OWCorpus)
restored.workflow_file = new_workflow
settings["corpus_path"] = os.path.relpath(new_corpus, new_dir)
restored.load_settings(settings)

self.assertTrue(os.path.exists(restored.corpus_path))
self.assertTrue(os.path.isabs(restored.corpus_path))

if __name__ == "__main__":
unittest.main()
6 changes: 3 additions & 3 deletions orangecontrib/text/widgets/tests/test_owcorpusviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_search(self):
self.process_events()
out_corpus = self.get_output(self.widget.Outputs.matching_docs)
self.assertEqual(len(out_corpus), 1)
self.assertEqual(self.widget.n_matches, "7")
self.assertEqual(int(self.widget.n_matches), 7)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR should be rebased and then this test is no longer necessary.


# first document is selected, when filter with word that is not in
# selected document, first of shown documents is selected
Expand All @@ -131,14 +131,14 @@ def test_search(self):
self.process_events()
self.assertEqual(1, len(self.get_output(self.widget.Outputs.matching_docs)))
# word count doesn't depend on selection
self.assertEqual(self.widget.n_matches, "7")
self.assertEqual(int(self.widget.n_matches), 7)

# when filter is removed, matched words is 0
self.widget.regexp_filter = ""
self.widget.refresh_search()
self.process_events()
self.wait_until_finished()
self.assertEqual(self.widget.n_matches, "0")
self.assertEqual(int(self.widget.n_matches), 0)

def test_invalid_regex(self):
# Error is shown when invalid regex is entered
Expand Down