CentreForDigitalHumanities
diff --git a/‎.github/ISSUE_TEMPLATE/bug_report.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/bug_report.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.vscode/launch.json‎
Lines changed: 7 additions & 6 deletions b/‎.vscode/launch.json‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/addcorpus/python_corpora/load_corpus.py‎
Lines changed: 13 additions & 33 deletions b/‎backend/addcorpus/python_corpora/load_corpus.py‎
Lines changed: 13 additions & 33 deletions
diff --git a/‎backend/addcorpus/python_corpora/tests/test_corpusimport.py‎
Lines changed: 8 additions & 39 deletions b/‎backend/addcorpus/python_corpora/tests/test_corpusimport.py‎
Lines changed: 8 additions & 39 deletions
diff --git a/‎backend/addcorpus/python_corpora/tests/test_times_source.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/addcorpus/python_corpora/tests/test_times_source.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/addcorpus/schemas/corpus.schema.json‎
Lines changed: 1 addition & 1 deletion b/‎backend/addcorpus/schemas/corpus.schema.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/addcorpus/tests/test_field_order.py‎
Lines changed: 23 additions & 41 deletions b/‎backend/addcorpus/tests/test_field_order.py‎
Lines changed: 23 additions & 41 deletions
diff --git a/‎backend/corpora/dbnl/tests/test_dbnl_extraction.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/corpora/dbnl/tests/test_dbnl_extraction.py‎
Lines changed: 2 additions & 2 deletions
@@ -35,7 +35,7 @@ body:
       label: Where did you find the bug?
       description: Please add where you found the bug.
       options:
-        - label: https://ianalyzer.hum.uu.nl
+        - label: https://textcavator.hum.uu.nl
         - label: https://peopleandparliament.hum.uu.nl
         - label: https://peace.sites.uu.nl
         - label: a server hosted elsewhere (i.e. not by the research software lab)
@@ -58,7 +58,7 @@ body:
       description: |
         How can a developer replicate the issue? Please provide any information you can. For
         example: "I went to
-        https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
+        https://textcavator.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
         clicked on Download CSV. I pressed cancel and then I clicked Download CSV again."
     validations:
       required: true
 
@@ -6,7 +6,7 @@
     "configurations": [
         {
             "name": "django: runserver",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
             "args": ["runserver"],
@@ -15,7 +15,7 @@
         },
         {
             "name": "django: shell",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
             "args": ["shell"],
@@ -24,16 +24,17 @@
         },
         {
             "name": "django: index",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
-            "args": ["index", "${input:corpusName}"],
+            "args": ["index", "${input:corpusName}", "--delete"],
             "django": true,
-            "justMyCode": true
+            "justMyCode": true,
+            "console": "internalConsole"
         },
         {
             "name": "django: loadcorpora",
-            "type": "python",
+            "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/backend/manage.py",
             "args": ["loadcorpora"],
 
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.26.0
-date-released: '2025-12-11'
+version: 5.27.0
+date-released: '2026-02-02'
@@ -22,7 +22,7 @@ For corpora included in Textcavator, the backend includes a definition file that
 
 ## Usage
 
-If you are interested in using Textcavator, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
+If you are interested in using Textcavator, the most straightforward way to get started is to visit [textcavator.hum.uu.nl](https://textcavator.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
 
 Textcavator does not have an "upload data" option (yet!). If you are interested in using Textcavator as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
 
 
@@ -1,59 +1,39 @@
 import logging
-import re
 import sys
-from importlib import util
-from os.path import abspath, dirname
+from os.path import dirname
+from django.utils.module_loading import import_string
+from typing import Type, Optional, Dict
+from inspect import getabsfile
 
 from addcorpus.python_corpora.corpus import CorpusDefinition
 from django.conf import settings
 
 logger = logging.getLogger(__name__)
-from addcorpus.python_corpora.corpus import CorpusDefinition
-
 
-def corpus_path(corpus_name):
-    return abspath(settings.CORPORA.get(corpus_name))
 
-def corpus_dir(corpus_name):
+def corpus_dir(corpus_name: str) -> str:
     """Gets the absolute path to the corpus definition directory
 
     Arguments:
         corpus_name {str} -- Key of the corpus in CORPORA object in settings
     """
-    return dirname(corpus_path(corpus_name))
+    corpus = load_corpus_definition(corpus_name)
+    return dirname(getabsfile(corpus.__class__))
 
-def load_corpus_definition(corpus_name) -> CorpusDefinition:
-    filepath = corpus_path(corpus_name)
-    try:
-        corpus_spec = util.spec_from_file_location(
-            corpus_name,
-            filepath)
 
-        corpus_mod = util.module_from_spec(corpus_spec)
-    except FileNotFoundError:
-        logger.critical(
-            'No module describing the corpus "{0}" found in the specified file path:\
-            {1}'.format(corpus_name, filepath)
-        )
-        raise
+def load_corpus_definition(corpus_name) -> Type[CorpusDefinition]:
+    import_path = settings.CORPORA.get(corpus_name)
+    return import_string(import_path)()
 
-    corpus_spec.loader.exec_module(corpus_mod)
-    # assume the class name is the same as the corpus name,
-    # allowing for differences in camel case vs. lower case
-    regex = re.compile('[^a-zA-Z]')
-    corpus_name = regex.sub('', corpus_name).lower()
-    endpoint = next((attr for attr in dir(corpus_mod)
-                     if attr.lower() == corpus_name), None)
-    corpus_class = getattr(corpus_mod, endpoint)
-    return corpus_class()
 
-def _try_loading_corpus_definition(corpus_name, stderr=sys.stderr):
+def _try_loading_corpus_definition(corpus_name, stderr=sys.stderr) -> Optional[CorpusDefinition]:
     try:
         return load_corpus_definition(corpus_name)
     except Exception as e:
         logger.exception('Could not load corpus %s: %s', corpus_name, e)
 
-def load_all_corpus_definitions(stderr=sys.stderr):
+
+def load_all_corpus_definitions(stderr=sys.stderr) -> Dict[str, CorpusDefinition]:
     '''
     Return a dict with corpus names and corpus definition objects.
     '''
 
@@ -16,12 +16,12 @@ def test_key_error(db, settings):
 
 def test_import_error(db, settings):
     ''' Verify that exceptions is correctly raised
-    - in case the file path in config.CORPORA is faulty
+    - in case the path in config.CORPORA is faulty
     '''
 
-    settings.CORPORA = {'times2': '/somewhere/times/times.py'}
+    settings.CORPORA = {'times2': 'somewhere.times.times.Times'}
 
-    with pytest.raises(FileNotFoundError) as e:
+    with pytest.raises(ModuleNotFoundError) as e:
         load_corpus.load_corpus_definition('times2')
 
     # corpus should not be included when
@@ -30,40 +30,9 @@ def test_import_error(db, settings):
     assert 'times2' not in corpora
     assert not Corpus.objects.filter(name='times2')
 
-mock_corpus_definition = '''
-class Times():
-    title = "Times"
-    description = "Newspaper archive, 1785-2010"
-    fields = []
-    es_index = 'some-other-name'
-'''
 
-@pytest.fixture()
-def temp_times_definition(tmpdir, settings):
-    '''Provide a temporary definition files for the
-    times corpus
-    '''
-    testdir = tmpdir.mkdir('/testdir')
-
-    with open(os.path.join(testdir, 'times.py'), 'w') as f:
-        f.write(mock_corpus_definition)
-    path_testfile = str(testdir)+'/times.py'
-
-    settings.CORPORA = {'times': path_testfile}
-
-def test_import_from_anywhere(db, temp_times_definition):
-    ''' Verify that the corpus definition
-    can live anywhere in the file system
-    '''
-    corpus_definitions = load_corpus.load_all_corpus_definitions()
-    assert 'times' in corpus_definitions
-    corpus = corpus_definitions['times']
-    assert corpus.title == 'Times'
-
-def test_corpus_dir_is_absolute(db, temp_times_definition):
-    corpus_dir = load_corpus.corpus_dir('times')
-    assert os.path.isabs(corpus_dir)
-
-def test_mismatch_corpus_index_names(temp_times_definition):
-    times = load_corpus.load_corpus_definition('times')
-    assert times.es_index == 'some-other-name'
+def test_corpus_dir(db, settings, basic_mock_corpus):
+    path = load_corpus.corpus_dir(basic_mock_corpus)
+    assert os.path.isabs(path)
+    assert 'mock_csv_corpus.py' in os.listdir(path)
+    assert 'source_data' in os.listdir(path)
@@ -9,7 +9,7 @@
 @pytest.fixture()
 def times_test_settings(settings):
     settings.CORPORA = {
-        'times': join(settings.BASE_DIR, 'corpora/times/times.py')
+        'times': 'corpora.times.times.Times',
     }
     settings.TIMES_DATA = join(settings.BASE_DIR, 'addcorpus/python_corpora/tests')
     settings.TIMES_ES_INDEX = 'test-times'
 
@@ -43,7 +43,7 @@
                         "oration",
                         "book",
                         "letter",
-                        "poerty",
+                        "poetry",
                         "social",
                         "informative"
                     ]
 
@@ -1,41 +1,23 @@
-import shutil
-import os
-
-from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora
-
-def test_field_order_python_corpus(small_mock_corpus, admin_client, tmpdir, settings):
-	# check field order matches corpus definition
-	response = admin_client.get('/api/corpus/')
-	corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
-	field_names = [field['name'] for field in corpus_data['fields']]
-	assert field_names == ['date', 'title', 'content', 'genre']
-
-	# copy corpus definition into tmpdir
-	current_dir = os.path.join(settings.BASE_DIR, 'corpora_test', 'small')
-	shutil.copytree(current_dir, tmpdir, dirs_exist_ok=True)
-
-	definition_path = os.path.join(tmpdir, 'small_mock_corpus.py')
-
-	with open(definition_path, 'r') as definition_file:
-		definition_str = definition_file.read()
-
-	# replace `fields = [...]` line in file to change field order
-	definition_str = definition_str.replace(
-		'fields = [date, title_field, content, genre]',
-		'fields = [title_field, content, genre, date]'
-	)
-
-	# save edited definition
-	with open(definition_path, 'w') as definition_file:
-		definition_file.write(definition_str)
-
-	# check order has changed
-	settings.CORPORA[small_mock_corpus] = definition_path
-	load_and_save_all_corpora()
-
-    # Okay this test will never work because it actually just looks at the es_index ordering of fields....
-	response = admin_client.get('/api/corpus/')
-	corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
-
-	field_names = [field['name'] for field in corpus_data['fields']]
-	assert field_names == ['title', 'content', 'genre', 'date']
+from addcorpus.models import Corpus
+from addcorpus.python_corpora.load_corpus import load_corpus_definition
+from addcorpus.python_corpora.save_corpus import _save_corpus_configuration
+
+def test_field_order_python_corpus(small_mock_corpus, admin_client):
+    # check field order matches corpus definition
+    response = admin_client.get('/api/corpus/')
+    corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
+    field_names = [field['name'] for field in corpus_data['fields']]
+    assert field_names == ['date', 'title', 'content', 'genre']
+
+    # update field order
+    corpus = Corpus.objects.get(name=small_mock_corpus)
+    definition = load_corpus_definition(small_mock_corpus)
+    definition.fields = list(reversed(definition.fields))
+    _save_corpus_configuration(corpus, definition)
+
+    # check order is updated
+    response = admin_client.get('/api/corpus/')
+    corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
+
+    field_names = [field['name'] for field in corpus_data['fields']]
+    assert field_names == ['genre', 'content', 'title', 'date']
@@ -13,8 +13,8 @@ def dbnl_corpus(settings):
     settings.DBNL_DATA = os.path.join(here, 'data')
     # for testing purposes, also add the metadata helper corpus
     settings.CORPORA = {
-        'dbnl': os.path.join(here, '..', 'dbnl.py'),
-        'dbnl_metadata': os.path.join(here, '..', 'dbnl_metadata.py'),
+        'dbnl': 'corpora.dbnl.dbnl.DBNL',
+        'dbnl_metadata': 'corpora.dbnl.dbnl_metadata.DBNLMetadata',
     }
     return 'dbnl'
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`@pytest.fixture()`
`10`	`10`	`def times_test_settings(settings):`
`11`	`11`	`settings.CORPORA = {`
`12`		`- 'times': join(settings.BASE_DIR, 'corpora/times/times.py')`
	`12`	`+ 'times': 'corpora.times.times.Times',`
`13`	`13`	`}`
`14`	`14`	`settings.TIMES_DATA = join(settings.BASE_DIR, 'addcorpus/python_corpora/tests')`
`15`	`15`	`settings.TIMES_ES_INDEX = 'test-times'`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@`
`43`	`43`	`"oration",`
`44`	`44`	`"book",`
`45`	`45`	`"letter",`
`46`		`- "poerty",`
	`46`	`+ "poetry",`
`47`	`47`	`"social",`
`48`	`48`	`"informative"`
`49`	`49`	`]`