Skip to content

Commit 29f02e2

Browse files
Merge branch 'release/5.27.0'
2 parents 5803f26 + 99524b6 commit 29f02e2

File tree

117 files changed

+1991
-2011
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+1991
-2011
lines changed

.github/ISSUE_TEMPLATE/bug_report.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ body:
3535
label: Where did you find the bug?
3636
description: Please add where you found the bug.
3737
options:
38-
- label: https://ianalyzer.hum.uu.nl
38+
- label: https://textcavator.hum.uu.nl
3939
- label: https://peopleandparliament.hum.uu.nl
4040
- label: https://peace.sites.uu.nl
4141
- label: a server hosted elsewhere (i.e. not by the research software lab)
@@ -58,7 +58,7 @@ body:
5858
description: |
5959
How can a developer replicate the issue? Please provide any information you can. For
6060
example: "I went to
61-
https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
61+
https://textcavator.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then
6262
clicked on Download CSV. I pressed cancel and then I clicked Download CSV again."
6363
validations:
6464
required: true

.vscode/launch.json

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"configurations": [
77
{
88
"name": "django: runserver",
9-
"type": "python",
9+
"type": "debugpy",
1010
"request": "launch",
1111
"program": "${workspaceFolder}/backend/manage.py",
1212
"args": ["runserver"],
@@ -15,7 +15,7 @@
1515
},
1616
{
1717
"name": "django: shell",
18-
"type": "python",
18+
"type": "debugpy",
1919
"request": "launch",
2020
"program": "${workspaceFolder}/backend/manage.py",
2121
"args": ["shell"],
@@ -24,16 +24,17 @@
2424
},
2525
{
2626
"name": "django: index",
27-
"type": "python",
27+
"type": "debugpy",
2828
"request": "launch",
2929
"program": "${workspaceFolder}/backend/manage.py",
30-
"args": ["index", "${input:corpusName}"],
30+
"args": ["index", "${input:corpusName}", "--delete"],
3131
"django": true,
32-
"justMyCode": true
32+
"justMyCode": true,
33+
"console": "internalConsole"
3334
},
3435
{
3536
"name": "django: loadcorpora",
36-
"type": "python",
37+
"type": "debugpy",
3738
"request": "launch",
3839
"program": "${workspaceFolder}/backend/manage.py",
3940
"args": ["loadcorpora"],

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ keywords:
3535
- elasticsearch
3636
- natural language processing
3737
license: MIT
38-
version: 5.26.0
39-
date-released: '2025-12-11'
38+
version: 5.27.0
39+
date-released: '2026-02-02'

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ For corpora included in Textcavator, the backend includes a definition file that
2222

2323
## Usage
2424

25-
If you are interested in using Textcavator, the most straightforward way to get started is to visit [ianalyzer.hum.uu.nl](https://ianalyzer.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
25+
If you are interested in using Textcavator, the most straightforward way to get started is to visit [textcavator.hum.uu.nl](https://textcavator.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
2626

2727
Textcavator does not have an "upload data" option (yet!). If you are interested in using Textcavator as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
2828

backend/addcorpus/python_corpora/load_corpus.py

Lines changed: 13 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,39 @@
11
import logging
2-
import re
32
import sys
4-
from importlib import util
5-
from os.path import abspath, dirname
3+
from os.path import dirname
4+
from django.utils.module_loading import import_string
5+
from typing import Type, Optional, Dict
6+
from inspect import getabsfile
67

78
from addcorpus.python_corpora.corpus import CorpusDefinition
89
from django.conf import settings
910

1011
logger = logging.getLogger(__name__)
11-
from addcorpus.python_corpora.corpus import CorpusDefinition
12-
1312

14-
def corpus_path(corpus_name):
15-
return abspath(settings.CORPORA.get(corpus_name))
1613

17-
def corpus_dir(corpus_name):
14+
def corpus_dir(corpus_name: str) -> str:
1815
"""Gets the absolute path to the corpus definition directory
1916
2017
Arguments:
2118
corpus_name {str} -- Key of the corpus in CORPORA object in settings
2219
"""
23-
return dirname(corpus_path(corpus_name))
20+
corpus = load_corpus_definition(corpus_name)
21+
return dirname(getabsfile(corpus.__class__))
2422

25-
def load_corpus_definition(corpus_name) -> CorpusDefinition:
26-
filepath = corpus_path(corpus_name)
27-
try:
28-
corpus_spec = util.spec_from_file_location(
29-
corpus_name,
30-
filepath)
3123

32-
corpus_mod = util.module_from_spec(corpus_spec)
33-
except FileNotFoundError:
34-
logger.critical(
35-
'No module describing the corpus "{0}" found in the specified file path:\
36-
{1}'.format(corpus_name, filepath)
37-
)
38-
raise
24+
def load_corpus_definition(corpus_name) -> Type[CorpusDefinition]:
25+
import_path = settings.CORPORA.get(corpus_name)
26+
return import_string(import_path)()
3927

40-
corpus_spec.loader.exec_module(corpus_mod)
41-
# assume the class name is the same as the corpus name,
42-
# allowing for differences in camel case vs. lower case
43-
regex = re.compile('[^a-zA-Z]')
44-
corpus_name = regex.sub('', corpus_name).lower()
45-
endpoint = next((attr for attr in dir(corpus_mod)
46-
if attr.lower() == corpus_name), None)
47-
corpus_class = getattr(corpus_mod, endpoint)
48-
return corpus_class()
4928

50-
def _try_loading_corpus_definition(corpus_name, stderr=sys.stderr):
29+
def _try_loading_corpus_definition(corpus_name, stderr=sys.stderr) -> Optional[CorpusDefinition]:
5130
try:
5231
return load_corpus_definition(corpus_name)
5332
except Exception as e:
5433
logger.exception('Could not load corpus %s: %s', corpus_name, e)
5534

56-
def load_all_corpus_definitions(stderr=sys.stderr):
35+
36+
def load_all_corpus_definitions(stderr=sys.stderr) -> Dict[str, CorpusDefinition]:
5737
'''
5838
Return a dict with corpus names and corpus definition objects.
5939
'''

backend/addcorpus/python_corpora/tests/test_corpusimport.py

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ def test_key_error(db, settings):
1616

1717
def test_import_error(db, settings):
1818
''' Verify that exceptions is correctly raised
19-
- in case the file path in config.CORPORA is faulty
19+
- in case the path in config.CORPORA is faulty
2020
'''
2121

22-
settings.CORPORA = {'times2': '/somewhere/times/times.py'}
22+
settings.CORPORA = {'times2': 'somewhere.times.times.Times'}
2323

24-
with pytest.raises(FileNotFoundError) as e:
24+
with pytest.raises(ModuleNotFoundError) as e:
2525
load_corpus.load_corpus_definition('times2')
2626

2727
# corpus should not be included when
@@ -30,40 +30,9 @@ def test_import_error(db, settings):
3030
assert 'times2' not in corpora
3131
assert not Corpus.objects.filter(name='times2')
3232

33-
mock_corpus_definition = '''
34-
class Times():
35-
title = "Times"
36-
description = "Newspaper archive, 1785-2010"
37-
fields = []
38-
es_index = 'some-other-name'
39-
'''
4033

41-
@pytest.fixture()
42-
def temp_times_definition(tmpdir, settings):
43-
'''Provide a temporary definition files for the
44-
times corpus
45-
'''
46-
testdir = tmpdir.mkdir('/testdir')
47-
48-
with open(os.path.join(testdir, 'times.py'), 'w') as f:
49-
f.write(mock_corpus_definition)
50-
path_testfile = str(testdir)+'/times.py'
51-
52-
settings.CORPORA = {'times': path_testfile}
53-
54-
def test_import_from_anywhere(db, temp_times_definition):
55-
''' Verify that the corpus definition
56-
can live anywhere in the file system
57-
'''
58-
corpus_definitions = load_corpus.load_all_corpus_definitions()
59-
assert 'times' in corpus_definitions
60-
corpus = corpus_definitions['times']
61-
assert corpus.title == 'Times'
62-
63-
def test_corpus_dir_is_absolute(db, temp_times_definition):
64-
corpus_dir = load_corpus.corpus_dir('times')
65-
assert os.path.isabs(corpus_dir)
66-
67-
def test_mismatch_corpus_index_names(temp_times_definition):
68-
times = load_corpus.load_corpus_definition('times')
69-
assert times.es_index == 'some-other-name'
34+
def test_corpus_dir(db, settings, basic_mock_corpus):
35+
path = load_corpus.corpus_dir(basic_mock_corpus)
36+
assert os.path.isabs(path)
37+
assert 'mock_csv_corpus.py' in os.listdir(path)
38+
assert 'source_data' in os.listdir(path)

backend/addcorpus/python_corpora/tests/test_times_source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
@pytest.fixture()
1010
def times_test_settings(settings):
1111
settings.CORPORA = {
12-
'times': join(settings.BASE_DIR, 'corpora/times/times.py')
12+
'times': 'corpora.times.times.Times',
1313
}
1414
settings.TIMES_DATA = join(settings.BASE_DIR, 'addcorpus/python_corpora/tests')
1515
settings.TIMES_ES_INDEX = 'test-times'

backend/addcorpus/schemas/corpus.schema.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
"oration",
4444
"book",
4545
"letter",
46-
"poerty",
46+
"poetry",
4747
"social",
4848
"informative"
4949
]
Lines changed: 23 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,23 @@
1-
import shutil
2-
import os
3-
4-
from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora
5-
6-
def test_field_order_python_corpus(small_mock_corpus, admin_client, tmpdir, settings):
7-
# check field order matches corpus definition
8-
response = admin_client.get('/api/corpus/')
9-
corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
10-
field_names = [field['name'] for field in corpus_data['fields']]
11-
assert field_names == ['date', 'title', 'content', 'genre']
12-
13-
# copy corpus definition into tmpdir
14-
current_dir = os.path.join(settings.BASE_DIR, 'corpora_test', 'small')
15-
shutil.copytree(current_dir, tmpdir, dirs_exist_ok=True)
16-
17-
definition_path = os.path.join(tmpdir, 'small_mock_corpus.py')
18-
19-
with open(definition_path, 'r') as definition_file:
20-
definition_str = definition_file.read()
21-
22-
# replace `fields = [...]` line in file to change field order
23-
definition_str = definition_str.replace(
24-
'fields = [date, title_field, content, genre]',
25-
'fields = [title_field, content, genre, date]'
26-
)
27-
28-
# save edited definition
29-
with open(definition_path, 'w') as definition_file:
30-
definition_file.write(definition_str)
31-
32-
# check order has changed
33-
settings.CORPORA[small_mock_corpus] = definition_path
34-
load_and_save_all_corpora()
35-
36-
# Okay this test will never work because it actually just looks at the es_index ordering of fields....
37-
response = admin_client.get('/api/corpus/')
38-
corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
39-
40-
field_names = [field['name'] for field in corpus_data['fields']]
41-
assert field_names == ['title', 'content', 'genre', 'date']
1+
from addcorpus.models import Corpus
2+
from addcorpus.python_corpora.load_corpus import load_corpus_definition
3+
from addcorpus.python_corpora.save_corpus import _save_corpus_configuration
4+
5+
def test_field_order_python_corpus(small_mock_corpus, admin_client):
6+
# check field order matches corpus definition
7+
response = admin_client.get('/api/corpus/')
8+
corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
9+
field_names = [field['name'] for field in corpus_data['fields']]
10+
assert field_names == ['date', 'title', 'content', 'genre']
11+
12+
# update field order
13+
corpus = Corpus.objects.get(name=small_mock_corpus)
14+
definition = load_corpus_definition(small_mock_corpus)
15+
definition.fields = list(reversed(definition.fields))
16+
_save_corpus_configuration(corpus, definition)
17+
18+
# check order is updated
19+
response = admin_client.get('/api/corpus/')
20+
corpus_data = next(c for c in response.data if c['name'] == small_mock_corpus)
21+
22+
field_names = [field['name'] for field in corpus_data['fields']]
23+
assert field_names == ['genre', 'content', 'title', 'date']

backend/corpora/dbnl/tests/test_dbnl_extraction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ def dbnl_corpus(settings):
1313
settings.DBNL_DATA = os.path.join(here, 'data')
1414
# for testing purposes, also add the metadata helper corpus
1515
settings.CORPORA = {
16-
'dbnl': os.path.join(here, '..', 'dbnl.py'),
17-
'dbnl_metadata': os.path.join(here, '..', 'dbnl_metadata.py'),
16+
'dbnl': 'corpora.dbnl.dbnl.DBNL',
17+
'dbnl_metadata': 'corpora.dbnl.dbnl_metadata.DBNLMetadata',
1818
}
1919
return 'dbnl'
2020

0 commit comments

Comments
 (0)