Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.6.15
3.13
15 changes: 7 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.6-2020-12-19

FROM python:3.13.2-alpine3.21
WORKDIR /backend
COPY . /backend

RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir poetry==1.0.0 \
RUN apk add gcc build-base \
&& pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir poetry==2.0.1 \
&& pip install uvicorn \
&& poetry config virtualenvs.create false \
&& poetry install --no-dev \
&& pip uninstall --yes poetry
&& poetry install

RUN sed -i "s/CipherString = DEFAULT@SECLEVEL=2/CipherString = DEFAULT@SECLEVEL=1/g" /etc/ssl/openssl.cnf
RUN openssl rand -hex 20 > /backend/.db-version

COPY . /backend

CMD ["poetry", "run", "uvicorn", "app:main", "--port", "8000", "--host", "0.0.0.0", "--workers", "2"]

4 changes: 2 additions & 2 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.6-2020-12-19
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.11-2025-02-10

WORKDIR /app

RUN pip install poetry lxml snakesist python-dotenv diskcache

COPY ./pyproject.toml ./poetry.lock* ./

RUN poetry install
RUN poetry install --no-root

COPY . .
# COPY .env.production .env
8 changes: 4 additions & 4 deletions app/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import requests
from fastapi import FastAPI
from fastapi.openapi.utils import get_openapi
from lxml import etree
from snakesist.exist_client import ExistClient
from starlette.responses import Response, JSONResponse, PlainTextResponse, FileResponse, StreamingResponse
from starlette.requests import Request
from random import choice
from string import ascii_letters
from pathlib import Path
from multiprocessing import Manager
from diskcache import Cache

from service import Service, beacon_service, image_service, letter_index_service
Expand All @@ -28,8 +28,8 @@
]


db = ExistClient(host="db")
# db = ExistClient(host="localhost")
db = ExistClient(host="db", parser=etree.XMLParser(recover=True))
# db = ExistClient(host="localhost", port=8071, parser=etree.XMLParser(recover=True))
db.root_collection = ROOT_COLLECTION
service = Service(db, CFG, watch_updates=True)

Expand Down Expand Up @@ -75,7 +75,7 @@ async def cmif_api():
Get correspondence metadata in CMI format
"""
return XMLResponse(
content=str(db.retrieve_resources("//*:TEI[@type='cmif']").pop())
content=str(db.xpath("//*:TEI[@type='cmif']").pop())
)


Expand Down
1,792 changes: 1,075 additions & 717 deletions poetry.lock

Large diffs are not rendered by default.

31 changes: 17 additions & 14 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
[tool.poetry]
name = "gregorovius-api"
version = "1.5.5"
version = "1.6.0"
description = "Backend API layer for the Gregorovius Correspondence Edition"
authors = ["Theodor Costea <[email protected]>", "Oliver Pohl <[email protected]>"]
package-mode = false

[tool.poetry.dependencies]
python = "^3.6"
pydantic = "^0.32.2"
snakesist = {version = "0.1.0-b1", allow-prereleases = true}
uvicorn = "^0.9.0"
fastapi = "^0.38.1"
pyyaml = "^5.1"
schedule = "^0.6.0"
xmltodict = "^0.12.0"
diskcache = "^5.4.0"
aiofiles = "0.8.0"
Pillow = "^7"
openpyxl = "^3.1.2"
python = "^3.13"
pydantic = "^2.10.4"
snakesist = "^0.3.0"
uvicorn = "^0.34.0"
fastapi = "^0.115.6"
pyyaml = "^6.0.2"
schedule = "^1.2.2"
xmltodict = "^0.14.2"
diskcache = "^5.6.3"
aiofiles = "24.1.0"
Pillow = "^11.1.0"
openpyxl = "^3.1.5"
requests = "^2.32.3"
gunicorn = "^23.0.0"

[tool.poetry.dev-dependencies]
pytest = "^5.1"
pytest = "^8.3.4"

[build-system]
requires = ["poetry>=0.12"]
Expand Down
12 changes: 9 additions & 3 deletions service/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from types import NoneType

from delb import TagNode
from typing import Dict
from snakesist.exist_client import Resource
from snakesist.exist_client import NodeResource as Resource

from models import EntityMeta

Expand Down Expand Up @@ -43,7 +45,10 @@ def process_property_value(node: TagNode, property_manifest: Dict) -> str:
if "filter" in property_manifest:
output = apply_filter(node[val], property_manifest["filter"])
else:
output = normalize_whitespace(node[val])
output = normalize_whitespace(node[val] if node[val] else '')
# Output can return empty strings. We don't want those. Get the first attribute match.
if output:
return output
except KeyError:
continue
else:
Expand Down Expand Up @@ -152,8 +157,9 @@ def xml_to_entitymeta(
except (KeyError, TypeError):
print(f"Warning: No @xml:id found for '{entity_name}' item! Item endpoint will not be accessible.")
node_id = ''

return EntityMeta(
id=node_id,
id=str(node_id),
entity=entity_name,
properties=properties
)
98 changes: 13 additions & 85 deletions service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(self, db: ExistClient, manifest: Dict, watch_updates: bool = False)
self.manifest_entities = manifest['entities']
self.db = db
self.entities = {
name: self.db.retrieve_resources(manifest["xpath"])
name: self.db.xpath(manifest["xpath"])
for name, manifest in self.manifest_entities.items()
}
try:
Expand All @@ -85,77 +85,6 @@ def __init__(self, db: ExistClient, manifest: Dict, watch_updates: bool = False)
self.id_attr = '{http://www.w3.org/XML/1998/namespace}id'
if watch_updates:
UpdateWatcher(self.db, self.entities)
self._initialize_search_indices()

def _initialize_search_indices(self):
"""
Create Lucene search configurations according to config.yml,
store them in the database and initiate reindexing.
"""
searchable_entities = [
entity_conf["search_index"] for entity_name, entity_conf in self.manifest["entities"].items()
if "search_index" in entity_conf
]
text_config = ""
for unit in searchable_entities:
try:
for text in unit["text"]:
text_config += (
f"<text {text['type']}='{text['pattern']}'>"
f"<inline qname='{text['inline-qname']}'/>"
)

if "fields" in text:
for field in text["fields"]:
text_config += f"<field name='{field['name']}' expression='{field['expression']}' />"

if "ignore" in text:
text_config += f"<ignore qname='{text['ignore']}'/>"
text_config += "</text>"
except KeyError:
raise ValueError(f"Error reading search index configuration: {unit}.")

config = (
"<collection xmlns='http://exist-db.org/collection-config/1.0'>"
"<index xmlns:tei='http://www.tei-c.org/ns/1.0'>"
"<fulltext default='none' attributes='false'/>"
"<lucene>"
"<analyzer class='org.apache.lucene.analysis.standard.StandardAnalyzer'/>"
f"{text_config}"
"</lucene>"
"</index>"
"</collection>"
)
collection = self.manifest['collection']
collection_alternative = self.manifest['collection_alternative']

config_path = f"/db/system/config{collection}"
config_path_alternative = f"/db/system/config{collection_alternative}"
try:
self.db.query(
f'(xmldb:create-collection("/db/system/config", "{collection}"),'
f'xmldb:store("{config_path}", "collection.xconf", "{config}"),'
f'xmldb:reindex("{collection}"))'
)

self.db.query(
f'(xmldb:create-collection("/db/system/config", "{collection_alternative}"),'
f'xmldb:store("{config_path_alternative}", "collection.xconf", "{config}"),'
f'xmldb:reindex("{collection_alternative}"))'
)
except HTTPError as e:
print(e)

collection_alt = self.manifest['collection_alternative']
config_path_alt = f"/db/system/config{collection}"
try:
self.db.query(
f'(xmldb:create-collection("/db/system/config", "{collection_alt}"),'
f'xmldb:store("{config_path_alt}", "collection.xconf", "{config}"),'
f'xmldb:reindex("{collection_alt}"))'
)
except HTTPError as e:
print(e)

def get_entities(self, entity_name: str) -> List[EntityMeta]:
"""
Expand Down Expand Up @@ -189,17 +118,16 @@ def get_entity(self, entity_name: str, entity_id: str, output_format: str) -> st
if output_format == "xml":
return str(resource.node)
elif output_format == "json":
try:
output = resource.node.css_select("teiHeader").pop()
except IndexError:
output = resource.node.css_select("teiHeader").first
if output is None:
output = resource.node
return xmltodict.parse(str(output))
else:
raise ValueError(
f"Invalid format: {output_format}."
f"Only 'xml' and 'json' are supported."
)
return resource
return str(resource)

def get_search_results(self, entity: str, keyword: str, width: int) -> Dict:
"""
Expand Down Expand Up @@ -253,18 +181,18 @@ def get_search_results(self, entity: str, keyword: str, width: int) -> Dict:
query += "</envelope>"

query_results = self.db.query(query)
results = query_results.css_select("envelope")
results = query_results.cssselect("envelope")
except HTTPError as e:
results = []
output = []
for r in results:
for p in r.xpath(".//p"):
context_previous = p.xpath(".//span[@class='previous']").pop().full_text
context_hi = p.xpath(".//span[@class='hi']").pop().full_text
context_following = p.xpath(".//span[@class='following']").pop().full_text
score = r.xpath(".//score").pop().full_text
entity_id = r.xpath(".//id").pop().full_text
entity_type = r.xpath(".//type").pop().full_text
for p in r.xpath(".//p"):
context_previous = p.xpath(".//span[@class='previous']").pop().text
context_hi = p.xpath(".//span[@class='hi']").pop().text
context_following = p.xpath(".//span[@class='following']").pop().text
score = r.xpath(".//score").pop().text
entity_id = r.xpath(".//id").pop().text
entity_type = r.xpath(".//type").pop().text

entry = {
"score": score,
Expand All @@ -275,7 +203,7 @@ def get_search_results(self, entity: str, keyword: str, width: int) -> Dict:
}

if not should_get_document_id:
entity_related_id = r.xpath(".//related").pop().full_text
entity_related_id = r.xpath(".//related").pop().text
entry["entity_related_id"] = entity_related_id

output.append(entry)
Expand Down
Loading