DHI-Roma · suchmaske · Feb 15, 2025 · Feb 15, 2025
diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-3.6.15
+3.13
diff --git a/Dockerfile b/Dockerfile
@@ -1,17 +1,16 @@
-FROM tiangolo/uvicorn-gunicorn-fastapi:python3.6-2020-12-19
-
+FROM python:3.13.2-alpine3.21
 WORKDIR /backend
 COPY . /backend
 
-RUN pip install --no-cache-dir --upgrade pip \
-  && pip install --no-cache-dir poetry==1.0.0 \
+RUN apk add gcc build-base \
+  && pip install --no-cache-dir --upgrade pip \
+  && pip install --no-cache-dir poetry==2.0.1 \
   && pip install uvicorn \
   && poetry config virtualenvs.create false \
-  && poetry install --no-dev \
-  && pip uninstall --yes poetry
+  && poetry install
 
-RUN sed -i "s/CipherString = DEFAULT@SECLEVEL=2/CipherString = DEFAULT@SECLEVEL=1/g" /etc/ssl/openssl.cnf
-RUN openssl rand -hex 20 > /backend/.db-version
 
 COPY . /backend
 
+CMD ["poetry", "run", "uvicorn", "app:main", "--port", "8000", "--host", "0.0.0.0", "--workers", "2"]
+
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -1,12 +1,12 @@
-FROM tiangolo/uvicorn-gunicorn-fastapi:python3.6-2020-12-19
+FROM tiangolo/uvicorn-gunicorn-fastapi:python3.11-2025-02-10
 
 WORKDIR /app
 
 RUN pip install poetry lxml snakesist python-dotenv diskcache
 
 COPY ./pyproject.toml ./poetry.lock* ./
 
-RUN poetry install
+RUN poetry install --no-root
 
 COPY . .
 # COPY .env.production .env
diff --git a/app/controller.py b/app/controller.py
@@ -3,13 +3,13 @@
 import requests
 from fastapi import FastAPI
 from fastapi.openapi.utils import get_openapi
+from lxml import etree
 from snakesist.exist_client import ExistClient
 from starlette.responses import Response, JSONResponse, PlainTextResponse, FileResponse, StreamingResponse
 from starlette.requests import Request
 from random import choice
 from string import ascii_letters
 from pathlib import Path
-from multiprocessing import Manager
 from diskcache import Cache
 
 from service import Service, beacon_service, image_service, letter_index_service
@@ -28,8 +28,8 @@
 ]
 
 
-db = ExistClient(host="db")
-# db = ExistClient(host="localhost")
+db = ExistClient(host="db", parser=etree.XMLParser(recover=True))
+# db = ExistClient(host="localhost", port=8071, parser=etree.XMLParser(recover=True))
 db.root_collection = ROOT_COLLECTION
 service = Service(db, CFG, watch_updates=True)
 
@@ -75,7 +75,7 @@ async def cmif_api():
     Get correspondence metadata in CMI format
     """
     return XMLResponse(
-        content=str(db.retrieve_resources("//*:TEI[@type='cmif']").pop())
+        content=str(db.xpath("//*:TEI[@type='cmif']").pop())
     )
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,25 +1,28 @@
 [tool.poetry]
 name = "gregorovius-api"
-version = "1.5.5"
+version = "1.6.0"
 description = "Backend API layer for the Gregorovius Correspondence Edition"
 authors = ["Theodor Costea <[email protected]>", "Oliver Pohl <[email protected]>"]
+package-mode = false
 
 [tool.poetry.dependencies]
-python = "^3.6"
-pydantic = "^0.32.2"
-snakesist = {version = "0.1.0-b1", allow-prereleases = true}
-uvicorn = "^0.9.0"
-fastapi = "^0.38.1"
-pyyaml = "^5.1"
-schedule = "^0.6.0"
-xmltodict = "^0.12.0"
-diskcache = "^5.4.0"
-aiofiles = "0.8.0"
-Pillow = "^7"
-openpyxl = "^3.1.2"
+python = "^3.13"
+pydantic = "^2.10.4"
+snakesist = "^0.3.0"
+uvicorn = "^0.34.0"
+fastapi = "^0.115.6"
+pyyaml = "^6.0.2"
+schedule = "^1.2.2"
+xmltodict = "^0.14.2"
+diskcache = "^5.6.3"
+aiofiles = "24.1.0"
+Pillow = "^11.1.0"
+openpyxl = "^3.1.5"
+requests = "^2.32.3"
+gunicorn = "^23.0.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^5.1"
+pytest = "^8.3.4"
 
 [build-system]
 requires = ["poetry>=0.12"]

diff --git a/service/helpers.py b/service/helpers.py
@@ -1,6 +1,8 @@
+from types import NoneType
+
 from delb import TagNode
 from typing import Dict
-from snakesist.exist_client import Resource
+from snakesist.exist_client import NodeResource as Resource
 
 from models import EntityMeta
 
@@ -43,7 +45,10 @@ def process_property_value(node: TagNode, property_manifest: Dict) -> str:
                 if "filter" in property_manifest:
                     output = apply_filter(node[val], property_manifest["filter"])
                 else:
-                    output = normalize_whitespace(node[val])
+                    output = normalize_whitespace(node[val] if node[val] else '')
+                # Output can return empty strings. We don't want those. Get the first attribute match.
+                if output:
+                    return output
             except KeyError:
                 continue
     else:
@@ -152,8 +157,9 @@ def xml_to_entitymeta(
     except (KeyError, TypeError):
         print(f"Warning: No @xml:id found for '{entity_name}' item! Item endpoint will not be accessible.")
         node_id = ''
+
     return EntityMeta(
-        id=node_id,
+        id=str(node_id),
         entity=entity_name,
         properties=properties
     )
diff --git a/service/main.py b/service/main.py
@@ -76,7 +76,7 @@ def __init__(self, db: ExistClient, manifest: Dict, watch_updates: bool = False)
         self.manifest_entities = manifest['entities']
         self.db = db
         self.entities = {
-            name: self.db.retrieve_resources(manifest["xpath"])
+            name: self.db.xpath(manifest["xpath"])
             for name, manifest in self.manifest_entities.items()
         }
         try:
@@ -85,77 +85,6 @@ def __init__(self, db: ExistClient, manifest: Dict, watch_updates: bool = False)
             self.id_attr = '{http://www.w3.org/XML/1998/namespace}id'
         if watch_updates:
             UpdateWatcher(self.db, self.entities)
-        self._initialize_search_indices()
-
-    def _initialize_search_indices(self):
-        """
-        Create Lucene search configurations according to config.yml,
-        store them in the database and initiate reindexing.
-        """
-        searchable_entities = [
-            entity_conf["search_index"] for entity_name, entity_conf in self.manifest["entities"].items()
-            if "search_index" in entity_conf
-        ]
-        text_config = ""
-        for unit in searchable_entities:
-            try:
-                for text in unit["text"]:
-                    text_config += (
-                        f"<text {text['type']}='{text['pattern']}'>"
-                        f"<inline qname='{text['inline-qname']}'/>"
-                    )
-
-                    if "fields" in text:
-                        for field in text["fields"]:
-                            text_config += f"<field name='{field['name']}' expression='{field['expression']}' />"
-
-                    if "ignore" in text:
-                        text_config += f"<ignore  qname='{text['ignore']}'/>"
-                    text_config += "</text>"
-            except KeyError:
-                raise ValueError(f"Error reading search index configuration: {unit}.")
-
-        config = (
-            "<collection xmlns='http://exist-db.org/collection-config/1.0'>"
-            "<index xmlns:tei='http://www.tei-c.org/ns/1.0'>"
-            "<fulltext default='none' attributes='false'/>"
-            "<lucene>"
-            "<analyzer class='org.apache.lucene.analysis.standard.StandardAnalyzer'/>"
-            f"{text_config}"
-            "</lucene>"
-            "</index>"
-            "</collection>"
-        )
-        collection = self.manifest['collection']
-        collection_alternative = self.manifest['collection_alternative']
-
-        config_path = f"/db/system/config{collection}"
-        config_path_alternative = f"/db/system/config{collection_alternative}"
-        try:
-            self.db.query(
-                f'(xmldb:create-collection("/db/system/config", "{collection}"),'
-                f'xmldb:store("{config_path}", "collection.xconf", "{config}"),'
-                f'xmldb:reindex("{collection}"))'
-            )
-
-            self.db.query(
-                f'(xmldb:create-collection("/db/system/config", "{collection_alternative}"),'
-                f'xmldb:store("{config_path_alternative}", "collection.xconf", "{config}"),'
-                f'xmldb:reindex("{collection_alternative}"))'
-            )
-        except HTTPError as e:
-            print(e)
-
-        collection_alt = self.manifest['collection_alternative']
-        config_path_alt = f"/db/system/config{collection}"
-        try:
-            self.db.query(
-                f'(xmldb:create-collection("/db/system/config", "{collection_alt}"),'
-                f'xmldb:store("{config_path_alt}", "collection.xconf", "{config}"),'
-                f'xmldb:reindex("{collection_alt}"))'
-            )
-        except HTTPError as e:
-            print(e)
 
     def get_entities(self, entity_name: str) -> List[EntityMeta]:
         """
@@ -189,17 +118,16 @@ def get_entity(self, entity_name: str, entity_id: str, output_format: str) -> st
             if output_format == "xml":
                 return str(resource.node)
             elif output_format == "json":
-                try:
-                    output = resource.node.css_select("teiHeader").pop()
-                except IndexError:
+                output = resource.node.css_select("teiHeader").first
+                if output is None:
                     output = resource.node
                 return xmltodict.parse(str(output))
             else:
                 raise ValueError(
                     f"Invalid format: {output_format}."
                     f"Only 'xml' and 'json' are supported."
                 )
-        return resource
+        return str(resource)
 
     def get_search_results(self, entity: str, keyword: str, width: int) -> Dict:
         """
@@ -253,18 +181,18 @@ def get_search_results(self, entity: str, keyword: str, width: int) -> Dict:
             query += "</envelope>"
 
             query_results = self.db.query(query)
-            results = query_results.css_select("envelope")
+            results = query_results.cssselect("envelope")
         except HTTPError as e:
             results  = []
         output = []
         for r in results:
-            for p in r.xpath(".//p"): 
-                context_previous = p.xpath(".//span[@class='previous']").pop().full_text
-                context_hi = p.xpath(".//span[@class='hi']").pop().full_text
-                context_following = p.xpath(".//span[@class='following']").pop().full_text
-                score = r.xpath(".//score").pop().full_text
-                entity_id = r.xpath(".//id").pop().full_text
-                entity_type = r.xpath(".//type").pop().full_text
+            for p in r.xpath(".//p"):
+                context_previous = p.xpath(".//span[@class='previous']").pop().text
+                context_hi = p.xpath(".//span[@class='hi']").pop().text
+                context_following = p.xpath(".//span[@class='following']").pop().text
+                score = r.xpath(".//score").pop().text
+                entity_id = r.xpath(".//id").pop().text
+                entity_type = r.xpath(".//type").pop().text
 
                 entry = {
                     "score": score,
@@ -275,7 +203,7 @@ def get_search_results(self, entity: str, keyword: str, width: int) -> Dict:
                 }
 
                 if not should_get_document_id:
-                    entity_related_id = r.xpath(".//related").pop().full_text
+                    entity_related_id = r.xpath(".//related").pop().text
                     entry["entity_related_id"] = entity_related_id
 
                 output.append(entry)