Skip to content

Commit d4568b4

Browse files
authored
feat: migrate from whoosh-based indices to elasticsearch (#54)
1 parent 6b57c4b commit d4568b4

29 files changed

+969
-704
lines changed

.gitlab-ci.yml

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@ test:
1717
- apt-get update && apt-get install make
1818
- python -m pip install --upgrade pip
1919
- pip install uv
20-
- uv sync
20+
- uv sync INDEXING_SUCCESS=0 && \
21+
(make generate-data && INDEXING_SUCCESS=1) || INDEXING_SUCCESS=0 && \
22+
echo "Flushing data to disk..." && \
23+
24+
2125

2226
stage: test
2327
script:
@@ -32,11 +36,19 @@ build:
3236
- docker info
3337
stage: build
3438
script:
35-
- docker build -f docker/dockerfile --cache-from $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME .
36-
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
37-
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
38-
39-
#deploy:
40-
# stage: deploy
41-
# extends:
42-
# .default-deploy
39+
- export REPO_NAME=$CI_REGISTRY_IMAGE/conjugador-elastic
40+
- export IMAGE_NAME=$REPO_NAME:$CI_COMMIT_REF_NAME
41+
- export DOCKER_PATH=docker/elasticsearch/Dockerfile
42+
- docker build -f $DOCKER_PATH --tag $IMAGE_NAME .
43+
- docker push $IMAGE_NAME
44+
45+
- export REPO_NAME=$CI_REGISTRY_IMAGE/conjugador-webserver
46+
- export IMAGE_NAME=$REPO_NAME:$CI_COMMIT_REF_NAME
47+
- export DOCKER_PATH=docker/webserver/Dockerfile
48+
- docker build -f $DOCKER_PATH --tag $IMAGE_NAME .
49+
- docker push $IMAGE_NAME
50+
51+
deploy:
52+
stage: deploy
53+
extends:
54+
.default-deploy

DEV.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,6 @@ Els infinitius a) són necessaris després per poder importar les definicions de
4848

4949
L'aplicació _definitions/extract-to-json.py_ té com a objectiu extreure les definicions del Viccionari en un JSON amb les definicions que després l'ordre extract.py usarà.
5050

51-
# Index
51+
## Index
5252

53-
L'aplicació _indexer/index_creation.py_ s'encarrega de generar un índex amb el motor de cerca Whoosh que després usarem per oferir la cerca i l'autocomplete a la web.
53+
L'aplicació _indexer/index_creation.py_ s'encarrega de generar diversos índexos a Elasticsearch que després usarem per oferir la cerca i l'autocomplete a la web.

Makefile

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
1-
.PHONY: docker-build docker-run generate-data update-data test
1+
.PHONY: docker-build docker-run generate-data update-data test generate-data-without-indexation
22

33
docker-build:
4-
docker build . -t conjugador -f docker/dockerfile;
4+
docker compose build
55

66
docker-run:
7-
docker run -p 8000:8000 -i -t conjugador
7+
docker compose up
88

99
generate-data:
10-
bzip2 -cdk definitions/cawiktionary-latest-pages-meta-current.xml.bz2 > definitions/cawiktionary-latest-pages-meta-current.xml
11-
uv run -m extractor.extract -i
12-
uv run -m definitions.extract-to-json
13-
uv run -m extractor.extract
10+
set -e; \
11+
bzip2 -cdk definitions/cawiktionary-latest-pages-meta-current.xml.bz2 > definitions/cawiktionary-latest-pages-meta-current.xml; \
12+
uv run -m extractor.extract -i; \
13+
uv run -m definitions.extract-to-json; \
14+
uv run -m extractor.extract; \
1415
uv run -m indexer.index_creation
1516

17+
generate-data-without-indexation:
18+
set -e; \
19+
bzip2 -cdk definitions/cawiktionary-latest-pages-meta-current.xml.bz2 > definitions/cawiktionary-latest-pages-meta-current.xml; \
20+
uv run -m extractor.extract -i; \
21+
uv run -m definitions.extract-to-json; \
22+
uv run -m extractor.extract; \
23+
1624
update-data:
1725
# Extract current version
1826
make generate-data

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ content from the spelling dictionary.
66
# How it works
77

88
1. extract.py reads diccionary file and extracts the verbs in JSON format (into data/jsons)
9-
2. index_creation.py reads the jsons and creates a Whoosh index (into data/indexdir)
10-
3. Flash application at web/ serves the content
9+
2. index_creation.py reads the jsons and creates multiple indices in Elasticsearch
10+
3. Flask application at web/ serves the content
1111

1212
# Git clone
1313

definitions/definitions.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class Definitions:
3232
Generates a dictionary of the definitions of all the verbs in catalan and saves it
3333
by default at `data/definitions.txt`.
3434
"""
35+
3536
def _get_revision_text(self, revision: Element) -> str:
3637
for child in revision:
3738
if "text" in child.tag:
@@ -40,7 +41,10 @@ def _get_revision_text(self, revision: Element) -> str:
4041
return ""
4142

4243
def _get_infinitives(self, filename: str) -> list[str]:
43-
words = [line.lower().strip() for line in Path(filename).read_text().splitlines()]
44+
words = [
45+
line.lower().strip()
46+
for line in Path(filename).read_text().splitlines()
47+
]
4448
return words
4549

4650
def generate(

docker-compose.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
services:
2+
conjugador-elastic:
3+
image: conjugador-elasticsearch
4+
build:
5+
context: .
6+
dockerfile: ./docker/elasticsearch/Dockerfile
7+
ports:
8+
- "9200:9200"
9+
10+
conjugador-server:
11+
image: conjugador-webserver
12+
build:
13+
context: .
14+
dockerfile: ./docker/webserver/Dockerfile
15+
environment:
16+
ES_URL: "http://conjugador-elastic:9200"
17+
ports:
18+
- "8000:8000"

docker/dockerfile

Lines changed: 0 additions & 49 deletions
This file was deleted.

docker/elasticsearch/Dockerfile

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
FROM elasticsearch:8.15.1 as build
2+
USER root
3+
RUN apt-get update && \
4+
apt-get install -y bzip2 make procps httping curl && \
5+
rm -rf /var/lib/apt/lists/*
6+
COPY --from=ghcr.io/astral-sh/uv:0.9.13 /uv /uvx /bin/
7+
8+
RUN mkdir -p /srv
9+
COPY pyproject.toml uv.lock Makefile /srv/
10+
COPY indexer /srv/indexer
11+
COPY extractor /srv/extractor
12+
COPY definitions /srv/definitions
13+
COPY catalan-dict-tools /srv/catalan-dict-tools
14+
WORKDIR /srv
15+
COPY .python-version /srv/
16+
17+
RUN uv python install $(cat .python-version) && \
18+
uv sync
19+
20+
ENV discovery.type=single-node
21+
ENV xpack.security.enabled=false
22+
ENV ES_JAVA_OPTS="-Xms512m -Xmx512m"
23+
24+
# Start Elasticsearch, wait for health, run indexing, then stop
25+
RUN set -e && \
26+
runuser -u elasticsearch -- /usr/share/elasticsearch/bin/elasticsearch -d -p /tmp/es.pid && \
27+
echo "Waiting for Elasticsearch to be ready..." && \
28+
for i in $(seq 1 180); do \
29+
if curl -s http://localhost:9200/_cluster/health 2>/dev/null | grep -q '"status":"green\|yellow"'; then \
30+
echo "Elasticsearch is ready!"; \
31+
break; \
32+
fi; \
33+
if [ $i -eq 180 ]; then \
34+
echo "Elasticsearch failed to start"; \
35+
exit 1; \
36+
fi; \
37+
echo "Waiting... ($i/180)"; \
38+
sleep 2; \
39+
done && \
40+
make generate-data; \
41+
MAKE_EXIT_CODE=$?; \
42+
echo "Flushing data to disk..." && \
43+
curl -X POST "http://localhost:9200/_flush" 2>/dev/null || true && \
44+
sleep 2 && \
45+
echo "Shutting down Elasticsearch..." && \
46+
if [ -f /tmp/es.pid ]; then \
47+
ES_PID=$(cat /tmp/es.pid); \
48+
kill -TERM $ES_PID 2>/dev/null || true; \
49+
for i in $(seq 1 60); do \
50+
if ! kill -0 $ES_PID 2>/dev/null; then \
51+
echo "Elasticsearch stopped cleanly"; \
52+
break; \
53+
fi; \
54+
if [ $i -eq 60 ]; then \
55+
echo "Force killing Elasticsearch"; \
56+
kill -9 $ES_PID 2>/dev/null || true; \
57+
fi; \
58+
sleep 1; \
59+
done; \
60+
fi && \
61+
sleep 3 && \
62+
echo "Removing lock files..." && \
63+
find /usr/share/elasticsearch/data -name "*.lock" -type f -delete && \
64+
find /usr/share/elasticsearch/data -name "write.lock" -type f -delete && \
65+
if [ $MAKE_EXIT_CODE -ne 0 ]; then \
66+
echo "ERROR: make generate-data failed with exit code $MAKE_EXIT_CODE"; \
67+
exit $MAKE_EXIT_CODE; \
68+
fi && \
69+
echo "Indexing complete!"
70+
71+
FROM elasticsearch:8.15.1
72+
USER root
73+
COPY --from=build /usr/share/elasticsearch/data /usr/share/elasticsearch/data
74+
RUN chown -R elasticsearch:elasticsearch /usr/share/elasticsearch/data
75+
ENV discovery.type=single-node
76+
ENV xpack.security.enabled=false
77+
ENV ES_JAVA_OPTS="-Xms512m -Xmx512m"
78+
USER elasticsearch

docker/webserver/Dockerfile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
FROM debian:bookworm-slim
2+
RUN apt-get update -y && apt-get upgrade -y
3+
RUN apt-get install -y make bzip2 curl
4+
COPY --from=ghcr.io/astral-sh/uv:0.9.13 /uv /uvx /bin/
5+
6+
COPY pyproject.toml uv.lock /srv/
7+
COPY .python-version /srv/
8+
COPY Makefile /srv/
9+
WORKDIR /srv
10+
RUN uv python install $(cat .python-version)
11+
RUN uv sync
12+
COPY docker/webserver/entrypoint.sh /srv/
13+
COPY catalan-dict-tools /srv/catalan-dict-tools
14+
COPY definitions /srv/definitions
15+
COPY indexer /srv/indexer
16+
COPY extractor /srv/extractor
17+
RUN make generate-data-without-indexation
18+
COPY web /srv/web
19+
20+
ENTRYPOINT /srv/entrypoint.sh

0 commit comments

Comments
 (0)