Skip to content

Commit a6309c4

Browse files
committed
Replace Elasticsearch with Whoosh and spaCy for in-process search
Use Whoosh for text/fuzzy search and spaCy word embeddings for semantic similarity. Removes the need for a separate ES service.
1 parent bc05ddf commit a6309c4

File tree

18 files changed

+1813
-2122
lines changed

18 files changed

+1813
-2122
lines changed

.do/app.yaml

Lines changed: 0 additions & 52 deletions
This file was deleted.

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ db.sqlite3-journal
1919
# Django static files
2020
staticfiles/
2121

22-
# Search index
22+
# Search indexes
2323
server/whoosh_index/
24+
whoosh_index/
2425

2526
# IDE and editor configurations
2627
.vscode/

Dockerfile

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@ COPY . /opt/services/open5e-api
1212
# Install all dependencies from Pipfile
1313
RUN pipenv install -v
1414

15+
# Download spaCy model for semantic search
16+
RUN pipenv run python -m spacy download en_core_web_md
17+
1518
# Remove .env file (set your env vars via docker-compose.yml or your hosting provider)
1619
RUN rm -f .env
1720

18-
# Copy startup script
19-
COPY start.sh /start.sh
20-
RUN chmod +x /start.sh
21+
# Run setup
22+
RUN pipenv run python manage.py quicksetup
2123

22-
# Run startup script (handles migrations, quicksetup, and gunicorn)
23-
CMD ["/start.sh"]
24+
# Run gunicorn
25+
CMD ["pipenv", "run", "gunicorn", "-b", ":8080", "--workers", "2", "--timeout", "120", "server.wsgi:application"]

Pipfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ requests = "*"
1313
whitenoise = "*"
1414
gunicorn = "*"
1515
drf-spectacular = {extras = ["sidecar"], version = "*"}
16-
elasticsearch = ">=7.0.0,<8.0.0"
1716
django-haystack = "*"
17+
Whoosh = "*"
1818
numpy = "<2.0"
19-
scikit-learn = "*"
19+
spacy = ">=3.0"
2020

2121
[dev-packages]
2222
pytest = "*"

Pipfile.lock

Lines changed: 1421 additions & 667 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/management/commands/quicksetup.py

Lines changed: 26 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,11 @@ class Command(BaseCommand):
1414
"""Implementation for the `manage.py quicksetup` subcommand."""
1515

1616
def add_arguments(self, parser: argparse.ArgumentParser):
17-
"""Define arguments for the `manage.py quicksetup` subcommand."""
18-
19-
# Named (optional) arguments.
2017
parser.add_argument(
2118
"--noindex",
2219
action="store_true",
23-
help="Flushes all existing database data before adding new objects.",
20+
help="Skip building search indexes.",
2421
)
25-
2622
parser.add_argument(
2723
"--clean",
2824
action="store_true",
@@ -56,15 +52,7 @@ def handle(self, *args, **options):
5652
self.stdout.write(self.style.ERROR(
5753
'QUICKSETUP FAILED - Fix foreign key constraint violations before proceeding.'
5854
))
59-
return # Exit without showing "API setup complete"
60-
61-
if not options['noindex']:
62-
if settings.BUILD_V1_INDEX:
63-
build_haystack_index()
64-
else:
65-
self.stdout.write("Skipping v1 index build because of --noindex")
66-
else:
67-
self.stdout.write('Skipping v1 database population.')
55+
return
6856

6957
if settings.INCLUDE_V2_DATA:
7058
self.stdout.write('Populating the v2 database...')
@@ -77,73 +65,59 @@ def handle(self, *args, **options):
7765
self.stdout.write(self.style.ERROR(
7866
'QUICKSETUP FAILED - Fix foreign key constraint violations before proceeding.'
7967
))
80-
return # Exit without showing "API setup complete"
68+
return
8169

8270
if not options['noindex']:
8371
if settings.BUILD_V2_INDEX:
84-
self.stdout.write('Building the v2 index with both v1 and v2 data.')
85-
build_v1v2_searchindex()
72+
self.stdout.write('Building the search index...')
73+
build_search_index()
8674
else:
87-
self.stdout.write('Skipping v2 index build because of --noindex.')
75+
self.stdout.write('Skipping index build because of --noindex.')
8876

8977
self.stdout.write(self.style.SUCCESS('API setup complete.'))
9078

9179

92-
def migrate_db() -> None:
93-
"""Migrate the local database as needed to incorporate new model updates.
94-
This command is added primarily to assist in local development, because
95-
checking out and changing branches results in unclean model/dbs."""
96-
80+
def migrate_db():
9781
call_command('makemigrations')
9882
call_command('migrate')
9983

100-
def is_dirty() ->None:
101-
# TODO switch these over to server settings values.
102-
is_dirty=False
103-
if Path('./server/whoosh_index').is_dir():
84+
85+
def is_dirty():
86+
is_dirty = False
87+
if Path('whoosh_index').is_dir():
10488
print("Found whoosh_index")
105-
is_dirty=True
89+
is_dirty = True
10690
if Path(settings.STATIC_ROOT).is_dir():
10791
print("Found static root")
108-
is_dirty=True
92+
is_dirty = True
10993
if Path(settings.DATABASES['default']['NAME']).exists():
11094
print("Found db file")
111-
is_dirty=True
95+
is_dirty = True
11296
return is_dirty
11397

114-
def clean_dir() ->None:
115-
if Path('./server/whoosh_index').is_dir():
116-
shutil.rmtree(Path('./server/whoosh_index'))
98+
99+
def clean_dir():
100+
if Path('whoosh_index').is_dir():
101+
shutil.rmtree(Path('whoosh_index'))
117102
if Path(settings.STATIC_ROOT).is_dir():
118103
shutil.rmtree(Path(settings.STATIC_ROOT))
119104
if Path(settings.DATABASES['default']['NAME']).exists():
120105
Path(settings.DATABASES['default']['NAME']).unlink()
121-
vector_index = Path('server/vector_index.pkl')
122-
if vector_index.exists():
123-
vector_index.unlink()
106+
if Path('server/vector_index.pkl').exists():
107+
Path('server/vector_index.pkl').unlink()
124108

125-
def import_v1() -> None:
126-
"""Import the v1 apps' database models."""
109+
110+
def import_v1():
127111
call_command('import', '--dir', 'data/v1')
128112

129113

130-
def import_v2() -> None:
131-
"""Import the v2 apps' database models."""
114+
def import_v2():
132115
call_command('import', '--dir', 'data/v2')
133116

134117

135-
def collect_static() -> None:
136-
"""Collect static files in a single location."""
118+
def collect_static():
137119
call_command('collectstatic', '--noinput')
138120

139121

140-
def build_haystack_index() -> None:
141-
"""Freshen the haystack search indexes. This is an internal haystack
142-
API that is being called, and only applies to v1 data."""
143-
print("THIS ENTIRE COMMAND HAS BEEN DEPRECATED! EXPECT ERRORS.")
144-
call_command('update_index', '--remove')
145-
146-
def build_v1v2_searchindex() -> None:
147-
"""Builds the custom search index defined in the api_v2 management
148-
commands. Only adds the v1 data."""
149-
call_command('buildindex','--v1','--v2')
122+
def build_search_index():
123+
call_command('buildindex', '--v1', '--v2')

docker-compose.yml

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,15 @@
11
services:
2-
elasticsearch:
3-
image: elasticsearch:8.11.0
4-
container_name: elasticsearch
5-
environment:
6-
- discovery.type=single-node
7-
- xpack.security.enabled=false
8-
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
9-
- http.max_content_length=100mb
10-
- indices.memory.index_buffer_size=20%
11-
ports:
12-
- "9200:9200"
13-
volumes:
14-
- elasticsearch_data:/usr/share/elasticsearch/data
15-
healthcheck:
16-
test: ["CMD-SHELL", "curl -f http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=60s && curl -f http://localhost:9200/_cat/nodes?format=json | grep -q '\"name\"' || exit 1"]
17-
interval: 15s
18-
timeout: 30s
19-
retries: 10
20-
start_period: 60s
21-
222
web:
233
build: .
244
container_name: open5e-api
255
ports:
266
- "8080:8080"
277
environment:
288
- SECRET_KEY=${SECRET_KEY:-your-secret-key-here}
29-
- ELASTICSEARCH_URL=http://elasticsearch:9200/
30-
depends_on:
31-
elasticsearch:
32-
condition: service_healthy
339
healthcheck:
3410
test: ["CMD-SHELL", "curl -f http://localhost:8080/v2/spells/?limit=1 || exit 1"]
3511
interval: 30s
3612
timeout: 10s
3713
retries: 5
3814
mem_limit: 1g
3915
mem_reservation: 512m
40-
41-
volumes:
42-
elasticsearch_data:

search/apps.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,6 @@ class SearchConfig(AppConfig):
1515

1616
def ready(self):
1717
"""Initialize search components when Django starts."""
18-
try:
19-
logger.info("Initializing search components...")
20-
from .services import get_tfidf_vectorizer
21-
22-
# Only initialize the vectorizer factory, don't build index during startup
23-
# This prevents hanging when Elasticsearch is not available during Django startup
24-
logger.info("TF-IDF vectorizer factory initialized")
25-
26-
# Vector index will be built lazily on first search request
27-
logger.info("Search components initialized successfully")
28-
except Exception as e:
29-
logger.error(f"Error initializing search components: {e}")
30-
# Don't crash the app if search initialization fails
18+
# Search components are loaded lazily on first use
19+
# No initialization needed during Django startup
20+
logger.info("Search app ready - components will load on first use")

0 commit comments

Comments
 (0)