Skip to content

Commit 115ff8d

Browse files
committed
clean up, lint
1 parent 598c9f8 commit 115ff8d

File tree

8 files changed

+374
-309
lines changed

8 files changed

+374
-309
lines changed

.cursor

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ pipenv run python manage.py indexctl unpack # Unpack indexes after clone
6262
- **quickindex**: Convenience command that runs `buildindex --v1 --v2` then `indexctl pack`
6363
- Packed indexes stored in `search/indexes/` (committed to repo)
6464
- After running `quickindex`, commit `search/indexes/` and `search/index_manifest.json`
65-
- GitHub Action `rebuild_search_index.yml` can trigger index rebuilds
65+
- Background re-indexing runs ~60s after server startup (configurable via `SEARCH_INDEX_REFRESH_DELAY`)
6666

6767
### Fixture Data Loading
6868
- Fixtures are automatically loaded by quicksetup

.github/workflows/rebuild_search_index.yml

Lines changed: 0 additions & 75 deletions
This file was deleted.

search/apps.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,46 @@
1-
"""
2-
Search app configuration for Open5e API.
3-
"""
41
import logging
2+
import os
3+
import sys
54

65
from django.apps import AppConfig
76

87
logger = logging.getLogger(__name__)
98

109

1110
class SearchConfig(AppConfig):
12-
"""Configuration for the search app."""
1311
default_auto_field = 'django.db.models.BigAutoField'
1412
name = 'search'
15-
13+
1614
def ready(self):
17-
"""Initialize search components when Django starts."""
18-
# Search components are loaded lazily on first use
19-
# No initialization needed during Django startup
20-
logger.info("Search app ready - components will load on first use")
15+
self._maybe_schedule_background_reindex()
16+
17+
def _maybe_schedule_background_reindex(self):
18+
if 'manage.py' in sys.argv:
19+
skip_commands = [
20+
'quicksetup', 'migrate', 'makemigrations', 'buildindex',
21+
'indexctl', 'loaddata', 'import', 'collectstatic', 'shell',
22+
'test', 'check', 'rebuild_index', 'update_index'
23+
]
24+
if any(cmd in sys.argv for cmd in skip_commands):
25+
return
26+
27+
if os.environ.get('DISABLE_BACKGROUND_REINDEX', '').lower() in ('true', '1', 'yes'):
28+
return
29+
30+
# Only run in the reloader's main process for runserver
31+
is_runserver = 'runserver' in sys.argv
32+
is_main_process = os.environ.get('RUN_MAIN') == 'true'
33+
if is_runserver and not is_main_process:
34+
return
35+
36+
from django.conf import settings
37+
from search.background_indexer import schedule_background_reindex
38+
39+
delay = getattr(settings, 'SEARCH_INDEX_REFRESH_DELAY', 60)
40+
rebuild_vector = getattr(settings, 'SEARCH_INDEX_REFRESH_VECTOR', True)
41+
42+
if delay and delay > 0:
43+
schedule_background_reindex(
44+
delay_seconds=delay,
45+
rebuild_vector=rebuild_vector
46+
)

search/background_indexer.py

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
"""
2+
Background search index refresh that runs after server startup.
3+
Ensures indexes stay consistent with the database even when repopulated on deploy.
4+
"""
5+
import threading
6+
import time
7+
import logging
8+
import gc
9+
import pickle
10+
from pathlib import Path
11+
12+
import numpy as np
13+
14+
logger = logging.getLogger(__name__)
15+
16+
_indexing_lock = threading.Lock()
17+
_indexing_in_progress = False
18+
_last_index_time = None
19+
20+
21+
def is_indexing_in_progress():
22+
return _indexing_in_progress
23+
24+
25+
def get_last_index_time():
26+
return _last_index_time
27+
28+
29+
def schedule_background_reindex(delay_seconds=60, rebuild_vector=True):
30+
if delay_seconds <= 0:
31+
return
32+
33+
def _delayed_reindex():
34+
global _indexing_in_progress, _last_index_time
35+
36+
logger.info(f"Background re-index scheduled in {delay_seconds}s")
37+
time.sleep(delay_seconds)
38+
39+
if not _indexing_lock.acquire(blocking=False):
40+
logger.warning("Background re-index skipped - already in progress")
41+
return
42+
43+
try:
44+
_indexing_in_progress = True
45+
logger.info("Starting background search index refresh...")
46+
start_time = time.time()
47+
48+
_run_reindex(rebuild_vector=rebuild_vector)
49+
50+
elapsed = time.time() - start_time
51+
_last_index_time = time.time()
52+
logger.info(f"Background re-index completed in {elapsed:.1f}s")
53+
54+
except Exception as e:
55+
logger.error(f"Background re-index failed: {e}", exc_info=True)
56+
finally:
57+
_indexing_in_progress = False
58+
_indexing_lock.release()
59+
60+
thread = threading.Thread(target=_delayed_reindex, daemon=True, name="search-reindex")
61+
thread.start()
62+
63+
64+
def _run_reindex(rebuild_vector=True):
65+
from django.core.management import call_command
66+
67+
logger.info("Updating Whoosh index...")
68+
try:
69+
call_command('update_index', '--remove', verbosity=1)
70+
except Exception as e:
71+
logger.error(f"Whoosh update failed: {e}")
72+
try:
73+
call_command('rebuild_index', '--noinput', verbosity=1)
74+
except Exception as e2:
75+
logger.error(f"Whoosh rebuild failed: {e2}")
76+
77+
if rebuild_vector:
78+
logger.info("Rebuilding vector index...")
79+
try:
80+
_rebuild_vector_index()
81+
except Exception as e:
82+
logger.error(f"Vector index rebuild failed: {e}")
83+
84+
85+
def _rebuild_vector_index():
86+
from django.conf import settings
87+
from api import models as v1
88+
from api_v2 import models as v2
89+
90+
try:
91+
import spacy
92+
except ImportError:
93+
logger.warning("spaCy not installed - skipping vector index")
94+
return
95+
96+
try:
97+
nlp = spacy.load("en_core_web_md")
98+
except OSError:
99+
logger.warning("spaCy model not found - skipping vector index")
100+
return
101+
102+
nlp.select_pipes(disable=["ner", "parser"])
103+
104+
all_embeddings = []
105+
all_names = []
106+
all_metadata = []
107+
108+
v1_models = [
109+
(v1.MagicItem, lambda o: o.name + " " + (o.desc or "")[:200]),
110+
(v1.Spell, lambda o: o.name + " " + (o.desc or "")[:200]),
111+
(v1.Monster, lambda o: o.name + " " + (o.desc or "")[:200]),
112+
(v1.CharClass, lambda o: o.name + " " + (o.desc or "")[:200]),
113+
(v1.Race, lambda o: o.name + " " + (o.desc or "")[:200]),
114+
(v1.Feat, lambda o: o.name + " " + (o.desc or "")[:200]),
115+
(v1.Condition, lambda o: o.name + " " + (o.desc or "")[:200]),
116+
(v1.Background, lambda o: o.name + " " + (o.desc or "")[:200]),
117+
]
118+
119+
v2_models = [
120+
(v2.Item, lambda o: o.name + " " + (o.as_text() or "")[:200]),
121+
(v2.Spell, lambda o: o.name + " " + (o.as_text() or "")[:200]),
122+
(v2.Creature, lambda o: o.name + " " + (o.as_text() or "")[:200]),
123+
(v2.CharacterClass, lambda o: o.name + " " + (o.as_text() or "")[:200]),
124+
(v2.Species, lambda o: o.name + " " + (o.as_text() or "")[:200]),
125+
(v2.Feat, lambda o: o.name + " " + (o.as_text() or "")[:200]),
126+
(v2.Background, lambda o: o.name + " " + (o.as_text() or "")[:200]),
127+
]
128+
129+
def process_model(model, text_func, schema_version):
130+
texts = []
131+
for obj in model.objects.all():
132+
try:
133+
text = text_func(obj)
134+
texts.append(text)
135+
all_names.append(obj.name)
136+
137+
doc_key = obj.document.slug if schema_version == 'v1' else obj.document.key
138+
all_metadata.append({
139+
'object_type': model.__name__,
140+
'document_pk': doc_key,
141+
'schema_version': schema_version,
142+
'description': text[:500] if text else ''
143+
})
144+
except Exception as e:
145+
logger.debug(f"Skipping {model.__name__} object: {e}")
146+
147+
for doc in nlp.pipe(texts, batch_size=50):
148+
vectors = [token.vector for token in doc if token.has_vector]
149+
if vectors:
150+
avg_vector = np.mean(vectors, axis=0)
151+
norm = np.linalg.norm(avg_vector)
152+
if norm > 0:
153+
avg_vector = avg_vector / norm
154+
all_embeddings.append(avg_vector)
155+
else:
156+
all_embeddings.append(np.zeros(nlp.vocab.vectors_length))
157+
158+
for model, text_func in v1_models:
159+
try:
160+
process_model(model, text_func, 'v1')
161+
except Exception as e:
162+
logger.warning(f"Error processing {model.__name__}: {e}")
163+
164+
for model, text_func in v2_models:
165+
try:
166+
process_model(model, text_func, 'v2')
167+
except Exception as e:
168+
logger.warning(f"Error processing {model.__name__}: {e}")
169+
170+
if not all_embeddings:
171+
logger.warning("No documents found for vector indexing")
172+
return
173+
174+
embeddings = np.array(all_embeddings)
175+
logger.info(f"Vector index: {len(all_names)} documents, shape {embeddings.shape}")
176+
177+
index_data = {
178+
"names": all_names,
179+
"metadata": all_metadata,
180+
"embeddings": embeddings,
181+
"vector_size": nlp.vocab.vectors_length
182+
}
183+
184+
index_path = Path(settings.BASE_DIR) / "server" / "vector_index.pkl"
185+
with index_path.open("wb") as fh:
186+
pickle.dump(index_data, fh)
187+
188+
# Invalidate cached index
189+
from search import services
190+
services._vector_index = None
191+
services._vector_index_loaded = False
192+
services._fuzzy_search_cache.clear()
193+
194+
del all_embeddings, all_names, all_metadata, embeddings, index_data, nlp
195+
gc.collect()
196+
197+
198+
def trigger_reindex_now(rebuild_vector=True):
199+
"""Trigger immediate re-index. Returns True if started, False if already running."""
200+
global _indexing_in_progress, _last_index_time
201+
202+
if not _indexing_lock.acquire(blocking=False):
203+
return False
204+
205+
def _run():
206+
global _indexing_in_progress, _last_index_time
207+
try:
208+
_indexing_in_progress = True
209+
start_time = time.time()
210+
_run_reindex(rebuild_vector=rebuild_vector)
211+
_last_index_time = time.time()
212+
logger.info(f"Manual re-index completed in {time.time() - start_time:.1f}s")
213+
except Exception as e:
214+
logger.error(f"Manual re-index failed: {e}", exc_info=True)
215+
finally:
216+
_indexing_in_progress = False
217+
_indexing_lock.release()
218+
219+
thread = threading.Thread(target=_run, daemon=True, name="search-reindex-manual")
220+
thread.start()
221+
return True

0 commit comments

Comments
 (0)