fix(docker): extract wordnet corpus after NLTK download (#61)

qinxuye · web-flow · commit bf7a78af8e90 · 2026-02-28T19:52:54.000+08:00
The wordnet corpus is downloaded as a zip file by default but needs
to be extracted for NLTK to use it properly. Add explicit unzip command
after NLTK download to ensure wordnet is available.
diff --git a/docker/Dockerfile.backend b/docker/Dockerfile.backend
@@ -136,6 +136,7 @@ RUN cd /opt/xagent/frontend && npm ci \
     && npm install -g pptxgenjs@4.0.1 \
     && python -m playwright install chromium \
     && python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('wordnet'); nltk.download('averaged_perceptron_tagger')" \
+    && unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora \
     && python -c "import tiktoken; tiktoken.encoding_for_model('gpt-4')" \
     && chmod +x /opt/xagent/deploy/entrypoint.sh