11from __future__ import annotations
22
3- import hashlib
43import os
5- import sys
6- import tarfile
7- import tempfile
8- import urllib .request
94from functools import lru_cache
105from typing import Final , List , Tuple
116
1611
1712CACHE_MAX_SIZE : Final [int ] = 128
1813
19- NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
20- NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{ NLTK_DATA_FILENAME } "
21- NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
22-
23-
24- # NOTE(robinson) - mimic default dir logic from NLTK
25- # https://github.com/nltk/nltk/
26- # blob/8c233dc585b91c7a0c58f96a9d99244a379740d5/nltk/downloader.py#L1046
27- def get_nltk_data_dir () -> str | None :
28- """Locates the directory the nltk data will be saved too. The directory
29- set by the NLTK environment variable takes highest precedence. Otherwise
30- the default is determined by the rules indicated below. Returns None when
31- the directory is not writable.
32-
33- On Windows, the default download directory is
34- ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
35- directory containing Python, e.g. ``C:\\ Python311``.
36-
37- On all other platforms, the default directory is the first of
38- the following which exists or which can be created with write
39- permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
40- ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
41- """
42- # Check if we are on GAE where we cannot write into filesystem.
43- if "APPENGINE_RUNTIME" in os .environ :
44- return
45-
46- # Check if we have sufficient permissions to install in a
47- # variety of system-wide locations.
48- for nltkdir in nltk .data .path :
49- if os .path .exists (nltkdir ) and nltk .internals .is_writable (nltkdir ):
50- return nltkdir
51-
52- # On Windows, use %APPDATA%
53- if sys .platform == "win32" and "APPDATA" in os .environ :
54- homedir = os .environ ["APPDATA" ]
55-
56- # Otherwise, install in the user's home directory.
57- else :
58- homedir = os .path .expanduser ("~/" )
59- if homedir == "~/" :
60- raise ValueError ("Could not find a default download directory" )
61-
62- # NOTE(robinson) - NLTK appends nltk_data to the homedir. That's already
63- # present in the tar file so we don't have to do that here.
64- return homedir
65-
6614
6715def download_nltk_packages ():
68- nltk_data_dir = get_nltk_data_dir ()
69-
70- if nltk_data_dir is None :
71- raise OSError ("NLTK data directory does not exist or is not writable." )
72-
73- # Check if the path ends with "nltk_data" and remove it if it does
74- if nltk_data_dir .endswith ("nltk_data" ):
75- nltk_data_dir = os .path .dirname (nltk_data_dir )
76-
77- def sha256_checksum (filename : str , block_size : int = 65536 ):
78- sha256 = hashlib .sha256 ()
79- with open (filename , "rb" ) as f :
80- for block in iter (lambda : f .read (block_size ), b"" ):
81- sha256 .update (block )
82- return sha256 .hexdigest ()
83-
84- with tempfile .TemporaryDirectory () as temp_dir_path :
85- tgz_file_path = os .path .join (temp_dir_path , NLTK_DATA_FILENAME )
86- urllib .request .urlretrieve (NLTK_DATA_URL , tgz_file_path )
87-
88- file_hash = sha256_checksum (tgz_file_path )
89- if file_hash != NLTK_DATA_SHA256 :
90- os .remove (tgz_file_path )
91- raise ValueError (f"SHA-256 mismatch: expected { NLTK_DATA_SHA256 } , got { file_hash } " )
92-
93- # Extract the contents
94- if not os .path .exists (nltk_data_dir ):
95- os .makedirs (nltk_data_dir )
96-
97- with tarfile .open (tgz_file_path , "r:gz" ) as tar :
98- tar .extractall (path = nltk_data_dir )
16+ nltk .download ("averaged_perceptron_tagger_eng" , quiet = True )
17+ nltk .download ("punkt_tab" , quiet = True )
9918
10019
10120def check_for_nltk_package (package_name : str , package_category : str ) -> bool :
@@ -109,10 +28,13 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
10928 try :
11029 nltk .find (f"{ package_category } /{ package_name } " , paths = paths )
11130 return True
112- except LookupError :
31+ except ( LookupError , OSError ) :
11332 return False
11433
11534
35+ # We cache this because we do not want to attempt
36+ # downloading the packages multiple times
37+ @lru_cache ()
11638def _download_nltk_packages_if_not_present ():
11739 """If required NLTK packages are not available, download them."""
11840
0 commit comments