44import re
55from collections import Counter
66
7- import nltk
87import pandas as pd
9- from nltk .corpus import stopwords , wordnet
10- from nltk .stem import WordNetLemmatizer
11- from nltk .tokenize import word_tokenize
12- from rake_nltk import Rake
138
149from adscrawler .dbcon .connection import PostgresCon
1510from adscrawler .dbcon .queries import (
5954 "com" ,
6055 "game" ,
6156}
62- STOPWORDS = set (stopwords .words ("english" )).union (CUSTOM_STOPWORDS )
6357
6458
6559def clean_text (text : str ) -> str :
@@ -101,6 +95,8 @@ def clean_df_text(df: pd.DataFrame, column: str) -> pd.DataFrame:
10195
10296def count_tokens (phrase : str ) -> int :
10397 """Count the number of tokens in a phrase."""
98+ from nltk .tokenize import word_tokenize
99+
104100 return len (word_tokenize (phrase ))
105101
106102
@@ -110,6 +106,9 @@ def extract_keywords_spacy(
110106 """Extracts noun phrase keywords using spaCy with token limit."""
111107 # Load spaCy model
112108 import spacy # noqa: PLC0415
109+ from nltk .corpus import stopwords
110+
111+ mystopwords = set (stopwords .words ("english" )).union (CUSTOM_STOPWORDS )
113112
114113 try :
115114 nlp = spacy .load ("en_core_web_sm" )
@@ -126,7 +125,7 @@ def extract_keywords_spacy(
126125 if chunk .root .text .isalpha ():
127126 # Check token count
128127 if count_tokens (chunk .text ) <= max_tokens :
129- if not any (token .is_stop or token in STOPWORDS for token in chunk ):
128+ if not any (token .is_stop or token in mystopwords for token in chunk ):
130129 keywords .append (chunk .text )
131130
132131 keyword_freq = Counter (keywords )
@@ -135,8 +134,16 @@ def extract_keywords_spacy(
135134
136135def extract_keywords_nltk (text : str , top_n : int = 10 ) -> list [str ]:
137136 """Extracts lemmatized keywords using NLTK with frequency ranking."""
137+ from nltk .tokenize import word_tokenize
138+
138139 words = word_tokenize (text )
139140 # Ensure necessary NLTK resources are downloaded
141+ import nltk
142+ from nltk .corpus import stopwords , wordnet
143+ from nltk .stem import WordNetLemmatizer
144+
145+ mystopwords = set (stopwords .words ("english" )).union (CUSTOM_STOPWORDS )
146+
140147 nltk .download ("punkt" , quiet = True )
141148 nltk .download ("stopwords" , quiet = True )
142149 nltk .download ("wordnet" , quiet = True )
@@ -147,7 +154,7 @@ def extract_keywords_nltk(text: str, top_n: int = 10) -> list[str]:
147154 processed_words = []
148155 for word , tag in pos_tags :
149156 # Only process alphabetic words that aren't stopwords
150- if word .isalpha () and word .lower () not in STOPWORDS :
157+ if word .isalpha () and word .lower () not in mystopwords :
151158 # Convert POS tag to WordNet format for better lemmatization
152159 tag_first_char = tag [0 ].lower ()
153160 wordnet_pos = {
@@ -165,6 +172,11 @@ def extract_keywords_nltk(text: str, top_n: int = 10) -> list[str]:
165172
166173def extract_keywords_rake (text : str , top_n : int = 10 , max_tokens : int = 3 ) -> list [str ]:
167174 """Extracts keywords using RAKE with token limit."""
175+ from nltk .corpus import stopwords
176+ from rake_nltk import Rake
177+
178+ mystopwords = set (stopwords .words ("english" )).union (CUSTOM_STOPWORDS )
179+
168180 r = Rake ()
169181 r .extract_keywords_from_text (text )
170182
@@ -174,7 +186,7 @@ def extract_keywords_rake(text: str, top_n: int = 10, max_tokens: int = 3) -> li
174186 if count_tokens (phrase ) <= max_tokens :
175187 filtered_phrases .append (phrase )
176188 filtered_phrases = [
177- phrase for phrase in filtered_phrases if phrase not in STOPWORDS
189+ phrase for phrase in filtered_phrases if phrase not in mystopwords
178190 ]
179191 return filtered_phrases [:top_n ]
180192
@@ -185,6 +197,10 @@ def extract_unique_app_keywords_from_text(
185197 max_tokens : int = 1 ,
186198) -> list [str ]:
187199 """Extracts keywords using spaCy, NLTK, and RAKE, then returns a unique set."""
200+ from nltk .corpus import stopwords
201+
202+ mystopwords = set (stopwords .words ("english" )).union (CUSTOM_STOPWORDS )
203+
188204 text = clean_text (text )
189205 words_spacy = extract_keywords_spacy (text , top_n , max_tokens )
190206 words_nltk = extract_keywords_nltk (text , top_n )
@@ -199,7 +215,7 @@ def extract_unique_app_keywords_from_text(
199215 filtered_keywords .append (kw )
200216
201217 # Remove stopwords from filtered keywords
202- filtered_keywords = [kw for kw in filtered_keywords if kw not in STOPWORDS ]
218+ filtered_keywords = [kw for kw in filtered_keywords if kw not in mystopwords ]
203219
204220 # keywords_base = query_keywords_base(database_connection)
205221 # matched_base_keywords = keywords_base[
@@ -217,6 +233,9 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
217233 """Get the global keywords from the database.
218234 NOTE: This takes about ~5-8GB of RAM for 50k keywords and 200k descriptions. For now run manually.
219235 """
236+ from nltk .corpus import stopwords
237+
238+ mystopwords = set (stopwords .words ("english" )).union (CUSTOM_STOPWORDS )
220239 df = query_all_store_app_descriptions (
221240 language_slug = "en" , database_connection = database_connection
222241 )
@@ -227,7 +246,7 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
227246
228247 vectorizer = TfidfVectorizer (
229248 ngram_range = (1 , 2 ), # Include 1-grams, 2-grams
230- stop_words = list (STOPWORDS ),
249+ stop_words = list (mystopwords ),
231250 max_df = 0.75 , # Ignore terms in >75% of docs (too common)
232251 min_df = 300 , # Ignore terms in <x docs (too rare)
233252 max_features = 50000 ,
@@ -240,7 +259,7 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
240259 global_scores = tfidf_matrix .sum (axis = 0 ).A1 # Sum scores per term
241260 keyword_scores = list (zip (feature_names , global_scores , strict = False ))
242261 keyword_scores .sort (key = lambda x : x [1 ], reverse = True )
243- global_keywords = [kw for kw , score in keyword_scores if kw not in STOPWORDS ]
262+ global_keywords = [kw for kw , score in keyword_scores if kw not in mystopwords ]
244263 return global_keywords
245264
246265
0 commit comments