Skip to content

Commit 6b3c676

Browse files
authored
Merge pull request #53 from ddxv/main
Fix some some missing files and hide some imports for less commonly used libraries
2 parents 81aefdb + 6a420a3 commit 6b3c676

File tree

2 files changed

+94
-12
lines changed

2 files changed

+94
-12
lines changed

adscrawler/app_stores/process_keywords.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,7 @@
44
import re
55
from collections import Counter
66

7-
import nltk
87
import pandas as pd
9-
from nltk.corpus import stopwords, wordnet
10-
from nltk.stem import WordNetLemmatizer
11-
from nltk.tokenize import word_tokenize
12-
from rake_nltk import Rake
138

149
from adscrawler.dbcon.connection import PostgresCon
1510
from adscrawler.dbcon.queries import (
@@ -59,7 +54,6 @@
5954
"com",
6055
"game",
6156
}
62-
STOPWORDS = set(stopwords.words("english")).union(CUSTOM_STOPWORDS)
6357

6458

6559
def clean_text(text: str) -> str:
@@ -101,6 +95,8 @@ def clean_df_text(df: pd.DataFrame, column: str) -> pd.DataFrame:
10195

10296
def count_tokens(phrase: str) -> int:
10397
"""Count the number of tokens in a phrase."""
98+
from nltk.tokenize import word_tokenize
99+
104100
return len(word_tokenize(phrase))
105101

106102

@@ -110,6 +106,9 @@ def extract_keywords_spacy(
110106
"""Extracts noun phrase keywords using spaCy with token limit."""
111107
# Load spaCy model
112108
import spacy # noqa: PLC0415
109+
from nltk.corpus import stopwords
110+
111+
mystopwords = set(stopwords.words("english")).union(CUSTOM_STOPWORDS)
113112

114113
try:
115114
nlp = spacy.load("en_core_web_sm")
@@ -126,7 +125,7 @@ def extract_keywords_spacy(
126125
if chunk.root.text.isalpha():
127126
# Check token count
128127
if count_tokens(chunk.text) <= max_tokens:
129-
if not any(token.is_stop or token in STOPWORDS for token in chunk):
128+
if not any(token.is_stop or token in mystopwords for token in chunk):
130129
keywords.append(chunk.text)
131130

132131
keyword_freq = Counter(keywords)
@@ -135,8 +134,16 @@ def extract_keywords_spacy(
135134

136135
def extract_keywords_nltk(text: str, top_n: int = 10) -> list[str]:
137136
"""Extracts lemmatized keywords using NLTK with frequency ranking."""
137+
from nltk.tokenize import word_tokenize
138+
138139
words = word_tokenize(text)
139140
# Ensure necessary NLTK resources are downloaded
141+
import nltk
142+
from nltk.corpus import stopwords, wordnet
143+
from nltk.stem import WordNetLemmatizer
144+
145+
mystopwords = set(stopwords.words("english")).union(CUSTOM_STOPWORDS)
146+
140147
nltk.download("punkt", quiet=True)
141148
nltk.download("stopwords", quiet=True)
142149
nltk.download("wordnet", quiet=True)
@@ -147,7 +154,7 @@ def extract_keywords_nltk(text: str, top_n: int = 10) -> list[str]:
147154
processed_words = []
148155
for word, tag in pos_tags:
149156
# Only process alphabetic words that aren't stopwords
150-
if word.isalpha() and word.lower() not in STOPWORDS:
157+
if word.isalpha() and word.lower() not in mystopwords:
151158
# Convert POS tag to WordNet format for better lemmatization
152159
tag_first_char = tag[0].lower()
153160
wordnet_pos = {
@@ -165,6 +172,11 @@ def extract_keywords_nltk(text: str, top_n: int = 10) -> list[str]:
165172

166173
def extract_keywords_rake(text: str, top_n: int = 10, max_tokens: int = 3) -> list[str]:
167174
"""Extracts keywords using RAKE with token limit."""
175+
from nltk.corpus import stopwords
176+
from rake_nltk import Rake
177+
178+
mystopwords = set(stopwords.words("english")).union(CUSTOM_STOPWORDS)
179+
168180
r = Rake()
169181
r.extract_keywords_from_text(text)
170182

@@ -174,7 +186,7 @@ def extract_keywords_rake(text: str, top_n: int = 10, max_tokens: int = 3) -> li
174186
if count_tokens(phrase) <= max_tokens:
175187
filtered_phrases.append(phrase)
176188
filtered_phrases = [
177-
phrase for phrase in filtered_phrases if phrase not in STOPWORDS
189+
phrase for phrase in filtered_phrases if phrase not in mystopwords
178190
]
179191
return filtered_phrases[:top_n]
180192

@@ -185,6 +197,10 @@ def extract_unique_app_keywords_from_text(
185197
max_tokens: int = 1,
186198
) -> list[str]:
187199
"""Extracts keywords using spaCy, NLTK, and RAKE, then returns a unique set."""
200+
from nltk.corpus import stopwords
201+
202+
mystopwords = set(stopwords.words("english")).union(CUSTOM_STOPWORDS)
203+
188204
text = clean_text(text)
189205
words_spacy = extract_keywords_spacy(text, top_n, max_tokens)
190206
words_nltk = extract_keywords_nltk(text, top_n)
@@ -199,7 +215,7 @@ def extract_unique_app_keywords_from_text(
199215
filtered_keywords.append(kw)
200216

201217
# Remove stopwords from filtered keywords
202-
filtered_keywords = [kw for kw in filtered_keywords if kw not in STOPWORDS]
218+
filtered_keywords = [kw for kw in filtered_keywords if kw not in mystopwords]
203219

204220
# keywords_base = query_keywords_base(database_connection)
205221
# matched_base_keywords = keywords_base[
@@ -217,6 +233,9 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
217233
"""Get the global keywords from the database.
218234
NOTE: This takes about ~5-8GB of RAM for 50k keywords and 200k descriptions. For now run manually.
219235
"""
236+
from nltk.corpus import stopwords
237+
238+
mystopwords = set(stopwords.words("english")).union(CUSTOM_STOPWORDS)
220239
df = query_all_store_app_descriptions(
221240
language_slug="en", database_connection=database_connection
222241
)
@@ -227,7 +246,7 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
227246

228247
vectorizer = TfidfVectorizer(
229248
ngram_range=(1, 2), # Include 1-grams, 2-grams
230-
stop_words=list(STOPWORDS),
249+
stop_words=list(mystopwords),
231250
max_df=0.75, # Ignore terms in >75% of docs (too common)
232251
min_df=300, # Ignore terms in <x docs (too rare)
233252
max_features=50000,
@@ -240,7 +259,7 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
240259
global_scores = tfidf_matrix.sum(axis=0).A1 # Sum scores per term
241260
keyword_scores = list(zip(feature_names, global_scores, strict=False))
242261
keyword_scores.sort(key=lambda x: x[1], reverse=True)
243-
global_keywords = [kw for kw, score in keyword_scores if kw not in STOPWORDS]
262+
global_keywords = [kw for kw, score in keyword_scores if kw not in mystopwords]
244263
return global_keywords
245264

246265

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
WITH latest_descriptions AS (
2+
SELECT DISTINCT ON
3+
(sad.store_app)
4+
sad.id AS description_id,
5+
sad.store_app,
6+
sad.description_short,
7+
sad.description,
8+
sad.updated_at AS description_last_updated
9+
FROM
10+
store_apps_descriptions AS sad
11+
WHERE
12+
sad.language_id = 1
13+
ORDER BY
14+
sad.store_app ASC,
15+
sad.updated_at DESC
16+
),
17+
latest_extractions AS (
18+
SELECT DISTINCT ON
19+
(ak.store_app)
20+
ak.store_app,
21+
ak.extracted_at AS last_extracted_at
22+
FROM
23+
app_keywords_extracted AS ak
24+
ORDER BY
25+
ak.store_app ASC,
26+
ak.extracted_at DESC
27+
),
28+
base AS (
29+
SELECT
30+
ld.store_app,
31+
ld.description_id,
32+
le.last_extracted_at,
33+
ld.description_short,
34+
ld.description
35+
FROM latest_descriptions AS ld
36+
LEFT JOIN
37+
latest_extractions AS le
38+
ON
39+
ld.store_app = le.store_app
40+
WHERE le.last_extracted_at IS NULL OR (
41+
ld.description_last_updated > le.last_extracted_at
42+
AND le.last_extracted_at <= NOW() - INTERVAL '7 days'
43+
)
44+
)
45+
SELECT
46+
b.store_app,
47+
b.description_id,
48+
b.last_extracted_at,
49+
b.description_short,
50+
b.description
51+
FROM
52+
base AS b
53+
INNER JOIN app_global_metrics_latest AS agml ON b.store_app = agml.store_app
54+
ORDER BY
55+
(CASE WHEN b.last_extracted_at IS NULL THEN 1 ELSE 0 END) DESC, -- always crawl new ones first
56+
(
57+
GREATEST(
58+
COALESCE(agml.installs, 0),
59+
COALESCE(agml.rating_count::BIGINT, 0)
60+
)
61+
* (10 * COALESCE(EXTRACT(DAY FROM (NOW() - b.last_extracted_at)), 1))
62+
) DESC
63+
LIMIT :mylimit;

0 commit comments

Comments
 (0)