Skip to content

Commit 81aefdb

Browse files
authored
Merge pull request #52 from ddxv/main
Add keyword processing functions
2 parents 691bde3 + 720761f commit 81aefdb

File tree

136 files changed

+1875
-1313
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+1875
-1313
lines changed

adscrawler/app_stores/process_from_s3.py

Lines changed: 108 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
get_db_connection,
1616
)
1717
from adscrawler.dbcon.queries import (
18+
delete_and_insert,
1819
query_categories,
1920
query_collections,
2021
query_countries,
22+
query_languages,
2123
query_store_id_map,
2224
query_store_id_map_cached,
2325
upsert_df,
@@ -295,10 +297,14 @@ def manual_import_app_metrics_from_s3(
295297
use_ssh_tunnel=use_tunnel, config_key="madrone"
296298
)
297299

300+
start_date = datetime.datetime.fromisoformat("2025-10-01").date()
298301
for snapshot_date in pd.date_range(start_date, end_date, freq="D"):
299302
snapshot_date = snapshot_date.date()
300303
for store in [1, 2]:
301-
process_app_metrics_to_db(database_connection, store, snapshot_date)
304+
try:
305+
process_app_metrics_to_db(database_connection, store, snapshot_date)
306+
except:
307+
process_app_metrics_to_db(database_connection, store, snapshot_date)
302308

303309

304310
def import_app_metrics_from_s3(
@@ -322,6 +328,11 @@ def process_app_metrics_to_db(
322328
make_s3_app_country_metrics_history(store, snapshot_date=snapshot_date)
323329
logger.info(f"date={snapshot_date}, store={store} agg df load")
324330
df = get_s3_agg_daily_snapshots(snapshot_date, snapshot_date, store)
331+
if df.empty:
332+
logger.warning(
333+
f"No data found for S3 agg app metrics {store=} {snapshot_date=}"
334+
)
335+
return
325336
if store == 2:
326337
# Should be resolved from 11/1/2025
327338
problem_rows = df["store_id"].str.contains(".0")
@@ -336,11 +347,6 @@ def process_app_metrics_to_db(
336347
df = df.drop_duplicates(
337348
["snapshot_date", "country", "store_id"], keep="last"
338349
)
339-
if df.empty:
340-
logger.warning(
341-
f"No data found for S3 agg app metrics {store=} {snapshot_date=}"
342-
)
343-
return
344350
logger.info(f"date={snapshot_date}, store={store} agg df prep")
345351
df = prep_app_metrics_history(
346352
df=df, store=store, database_connection=database_connection
@@ -439,6 +445,7 @@ def app_details_country_history_query(
439445
# lookback_date_str: str,
440446
snapshot_date_str: str,
441447
) -> str:
448+
bucket = CONFIG["s3"]["bucket"]
442449
if store == 2:
443450
data_cols = """
444451
CAST(trackId AS VARCHAR) AS store_id,
@@ -500,7 +507,7 @@ def app_details_country_history_query(
500507
PARTITION BY store_id, country
501508
ORDER BY crawled_at DESC, {extra_sort_column}
502509
) = 1
503-
) TO 's3://adscrawler/agg-data/app_country_metrics/store={store}/snapshot_date={snapshot_date_str}/'
510+
) TO 's3://{bucket}/agg-data/app_country_metrics/store={store}/snapshot_date={snapshot_date_str}/'
504511
(FORMAT PARQUET,
505512
PARTITION_BY (country),
506513
ROW_GROUP_SIZE 100000,
@@ -571,7 +578,7 @@ def process_ranks_from_s3(
571578
logger.info(
572579
f"DuckDB {store=} period_start={period_date_str} {country=} files={len(country_parquet_paths)}"
573580
)
574-
wdf = process_parquets_and_insert(
581+
wdf = query_store_collection_ranks(
575582
country_parquet_paths=country_parquet_paths,
576583
period=period,
577584
s3_config_key=s3_config_key,
@@ -632,7 +639,7 @@ def process_ranks_from_s3(
632639
)
633640

634641

635-
def process_parquets_and_insert(
642+
def query_store_collection_ranks(
636643
country_parquet_paths: list[str],
637644
period: str,
638645
s3_config_key: str,
@@ -687,3 +694,95 @@ def manual_download_rankings(
687694
s3_client.download_file(bucket, s3_key, str(local_path))
688695
df = pd.read_parquet(local_path)
689696
return df
697+
698+
699+
def import_keywords_from_s3(
700+
start_date: datetime.date, end_date: datetime.date, database_connection: PostgresCon
701+
) -> None:
702+
language = "en"
703+
country_map = query_countries(database_connection)
704+
languages_map = query_languages(database_connection)
705+
language_dict = languages_map.set_index("language_slug")["id"].to_dict()
706+
_language_key = language_dict[language]
707+
s3_config_key = "s3"
708+
bucket = CONFIG[s3_config_key]["bucket"]
709+
for snapshot_date in pd.date_range(start_date, end_date, freq="D"):
710+
snapshot_date = snapshot_date.date()
711+
for store in [1, 2]:
712+
s3_loc = "raw-data/keywords"
713+
s3_key = f"{s3_loc}/store={store}/crawled_date={snapshot_date}/"
714+
parquet_paths = get_parquet_paths_by_prefix(bucket, s3_key)
715+
if len(parquet_paths) == 0:
716+
logger.warning(f"No parquet paths found for {s3_key}")
717+
continue
718+
df = query_keywords_from_s3(parquet_paths, s3_config_key)
719+
store_id_map = query_store_id_map_cached(database_connection, store)
720+
df["store_app"] = df["store_id"].map(
721+
store_id_map.set_index("store_id")["id"].to_dict()
722+
)
723+
df["country"] = df["country"].map(
724+
country_map.set_index("alpha2")["id"].to_dict()
725+
)
726+
if df["store_app"].isna().any():
727+
check_and_insert_new_apps(
728+
database_connection=database_connection,
729+
dicts=df.to_dict(orient="records"),
730+
crawl_source="keywords",
731+
store=store,
732+
)
733+
store_id_map = query_store_id_map_cached(database_connection, store)
734+
df["store_app"] = df["store_id"].map(
735+
store_id_map.set_index("store_id")["id"].to_dict()
736+
)
737+
delete_and_insert(
738+
df=df,
739+
table_name="app_keyword_ranks_daily",
740+
schema="frontend",
741+
database_connection=database_connection,
742+
delete_by_keys=["crawled_date"],
743+
insert_columns=[
744+
"country",
745+
"keyword_id",
746+
"crawled_date",
747+
"store_app",
748+
"app_rank",
749+
],
750+
delete_keys_have_duplicates=True,
751+
)
752+
753+
754+
def query_keywords_from_s3(
755+
parquet_paths: list[str],
756+
s3_config_key: str,
757+
) -> pd.DataFrame:
758+
"""Query keywords from S3 parquet files."""
759+
period_query = f"""WITH all_data AS (
760+
SELECT * FROM read_parquet({parquet_paths})
761+
),
762+
latest_per_keyword AS (
763+
SELECT
764+
store,
765+
country,
766+
keyword_id,
767+
rank,
768+
MAX(crawled_at) AS latest_crawled_at
769+
FROM all_data
770+
GROUP BY store, country, keyword_id, rank
771+
)
772+
SELECT
773+
ar.crawled_date,
774+
ar.country,
775+
ar.store,
776+
ar.rank AS app_rank,
777+
ar.keyword_id,
778+
ar.store_id
779+
FROM all_data ar
780+
JOIN latest_per_keyword lp
781+
ON ar.keyword_id = lp.keyword_id
782+
AND ar.store = lp.store
783+
AND ar.country = lp.country
784+
AND ar.rank = lp.rank
785+
AND ar.crawled_at = lp.latest_crawled_at;
786+
"""
787+
duckdb_con = get_duckdb_connection(s3_config_key)
788+
return duckdb_con.execute(period_query).df()
Lines changed: 100 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# noqa: PLC0415
2+
import datetime
23
import os
34
import re
45
from collections import Counter
@@ -12,11 +13,17 @@
1213

1314
from adscrawler.dbcon.connection import PostgresCon
1415
from adscrawler.dbcon.queries import (
16+
delete_and_insert,
1517
query_all_store_app_descriptions,
18+
query_apps_to_process_keywords,
1619
query_keywords_base,
1720
upsert_df,
1821
)
1922

23+
from adscrawler.config import get_logger
24+
25+
logger = get_logger(__name__)
26+
2027
# Custom stopwords to remove personal pronouns & other irrelevant words
2128
CUSTOM_STOPWORDS = {
2229
"your",
@@ -72,6 +79,26 @@ def clean_text(text: str) -> str:
7279
return re.sub(r"[^a-zA-Z\s]", ". ", text.lower())
7380

7481

82+
def clean_df_text(df: pd.DataFrame, column: str) -> pd.DataFrame:
83+
# Note these are same as clean_text function
84+
df[column] = (
85+
df[column]
86+
.str.replace("\r", ". ")
87+
.replace("\n", ". ")
88+
.replace("\t", ". ")
89+
.replace("\xa0", ". ")
90+
.replace("•", ". ")
91+
.replace("'", "")
92+
.replace("’", "")
93+
.replace("-", " ")
94+
.replace(r"\bhttp\S*", "", regex=True)
95+
.replace(r"\bwww\S*", "", regex=True)
96+
.replace(r"[^a-zA-Z\s]", ". ", regex=True)
97+
.str.lower()
98+
)
99+
return df
100+
101+
75102
def count_tokens(phrase: str) -> int:
76103
"""Count the number of tokens in a phrase."""
77104
return len(word_tokenize(phrase))
@@ -152,9 +179,8 @@ def extract_keywords_rake(text: str, top_n: int = 10, max_tokens: int = 3) -> li
152179
return filtered_phrases[:top_n]
153180

154181

155-
def extract_keywords(
182+
def extract_unique_app_keywords_from_text(
156183
text: str,
157-
database_connection: PostgresCon,
158184
top_n: int = 2,
159185
max_tokens: int = 1,
160186
) -> list[str]:
@@ -175,13 +201,14 @@ def extract_keywords(
175201
# Remove stopwords from filtered keywords
176202
filtered_keywords = [kw for kw in filtered_keywords if kw not in STOPWORDS]
177203

178-
keywords_base = query_keywords_base(database_connection)
204+
# keywords_base = query_keywords_base(database_connection)
205+
# matched_base_keywords = keywords_base[
206+
# keywords_base["keyword_text"].apply(lambda x: x in description_text)
207+
# ]
179208

180-
matched_base_keywords = keywords_base[
181-
keywords_base["keyword_text"].apply(lambda x: x in text)
182-
]
183-
matched_base_keywords = matched_base_keywords["keyword_text"].str.strip().tolist()
184-
combined_keywords = list(sorted(set(filtered_keywords + matched_base_keywords)))
209+
# matched_base_keywords = matched_base_keywords["keyword_text"].str.strip().tolist()
210+
# combined_keywords = list(sorted(set(filtered_keywords + matched_base_keywords)))
211+
combined_keywords = list(sorted(set(filtered_keywords)))
185212

186213
return combined_keywords
187214

@@ -194,22 +221,7 @@ def get_global_keywords(database_connection: PostgresCon) -> list[str]:
194221
language_slug="en", database_connection=database_connection
195222
)
196223

197-
# Note these are same as clean_text function
198-
df["description"] = (
199-
df["description"]
200-
.str.replace("\r", ". ")
201-
.replace("\n", ". ")
202-
.replace("\t", ". ")
203-
.replace("\xa0", ". ")
204-
.replace("•", ". ")
205-
.replace("'", "")
206-
.replace("’", "")
207-
.replace("-", " ")
208-
.replace(r"\bhttp\S*", "", regex=True)
209-
.replace(r"\bwww\S*", "", regex=True)
210-
.replace(r"[^a-zA-Z\s]", ". ", regex=True)
211-
.str.lower()
212-
)
224+
df = clean_df_text(df, "description")
213225

214226
from sklearn.feature_extraction.text import TfidfVectorizer # noqa: PLC0415
215227

@@ -262,3 +274,67 @@ def insert_global_keywords(database_connection: PostgresCon) -> None:
262274
index=False,
263275
schema="public",
264276
)
277+
278+
279+
def process_app_keywords(database_connection: PostgresCon, limit: int) -> None:
280+
"""Process app keywords.
281+
282+
While Python might be less efficient than SQL it's more flexible for
283+
the query input limiting which apps and when to run.
284+
This way apps can be processed in batches and only when really needed.
285+
"""
286+
logger.info(f"Extracting app keywords for {limit} apps")
287+
extract_app_keywords_from_descriptions(database_connection, limit)
288+
logger.info("Extracted app keywords finished")
289+
290+
291+
def extract_app_keywords_from_descriptions(
292+
database_connection: PostgresCon, limit: int
293+
) -> None:
294+
"""Process keywords for app descriptions."""
295+
description_df = query_apps_to_process_keywords(database_connection, limit=limit)
296+
keywords_base = query_keywords_base(database_connection)
297+
keywords_base["keyword_text"] = (
298+
" " + keywords_base["keyword_text"].str.lower() + " "
299+
)
300+
description_df["description_text"] = (
301+
" "
302+
+ description_df["description_short"]
303+
+ " "
304+
+ description_df["description"]
305+
+ " "
306+
).str.lower()
307+
description_df = clean_df_text(description_df, "description_text")
308+
all_keywords_dfs = []
309+
logger.info(f"Processing {len(description_df)} app descriptions")
310+
for _i, row in description_df.iterrows():
311+
logger.debug(f"Processing app description: {_i}/{len(description_df)}")
312+
description_id = row["description_id"]
313+
store_app = row["store_app"]
314+
description_text = row["description_text"]
315+
matched_base_keywords = keywords_base[
316+
keywords_base["keyword_text"].apply(
317+
lambda x, text=description_text: x in text
318+
)
319+
]
320+
keywords_df = pd.DataFrame(
321+
matched_base_keywords, columns=["keyword_text", "keyword_id"]
322+
)
323+
keywords_df["description_id"] = description_id
324+
keywords_df["store_app"] = store_app
325+
all_keywords_dfs.append(keywords_df)
326+
main_keywords_df = pd.concat(all_keywords_dfs)
327+
main_keywords_df = main_keywords_df[["store_app", "description_id", "keyword_id"]]
328+
main_keywords_df["extracted_at"] = datetime.datetime.now(tz=datetime.UTC)
329+
table_name = "app_keywords_extracted"
330+
insert_columns = ["store_app", "description_id", "keyword_id", "extracted_at"]
331+
logger.info(f"Delete and insert {len(main_keywords_df)} app keywords")
332+
delete_and_insert(
333+
df=main_keywords_df,
334+
table_name=table_name,
335+
schema="public",
336+
database_connection=database_connection,
337+
insert_columns=insert_columns,
338+
delete_by_keys=["store_app"],
339+
delete_keys_have_duplicates=True,
340+
)

0 commit comments

Comments
 (0)