Skip to content

Commit 4a5ebd2

Browse files
committed
Crawl keywords to S3 and log
1 parent 6723a16 commit 4a5ebd2

File tree

2 files changed

+22
-16
lines changed

2 files changed

+22
-16
lines changed

adscrawler/app_stores/scrape_stores.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -197,33 +197,33 @@ def crawl_keyword_cranks(database_connection: PostgresCon) -> None:
197197
language=language,
198198
keyword=keyword,
199199
)
200+
df["keyword_text"] = keyword
201+
df["keyword_id"] = row["keyword_id"]
200202
df["language"] = language.lower()
201203
df["country"] = country.upper()
202-
df["crawled_date"] = datetime.datetime.now(tz=datetime.UTC).date()
204+
df["crawled_at"] = datetime.datetime.now(tz=datetime.UTC)
205+
df["crawled_date"] = df["crawled_at"].dt.date
203206
all_keywords = pd.concat([all_keywords, df], ignore_index=True)
204207
except Exception:
205208
logger.exception(f"Scrape keyword={keyword} hit error, skipping")
206209
raw_keywords_to_s3(all_keywords)
210+
all_keywords = all_keywords.rename(columns={"keyword_id": "keyword"})
211+
key_columns = ["keyword"]
212+
upsert_df(
213+
table_name="keywords_crawled_at",
214+
schema="logging",
215+
insert_columns=["keyword", "crawled_at"],
216+
df=all_keywords[["keyword", "crawled_at"]],
217+
key_columns=key_columns,
218+
database_connection=database_connection,
219+
)
207220

208221

209222
# def import_keywords_from_s3(database_connection: PostgresCon) -> None:
210223
# languages_map = query_languages(database_connection)
211224
# language_dict = languages_map.set_index("language_slug")["id"].to_dict()
212225
# language_key = language_dict[language]
213-
# key_columns = ["keyword"]
214-
# upsert_df(
215-
# table_name="keywords_crawled_at",
216-
# schema="logging",
217-
# insert_columns=["keyword", "crawled_at"],
218-
# df=pd.DataFrame(
219-
# {
220-
# "keyword": [keyword_id],
221-
# "crawled_at": datetime.datetime.now(tz=datetime.UTC),
222-
# }
223-
# ),
224-
# key_columns=key_columns,
225-
# database_connection=database_connection,
226-
# )
226+
#
227227

228228

229229
def scrape_store_ranks(database_connection: PostgresCon, store: int) -> None:

adscrawler/dbcon/sql/query_keywords_to_crawl.sql

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@ log_crawled_keywords AS (
1212
WHERE
1313
crawled_at > CURRENT_DATE - INTERVAL '7 days'
1414
)
15-
SELECT *
15+
SELECT
16+
store,
17+
keyword_id,
18+
keyword_text,
19+
app_count,
20+
total_apps,
21+
competitiveness_score
1622
FROM
1723
frontend.keyword_scores
1824
WHERE

0 commit comments

Comments
 (0)