Skip to content

Commit c8fd6af

Browse files
committed
Vastly speed up secondary apps query
1 parent ede10ff commit c8fd6af

File tree

4 files changed

+104
-49
lines changed

4 files changed

+104
-49
lines changed

adscrawler/app_stores/apple.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def get_urls_from_html(soup: BeautifulSoup) -> dict:
229229
return urls
230230

231231

232-
def get_privacy_details(html: str, country: str, store_id: str) -> bool:
232+
def get_privacy_details(html: str, country: str, store_id: str) -> dict:
233233
"""
234234
Get privacy details for an iOS app from the App Store.
235235
@@ -265,7 +265,8 @@ def get_privacy_details(html: str, country: str, store_id: str) -> bool:
265265

266266
if not data.get("data") or len(data["data"]) == 0:
267267
raise ValueError("App not found (404)")
268-
return data["data"][0]["attributes"]["privacyDetails"]
268+
privacy_details: dict = data["data"][0]["attributes"]["privacyDetails"]
269+
return privacy_details
269270

270271

271272
def find_privacy_policy_id(soup: BeautifulSoup) -> str | None:
@@ -286,9 +287,10 @@ def find_privacy_policy_id(soup: BeautifulSoup) -> str | None:
286287
# Find the position where "id" starts and extract everything after it
287288
id_position = url.find("id")
288289
if id_position != -1:
289-
id_value = url[id_position + 2 :] # +2 to skip the "id" prefix
290+
id_value: str = url[id_position + 2 :] # +2 to skip the "id" prefix
290291
print(f"ID: {id_value}") # This will print: 1538632801
291292
return id_value
293+
return None
292294

293295

294296
def get_developer_url(result: dict, urls: dict) -> str:
@@ -352,7 +354,7 @@ def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
352354
result_dict: dict = scraper.get_app_details(
353355
store_id, country=country, add_ratings=True, timeout=10, lang=language
354356
)
355-
logger.info(f"{store_id=}, {country=}, {language=} ios store scraped")
357+
logger.info(f"store=2 {country=} {language=} {store_id=} ios store scraped")
356358
return result_dict
357359

358360

adscrawler/app_stores/google.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def scrape_app_gp(store_id: str, country: str, language: str = "en") -> dict:
4343
country=country,
4444
timeout=10,
4545
)
46-
logger.info(f"{store_id=}, {country=}, {language=} play store scraped")
46+
logger.info(f"store=1 {country=} {language=} {store_id=} play store scraped")
4747
return result_dict
4848

4949

adscrawler/dbcon/queries.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ def query_store_apps_to_update(
853853
database_connection: PostgresCon,
854854
store: int,
855855
country_priority_group: int,
856-
log_query=False,
856+
log_query: bool = False,
857857
limit: int = 1000,
858858
) -> pd.DataFrame:
859859
short_update_days = CONFIG["crawl-settings"].get("short_update_days", 1)
@@ -871,12 +871,6 @@ def query_store_apps_to_update(
871871
days=max_recrawl_days
872872
)
873873
year_ago_ts = datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(days=365)
874-
rankings_mv_apps = ""
875-
if check_mv_exists(database_connection, "store_apps_in_latest_rankings"):
876-
# TODO: this is not included yet
877-
rankings_mv_apps = (
878-
" OR (sa.id in (select store_app from store_apps_in_latest_rankings))"
879-
)
880874
params = {
881875
"store": store,
882876
"country_crawl_priority": country_priority_group,
@@ -1272,7 +1266,7 @@ def query_api_call_id_for_uuid(mitm_uuid: str, database_connection: PostgresCon)
12721266
api_calls = query_api_calls_id_uuid_map(database_connection)
12731267
filtered_df = api_calls[api_calls["mitm_uuid"] == mitm_uuid]
12741268
assert filtered_df.shape[0] == 1, "Failed to find api_call_id for mitm_uuid"
1275-
api_call_id = filtered_df["api_call_id"].to_numpy()[0]
1269+
api_call_id: int = filtered_df["api_call_id"].to_numpy()[0]
12761270
return api_call_id
12771271

12781272

Lines changed: 95 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,31 @@
1-
WITH countries_to_crawl AS (
1+
WITH target_apps AS (
2+
SELECT
3+
sa.store,
4+
sa.id AS store_app,
5+
sa.store_id,
6+
sa.icon_url_100,
7+
sa.additional_html_scraped_at,
8+
sa.updated_at,
9+
sa.store_last_updated,
10+
agm.installs,
11+
agm.rating_count
12+
FROM
13+
store_apps AS sa
14+
LEFT JOIN app_global_metrics_latest AS agm
15+
ON
16+
sa.id = agm.store_app
17+
WHERE
18+
sa.store = :store
19+
AND (
20+
sa.crawl_result = 1
21+
OR sa.id IN (
22+
SELECT sailr.store_app
23+
FROM
24+
store_apps_in_latest_rankings AS sailr
25+
)
26+
)
27+
),
28+
all_countries_to_crawl AS (
229
SELECT
330
cc.country_id,
431
c.alpha2,
@@ -16,7 +43,30 @@ WITH countries_to_crawl AS (
1643
AND cc.enabled = TRUE
1744
AND cc.priority = :country_crawl_priority
1845
),
19-
latest_crawls AS (
46+
oldest_country_crawls AS (
47+
SELECT
48+
country_id,
49+
MIN(crawled_at) AS crawled_at
50+
FROM
51+
logging.app_country_crawls
52+
GROUP BY
53+
country_id
54+
),
55+
countries_to_crawl AS (
56+
SELECT
57+
actc.country_id,
58+
actc.alpha2,
59+
actc.priority
60+
FROM
61+
all_countries_to_crawl AS actc
62+
LEFT JOIN oldest_country_crawls AS occ
63+
ON
64+
actc.country_id = occ.country_id
65+
ORDER BY
66+
occ.crawled_at NULLS FIRST
67+
LIMIT 3
68+
),
69+
latest_app_crawls AS (
2070
SELECT DISTINCT ON
2171
(
2272
store_app,
@@ -28,14 +78,25 @@ latest_crawls AS (
2878
crawl_result
2979
FROM
3080
logging.app_country_crawls
81+
WHERE
82+
country_id IN (
83+
SELECT cctc.country_id
84+
FROM
85+
countries_to_crawl AS cctc
86+
)
87+
AND store_app IN (
88+
SELECT tta.store_app
89+
FROM
90+
target_apps AS tta
91+
)
3192
ORDER BY
3293
store_app ASC,
3394
country_id ASC,
3495
crawled_at DESC
3596
)
3697
SELECT
3798
sa.store,
38-
sa.id AS store_app,
99+
sa.store_app,
39100
sa.store_id,
40101
ctc.country_id,
41102
ctc.alpha2 AS country_code,
@@ -45,46 +106,44 @@ SELECT
45106
sa.updated_at AS app_updated_at,
46107
lc.crawled_at AS country_crawled_at
47108
FROM
48-
public.store_apps AS sa
109+
target_apps AS sa
49110
CROSS JOIN countries_to_crawl AS ctc
50-
LEFT JOIN app_global_metrics_latest agm ON sa.id = agm.store_app
51-
LEFT JOIN latest_crawls AS lc
111+
LEFT JOIN latest_app_crawls AS lc
52112
ON
53-
sa.id = lc.store_app
113+
sa.store_app = lc.store_app
54114
AND ctc.country_id = lc.country_id
55115
WHERE
56-
sa.store = :store
57-
AND
58-
-- ensure it is at least a valid app to crawl many countries
59-
(sa.crawl_result = 1 OR (sa.id in (select store_app from store_apps_in_latest_rankings)))
60-
AND (
61-
-- Long update conditions
62-
(
63-
lc.crawled_at <= :long_update_ts
64-
AND
65-
sa.store_last_updated >= :year_ago_ts
66-
)
67-
-- Crawl at least once a year conditions
68-
OR (
69-
(lc.crawled_at <= :max_recrawl_ts
70-
OR lc.crawl_result IS NULL)
71-
72-
)
116+
-- Long update conditions
117+
(
118+
lc.crawled_at <= :long_update_ts
119+
AND
120+
sa.store_last_updated >= :year_ago_ts
121+
)
122+
-- Crawl at least once a year conditions
123+
OR (
124+
(
125+
lc.crawled_at <= :max_recrawl_ts
126+
OR lc.crawl_result IS NULL
127+
)
73128
)
74129
ORDER BY
75-
(CASE
76-
WHEN lc.crawl_result IS NULL
77-
THEN 0
78-
ELSE 1
79-
END),
80-
(CASE
81-
WHEN lc.crawled_at < :max_recrawl_ts
82-
THEN 0
83-
ELSE 1
84-
END),
130+
(
131+
CASE
132+
WHEN lc.crawl_result IS NULL
133+
THEN 0
134+
ELSE 1
135+
END
136+
),
137+
(
138+
CASE
139+
WHEN lc.crawled_at < :max_recrawl_ts
140+
THEN 0
141+
ELSE 1
142+
END
143+
),
85144
GREATEST(
86-
COALESCE(agm.installs, 0),
87-
COALESCE(CAST(agm.rating_count AS bigint), 0)
145+
COALESCE(sa.installs, 0),
146+
COALESCE(CAST(sa.rating_count AS bigint), 0)
88147
)
89148
DESC NULLS LAST
90149
LIMIT :mylimit;

0 commit comments

Comments
 (0)