Skip to content

Commit fa03bf2

Browse files
committed
Fix when duplicates in ios apps
1 parent 27adeae commit fa03bf2

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

adscrawler/app_stores/process_from_s3.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -262,13 +262,14 @@ def prep_app_metrics_history(
262262
return df
263263

264264

265-
def manual_import_app_metrics_from_s3() -> None:
265+
def manual_import_app_metrics_from_s3(
266+
start_date: datetime.date, end_date: datetime.date
267+
) -> None:
266268
use_tunnel = False
267269
database_connection = get_db_connection(
268270
use_ssh_tunnel=use_tunnel, config_key="madrone"
269271
)
270-
start_date = datetime.date(2025, 10, 31)
271-
end_date = datetime.date(2025, 10, 31)
272+
272273
for snapshot_date in pd.date_range(start_date, end_date, freq="D"):
273274
snapshot_date = snapshot_date.date()
274275
for store in [1, 2]:
@@ -297,9 +298,19 @@ def process_app_metrics_to_db(
297298
logger.info(f"date={snapshot_date}, store={store} agg df load")
298299
df = get_s3_agg_daily_snapshots(snapshot_date, snapshot_date, store)
299300
if store == 2:
300-
df.loc[df["store_id"].str.contains(".0"), "store_id"] = (
301-
df.loc[df["store_id"].str.contains(".0"), "store_id"].str.split(".").str[0]
302-
)
301+
# Should be resolved from 11/1/2025
302+
problem_rows = df["store_id"].str.contains(".0")
303+
if problem_rows.any():
304+
logger.warning(
305+
f'Apple App IDs: Found {problem_rows.sum()} store_id with ".0" suffix, fixing'
306+
)
307+
df.loc[problem_rows, "store_id"] = (
308+
df.loc[problem_rows, "store_id"].str.split(".").str[0]
309+
)
310+
df["crawled_at"] = df["crawled_at"].sort_values(ascending=True)
311+
df = df.drop_duplicates(
312+
["snapshot_date", "country", "store_id"], keep="last"
313+
)
303314
if df.empty:
304315
logger.warning(
305316
f"No data found for S3 agg app metrics {store=} {snapshot_date=}"

0 commit comments

Comments
 (0)