Skip to content

Commit 2a87b41

Browse files
committed
Remove exception handling for chunk as already handled by processor
1 parent 0b08c72 commit 2a87b41

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

adscrawler/app_stores/scrape_stores.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,21 +72,24 @@ def process_chunk(
7272
chunk_info = f"{store=} chunk={df_chunk.index[0]}-{df_chunk.index[-1]}/{total_rows}"
7373
logger.info(f"{chunk_info} start")
7474
database_connection = get_db_connection(use_ssh_tunnel=use_ssh_tunnel)
75+
chunk_results = []
7576
try:
76-
chunk_results = []
7777
for _, row in df_chunk.iterrows():
7878
try:
79-
result_dict = scrape_app(
79+
result = scrape_app(
8080
store=store,
8181
store_id=row["store_id"],
8282
country=row["country_code"].lower(),
8383
language=row["language"].lower(),
8484
)
85-
chunk_results.append(result_dict)
85+
chunk_results.append(result)
8686
except Exception as e:
8787
logger.exception(
88-
f"store={row.store}, store_id={row.store_id} update_all_app_info failed with {e}"
88+
f"{chunk_info} store_id={row['store_id']} scrape_app failed: {e}"
8989
)
90+
if not chunk_results:
91+
logger.warning(f"{chunk_info} produced no results.")
92+
return
9093
results_df = pd.DataFrame(chunk_results)
9194
results_df["crawled_date"] = results_df["crawled_at"].dt.date
9295
app_details_to_s3(results_df, store=store)
@@ -100,10 +103,7 @@ def process_chunk(
100103
df_chunk=df_chunk,
101104
)
102105
logger.info(f"{chunk_info} finished")
103-
except Exception as e:
104-
logger.exception(f"{chunk_info} error processing with {e}")
105106
finally:
106-
logger.info(f"{chunk_info} finished")
107107
if database_connection and hasattr(database_connection, "engine"):
108108
database_connection.engine.dispose()
109109
logger.debug(f"{chunk_info} database connection disposed")
@@ -690,6 +690,7 @@ def process_live_app_details(
690690
for crawl_result, apps_df in results_df.groupby("crawl_result"):
691691
logger.info(f"{store=} {crawl_result=} processing {len(apps_df)} apps for db")
692692
if crawl_result != 1:
693+
# If bad crawl result, only save minimal info to avoid overwriting good data, ie name
693694
apps_df = apps_df[["store_id", "store", "crawled_at", "crawl_result"]]
694695
else:
695696
apps_df = clean_scraped_df(df=apps_df, store=store)

0 commit comments

Comments
 (0)