@@ -72,21 +72,24 @@ def process_chunk(
7272 chunk_info = f"{ store = } chunk={ df_chunk .index [0 ]} -{ df_chunk .index [- 1 ]} /{ total_rows } "
7373 logger .info (f"{ chunk_info } start" )
7474 database_connection = get_db_connection (use_ssh_tunnel = use_ssh_tunnel )
75+ chunk_results = []
7576 try :
76- chunk_results = []
7777 for _ , row in df_chunk .iterrows ():
7878 try :
79- result_dict = scrape_app (
79+ result = scrape_app (
8080 store = store ,
8181 store_id = row ["store_id" ],
8282 country = row ["country_code" ].lower (),
8383 language = row ["language" ].lower (),
8484 )
85- chunk_results .append (result_dict )
85+ chunk_results .append (result )
8686 except Exception as e :
8787 logger .exception (
88- f"store= { row . store } , store_id={ row . store_id } update_all_app_info failed with { e } "
88+ f"{ chunk_info } store_id={ row [ ' store_id' ] } scrape_app failed: { e } "
8989 )
90+ if not chunk_results :
91+ logger .warning (f"{ chunk_info } produced no results." )
92+ return
9093 results_df = pd .DataFrame (chunk_results )
9194 results_df ["crawled_date" ] = results_df ["crawled_at" ].dt .date
9295 app_details_to_s3 (results_df , store = store )
@@ -100,10 +103,7 @@ def process_chunk(
100103 df_chunk = df_chunk ,
101104 )
102105 logger .info (f"{ chunk_info } finished" )
103- except Exception as e :
104- logger .exception (f"{ chunk_info } error processing with { e } " )
105106 finally :
106- logger .info (f"{ chunk_info } finished" )
107107 if database_connection and hasattr (database_connection , "engine" ):
108108 database_connection .engine .dispose ()
109109 logger .debug (f"{ chunk_info } database connection disposed" )
@@ -690,6 +690,7 @@ def process_live_app_details(
690690 for crawl_result , apps_df in results_df .groupby ("crawl_result" ):
691691 logger .info (f"{ store = } { crawl_result = } processing { len (apps_df )} apps for db" )
692692 if crawl_result != 1 :
693+ # If bad crawl result, only save minimal info to avoid overwriting good data, ie name
693694 apps_df = apps_df [["store_id" , "store" , "crawled_at" , "crawl_result" ]]
694695 else :
695696 apps_df = clean_scraped_df (df = apps_df , store = store )
0 commit comments