4343from adscrawler .dbcon .queries import (
4444 get_crawl_scenario_countries ,
4545 get_store_app_columns ,
46+ prepare_for_psycopg ,
4647 query_categories ,
4748 query_collections ,
4849 query_countries ,
5354 query_store_id_map ,
5455 query_store_id_map_cached ,
5556 query_store_ids ,
57+ update_from_df ,
5658 upsert_df ,
5759)
5860from adscrawler .packages .storage import get_s3_client
@@ -113,7 +115,7 @@ def update_app_details(
113115 workers ,
114116 process_icon ,
115117 limit ,
116- country_crawl_priority ,
118+ country_priority_group ,
117119):
118120 """Process apps with dynamic work queue - simple and efficient."""
119121 log_info = f"Update app details: { store = } "
@@ -122,12 +124,12 @@ def update_app_details(
122124 store = store ,
123125 database_connection = database_connection ,
124126 limit = limit ,
125- country_crawl_priority = country_crawl_priority ,
127+ country_priority_group = country_priority_group ,
126128 )
127129 df = df .sort_values ("country_code" ).reset_index (drop = True )
128130 logger .info (f"{ log_info } start { len (df )} apps" )
129131
130- max_chunk_size = 10000
132+ max_chunk_size = 5000
131133 chunks = []
132134 # Try keeping countries together for larger end S3 files
133135 for _country , country_df in df .groupby ("country_code" ):
@@ -696,6 +698,7 @@ def process_live_app_details(
696698 df_chunk : pd .DataFrame ,
697699) -> None :
698700 for crawl_result , apps_df in results_df .groupby ("crawl_result" ):
701+ logger .info (f"{ store = } { crawl_result = } processing { len (apps_df )} apps for db" )
699702 if crawl_result != 1 :
700703 apps_df = apps_df [["store_id" , "store" , "crawled_at" , "crawl_result" ]]
701704 else :
@@ -725,100 +728,85 @@ def process_live_app_details(
725728 )
726729 except Exception :
727730 logger .exception ("failed to process app icon" )
728- apps_df = apps_df .convert_dtypes (dtype_backend = "pyarrow" )
729- apps_df = apps_df .replace ({pd .NA : None })
731+ # I think only coming from S3?
732+ # apps_df = apps_df.convert_dtypes(dtype_backend="pyarrow")
733+ # apps_df = apps_df.replace({pd.NA: None})
730734 apps_details_to_db (
731735 apps_df = apps_df ,
732736 database_connection = database_connection ,
737+ crawl_result = crawl_result ,
733738 )
734739
735740
736741def apps_details_to_db (
737742 apps_df : pd .DataFrame ,
738743 database_connection : PostgresCon ,
744+ crawl_result : int ,
739745) -> None :
740746 key_columns = ["store" , "store_id" ]
741747 if (apps_df ["crawl_result" ] == 1 ).all () and apps_df ["developer_id" ].notna ().all ():
742748 apps_df = save_developer_info (apps_df , database_connection )
743749 insert_columns = [
744750 x for x in get_store_app_columns (database_connection ) if x in apps_df .columns
745751 ]
746- # Update columns we always want the latest of
747- # Eg name, developer_id
748- store_apps_df = upsert_df (
752+ apps_df = prepare_for_psycopg (apps_df )
753+ return_rows = crawl_result == 1
754+ logger .info (f"{ crawl_result = } update store_apps table for { len (apps_df )} apps" )
755+ store_apps_df = update_from_df (
749756 table_name = "store_apps" ,
750757 df = apps_df ,
751- insert_columns = insert_columns ,
758+ update_columns = insert_columns ,
752759 key_columns = key_columns ,
753760 database_connection = database_connection ,
754- return_rows = True ,
761+ return_rows = return_rows ,
762+ )
763+ if store_apps_df is None or store_apps_df .empty or crawl_result != 1 :
764+ return
765+ store_apps_df = store_apps_df .rename (columns = {"id" : "store_app" })
766+ apps_df = pd .merge (
767+ apps_df ,
768+ store_apps_df [["store_id" , "store_app" ]],
769+ how = "left" ,
770+ validate = "1:1" ,
771+ )
772+ upsert_store_apps_descriptions (apps_df , database_connection )
773+ save_app_domains (
774+ apps_df = apps_df ,
775+ database_connection = database_connection ,
755776 )
756- if (
757- store_apps_df is not None
758- and not store_apps_df [store_apps_df ["crawl_result" ] == 1 ].empty
759- ):
760- store_apps_descriptions = store_apps_df [
761- store_apps_df ["crawl_result" ] == 1
762- ].copy ()
763- store_apps_descriptions = pd .merge (
764- store_apps_descriptions ,
765- apps_df [
766- [
767- "store_id" ,
768- "description" ,
769- "description_short" ,
770- "queried_language" ,
771- "store_language_code" ,
772- ]
773- ],
774- on = "store_id" ,
775- )
776- upsert_store_apps_descriptions (store_apps_descriptions , database_connection )
777- if store_apps_df is not None and not store_apps_df .empty :
778- store_apps_df = store_apps_df .rename (columns = {"id" : "store_app" })
779- apps_df = pd .merge (
780- apps_df ,
781- store_apps_df [["store_id" , "store_app" ]],
782- how = "left" ,
783- validate = "1:1" ,
784- )
785- save_app_domains (
786- apps_df = apps_df ,
787- database_connection = database_connection ,
788- )
789777 return
790778
791779
792780def upsert_store_apps_descriptions (
793- store_apps_descriptions : pd .DataFrame ,
781+ apps_df : pd .DataFrame ,
794782 database_connection : PostgresCon ,
795783) -> None :
796784 table_name = "store_apps_descriptions"
797785 languages_map = query_languages (database_connection )
798- store_apps_descriptions = pd .merge (
799- store_apps_descriptions ,
786+ apps_df = pd .merge (
787+ apps_df ,
800788 languages_map [["id" , "language_slug" ]],
801789 how = "left" ,
802790 left_on = "store_language_code" ,
803791 right_on = "language_slug" ,
804792 validate = "m:1" ,
805793 ).rename (columns = {"id" : "language_id" })
806- if store_apps_descriptions ["language_id" ].isna ().any ():
807- null_ids = store_apps_descriptions ["language_id" ].isna ()
808- null_langs = store_apps_descriptions [
794+ if apps_df ["language_id" ].isna ().any ():
795+ null_ids = apps_df ["language_id" ].isna ()
796+ null_langs = apps_df [ null_ids ] [
809797 ["store_id" , "store_language_code" ]
810798 ].drop_duplicates ()
811799 logger .error (f"App descriptions dropping unknown language codes: { null_langs } " )
812- store_apps_descriptions = store_apps_descriptions [~ null_ids ]
813- if store_apps_descriptions .empty :
800+ apps_df = apps_df [~ null_ids ]
801+ if apps_df .empty :
814802 logger .debug ("Dropped all descriptions, no language id found" )
815803 return
816- if "description_short" not in store_apps_descriptions .columns :
817- store_apps_descriptions ["description_short" ] = ""
804+ if "description_short" not in apps_df .columns :
805+ apps_df ["description_short" ] = ""
818806 key_columns = ["store_app" , "language_id" , "description" , "description_short" ]
819807 upsert_df (
820808 table_name = table_name ,
821- df = store_apps_descriptions ,
809+ df = apps_df ,
822810 insert_columns = key_columns ,
823811 key_columns = key_columns ,
824812 md5_key_columns = ["description" , "description_short" ],
0 commit comments