For large updates to store_apps table use regular UPDATE and avoid ON CONFLICT

ddxv · ddxv · commit 0ec6d309d1de · 2025-11-01T12:19:38.000+08:00
diff --git a/adscrawler/app_stores/google.py b/adscrawler/app_stores/google.py
@@ -106,13 +106,13 @@ def clean_google_play_app_df(apps_df: pd.DataFrame) -> pd.DataFrame:
         .fillna("0")
         .astype(int),
         category=apps_df["category"].str.lower(),
+        release_date=pd.to_datetime(
+            apps_df["release_date"], format="%b %d, %Y"
+        ).dt.date,
         store_last_updated=pd.to_datetime(
             apps_df["store_last_updated"],
             unit="s",
         ).fillna(apps_df["release_date"]),
-        release_date=pd.to_datetime(
-            apps_df["release_date"], format="%b %d, %Y"
-        ).dt.date,
     )
     if "developer_name" in apps_df.columns:
         apps_df.loc[apps_df["developer_name"].notna(), "developer_name"] = apps_df.loc[
diff --git a/adscrawler/app_stores/process_from_s3.py b/adscrawler/app_stores/process_from_s3.py
@@ -63,7 +63,7 @@ def app_details_to_s3(
     df: pd.DataFrame,
     store: int,
 ) -> None:
-    logger.info(f"S3 upload app details {store=} start")
+    logger.info(f"S3 upload app_details {store=}, rows={df.shape[0]} start")
     if store is None:
         raise ValueError("store is required")
     s3_client = get_s3_client()
diff --git a/adscrawler/app_stores/scrape_stores.py b/adscrawler/app_stores/scrape_stores.py
@@ -43,6 +43,7 @@
 from adscrawler.dbcon.queries import (
     get_crawl_scenario_countries,
     get_store_app_columns,
+    prepare_for_psycopg,
     query_categories,
     query_collections,
     query_countries,
@@ -53,6 +54,7 @@
     query_store_id_map,
     query_store_id_map_cached,
     query_store_ids,
+    update_from_df,
     upsert_df,
 )
 from adscrawler.packages.storage import get_s3_client
@@ -113,7 +115,7 @@ def update_app_details(
     workers,
     process_icon,
     limit,
-    country_crawl_priority,
+    country_priority_group,
 ):
     """Process apps with dynamic work queue - simple and efficient."""
     log_info = f"Update app details: {store=}"
@@ -122,12 +124,12 @@ def update_app_details(
         store=store,
         database_connection=database_connection,
         limit=limit,
-        country_crawl_priority=country_crawl_priority,
+        country_priority_group=country_priority_group,
     )
     df = df.sort_values("country_code").reset_index(drop=True)
     logger.info(f"{log_info} start {len(df)} apps")
 
-    max_chunk_size = 10000
+    max_chunk_size = 5000
     chunks = []
     # Try keeping countries together for larger end S3 files
     for _country, country_df in df.groupby("country_code"):
@@ -696,6 +698,7 @@ def process_live_app_details(
     df_chunk: pd.DataFrame,
 ) -> None:
     for crawl_result, apps_df in results_df.groupby("crawl_result"):
+        logger.info(f"{store=} {crawl_result=} processing {len(apps_df)} apps for db")
         if crawl_result != 1:
             apps_df = apps_df[["store_id", "store", "crawled_at", "crawl_result"]]
         else:
@@ -725,100 +728,85 @@ def process_live_app_details(
                         )
                 except Exception:
                     logger.exception("failed to process app icon")
-        apps_df = apps_df.convert_dtypes(dtype_backend="pyarrow")
-        apps_df = apps_df.replace({pd.NA: None})
+        # I think only coming from S3?
+        # apps_df = apps_df.convert_dtypes(dtype_backend="pyarrow")
+        # apps_df = apps_df.replace({pd.NA: None})
         apps_details_to_db(
             apps_df=apps_df,
             database_connection=database_connection,
+            crawl_result=crawl_result,
         )
 
 
 def apps_details_to_db(
     apps_df: pd.DataFrame,
     database_connection: PostgresCon,
+    crawl_result: int,
 ) -> None:
     key_columns = ["store", "store_id"]
     if (apps_df["crawl_result"] == 1).all() and apps_df["developer_id"].notna().all():
         apps_df = save_developer_info(apps_df, database_connection)
     insert_columns = [
         x for x in get_store_app_columns(database_connection) if x in apps_df.columns
     ]
-    # Update columns we always want the latest of
-    # Eg name, developer_id
-    store_apps_df = upsert_df(
+    apps_df = prepare_for_psycopg(apps_df)
+    return_rows = crawl_result == 1
+    logger.info(f"{crawl_result=} update store_apps table for {len(apps_df)} apps")
+    store_apps_df = update_from_df(
         table_name="store_apps",
         df=apps_df,
-        insert_columns=insert_columns,
+        update_columns=insert_columns,
         key_columns=key_columns,
         database_connection=database_connection,
-        return_rows=True,
+        return_rows=return_rows,
+    )
+    if store_apps_df is None or store_apps_df.empty or crawl_result != 1:
+        return
+    store_apps_df = store_apps_df.rename(columns={"id": "store_app"})
+    apps_df = pd.merge(
+        apps_df,
+        store_apps_df[["store_id", "store_app"]],
+        how="left",
+        validate="1:1",
+    )
+    upsert_store_apps_descriptions(apps_df, database_connection)
+    save_app_domains(
+        apps_df=apps_df,
+        database_connection=database_connection,
     )
-    if (
-        store_apps_df is not None
-        and not store_apps_df[store_apps_df["crawl_result"] == 1].empty
-    ):
-        store_apps_descriptions = store_apps_df[
-            store_apps_df["crawl_result"] == 1
-        ].copy()
-        store_apps_descriptions = pd.merge(
-            store_apps_descriptions,
-            apps_df[
-                [
-                    "store_id",
-                    "description",
-                    "description_short",
-                    "queried_language",
-                    "store_language_code",
-                ]
-            ],
-            on="store_id",
-        )
-        upsert_store_apps_descriptions(store_apps_descriptions, database_connection)
-    if store_apps_df is not None and not store_apps_df.empty:
-        store_apps_df = store_apps_df.rename(columns={"id": "store_app"})
-        apps_df = pd.merge(
-            apps_df,
-            store_apps_df[["store_id", "store_app"]],
-            how="left",
-            validate="1:1",
-        )
-        save_app_domains(
-            apps_df=apps_df,
-            database_connection=database_connection,
-        )
     return
 
 
 def upsert_store_apps_descriptions(
-    store_apps_descriptions: pd.DataFrame,
+    apps_df: pd.DataFrame,
     database_connection: PostgresCon,
 ) -> None:
     table_name = "store_apps_descriptions"
     languages_map = query_languages(database_connection)
-    store_apps_descriptions = pd.merge(
-        store_apps_descriptions,
+    apps_df = pd.merge(
+        apps_df,
         languages_map[["id", "language_slug"]],
         how="left",
         left_on="store_language_code",
         right_on="language_slug",
         validate="m:1",
     ).rename(columns={"id": "language_id"})
-    if store_apps_descriptions["language_id"].isna().any():
-        null_ids = store_apps_descriptions["language_id"].isna()
-        null_langs = store_apps_descriptions[
+    if apps_df["language_id"].isna().any():
+        null_ids = apps_df["language_id"].isna()
+        null_langs = apps_df[null_ids][
             ["store_id", "store_language_code"]
         ].drop_duplicates()
         logger.error(f"App descriptions dropping unknown language codes: {null_langs}")
-        store_apps_descriptions = store_apps_descriptions[~null_ids]
-        if store_apps_descriptions.empty:
+        apps_df = apps_df[~null_ids]
+        if apps_df.empty:
             logger.debug("Dropped all descriptions, no language id found")
             return
-    if "description_short" not in store_apps_descriptions.columns:
-        store_apps_descriptions["description_short"] = ""
+    if "description_short" not in apps_df.columns:
+        apps_df["description_short"] = ""
     key_columns = ["store_app", "language_id", "description", "description_short"]
     upsert_df(
         table_name=table_name,
-        df=store_apps_descriptions,
+        df=apps_df,
         insert_columns=key_columns,
         key_columns=key_columns,
         md5_key_columns=["description", "description_short"],
diff --git a/adscrawler/dbcon/queries.py b/adscrawler/dbcon/queries.py
@@ -118,6 +118,140 @@ def insert_df(
     return return_df
 
 
+def prepare_for_psycopg(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    for col in df.select_dtypes(include=["datetimetz", "datetime64[ns]"]):
+        # Convert to object dtype first so it can hold None
+        df[col] = (
+            df[col]
+            .apply(lambda x: x.to_pydatetime() if pd.notna(x) else None)
+            .astype("object")
+        )
+    # Replace NaN (for floats, strings, etc.)
+    df = df.astype(object).where(pd.notna(df), None)
+    return df
+
+
+def update_from_df(
+    df: pd.DataFrame,
+    table_name: str,
+    database_connection: Connection,
+    key_columns: list[str],
+    update_columns: list[str],
+    return_rows: bool = False,
+    schema: str | None = None,
+    md5_key_columns: list[str] | None = None,
+    log: bool = False,
+) -> pd.DataFrame | None:
+    """Perform an UPDATE on a PostgreSQL table from a DataFrame.
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        The DataFrame containing update data.
+    table_name : str
+        The name of the target table.
+    database_connection : Connection
+        The database connection object.
+    key_columns : list of str
+        Column name(s) on which to match for the UPDATE.
+    update_columns : list of str
+        Columns to update (excluding key columns).
+    return_rows : bool, optional
+        Whether to return the rows that were updated.
+    schema : str, optional
+        The name of the schema containing the target table.
+    md5_key_columns: list of str, optional
+        Key columns that use MD5 hashing in their index.
+    log : bool, optional
+        Print generated SQL statement for debugging.
+    Returns
+    -------
+    pd.DataFrame or None
+        DataFrame of updated rows if return_rows=True, else None.
+    """
+    raw_conn = database_connection.engine.raw_connection()
+    # Handle special date columns
+    if "crawled_date" in df.columns and df["crawled_date"].isna().all():
+        df["crawled_date"] = pd.to_datetime(df["crawled_date"]).dt.date
+        df["crawled_date"] = None
+    if "release_date" in df.columns and df["release_date"].isna().all():
+        df["release_date"] = None
+    # Build table identifier
+    table_identifier = Identifier(table_name)
+    if schema:
+        table_identifier = Composed([Identifier(schema), SQL("."), table_identifier])
+    # Build UPDATE SET clause for update_columns only
+    update_set = SQL(", ").join(
+        SQL("{0} = %s").format(Identifier(col)) for col in update_columns
+    )
+    # Build WHERE conditions for key_columns
+    if md5_key_columns:
+        where_conditions = SQL(" AND ").join(
+            (
+                SQL("md5({col}) = %s").format(col=Identifier(col))
+                if col in md5_key_columns
+                else SQL("{col} = %s").format(col=Identifier(col))
+            )
+            for col in key_columns
+        )
+    else:
+        where_conditions = SQL(" AND ").join(
+            SQL("{col} = %s").format(col=Identifier(col)) for col in key_columns
+        )
+    if return_rows:
+        update_query = SQL(
+            """
+            UPDATE {table}
+            SET {update_set}
+            WHERE {where_conditions}
+            RETURNING *
+            """
+        ).format(
+            table=table_identifier,
+            update_set=update_set,
+            where_conditions=where_conditions,
+        )
+    else:
+        update_query = SQL(
+            """
+            UPDATE {table}
+            SET {update_set}
+            WHERE {where_conditions}
+            """
+        ).format(
+            table=table_identifier,
+            update_set=update_set,
+            where_conditions=where_conditions,
+        )
+    if log:
+        logger.info(f"Update query: {update_query.as_string(raw_conn)}")
+    all_columns = update_columns + key_columns
+    with raw_conn.cursor() as cur:
+        # Prepare data
+        data = [
+            tuple(row) for row in df[all_columns].itertuples(index=False, name=None)
+        ]
+        if log:
+            logger.info(f"Update data sample: {data[:5] if len(data) > 5 else data}")
+        # Execute updates
+        if return_rows:
+            all_results = []
+            for row in data:
+                cur.execute(update_query, row)
+                result = cur.fetchall()
+                all_results.extend(result)
+            if all_results:
+                column_names = [desc[0] for desc in cur.description]
+                return_df = pd.DataFrame(all_results, columns=column_names)
+            else:
+                return_df = pd.DataFrame()
+        else:
+            cur.executemany(update_query, data)
+            return_df = None
+    raw_conn.commit()
+    return return_df
+
+
 def upsert_df(
     df: pd.DataFrame,
     table_name: str,
@@ -717,7 +851,7 @@ def get_crawl_scenario_countries(
 def query_store_apps_to_update(
     database_connection: PostgresCon,
     store: int,
-    country_crawl_priority: int,
+    country_priority_group: int,
     log_query=False,
     limit: int = 1000,
 ) -> pd.DataFrame:
@@ -744,7 +878,7 @@ def query_store_apps_to_update(
         )
     params = {
         "store": store,
-        "country_crawl_priority": country_crawl_priority,
+        "country_crawl_priority": country_priority_group,
         "short_update_ts": short_update_ts,
         "short_update_installs": short_update_installs,
         "short_update_ratings": short_update_ratings,
diff --git a/main.py b/main.py
@@ -376,7 +376,7 @@ def update_app_details(self, store: int, country_priority_group: int) -> None:
             use_ssh_tunnel=self.args.use_ssh_tunnel,
             workers=int(self.args.workers),
             process_icon=self.args.process_icons,
-            country_crawl_priority=self.args.country_priority_group,
+            country_priority_group=self.args.country_priority_group,
             limit=self.args.limit_query_rows,
         )
 

Original file line number	Diff line number	Diff line change
`@@ -376,7 +376,7 @@ def update_app_details(self, store: int, country_priority_group: int) -> None:`
`376`	`376`	`use_ssh_tunnel=self.args.use_ssh_tunnel,`
`377`	`377`	`workers=int(self.args.workers),`
`378`	`378`	`process_icon=self.args.process_icons,`
`379`		`- country_crawl_priority=self.args.country_priority_group,`
	`379`	`+ country_priority_group=self.args.country_priority_group,`
`380`	`380`	`limit=self.args.limit_query_rows,`
`381`	`381`	`)`
`382`	`382`