[FIX] util/pg: ensure the ranges contain at least bucket_size elements

aj-fuentes · aj-fuentes · commit 48bb32880862 · 2024-03-19T14:13:48.000Z
Before this patch when we exploded a query by range we could end up creating too many queries for which there is no actual record in the range. In extreme cases this causes a memory error doe to the huge list of generated queries. Example: ``` => select min(id),max(id),count(id) from account_journal +-----+----------+-------+ | min | max | count | |-----+----------+-------| | 1 | 10000003 | 56 | +-----+----------+-------+ ``` All the journals could be processed in one bucket of >=56 elements. The high max id causes the generation of many queries that would not process any actual record. ``` ... File "/tmp/tmpl7nu4jh5/migrations/account/saas~16.2.1.2/pre-migrate.py", line 365, in migrate util.explode_execute(cr, query, table="account_journal", alias="journal", bucket_size=1) File "/tmp/tmpl7nu4jh5/migrations/util/pg.py", line 226, in explode_execute explode_query_range(cr, query, table, alias=alias, bucket_size=bucket_size), File "/tmp/tmpl7nu4jh5/migrations/util/pg.py", line 217, in explode_query_range return [ File "/tmp/tmpl7nu4jh5/migrations/util/pg.py", line 218, in <listcomp> cr.mogrify(query, {"lower-bound": index, "upper-bound": index + bucket_size - 1}).decode() File "/home/odoo/src/odoo/17.0/odoo/sql_db.py", line 316, in mogrify return self._obj.mogrify(query, params) MemoryError ``` On this patch we propose to fallback to precise bucketing. Ensuring each bucket has the same number of records from the table -- with the possible exception of the last bucket. closes #56 Signed-off-by: Christophe Simonis (chs) <chs@odoo.com>
diff --git a/src/base/tests/test_util.py b/src/base/tests/test_util.py
@@ -485,6 +485,28 @@ def test_explode_mult_filters(self):
             cr.execute(q)
             self.assertTrue(all(x for (x,) in cr.fetchall()))
 
+    @mute_logger("odoo.upgrade.util.pg.explode_query_range")
+    def test_explode_query_range(self):
+        cr = self.env.cr
+
+        cr.execute("SELECT count(id) FROM res_partner_title")
+        count = cr.fetchone()[0]
+        # ensure there start with at least 10 records
+        for _ in range(10 - count):
+            count += 1
+            self.env["res.partner.title"].create({"name": "x"})
+
+        # set one record with very high id
+        tid = self.env["res.partner.title"].create({"name": "x"}).id
+        count += 1
+        cr.execute("UPDATE res_partner_title SET id = 10000000 WHERE id = %s", [tid])
+
+        qs = util.explode_query_range(cr, "SELECT 1", table="res_partner_title", bucket_size=count)
+        self.assertEqual(len(qs), 1)  # one bucket should be enough for all records
+
+        qs = util.explode_query_range(cr, "SELECT 1", table="res_partner_title", bucket_size=count - 1)
+        self.assertEqual(len(qs), 1)  # 10% rule for second bucket, 1 <= 0.1(count - 1) since count >= 11
+
     def test_parallel_rowcount(self):
         cr = self.env.cr
         cr.execute("SELECT count(*) FROM res_lang")
diff --git a/src/util/pg.py b/src/util/pg.py
@@ -21,6 +21,11 @@
 except ImportError:
     from UserList import UserList
 
+try:  # noqa: SIM105
+    range = xrange  # noqa: A001
+except NameError:
+    pass
+
 import psycopg2
 from psycopg2 import sql
 
@@ -36,6 +41,7 @@
 _logger = logging.getLogger(__name__)
 
 ON_DELETE_ACTIONS = frozenset(("SET NULL", "CASCADE", "RESTRICT", "NO ACTION", "SET DEFAULT"))
+MAX_BUCKETS = int(os.getenv("MAX_BUCKETS", "150000"))
 
 
 class PGRegexp(str):
@@ -196,27 +202,57 @@ def explode_query_range(cr, query, table, alias=None, bucket_size=10000, prefix=
 
     alias = alias or table
 
-    cr.execute("SELECT min(id), max(id) FROM {}".format(table))
+    if "{parallel_filter}" not in query:
+        sep_kw = " AND " if re.search(r"\sWHERE\s", query, re.M | re.I) else " WHERE "
+        query += sep_kw + "{parallel_filter}"
+
+    cr.execute(format_query(cr, "SELECT min(id), max(id) FROM {}", table))
     min_id, max_id = cr.fetchone()
     if min_id is None:
         return []  # empty table
+    count = (max_id + 1 - min_id) // bucket_size
+    if count > MAX_BUCKETS:
+        _logger.getChild("explode_query_range").warning(
+            "High number of queries generated (%s); switching to a precise bucketing strategy", count
+        )
+        cr.execute(
+            format_query(
+                cr,
+                """
+                WITH t AS (
+                    SELECT id,
+                           mod(row_number() OVER(ORDER BY id) - 1, %s) AS g
+                      FROM {table}
+                     ORDER BY id
+                ) SELECT array_agg(id ORDER BY id) FILTER (WHERE g=0),
+                         min(id),
+                         max(id)
+                    FROM t
+                """,
+                table=table,
+            ),
+            [bucket_size],
+        )
+        ids, min_id, max_id = cr.fetchone()
+    else:
+        ids = list(range(min_id, max_id + 1, bucket_size))
 
-    if "{parallel_filter}" not in query:
-        sep_kw = " AND " if re.search(r"\sWHERE\s", query, re.M | re.I) else " WHERE "
-        query += sep_kw + "{parallel_filter}"
+    assert min_id == ids[0] and max_id + 1 != ids[-1]  # sanity checks
+    ids.append(max_id + 1)  # ensure last bucket covers whole range
+    # `ids` holds a list of values marking the interval boundaries for all buckets
 
-    if ((max_id - min_id + 1) * 0.9) <= bucket_size:
-        # If there is less than `bucket_size` records (with a 10% tolerance), no need to explode the query.
-        # Force usage of `prefix` in the query to validate it correctness.
-        # If we don't the query may only be valid if there is no split. It avoid scripts to pass the CI but fail in production.
+    if (max_id - min_id + 1) <= 1.1 * bucket_size or (len(ids) == 3 and ids[2] - ids[1] <= 0.1 * bucket_size):
+        # If we return one query `parallel_execute` skip spawning new threads. Thus we return only one query if we have
+        # only two buckets and the second would have at most 10% of bucket_size records.
+        # Still, since the query may only be valid if there is no split, we force the usage of `prefix` in the query to
+        # validate its correctness and avoid scripts that pass the CI but fail in production.
         parallel_filter = "{alias}.id IS NOT NULL".format(alias=alias)
         return [query.format(parallel_filter=parallel_filter)]
 
     parallel_filter = "{alias}.id BETWEEN %(lower-bound)s AND %(upper-bound)s".format(alias=alias)
     query = query.replace("%", "%%").format(parallel_filter=parallel_filter)
     return [
-        cr.mogrify(query, {"lower-bound": index, "upper-bound": index + bucket_size - 1}).decode()
-        for index in range(min_id, max_id, bucket_size)
+        cr.mogrify(query, {"lower-bound": ids[i], "upper-bound": ids[i + 1] - 1}).decode() for i in range(len(ids) - 1)
     ]