Skip to content

Commit 48bb328

Browse files
committed
[FIX] util/pg: ensure the ranges contain at least bucket_size elements
Before this patch when we exploded a query by range we could end up creating too many queries for which there is no actual record in the range. In extreme cases this causes a memory error doe to the huge list of generated queries. Example: ``` => select min(id),max(id),count(id) from account_journal +-----+----------+-------+ | min | max | count | |-----+----------+-------| | 1 | 10000003 | 56 | +-----+----------+-------+ ``` All the journals could be processed in one bucket of >=56 elements. The high max id causes the generation of many queries that would not process any actual record. ``` ... File "/tmp/tmpl7nu4jh5/migrations/account/saas~16.2.1.2/pre-migrate.py", line 365, in migrate util.explode_execute(cr, query, table="account_journal", alias="journal", bucket_size=1) File "/tmp/tmpl7nu4jh5/migrations/util/pg.py", line 226, in explode_execute explode_query_range(cr, query, table, alias=alias, bucket_size=bucket_size), File "/tmp/tmpl7nu4jh5/migrations/util/pg.py", line 217, in explode_query_range return [ File "/tmp/tmpl7nu4jh5/migrations/util/pg.py", line 218, in <listcomp> cr.mogrify(query, {"lower-bound": index, "upper-bound": index + bucket_size - 1}).decode() File "/home/odoo/src/odoo/17.0/odoo/sql_db.py", line 316, in mogrify return self._obj.mogrify(query, params) MemoryError ``` On this patch we propose to fallback to precise bucketing. Ensuring each bucket has the same number of records from the table -- with the possible exception of the last bucket. closes #56 Signed-off-by: Christophe Simonis (chs) <[email protected]>
1 parent 35b43aa commit 48bb328

File tree

2 files changed

+68
-10
lines changed

2 files changed

+68
-10
lines changed

src/base/tests/test_util.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,28 @@ def test_explode_mult_filters(self):
485485
cr.execute(q)
486486
self.assertTrue(all(x for (x,) in cr.fetchall()))
487487

488+
@mute_logger("odoo.upgrade.util.pg.explode_query_range")
489+
def test_explode_query_range(self):
490+
cr = self.env.cr
491+
492+
cr.execute("SELECT count(id) FROM res_partner_title")
493+
count = cr.fetchone()[0]
494+
# ensure there start with at least 10 records
495+
for _ in range(10 - count):
496+
count += 1
497+
self.env["res.partner.title"].create({"name": "x"})
498+
499+
# set one record with very high id
500+
tid = self.env["res.partner.title"].create({"name": "x"}).id
501+
count += 1
502+
cr.execute("UPDATE res_partner_title SET id = 10000000 WHERE id = %s", [tid])
503+
504+
qs = util.explode_query_range(cr, "SELECT 1", table="res_partner_title", bucket_size=count)
505+
self.assertEqual(len(qs), 1) # one bucket should be enough for all records
506+
507+
qs = util.explode_query_range(cr, "SELECT 1", table="res_partner_title", bucket_size=count - 1)
508+
self.assertEqual(len(qs), 1) # 10% rule for second bucket, 1 <= 0.1(count - 1) since count >= 11
509+
488510
def test_parallel_rowcount(self):
489511
cr = self.env.cr
490512
cr.execute("SELECT count(*) FROM res_lang")

src/util/pg.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
except ImportError:
2222
from UserList import UserList
2323

24+
try: # noqa: SIM105
25+
range = xrange # noqa: A001
26+
except NameError:
27+
pass
28+
2429
import psycopg2
2530
from psycopg2 import sql
2631

@@ -36,6 +41,7 @@
3641
_logger = logging.getLogger(__name__)
3742

3843
ON_DELETE_ACTIONS = frozenset(("SET NULL", "CASCADE", "RESTRICT", "NO ACTION", "SET DEFAULT"))
44+
MAX_BUCKETS = int(os.getenv("MAX_BUCKETS", "150000"))
3945

4046

4147
class PGRegexp(str):
@@ -196,27 +202,57 @@ def explode_query_range(cr, query, table, alias=None, bucket_size=10000, prefix=
196202

197203
alias = alias or table
198204

199-
cr.execute("SELECT min(id), max(id) FROM {}".format(table))
205+
if "{parallel_filter}" not in query:
206+
sep_kw = " AND " if re.search(r"\sWHERE\s", query, re.M | re.I) else " WHERE "
207+
query += sep_kw + "{parallel_filter}"
208+
209+
cr.execute(format_query(cr, "SELECT min(id), max(id) FROM {}", table))
200210
min_id, max_id = cr.fetchone()
201211
if min_id is None:
202212
return [] # empty table
213+
count = (max_id + 1 - min_id) // bucket_size
214+
if count > MAX_BUCKETS:
215+
_logger.getChild("explode_query_range").warning(
216+
"High number of queries generated (%s); switching to a precise bucketing strategy", count
217+
)
218+
cr.execute(
219+
format_query(
220+
cr,
221+
"""
222+
WITH t AS (
223+
SELECT id,
224+
mod(row_number() OVER(ORDER BY id) - 1, %s) AS g
225+
FROM {table}
226+
ORDER BY id
227+
) SELECT array_agg(id ORDER BY id) FILTER (WHERE g=0),
228+
min(id),
229+
max(id)
230+
FROM t
231+
""",
232+
table=table,
233+
),
234+
[bucket_size],
235+
)
236+
ids, min_id, max_id = cr.fetchone()
237+
else:
238+
ids = list(range(min_id, max_id + 1, bucket_size))
203239

204-
if "{parallel_filter}" not in query:
205-
sep_kw = " AND " if re.search(r"\sWHERE\s", query, re.M | re.I) else " WHERE "
206-
query += sep_kw + "{parallel_filter}"
240+
assert min_id == ids[0] and max_id + 1 != ids[-1] # sanity checks
241+
ids.append(max_id + 1) # ensure last bucket covers whole range
242+
# `ids` holds a list of values marking the interval boundaries for all buckets
207243

208-
if ((max_id - min_id + 1) * 0.9) <= bucket_size:
209-
# If there is less than `bucket_size` records (with a 10% tolerance), no need to explode the query.
210-
# Force usage of `prefix` in the query to validate it correctness.
211-
# If we don't the query may only be valid if there is no split. It avoid scripts to pass the CI but fail in production.
244+
if (max_id - min_id + 1) <= 1.1 * bucket_size or (len(ids) == 3 and ids[2] - ids[1] <= 0.1 * bucket_size):
245+
# If we return one query `parallel_execute` skip spawning new threads. Thus we return only one query if we have
246+
# only two buckets and the second would have at most 10% of bucket_size records.
247+
# Still, since the query may only be valid if there is no split, we force the usage of `prefix` in the query to
248+
# validate its correctness and avoid scripts that pass the CI but fail in production.
212249
parallel_filter = "{alias}.id IS NOT NULL".format(alias=alias)
213250
return [query.format(parallel_filter=parallel_filter)]
214251

215252
parallel_filter = "{alias}.id BETWEEN %(lower-bound)s AND %(upper-bound)s".format(alias=alias)
216253
query = query.replace("%", "%%").format(parallel_filter=parallel_filter)
217254
return [
218-
cr.mogrify(query, {"lower-bound": index, "upper-bound": index + bucket_size - 1}).decode()
219-
for index in range(min_id, max_id, bucket_size)
255+
cr.mogrify(query, {"lower-bound": ids[i], "upper-bound": ids[i + 1] - 1}).decode() for i in range(len(ids) - 1)
220256
]
221257

222258

0 commit comments

Comments
 (0)