Skip to content

Commit c097bf6

Browse files
maximeretyfatkodima
authored andcommitted
Optimize Active Record batching further when using ranges
When `use_ranges: true` option is used, we do not need to return the whole list of values for each range. This wastes network resources and takes longer than necessary, as only the last value from the range would be needed. We can instead use OFFSET to peek at the last value of the next batch, instead of returning the whole list of values from the range. There is a trade-off, however, as we need an additional query to confirm the size of the very last batch, and get the last value. Unless we only have a handful of small batches, this strategy is a winner overall, as it reduces the time and network resources spent generating batches. For example, `[10, 11, 12, ..., 20]` was previously loaded to generate a relation in the form of `WHERE id > 10 AND id <= 20` for a batch. Since `10` would already be known from the previous iteration, we only need to retrieve the last value in the range (`20`) using a `LIMIT 1 OFFSET ω` construct (ω = batch size - 1), thus avoiding the unnecessary loading and discarding of all other values in the range.
1 parent 5709348 commit c097bf6

File tree

3 files changed

+70
-24
lines changed

3 files changed

+70
-24
lines changed

activerecord/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
* Optimize Active Record batching further when using ranges.
2+
3+
Tested on a PostgreSQL table with 10M records and batches of 10k records, the generation
4+
of relations for the 1000 batches was `4.8x` faster (`6.8s` vs. `1.4s`), used `900x`
5+
less bandwidth (`180MB` vs. `0.2MB`) and allocated `45x` less memory (`490MB` vs. `11MB`).
6+
7+
*Maxime Réty*, *fatkodima*
8+
19
* Include current character length in error messages for index and table name length validations.
210

311
*Joshua Young*

activerecord/lib/active_record/relation/batches.rb

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -435,35 +435,49 @@ def batch_on_unloaded_relation(relation:, start:, finish:, load:, cursor:, order
435435
if load
436436
records = batch_relation.records
437437
values = records.pluck(*cursor)
438+
values_size = values.size
439+
values_last = values.last
438440
yielded_relation = where(cursor => values).order(batch_orders.to_h)
439441
yielded_relation.load_records(records)
440442
elsif (empty_scope && use_ranges != false) || use_ranges
441-
values = batch_relation.pluck(*cursor)
443+
# Efficiently peak at the last value for the next batch using offset and limit.
444+
values_size = batch_limit
445+
values_last = batch_relation.offset(batch_limit - 1).pick(*cursor)
446+
447+
# If the last value is not found using offset, there is at most one more batch of size < batch_limit.
448+
# Retry by getting the whole list of remaining values so that we have the exact size and last value.
449+
unless values_last
450+
values = batch_relation.pluck(*cursor)
451+
values_size = values.size
452+
values_last = values.last
453+
end
442454

443-
finish = values.last
444-
if finish
445-
yielded_relation = apply_finish_limit(batch_relation, cursor, finish, batch_orders)
455+
# Finally, build the yielded relation if at least one value found.
456+
if values_last
457+
yielded_relation = apply_finish_limit(batch_relation, cursor, values_last, batch_orders)
446458
yielded_relation = yielded_relation.except(:limit).reorder(batch_orders.to_h)
447459
yielded_relation.skip_query_cache!(false)
448460
end
449461
else
450462
values = batch_relation.pluck(*cursor)
463+
values_size = values.size
464+
values_last = values.last
451465
yielded_relation = where(cursor => values).order(batch_orders.to_h)
452466
end
453467

454-
break if values.empty?
468+
break if values_size == 0
455469

456-
if values.flatten.any?(nil)
470+
if [values_last].flatten.any?(nil)
457471
raise ArgumentError, "Not all of the batch cursor columns were included in the custom select clause "\
458472
"or some columns contain nil."
459473
end
460474

461475
yield yielded_relation
462476

463-
break if values.length < batch_limit
477+
break if values_size < batch_limit
464478

465479
if limit_value
466-
remaining -= values.length
480+
remaining -= values_size
467481

468482
if remaining == 0
469483
# Saves a useless iteration when the limit is a multiple of the
@@ -481,7 +495,7 @@ def batch_on_unloaded_relation(relation:, start:, finish:, load:, cursor:, order
481495
end
482496
operators << (last_order == :desc ? :lt : :gt)
483497

484-
cursor_value = values.last
498+
cursor_value = values_last
485499
batch_relation = batch_condition(relation, cursor, cursor_value, operators)
486500
end
487501

activerecord/test/cases/batches_test.rb

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -335,15 +335,15 @@ def test_in_batches_has_attribute_readers
335335
end
336336

337337
def test_in_batches_should_yield_relation_if_block_given
338-
assert_queries_count(6) do
338+
assert_queries_count(7) do
339339
Post.in_batches(of: 2) do |relation|
340340
assert_kind_of ActiveRecord::Relation, relation
341341
end
342342
end
343343
end
344344

345345
def test_in_batches_should_be_enumerable_if_no_block_given
346-
assert_queries_count(6) do
346+
assert_queries_count(7) do
347347
Post.in_batches(of: 2).each do |relation|
348348
assert_kind_of ActiveRecord::Relation, relation
349349
end
@@ -378,10 +378,10 @@ def test_in_batches_each_record_should_be_ordered_by_id
378378
end
379379

380380
def test_in_batches_update_all_affect_all_records
381-
assert_queries_count(6 + 6) do # 6 selects, 6 updates
381+
assert_queries_count(7 + 6) do # 7 selects, 6 updates
382382
Post.in_batches(of: 2).update_all(title: "updated-title")
383383
end
384-
assert_equal Post.all.pluck(:title), ["updated-title"] * Post.count
384+
assert_equal ["updated-title"] * Post.count, Post.all.pluck(:title)
385385
end
386386

387387
def test_in_batches_update_all_returns_rows_affected
@@ -394,7 +394,7 @@ def test_in_batches_update_all_returns_zero_when_no_batches
394394

395395
def test_in_batches_touch_all_affect_all_records
396396
time = Time.new(2000, 1, 1, 0, 0, 0)
397-
assert_queries_count(6 + 6) do # 6 selects, 6 updates
397+
assert_queries_count(7 + 6) do # 7 selects, 6 updates
398398
Developer.in_batches(of: 2).touch_all(time: time)
399399
end
400400
assert_equal [time] * Developer.count, Developer.all.pluck(:updated_at)
@@ -486,7 +486,7 @@ def test_in_scoped_batches_preserves_order_within_batches
486486
end
487487

488488
def test_in_batches_if_not_loaded_executes_more_queries
489-
assert_queries_count(@total + 1) do
489+
assert_queries_count(@total + 2) do
490490
Post.in_batches(of: 1, load: false) do |relation|
491491
assert_not_predicate relation, :loaded?
492492
end
@@ -617,7 +617,7 @@ def test_in_batches_when_loaded_iterates_using_custom_column
617617
end
618618

619619
def test_in_batches_should_return_relations
620-
assert_queries_count(@total + 1) do
620+
assert_queries_count(@total + 2) do
621621
Post.in_batches(of: 1) do |relation|
622622
assert_kind_of ActiveRecord::Relation, relation
623623
end
@@ -634,46 +634,70 @@ def test_in_batches_should_start_from_the_start_option
634634

635635
def test_in_batches_should_end_at_the_finish_option
636636
post = Post.order("id DESC").where("id <= ?", 5).first
637-
assert_queries_count(7) do
637+
assert_queries_count(8) do
638638
relation = Post.in_batches(of: 1, finish: 5, load: true).reverse_each.first
639639
assert_equal post, relation.last
640640
end
641641
end
642642

643643
def test_in_batches_executes_range_queries_when_unconstrained
644644
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
645+
646+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+ OFFSET \S+\z/i, count: 6) do
647+
assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 1) do
648+
Post.in_batches(of: 2).to_a
649+
end
650+
end
651+
645652
assert_queries_match(/WHERE #{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
646-
Post.in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
653+
relations.each { |relation| assert_kind_of Post, relation.first }
647654
end
648655
end
649656

650657
def test_in_batches_executes_in_queries_when_unconstrained_and_opted_out_of_ranges
651658
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
659+
660+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 6) do
661+
Post.in_batches(of: 2, use_ranges: false).to_a
662+
end
663+
652664
assert_queries_match(/#{quoted_posts_id} IN \(.+\)/i) do
653-
Post.in_batches(of: 2, use_ranges: false) { |relation| assert_kind_of Post, relation.first }
665+
relations.each { |relation| assert_kind_of Post, relation.first }
654666
end
655667
end
656668

657669
def test_in_batches_executes_in_queries_when_constrained
658670
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
671+
672+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 3) do
673+
Post.where("id < ?", 5).in_batches(of: 2).to_a
674+
end
675+
659676
assert_queries_match(/#{quoted_posts_id} IN \(.+\)/i) do
660-
Post.where("id < ?", 5).in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
677+
relations.each { |relation| assert_kind_of Post, relation.first }
661678
end
662679
end
663680

664681
def test_in_batches_executes_range_queries_when_constrained_and_opted_in_into_ranges
665682
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
683+
684+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+ OFFSET \S+\z/i, count: 3) do
685+
assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 1) do
686+
Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true).to_a
687+
end
688+
end
689+
666690
assert_queries_match(/#{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
667-
Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true) { |relation| assert_kind_of Post, relation.first }
691+
relations.each { |relation| assert_kind_of Post, relation.first }
668692
end
669693
end
670694

671695
def test_in_batches_shouldnt_execute_query_unless_needed
672-
assert_queries_count(2) do
696+
assert_queries_count(3) do
673697
Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation }
674698
end
675699

676-
assert_queries_count(1) do
700+
assert_queries_count(2) do
677701
Post.in_batches(of: @total + 1) { |relation| assert_kind_of ActiveRecord::Relation, relation }
678702
end
679703
end
@@ -997,7 +1021,7 @@ def test_find_in_batches_should_return_a_sized_enumerator
9971021

9981022
test ".in_batches bypasses the query cache for its own queries" do
9991023
Post.cache do
1000-
assert_queries_count(2) do
1024+
assert_queries_count(4) do
10011025
Post.in_batches { }
10021026
Post.in_batches { }
10031027
end

0 commit comments

Comments
 (0)