Skip to content

Commit 97dac99

Browse files
authored
Merge pull request rails#51243 from maximerety/optim-in-batches-using-offset
Optimize Active Record batching further when using ranges
2 parents 5709348 + c097bf6 commit 97dac99

File tree

3 files changed

+70
-24
lines changed

3 files changed

+70
-24
lines changed

activerecord/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
* Optimize Active Record batching further when using ranges.
2+
3+
Tested on a PostgreSQL table with 10M records and batches of 10k records, the generation
4+
of relations for the 1000 batches was `4.8x` faster (`6.8s` vs. `1.4s`), used `900x`
5+
less bandwidth (`180MB` vs. `0.2MB`) and allocated `45x` less memory (`490MB` vs. `11MB`).
6+
7+
*Maxime Réty*, *fatkodima*
8+
19
* Include current character length in error messages for index and table name length validations.
210

311
*Joshua Young*

activerecord/lib/active_record/relation/batches.rb

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -435,35 +435,49 @@ def batch_on_unloaded_relation(relation:, start:, finish:, load:, cursor:, order
435435
if load
436436
records = batch_relation.records
437437
values = records.pluck(*cursor)
438+
values_size = values.size
439+
values_last = values.last
438440
yielded_relation = where(cursor => values).order(batch_orders.to_h)
439441
yielded_relation.load_records(records)
440442
elsif (empty_scope && use_ranges != false) || use_ranges
441-
values = batch_relation.pluck(*cursor)
443+
# Efficiently peak at the last value for the next batch using offset and limit.
444+
values_size = batch_limit
445+
values_last = batch_relation.offset(batch_limit - 1).pick(*cursor)
446+
447+
# If the last value is not found using offset, there is at most one more batch of size < batch_limit.
448+
# Retry by getting the whole list of remaining values so that we have the exact size and last value.
449+
unless values_last
450+
values = batch_relation.pluck(*cursor)
451+
values_size = values.size
452+
values_last = values.last
453+
end
442454

443-
finish = values.last
444-
if finish
445-
yielded_relation = apply_finish_limit(batch_relation, cursor, finish, batch_orders)
455+
# Finally, build the yielded relation if at least one value found.
456+
if values_last
457+
yielded_relation = apply_finish_limit(batch_relation, cursor, values_last, batch_orders)
446458
yielded_relation = yielded_relation.except(:limit).reorder(batch_orders.to_h)
447459
yielded_relation.skip_query_cache!(false)
448460
end
449461
else
450462
values = batch_relation.pluck(*cursor)
463+
values_size = values.size
464+
values_last = values.last
451465
yielded_relation = where(cursor => values).order(batch_orders.to_h)
452466
end
453467

454-
break if values.empty?
468+
break if values_size == 0
455469

456-
if values.flatten.any?(nil)
470+
if [values_last].flatten.any?(nil)
457471
raise ArgumentError, "Not all of the batch cursor columns were included in the custom select clause "\
458472
"or some columns contain nil."
459473
end
460474

461475
yield yielded_relation
462476

463-
break if values.length < batch_limit
477+
break if values_size < batch_limit
464478

465479
if limit_value
466-
remaining -= values.length
480+
remaining -= values_size
467481

468482
if remaining == 0
469483
# Saves a useless iteration when the limit is a multiple of the
@@ -481,7 +495,7 @@ def batch_on_unloaded_relation(relation:, start:, finish:, load:, cursor:, order
481495
end
482496
operators << (last_order == :desc ? :lt : :gt)
483497

484-
cursor_value = values.last
498+
cursor_value = values_last
485499
batch_relation = batch_condition(relation, cursor, cursor_value, operators)
486500
end
487501

activerecord/test/cases/batches_test.rb

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -335,15 +335,15 @@ def test_in_batches_has_attribute_readers
335335
end
336336

337337
def test_in_batches_should_yield_relation_if_block_given
338-
assert_queries_count(6) do
338+
assert_queries_count(7) do
339339
Post.in_batches(of: 2) do |relation|
340340
assert_kind_of ActiveRecord::Relation, relation
341341
end
342342
end
343343
end
344344

345345
def test_in_batches_should_be_enumerable_if_no_block_given
346-
assert_queries_count(6) do
346+
assert_queries_count(7) do
347347
Post.in_batches(of: 2).each do |relation|
348348
assert_kind_of ActiveRecord::Relation, relation
349349
end
@@ -378,10 +378,10 @@ def test_in_batches_each_record_should_be_ordered_by_id
378378
end
379379

380380
def test_in_batches_update_all_affect_all_records
381-
assert_queries_count(6 + 6) do # 6 selects, 6 updates
381+
assert_queries_count(7 + 6) do # 7 selects, 6 updates
382382
Post.in_batches(of: 2).update_all(title: "updated-title")
383383
end
384-
assert_equal Post.all.pluck(:title), ["updated-title"] * Post.count
384+
assert_equal ["updated-title"] * Post.count, Post.all.pluck(:title)
385385
end
386386

387387
def test_in_batches_update_all_returns_rows_affected
@@ -394,7 +394,7 @@ def test_in_batches_update_all_returns_zero_when_no_batches
394394

395395
def test_in_batches_touch_all_affect_all_records
396396
time = Time.new(2000, 1, 1, 0, 0, 0)
397-
assert_queries_count(6 + 6) do # 6 selects, 6 updates
397+
assert_queries_count(7 + 6) do # 7 selects, 6 updates
398398
Developer.in_batches(of: 2).touch_all(time: time)
399399
end
400400
assert_equal [time] * Developer.count, Developer.all.pluck(:updated_at)
@@ -486,7 +486,7 @@ def test_in_scoped_batches_preserves_order_within_batches
486486
end
487487

488488
def test_in_batches_if_not_loaded_executes_more_queries
489-
assert_queries_count(@total + 1) do
489+
assert_queries_count(@total + 2) do
490490
Post.in_batches(of: 1, load: false) do |relation|
491491
assert_not_predicate relation, :loaded?
492492
end
@@ -617,7 +617,7 @@ def test_in_batches_when_loaded_iterates_using_custom_column
617617
end
618618

619619
def test_in_batches_should_return_relations
620-
assert_queries_count(@total + 1) do
620+
assert_queries_count(@total + 2) do
621621
Post.in_batches(of: 1) do |relation|
622622
assert_kind_of ActiveRecord::Relation, relation
623623
end
@@ -634,46 +634,70 @@ def test_in_batches_should_start_from_the_start_option
634634

635635
def test_in_batches_should_end_at_the_finish_option
636636
post = Post.order("id DESC").where("id <= ?", 5).first
637-
assert_queries_count(7) do
637+
assert_queries_count(8) do
638638
relation = Post.in_batches(of: 1, finish: 5, load: true).reverse_each.first
639639
assert_equal post, relation.last
640640
end
641641
end
642642

643643
def test_in_batches_executes_range_queries_when_unconstrained
644644
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
645+
646+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+ OFFSET \S+\z/i, count: 6) do
647+
assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 1) do
648+
Post.in_batches(of: 2).to_a
649+
end
650+
end
651+
645652
assert_queries_match(/WHERE #{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
646-
Post.in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
653+
relations.each { |relation| assert_kind_of Post, relation.first }
647654
end
648655
end
649656

650657
def test_in_batches_executes_in_queries_when_unconstrained_and_opted_out_of_ranges
651658
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
659+
660+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 6) do
661+
Post.in_batches(of: 2, use_ranges: false).to_a
662+
end
663+
652664
assert_queries_match(/#{quoted_posts_id} IN \(.+\)/i) do
653-
Post.in_batches(of: 2, use_ranges: false) { |relation| assert_kind_of Post, relation.first }
665+
relations.each { |relation| assert_kind_of Post, relation.first }
654666
end
655667
end
656668

657669
def test_in_batches_executes_in_queries_when_constrained
658670
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
671+
672+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 3) do
673+
Post.where("id < ?", 5).in_batches(of: 2).to_a
674+
end
675+
659676
assert_queries_match(/#{quoted_posts_id} IN \(.+\)/i) do
660-
Post.where("id < ?", 5).in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
677+
relations.each { |relation| assert_kind_of Post, relation.first }
661678
end
662679
end
663680

664681
def test_in_batches_executes_range_queries_when_constrained_and_opted_in_into_ranges
665682
quoted_posts_id = Regexp.escape(quote_table_name("posts.id"))
683+
684+
relations = assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+ OFFSET \S+\z/i, count: 3) do
685+
assert_queries_match(/ORDER BY #{quoted_posts_id} ASC LIMIT \S+\z/i, count: 1) do
686+
Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true).to_a
687+
end
688+
end
689+
666690
assert_queries_match(/#{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
667-
Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true) { |relation| assert_kind_of Post, relation.first }
691+
relations.each { |relation| assert_kind_of Post, relation.first }
668692
end
669693
end
670694

671695
def test_in_batches_shouldnt_execute_query_unless_needed
672-
assert_queries_count(2) do
696+
assert_queries_count(3) do
673697
Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation }
674698
end
675699

676-
assert_queries_count(1) do
700+
assert_queries_count(2) do
677701
Post.in_batches(of: @total + 1) { |relation| assert_kind_of ActiveRecord::Relation, relation }
678702
end
679703
end
@@ -997,7 +1021,7 @@ def test_find_in_batches_should_return_a_sized_enumerator
9971021

9981022
test ".in_batches bypasses the query cache for its own queries" do
9991023
Post.cache do
1000-
assert_queries_count(2) do
1024+
assert_queries_count(4) do
10011025
Post.in_batches { }
10021026
Post.in_batches { }
10031027
end

0 commit comments

Comments
 (0)