Skip to content

Commit d141325

Browse files
committed
[E] Optimize text section node searching
1 parent 18c502c commit d141325

22 files changed

+381
-108
lines changed

api/Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ gem "paper_trail-globalid", "~> 0.2"
9393
gem "pg", "~> 1.2"
9494
gem "pg_search", "~> 2.3.6"
9595
gem "premailer-rails", "~> 1.0"
96+
gem "progressbar", "~> 1.13.0"
9697
gem "pry-rails", "~> 0.3.9"
9798
gem "puma", "~> 6.4"
9899
gem "rack", ">= 2.2.13"

api/Gemfile.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ GEM
567567
net-smtp
568568
premailer (~> 1.7, >= 1.7.9)
569569
prism (1.4.0)
570+
progressbar (1.13.0)
570571
pry (0.14.2)
571572
coderay (~> 1.1)
572573
method_source (~> 1.0)
@@ -950,6 +951,7 @@ DEPENDENCIES
950951
pg (~> 1.2)
951952
pg_search (~> 2.3.6)
952953
premailer-rails (~> 1.0)
954+
progressbar (~> 1.13.0)
953955
pry-byebug
954956
pry-rails (~> 0.3.9)
955957
puma (~> 6.4)

api/app/jobs/text_sections/index_current_node_content_job.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ def build_enumerator(text_section, cursor:)
2424
end
2525

2626
# @param [TextSectionNode] text_section_node
27-
# @param [Project] _project
27+
# @param [TextSection] _text_section
2828
# @return [void]
29-
def each_iteration(text_section_node, _project)
29+
def each_iteration(text_section_node, _text_section)
3030
text_section_node.index_contained_content!
3131
end
3232
end
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# frozen_string_literal: true
2+
3+
module TextSections
4+
class MaintainAllCurrentNodesJob < ApplicationJob
5+
include JobIteration::Iteration
6+
7+
queue_as :low_priority
8+
9+
queue_with_priority 500
10+
11+
# @param [String, nil] cursor
12+
# @return [Enumerator]
13+
def build_enumerator(cursor:)
14+
enumerator_builder.active_record_on_records(
15+
TextSection.all,
16+
cursor:,
17+
)
18+
end
19+
20+
# @param [TextSection] text_section
21+
# @return [void]
22+
def each_iteration(text_section)
23+
text_section.maintain_current_nodes!
24+
end
25+
end
26+
end

api/app/models/text_section.rb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ class TextSection < ApplicationRecord
8383
scope :in_texts, ->(texts) { where(text: texts) }
8484
scope :ordered, -> { order(position: :asc) }
8585

86+
scope :with_unindexed_nodes, -> { where(id: TextSectionNode.sans_search_indexed.select(:text_section_id).distinct) }
87+
8688
multisearches! :body_text
8789

8890
alias_attribute :title, :name
@@ -217,15 +219,20 @@ def extracted_body_content
217219
end
218220

219221
# @return [void]
220-
def index_contained_content!
221-
ManifoldApi::Container["text_sections.index_contained_content"].(self).value!
222+
def index_contained_content!(**options)
223+
ManifoldApi::Container["text_sections.index_contained_content"].(self, **options).value!
222224
end
223225

224226
# @return [void]
225227
def index_nodes!
226228
ManifoldApi::Container["text_sections.index_nodes"].(self).value!
227229
end
228230

231+
# @return [void]
232+
def maintain_current_nodes!
233+
ManifoldApi::Container["text_sections.maintain_current_nodes"].(self).value!
234+
end
235+
229236
private
230237

231238
def extract_content_from!(source, data: [])

api/app/models/text_section_node.rb

Lines changed: 18 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ class TextSectionNode < ApplicationRecord
3434
scope :terminal, -> { where(intermediate: false) }
3535
scope :with_intermediate_tag, -> { where(tag: INTERMEDIATE_TAGS) }
3636

37-
scope :current, -> { joins(:text_section).where(TextSection.arel_table[:body_hash].eq(arel_table[:body_hash])) }
38-
scope :orphaned, -> { joins(:text_section).where(TextSection.arel_table[:body_hash].not_eq(arel_table[:body_hash])) }
37+
scope :current, -> { where(current: true) }
38+
scope :orphaned, -> { where(current: false) }
39+
40+
scope :mismatched_current, -> { joins(:text_section).where(arel_mismatched_current) }
3941

4042
scope :sans_search_indexed, -> { where(search_indexed: false) }
4143

@@ -119,15 +121,17 @@ class << self
119121
# @param [<String>] text_section_ids
120122
# @return [<Hash>]
121123
def hit_search_for(keyword, text_section_ids: [])
124+
# :nocov:
122125
return {} if text_section_ids.blank?
126+
# :nocov:
123127

124128
query = build_hit_query_for(keyword, text_section_ids:)
125129

126130
hit_filters = text_section_ids.index_with { Search::HitFilter.new }
127131

128132
hit_results = text_section_ids.index_with { [] }.merge(nil => Dry::Core::Constants::EMPTY_ARRAY)
129133

130-
query.each_with_object(hit_results) do |node, hits|
134+
TextSectionNode.find_by_sql(query).each_with_object(hit_results) do |node, hits|
131135
hit_filter = hit_filters.fetch(node.text_section_id)
132136

133137
next unless hit_filter.allow?(node)
@@ -136,89 +140,22 @@ def hit_search_for(keyword, text_section_ids: [])
136140
end
137141
end
138142

139-
private
140-
141-
def arel_content_highlighted_for(keyword, node_hits:)
142-
q = unscoped.keyword_search(keyword)
143-
144-
tsearch = q.__send__(:tsearch)
145-
146-
Arel::Nodes::NamedFunction.new(
147-
"ts_headline",
148-
[
149-
tsearch.__send__(:dictionary),
150-
node_hits[:contained_content],
151-
tsearch.__send__(:arel_wrap, tsearch.__send__(:tsquery)),
152-
Arel::Nodes.build_quoted(tsearch.__send__(:ts_headline_options))
153-
]
154-
)
155-
end
156-
157-
# @param [String] keyword
158-
# @param [<String>] text_section_ids
159-
# @return [ActiveRecord::Relation<TextSectionNode>]
143+
# @api private
144+
# @return [String]
160145
def build_hit_query_for(keyword, text_section_ids:)
161-
inner_query = build_hit_inner_query_for(keyword, text_section_ids:)
162-
163-
node_hits = Arel::Table.new("node_hits")
164-
165-
TextSectionNode.from(inner_query.to_sql, "node_hits")
166-
.reselect(?*)
167-
.select(arel_content_highlighted_for(keyword, node_hits:).as("content_highlighted"))
168-
.where(node_hits[:hit_number].lteq(MAX_HIT_COUNT))
169-
.order(node_hits[:text_section_id].asc)
170-
.order(node_hits[:pg_search_rank].desc)
171-
.order(node_hits[:hit_number].asc)
172-
end
173-
174-
# @param [String] keyword
175-
# @param [<String>] text_section_ids
176-
# @return [Arel::Nodes::Grouping]
177-
def build_hit_inner_query_for(keyword, text_section_ids:)
178-
base_query = all
179-
.keyword_search(keyword)
180-
.current
181-
.where(text_section_id: text_section_ids)
182-
.with_pg_search_rank
183-
184-
hit_number = hit_number_for(base_query)
185-
186-
hit_inner_query = base_query.select(hit_number.as("hit_number")).to_sql
187-
188-
in_text_sections = arel_table[:text_section_id].in(text_section_ids).to_sql
189-
190-
# We need to move this condition inside the pg_search subquery
191-
# or else we fetch way too much
192-
hit_inner_query = hit_inner_query.sub(
193-
/(WHERE\s+)(.+?)(\) AS #{base_query.pg_search_rank_table_alias} ON)/i,
194-
%[\\1 #{in_text_sections} AND (\\2)\\3]
195-
)
196-
197-
arel_grouping(
198-
arel_literal(
199-
hit_inner_query
200-
)
201-
).as("node_hits")
146+
ManifoldApi::Container["text_section_nodes.build_hits_query"].(keyword:, text_section_ids:).value!
202147
end
203148

204-
# @param [ActiveRecord::Relation<TextSectionNode>] query
205-
# @return [Arel::Nodes::NamedFunction]
206-
def hit_number_for(query)
207-
window = search_window_for query
208-
209-
arel_named_fn("row_number").over(window)
210-
end
149+
# @api private
150+
# @return [Arel::Nodes::Inequality]
151+
def arel_mismatched_current
152+
text_sections = TextSection.arel_table
153+
text_section_nodes = TextSectionNode.arel_table
211154

212-
# @param [ActiveRecord::Relation<TextSectionNode>] query
213-
# @return [Arel::Nodes::Window]
214-
def search_window_for(query)
215-
rank_table = Arel::Table.new(query.pg_search_rank_table_alias)
155+
body_hash_matches = arel_grouping(text_sections[:body_hash].eq(text_section_nodes[:body_hash]))
156+
current = text_section_nodes[:current]
216157

217-
Arel::Nodes::Window.new
218-
.partition(arel_table[:text_section_id])
219-
.order(rank_table[:rank].desc)
220-
.order(arel_table[:node_indices].desc)
221-
.order(arel_table[:id].asc)
158+
body_hash_matches.not_eq(current)
222159
end
223160
end
224161
end
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# frozen_string_literal: true
2+
3+
module TextSectionNodes
4+
# @see TextSectionNodes::HitsQueryBuilder
5+
class BuildHitsQuery
6+
# @return [Dry::Monads::Result]
7+
def call(...)
8+
TextSectionNodes::HitsQueryBuilder.new(...).call
9+
end
10+
end
11+
end

api/app/operations/text_section_nodes/index_contained_content.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@ class IndexContainedContent
1919
string_agg(cn.content, ' ' ORDER BY cn.node_indices) FILTER (WHERE cn.content IS NOT NULL AND cn.content ~ '[^[:space:]]+') AS contained_content
2020
FROM text_section_nodes pn
2121
INNER JOIN text_section_nodes cn ON cn.text_section_id = %<text_section_id>s AND cn.body_hash = %<body_hash>s AND pn.node_path @> cn.node_path
22-
WHERE pn.id = %<text_section_node_id>s
22+
WHERE pn.text_section_id = %<text_section_id>s AND pn.id = %<text_section_node_id>s
2323
GROUP BY pn.id
2424
)
2525
UPDATE text_section_nodes tsn SET
2626
contained_node_uuids = COALESCE(d.contained_node_uuids, '{}'::text[]),
2727
contained_content = CASE WHEN char_length(d.contained_content) <= 4096 THEN d.contained_content ELSE '' END,
2828
search_indexed = TRUE
2929
FROM derived d
30-
WHERE tsn.id = %<text_section_node_id>s
30+
WHERE tsn.text_section_id = %<text_section_id>s AND tsn.id = %<text_section_node_id>s
3131
;
3232
SQL
3333

api/app/operations/text_sections/extrapolate_nodes.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class ExtrapolateNodes
7777
(#{TAG_IS_INTERMEDIATE}) AS intermediate,
7878
CURRENT_TIMESTAMP AS extrapolated_at,
7979
CURRENT_TIMESTAMP AS search_indexed_at,
80-
TRUE AS search_indexed
80+
TRUE AS current
8181
FROM nodes
8282
)
8383
SQL
@@ -95,7 +95,7 @@ class ExtrapolateNodes
9595
intermediate,
9696
extrapolated_at,
9797
search_indexed_at,
98-
search_indexed
98+
current
9999
) SELECT
100100
text_section_id, body_hash,
101101
node_root, node_path, path,
@@ -108,7 +108,7 @@ class ExtrapolateNodes
108108
intermediate,
109109
extrapolated_at,
110110
search_indexed_at,
111-
search_indexed
111+
current
112112
FROM finalized
113113
ON CONFLICT (node_path) DO UPDATE SET
114114
"text_section_id" = EXCLUDED."text_section_id",
@@ -128,6 +128,7 @@ class ExtrapolateNodes
128128
"children_count" = EXCLUDED."children_count",
129129
"intermediate" = EXCLUDED."intermediate",
130130
"extrapolated_at" = EXCLUDED."extrapolated_at",
131+
"current" = TRUE,
131132
"updated_at" =
132133
CASE
133134
WHEN EXCLUDED."text_section_id" IS DISTINCT FROM text_section_nodes."text_section_id"

api/app/operations/text_sections/index_contained_content.rb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,32 @@ class IndexContainedContent
1818
pn.text_section_id = %<text_section_id>s
1919
AND
2020
pn.body_hash = %<body_hash>s
21+
%<search_indexed_filter>s
2122
GROUP BY pn.id
2223
)
2324
UPDATE text_section_nodes tsn SET
2425
contained_node_uuids = COALESCE(d.contained_node_uuids, '{}'::text[]),
2526
contained_content = CASE WHEN char_length(d.contained_content) <= 4096 THEN d.contained_content ELSE '' END,
2627
search_indexed = TRUE
2728
FROM derived d
28-
WHERE tsn.id = d.text_section_node_id
29+
WHERE tsn.text_section_id = %<text_section_id>s AND tsn.id = d.text_section_node_id
2930
;
3031
SQL
3132

3233
# @param [TextSection] text_section
3334
# @return [Dry::Monads::Result]
34-
def call(text_section)
35+
def call(text_section, unindexed_only: true)
3536
params = {
3637
text_section_id: text_section.quoted_id,
3738
body_hash: text_section.body_hash,
3839
}
3940

41+
if unindexed_only
42+
params[:search_indexed_filter] = "AND pn.search_indexed = FALSE"
43+
else
44+
params[:search_indexed_filter] = ""
45+
end
46+
4047
query = QUERY % params
4148

4249
response = sql_update!(query)

0 commit comments

Comments
 (0)