Skip to content

Commit 29b3f38

Browse files
committed
parse ai citations
1 parent 205e679 commit 29b3f38

14 files changed

+283
-1
lines changed

lib/serp_parser/google/config.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,24 @@ def self.config_block
172172
end
173173
end
174174

175+
component :ai_citation do
176+
variant "standard", meta: { first_seen: "2026-12-18" } do
177+
match ".Y3BBE a.H23r4e"
178+
model SerpParser::Models::AiCitation
179+
text :title, ".", processors: [ :text, :clean_text ]
180+
url :url, attribute: "href"
181+
end
182+
end
183+
184+
element :ai_overview do
185+
variant "standard", meta: { first_seen: "2026-12-18" } do
186+
container '.FkX2oe[data-subtree="aimc"]'
187+
required_children [ ".Y3BBE" ]
188+
text :answer, ".Y3BBE", processors: [ :strip_inner_style, :clean_text ]
189+
has_many :citations, component: :ai_citation
190+
end
191+
end
192+
175193
element :related_searches do
176194
variant "filter_pills", meta: { first_seen: "2025-12-23" } do
177195
container "div.fBctee"

lib/serp_parser/google/parsers/element.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def extract_field_value(field_def)
100100
value = element.text
101101
element_ref = element
102102
processors.each do |p|
103-
if p == :remove_span_elements || p == :find_description_node
103+
if p == :remove_span_elements || p == :find_description_node || p == :strip_inner_style
104104
element_ref = apply_processor(p, value, element_ref)
105105
value = element_ref.text if element_ref.respond_to?(:text)
106106
else
@@ -135,6 +135,8 @@ def apply_processor(processor_name, value, element = nil)
135135
Processors.parse_date(value)
136136
when :remove_span_elements
137137
Processors.remove_span_elements(element) || element
138+
when :strip_inner_style
139+
Processors.strip_inner_style(element) || element
138140
when :extract_number
139141
Processors.extract_number(value)
140142
when :normalize_number
@@ -252,6 +254,9 @@ def extract_component_data(element, variant)
252254
# For FAQ citations, need a non-internal URL
253255
elsif model_class == SerpParser::Models::FaqCitation
254256
return nil unless data[:url] && !data[:url].to_s.start_with?("/")
257+
# For AI citations, need a non-internal URL
258+
elsif model_class == SerpParser::Models::AiCitation
259+
return nil unless data[:url] && !data[:url].to_s.start_with?("/")
255260
end
256261

257262
model_class.new(**data)

lib/serp_parser/google/search.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ def initialize(html_or_doc, schema: nil)
1111
@registry = Config.registry
1212
end
1313

14+
def ai_overview
15+
results = Parsers::Element.find_all(@doc, :ai_overview, @registry)
16+
return nil if results.empty?
17+
SerpParser::Models::AiOverviewResult.new(**results.first)
18+
end
19+
1420
def organic_results
1521
results = Parsers::Element.find_all(@doc, :organic_result, @registry)
1622
models = results.map do |data|
@@ -59,6 +65,7 @@ def search_information
5965

6066
def to_h
6167
{
68+
ai_overview: ai_overview&.to_h,
6269
sponsored_results: sponsored_results.map(&:to_h),
6370
organic_results: organic_results.map(&:to_h),
6471
faq_results: faq_results.map(&:to_h),
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
require "uri"
2+
3+
module SerpParser
4+
module Models
5+
class AiCitation
6+
attr_reader :title, :url
7+
attr_accessor :position
8+
9+
def initialize(**args)
10+
@title = args[:title]
11+
@url = args[:url]
12+
end
13+
14+
def domain
15+
return nil unless url
16+
host = URI.parse(url).host
17+
return nil unless host
18+
host.gsub(/www\./, "")
19+
rescue
20+
nil
21+
end
22+
23+
def to_h
24+
{
25+
"position" => @position,
26+
"title" => title,
27+
"domain" => domain,
28+
"url" => url
29+
}
30+
end
31+
end
32+
end
33+
end
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
module SerpParser
2+
module Models
3+
class AiOverviewResult
4+
attr_reader :answer, :citations
5+
6+
def initialize(**args)
7+
@answer = args[:answer]
8+
@citations = args[:citations].is_a?(Array) ? SerpParser::Collection.new(args[:citations]) : args[:citations]
9+
end
10+
11+
def to_h
12+
ans = answer&.strip
13+
{
14+
"answer" => (ans && !ans.empty?) ? ans : nil,
15+
"citations" => citations ? citations.map(&:to_h) : []
16+
}
17+
end
18+
end
19+
end
20+
end

lib/serp_parser/processors.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,18 @@ def self.normalize_number(string, decimals: false)
3434
end
3535
end
3636

37+
# Returns a copy of the node with <style> and <script> children removed,
38+
# so that .text on the result excludes embedded CSS/JS content.
39+
# @param element [Nokogiri::XML::Element]
40+
# @return [Nokogiri::XML::Element]
41+
def self.strip_inner_style(element)
42+
return element unless element.respond_to?(:dup)
43+
clone = Nokogiri::HTML.fragment(element.to_html)
44+
clone.css("style, script").each(&:remove)
45+
clone.css('[style*="display:none"]').each(&:remove)
46+
clone
47+
end
48+
3749
# Remove span elements from description
3850
# @param element [Nokogiri::XML::Element]
3951
# @return [Nokogiri::XML::Element]

spec/files/google/2025-12-23-mobile-matkasse.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"ai_overview": null,
23
"sponsored_results": [],
34
"organic_results": [
45
{

spec/files/google/2025-12-23-mobile-middagsfrid-rabattkod.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"ai_overview": null,
23
"sponsored_results": [],
34
"organic_results": [
45
{

spec/files/google/2026-01-01-mobile-swedbank.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"ai_overview": null,
23
"sponsored_results": [],
34
"organic_results": [
45
{

spec/files/google/2026-01-29-mobile-matkasse.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"ai_overview": null,
23
"sponsored_results": [],
34
"organic_results": [
45
{

0 commit comments

Comments
 (0)