Skip to content

Commit 63fec36

Browse files
committed
Feat: db.query.summary parser
1 parent c4bdf4b commit 63fec36

File tree

6 files changed

+682
-0
lines changed

6 files changed

+682
-0
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright The OpenTelemetry Authors
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
require_relative 'query_summary/cache'
8+
require_relative 'query_summary/tokenizer'
9+
require_relative 'query_summary/parser'
10+
11+
module OpenTelemetry
12+
module Helpers
13+
# QuerySummary generates high-level summaries of SQL queries, made up of
14+
# key operations and table names.
15+
#
16+
# Example:
17+
# QuerySummary.generate_summary("SELECT * FROM users WHERE id = 1")
18+
# # => "SELECT users"
19+
module QuerySummary
20+
def self.configure_cache(size: Cache::DEFAULT_SIZE)
21+
Cache.configure(size: size)
22+
end
23+
24+
def self.generate_summary(query)
25+
Cache.fetch(query) do
26+
tokens = Tokenizer.tokenize(query)
27+
Parser.build_summary_from_tokens(tokens)
28+
end
29+
rescue StandardError
30+
'UNKNOWN'
31+
end
32+
end
33+
end
34+
end
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright The OpenTelemetry Authors
4+
#
5+
# SPDX-License-Identifier: Apache-2.0module OpenTelemetry
6+
7+
module OpenTelemetry
8+
module Helpers
9+
module QuerySummary
10+
# Cache provides thread-safe LRU caching for query summaries.
11+
#
12+
# Stores generated query summaries to avoid reprocessing identical queries.
13+
# Uses mutex synchronization for thread safety.
14+
#
15+
# @example
16+
# Cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users"
17+
class Cache
18+
DEFAULT_SIZE = 1000
19+
20+
@cache = {}
21+
@cache_mutex = Mutex.new
22+
@cache_size = DEFAULT_SIZE
23+
24+
def self.fetch(key)
25+
return @cache[key] if @cache.key?(key)
26+
27+
result = yield
28+
store(key, result)
29+
result
30+
end
31+
32+
def self.configure(size: DEFAULT_SIZE)
33+
@cache_mutex.synchronize do
34+
@cache_size = size
35+
@cache.clear if @cache.size > size
36+
end
37+
end
38+
39+
def self.store(key, value)
40+
@cache_mutex.synchronize do
41+
@cache.shift if @cache.size >= @cache_size
42+
@cache[key] = value
43+
end
44+
end
45+
end
46+
end
47+
end
48+
end
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright The OpenTelemetry Authors
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
module OpenTelemetry
8+
module Helpers
9+
module QuerySummary
10+
# Parser builds high-level SQL query summaries from tokenized input.
11+
#
12+
# Processes tokens to extract key operations and table names, creating
13+
# summaries like "SELECT users" or "INSERT INTO orders".
14+
#
15+
# @example
16+
# tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")]
17+
# Parser.build_summary_from_tokens(tokens) # => "SELECT users"
18+
class Parser
19+
def self.build_summary_from_tokens(tokens)
20+
summary_parts = []
21+
state = :default # Either :default or :expect_collection
22+
skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations
23+
24+
tokens.each_with_index do |token, index|
25+
next if index < skip_until # Skip already processed tokens
26+
27+
result = process_token(token, tokens, index, state)
28+
29+
summary_parts.concat(result[:parts])
30+
state = result[:new_state]
31+
skip_until = result[:next_index]
32+
end
33+
34+
summary_parts.join(' ')
35+
end
36+
37+
def self.process_token(token, tokens, index, state)
38+
operation_result = process_main_operation(token, tokens, index, state)
39+
return operation_result if operation_result[:processed]
40+
41+
collection_result = process_collection_token(token, tokens, index, state)
42+
return collection_result if collection_result[:processed]
43+
44+
{ processed: false, parts: [], new_state: state, next_index: index + 1 }
45+
end
46+
47+
def self.process_main_operation(token, tokens, index, current_state)
48+
case token.value.upcase
49+
when 'SELECT', 'INSERT', 'DELETE'
50+
add_to_summary(token.value, :default, index + 1)
51+
when 'WITH', 'UPDATE'
52+
add_to_summary(token.value, :expect_collection, index + 1)
53+
when 'FROM', 'INTO', 'JOIN', 'IN'
54+
trigger_collection_mode(index + 1)
55+
when 'CREATE', 'ALTER', 'DROP', 'TRUNCATE'
56+
handle_table_operation(token, tokens, index)
57+
when 'UNION'
58+
handle_union(token, tokens, index)
59+
else
60+
not_processed(current_state, index + 1)
61+
end
62+
end
63+
64+
def self.process_collection_token(token, tokens, index, state)
65+
return { processed: false, parts: [], new_state: state, next_index: index + 1 } unless state == :expect_collection
66+
67+
upcased_value = token.value.upcase
68+
69+
if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value))
70+
skip_count = calculate_alias_skip(tokens, index)
71+
new_state = tokens[index + 1 + skip_count]&.value == ',' ? :expect_collection : :default
72+
skip_count += 1 if tokens[index + 1 + skip_count]&.value == ','
73+
74+
{ processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count }
75+
elsif token.value == '(' || token.type == :operator
76+
{ processed: true, parts: [], new_state: state, next_index: index + 1 }
77+
else
78+
{ processed: true, parts: [], new_state: :default, next_index: index + 1 }
79+
end
80+
end
81+
82+
def self.identifier_like?(token)
83+
%i[identifier quoted_identifier string].include?(token.type)
84+
end
85+
86+
def self.can_be_table_name?(upcased_value)
87+
# Keywords that can also be used as table/object names in certain contexts
88+
%w[TABLE INDEX PROCEDURE VIEW DATABASE].include?(upcased_value)
89+
end
90+
91+
def self.calculate_alias_skip(tokens, index)
92+
if tokens[index + 1]&.value&.upcase == 'AS'
93+
2 # Skip 'AS' and the alias
94+
elsif tokens[index + 1]&.type == :identifier
95+
1 # Skip the alias
96+
else
97+
0
98+
end
99+
end
100+
101+
def self.add_to_summary(part, new_state, next_index)
102+
{ processed: true, parts: [part], new_state: new_state, next_index: next_index }
103+
end
104+
105+
def self.trigger_collection_mode(next_index)
106+
{ processed: true, parts: [], new_state: :expect_collection, next_index: next_index }
107+
end
108+
109+
def self.not_processed(current_state, next_index)
110+
{ processed: false, parts: [], new_state: current_state, next_index: next_index }
111+
end
112+
113+
def self.handle_union(token, tokens, index)
114+
if tokens[index + 1]&.value&.upcase == 'ALL'
115+
{ processed: true, parts: ["#{token.value} #{tokens[index + 1].value}"], new_state: :default, next_index: index + 2 }
116+
else
117+
add_to_summary(token.value, :default, index + 1)
118+
end
119+
end
120+
121+
def self.handle_table_operation(token, tokens, index)
122+
next_token = tokens[index + 1]&.value&.upcase
123+
124+
case next_token
125+
when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE'
126+
{ processed: true, parts: ["#{token.value} #{next_token}"], new_state: :expect_collection, next_index: index + 2 }
127+
else
128+
add_to_summary(token.value, :default, index + 1)
129+
end
130+
end
131+
end
132+
end
133+
end
134+
end
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright The OpenTelemetry Authors
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
require 'strscan'
8+
9+
module OpenTelemetry
10+
module Helpers
11+
module QuerySummary
12+
# Tokenizer breaks down SQL queries into structured tokens for analysis.
13+
#
14+
# Parses SQL query strings into typed tokens (keywords, identifiers, operators, literals)
15+
# for generating query summaries while filtering out sensitive data.
16+
#
17+
# @example
18+
# tokens = Tokenizer.tokenize("SELECT * FROM users WHERE id = 1")
19+
# # Returns tokens: [keyword: SELECT], [operator: *], [keyword: FROM], etc.
20+
class Tokenizer
21+
# Token holds the type (e.g., :keyword) and value (e.g., "SELECT")
22+
Token = Struct.new(:type, :value)
23+
24+
# The order of token matching is important for correct parsing,
25+
# as more specific patterns should be matched before more general ones.
26+
TOKEN_REGEX = {
27+
whitespace: /\s+/,
28+
comment: %r{--[^\r\n]*|\/\*.*?\*\/}m,
29+
numeric: /[+-]?(?:0x[0-9a-fA-F]+|\d+\.?\d*(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?)/,
30+
string: /'(?:''|[^'\r\n])*'?/,
31+
quoted_identifier: /"(?:""|[^"\r\n])*"|`(?:``|[^`\r\n])*`|\[(?:[^\]\r\n])*\]/,
32+
keyword: /\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|INTO|JOIN|CREATE|ALTER|DROP|TRUNCATE|WITH|UNION|TABLE|INDEX|PROCEDURE|VIEW|DATABASE)\b/i,
33+
identifier: /[a-zA-Z_][a-zA-Z0-9_.]*/,
34+
operator: /<=|>=|<>|!=|[=<>+\-*\/%,;()!?]/
35+
}.freeze
36+
37+
EXCLUDED_TYPES = %i[whitespace comment].freeze
38+
39+
def self.tokenize(query)
40+
scanner = StringScanner.new(query)
41+
tokens = []
42+
43+
until scanner.eos?
44+
matched = TOKEN_REGEX.any? do |type, regex|
45+
next unless (value = scanner.scan(regex))
46+
47+
tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type)
48+
true
49+
end
50+
scanner.getch unless matched
51+
end
52+
53+
tokens
54+
end
55+
end
56+
end
57+
end
58+
end

0 commit comments

Comments
 (0)