Feat: db.query.summary parser

hannahramadan · hannahramadan · commit 63fec36a966f · 2025-09-03T09:29:34.000-07:00
diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary.rb
@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+require_relative 'query_summary/cache'
+require_relative 'query_summary/tokenizer'
+require_relative 'query_summary/parser'
+
+module OpenTelemetry
+  module Helpers
+    # QuerySummary generates high-level summaries of SQL queries, made up of
+    # key operations and table names.
+    #
+    # Example:
+    #   QuerySummary.generate_summary("SELECT * FROM users WHERE id = 1")
+    #   # => "SELECT users"
+    module QuerySummary
+      def self.configure_cache(size: Cache::DEFAULT_SIZE)
+        Cache.configure(size: size)
+      end
+
+      def self.generate_summary(query)
+        Cache.fetch(query) do
+          tokens = Tokenizer.tokenize(query)
+          Parser.build_summary_from_tokens(tokens)
+        end
+      rescue StandardError
+        'UNKNOWN'
+      end
+    end
+  end
+end
diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/cache.rb
@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0module OpenTelemetry
+
+module OpenTelemetry
+  module Helpers
+    module QuerySummary
+      # Cache provides thread-safe LRU caching for query summaries.
+      #
+      # Stores generated query summaries to avoid reprocessing identical queries.
+      # Uses mutex synchronization for thread safety.
+      #
+      # @example
+      #   Cache.fetch("SELECT * FROM users") { "SELECT users" } # => "SELECT users"
+      class Cache
+        DEFAULT_SIZE = 1000
+
+        @cache = {}
+        @cache_mutex = Mutex.new
+        @cache_size = DEFAULT_SIZE
+
+        def self.fetch(key)
+          return @cache[key] if @cache.key?(key)
+
+          result = yield
+          store(key, result)
+          result
+        end
+
+        def self.configure(size: DEFAULT_SIZE)
+          @cache_mutex.synchronize do
+            @cache_size = size
+            @cache.clear if @cache.size > size
+          end
+        end
+
+        def self.store(key, value)
+          @cache_mutex.synchronize do
+            @cache.shift if @cache.size >= @cache_size
+            @cache[key] = value
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/parser.rb
@@ -0,0 +1,134 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+module OpenTelemetry
+  module Helpers
+    module QuerySummary
+      # Parser builds high-level SQL query summaries from tokenized input.
+      #
+      # Processes tokens to extract key operations and table names, creating
+      # summaries like "SELECT users" or "INSERT INTO orders".
+      #
+      # @example
+      #   tokens = [Token.new(:keyword, "SELECT"), Token.new(:identifier, "users")]
+      #   Parser.build_summary_from_tokens(tokens) # => "SELECT users"
+      class Parser
+        def self.build_summary_from_tokens(tokens)
+          summary_parts = []
+          state = :default # Either :default or :expect_collection
+          skip_until = 0 # Next token index to process; allows skipping tokens already consumed by previous operations
+
+          tokens.each_with_index do |token, index|
+            next if index < skip_until # Skip already processed tokens
+
+            result = process_token(token, tokens, index, state)
+
+            summary_parts.concat(result[:parts])
+            state = result[:new_state]
+            skip_until = result[:next_index]
+          end
+
+          summary_parts.join(' ')
+        end
+
+        def self.process_token(token, tokens, index, state)
+          operation_result = process_main_operation(token, tokens, index, state)
+          return operation_result if operation_result[:processed]
+
+          collection_result = process_collection_token(token, tokens, index, state)
+          return collection_result if collection_result[:processed]
+
+          { processed: false, parts: [], new_state: state, next_index: index + 1 }
+        end
+
+        def self.process_main_operation(token, tokens, index, current_state)
+          case token.value.upcase
+          when 'SELECT', 'INSERT', 'DELETE'
+            add_to_summary(token.value, :default, index + 1)
+          when 'WITH', 'UPDATE'
+            add_to_summary(token.value, :expect_collection, index + 1)
+          when 'FROM', 'INTO', 'JOIN', 'IN'
+            trigger_collection_mode(index + 1)
+          when 'CREATE', 'ALTER', 'DROP', 'TRUNCATE'
+            handle_table_operation(token, tokens, index)
+          when 'UNION'
+            handle_union(token, tokens, index)
+          else
+            not_processed(current_state, index + 1)
+          end
+        end
+
+        def self.process_collection_token(token, tokens, index, state)
+          return { processed: false, parts: [], new_state: state, next_index: index + 1 } unless state == :expect_collection
+
+          upcased_value = token.value.upcase
+
+          if identifier_like?(token) || (token.type == :keyword && can_be_table_name?(upcased_value))
+            skip_count = calculate_alias_skip(tokens, index)
+            new_state = tokens[index + 1 + skip_count]&.value == ',' ? :expect_collection : :default
+            skip_count += 1 if tokens[index + 1 + skip_count]&.value == ','
+
+            { processed: true, parts: [token.value], new_state: new_state, next_index: index + 1 + skip_count }
+          elsif token.value == '(' || token.type == :operator
+            { processed: true, parts: [], new_state: state, next_index: index + 1 }
+          else
+            { processed: true, parts: [], new_state: :default, next_index: index + 1 }
+          end
+        end
+
+        def self.identifier_like?(token)
+          %i[identifier quoted_identifier string].include?(token.type)
+        end
+
+        def self.can_be_table_name?(upcased_value)
+          # Keywords that can also be used as table/object names in certain contexts
+          %w[TABLE INDEX PROCEDURE VIEW DATABASE].include?(upcased_value)
+        end
+
+        def self.calculate_alias_skip(tokens, index)
+          if tokens[index + 1]&.value&.upcase == 'AS'
+            2  # Skip 'AS' and the alias
+          elsif tokens[index + 1]&.type == :identifier
+            1  # Skip the alias
+          else
+            0
+          end
+        end
+
+        def self.add_to_summary(part, new_state, next_index)
+          { processed: true, parts: [part], new_state: new_state, next_index: next_index }
+        end
+
+        def self.trigger_collection_mode(next_index)
+          { processed: true, parts: [], new_state: :expect_collection, next_index: next_index }
+        end
+
+        def self.not_processed(current_state, next_index)
+          { processed: false, parts: [], new_state: current_state, next_index: next_index }
+        end
+
+        def self.handle_union(token, tokens, index)
+          if tokens[index + 1]&.value&.upcase == 'ALL'
+            { processed: true, parts: ["#{token.value} #{tokens[index + 1].value}"], new_state: :default, next_index: index + 2 }
+          else
+            add_to_summary(token.value, :default, index + 1)
+          end
+        end
+
+        def self.handle_table_operation(token, tokens, index)
+          next_token = tokens[index + 1]&.value&.upcase
+
+          case next_token
+          when 'TABLE', 'INDEX', 'PROCEDURE', 'VIEW', 'DATABASE'
+            { processed: true, parts: ["#{token.value} #{next_token}"], new_state: :expect_collection, next_index: index + 2 }
+          else
+            add_to_summary(token.value, :default, index + 1)
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb b/helpers/sql-obfuscation/lib/opentelemetry/helpers/query_summary/tokenizer.rb
@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+# Copyright The OpenTelemetry Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+require 'strscan'
+
+module OpenTelemetry
+  module Helpers
+    module QuerySummary
+      # Tokenizer breaks down SQL queries into structured tokens for analysis.
+      #
+      # Parses SQL query strings into typed tokens (keywords, identifiers, operators, literals)
+      # for generating query summaries while filtering out sensitive data.
+      #
+      # @example
+      #   tokens = Tokenizer.tokenize("SELECT * FROM users WHERE id = 1")
+      #   # Returns tokens: [keyword: SELECT], [operator: *], [keyword: FROM], etc.
+      class Tokenizer
+        # Token holds the type (e.g., :keyword) and value (e.g., "SELECT")
+        Token = Struct.new(:type, :value)
+
+        # The order of token matching is important for correct parsing,
+        # as more specific patterns should be matched before more general ones.
+        TOKEN_REGEX = {
+          whitespace: /\s+/,
+          comment: %r{--[^\r\n]*|\/\*.*?\*\/}m,
+          numeric: /[+-]?(?:0x[0-9a-fA-F]+|\d+\.?\d*(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?)/,
+          string: /'(?:''|[^'\r\n])*'?/,
+          quoted_identifier: /"(?:""|[^"\r\n])*"|`(?:``|[^`\r\n])*`|\[(?:[^\]\r\n])*\]/,
+          keyword: /\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|INTO|JOIN|CREATE|ALTER|DROP|TRUNCATE|WITH|UNION|TABLE|INDEX|PROCEDURE|VIEW|DATABASE)\b/i,
+          identifier: /[a-zA-Z_][a-zA-Z0-9_.]*/,
+          operator: /<=|>=|<>|!=|[=<>+\-*\/%,;()!?]/
+        }.freeze
+
+        EXCLUDED_TYPES = %i[whitespace comment].freeze
+
+        def self.tokenize(query)
+          scanner = StringScanner.new(query)
+          tokens = []
+
+          until scanner.eos?
+            matched = TOKEN_REGEX.any? do |type, regex|
+              next unless (value = scanner.scan(regex))
+
+              tokens << Token.new(type, value) unless EXCLUDED_TYPES.include?(type)
+              true
+            end
+            scanner.getch unless matched
+          end
+
+          tokens
+        end
+      end
+    end
+  end
+end
diff --git a/helpers/sql-obfuscation/test/fixtures/query_summary.json b/helpers/sql-obfuscation/test/fixtures/query_summary.json
diff --git a/helpers/sql-obfuscation/test/helpers/query_summary_test.rb b/helpers/sql-obfuscation/test/helpers/query_summary_test.rb