diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a743b7..ff1b630 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## [Unreleased] +### Added + +- Added category methods to `Result` for querying specific types of sensitive information (e.g., `emails`, `emails?`, `email_mapping`) +- Category methods are automatically generated for all default filter types and custom labels +- Category methods always return empty arrays/hashes when no data of that type is found, ensuring they're safe to call without checking + ### Changed - **BREAKING:** Added strict label validation for custom filters. Labels must now start and end with letters and contain only alphabetic characters and single underscores (no consecutive underscores, digits, or special characters). Previously malformed labels will now raise `Error::MalformedLabel`. diff --git a/README.md b/README.md index c331c30..2416a4f 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,76 @@ result.safe? # => false ``` +### Category Methods + +Query the result for specific types of sensitive information using category methods: + +```ruby +result = TopSecret::Text.filter("Ralph can be reached at ralph@example.com or 555-1234") + +# Check if emails were found +result.emails? +# => true + +# Get all emails +result.emails +# => ["ralph@example.com"] + +# Get email mapping +result.email_mapping +# => {:EMAIL_1=>"ralph@example.com"} + +# Similarly for other types +result.people? # => true +result.people # => ["Ralph"] +result.person_mapping # => {:PERSON_1=>"Ralph"} + +result.phone_numbers? # => true +result.phone_numbers # => ["555-1234"] +result.phone_number_mapping # => {:PHONE_NUMBER_1=>"555-1234"} +``` + +Available category methods for all default filters: + +- `emails`, `emails?`, `email_mapping` +- `credit_cards`, `credit_cards?`, `credit_card_mapping` +- `phone_numbers`, `phone_numbers?`, `phone_number_mapping` +- `ssns`, `ssns?`, `ssn_mapping` +- `people`, `people?`, `person_mapping` +- `locations`, `locations?`, `location_mapping` + +These methods are always available and return empty arrays/hashes when no sensitive information of that type is found: + +```ruby +result = TopSecret::Text.filter("No sensitive data here") + +result.emails? # => false +result.emails # => [] +result.email_mapping # => {} +``` + +When using custom labels, methods are generated based on the label name. Note that default filter methods remain available and can access the same data: + +```ruby +result = TopSecret::Text.filter( + "user[at]example.com", + email_filter: TopSecret::Filters::Regex.new( + label: "EMAIL_ADDRESS", + regex: /\w+\[at\]\w+\.\w+/ + ) +) + +# Custom label methods (based on EMAIL_ADDRESS) +result.email_addresses # => ["user[at]example.com"] +result.email_addresses? # => true +result.email_address_mapping # => {:EMAIL_ADDRESS_1=>"user[at]example.com"} + +# Default methods still work and return the same data +result.emails # => ["user[at]example.com"] +result.emails? # => true +result.email_mapping # => {:EMAIL_ADDRESS_1=>"user[at]example.com"} +``` + ### Scanning for Sensitive Information Use `TopSecret::Text.scan` to detect sensitive information without redacting the text. This is useful when you only need to check if sensitive data exists or get a mapping of what was found: diff --git a/lib/top_secret/constants.rb b/lib/top_secret/constants.rb index 3da8a96..14eae95 100644 --- a/lib/top_secret/constants.rb +++ b/lib/top_secret/constants.rb @@ -25,4 +25,7 @@ module TopSecret # @return [Float] The minimum confidence score for NER filtering MIN_CONFIDENCE_SCORE = 0.5 + + # @return [String] The delimiter used in label names + LABEL_DELIMITER = "_" end diff --git a/lib/top_secret/mapping.rb b/lib/top_secret/mapping.rb index 74d9c9e..453f9ac 100644 --- a/lib/top_secret/mapping.rb +++ b/lib/top_secret/mapping.rb @@ -1,7 +1,40 @@ # frozen_string_literal: true +require "active_support/core_ext/string/inflections" + module TopSecret + # Provides dynamic category methods for querying sensitive information by type. + # + # This module automatically generates methods for accessing sensitive information + # organized by category (emails, credit cards, people, etc.). Methods are available + # for all default filter types and any custom labels used in the mapping. + # + # @example Querying emails + # result = TopSecret::Text.filter("Contact ralph@example.com") + # result.emails? # => true + # result.emails # => ["ralph@example.com"] + # result.email_mapping # => {:EMAIL_1=>"ralph@example.com"} + # + # @example With no matches + # result = TopSecret::Text.filter("No sensitive data") + # result.emails? # => false + # result.emails # => [] + # result.email_mapping # => {} + # + # @example Custom labels + # result = TopSecret::Text.filter( + # "user[at]example.com", + # email_filter: TopSecret::Filters::Regex.new( + # label: "EMAIL_ADDRESS", + # regex: /\w+\[at\]\w+\.\w+/ + # ) + # ) + # result.email_addresses # => ["user[at]example.com"] + # result.email_address_mapping # => {:EMAIL_ADDRESS_1=>"user[at]example.com"} module Mapping + MAPPING_SUFFIX = "_mapping" + PREDICATE_SUFFIX = "?" + # @return [Boolean] Whether sensitive information was found def sensitive? mapping.any? @@ -11,5 +44,147 @@ def sensitive? def safe? !sensitive? end + + def method_missing(method_name, *args, &block) + if mapping_methods.include? method_name + self.class.define_method(method_name) do + build_mapping_method_from method_name + end + + send(method_name) + elsif pluralized_methods.include? method_name + self.class.define_method(method_name) do + build_plural_method_from method_name + end + + send(method_name) + elsif predicate_methods.include? method_name + self.class.define_method(method_name) do + build_predicate_method_from method_name + end + + send(method_name) + elsif mapping_predicate_methods.include? method_name + self.class.define_method(method_name) do + build_mapping_predicate_method_from method_name + end + + send(method_name) + else + super + end + end + + def respond_to_missing?(method_name, include_private = false) + mapping_methods.include?(method_name) || + pluralized_methods.include?(method_name) || + predicate_methods.include?(method_name) || + mapping_predicate_methods.include?(method_name) || + super + end + + # Returns all available types for category methods. + # + # Types are derived from both the mapping keys and default filters. + # For example, with mapping `{EMAIL_1: "test@example.com"}`, the type is `:email`. + # Default filter types (credit_card, email, phone_number, ssn, person, location) + # are always available even when not present in the mapping. + # + # @return [Array] List of available types + # @example + # result = TopSecret::Text.filter("ralph@example.com") + # result.types + # # => [:email, :credit_card, :phone_number, :ssn, :person, :location] + def types + @types ||= all_types.uniq.map(&:to_sym) + end + + private + + def types_from_mapping + mapping.keys.map do |key| + parts = key.to_s.split(TopSecret::LABEL_DELIMITER).reject(&:empty?) + parts[0...-1].join(TopSecret::LABEL_DELIMITER).downcase + end + end + + def types_from_filters + default_filter_objects.map { |filter| filter.label.downcase } + end + + def all_types + types_from_mapping + types_from_filters + end + + def default_filter_objects + [ + TopSecret.credit_card_filter, + TopSecret.email_filter, + TopSecret.phone_number_filter, + TopSecret.ssn_filter, + TopSecret.people_filter, + TopSecret.location_filter + ].compact + end + + def stringified_types + types.map(&:to_s) + end + + def pluralized_methods + @pluralized_methods ||= stringified_types.map(&:pluralize).map(&:to_sym) + end + + def predicate_methods + @predicate_methods ||= pluralized_methods.map { :"#{_1}#{PREDICATE_SUFFIX}" } + end + + def mapping_predicate_methods + @mapping_predicate_methods ||= mapping_methods.map { :"#{_1}#{PREDICATE_SUFFIX}" } + end + + def mapping_methods + @mapping_methods ||= stringified_types.map do |type| + if type.end_with?(MAPPING_SUFFIX) + :"#{type.pluralize}#{MAPPING_SUFFIX}" + else + :"#{type}#{MAPPING_SUFFIX}" + end + end + end + + def build_mapping_method_from(method_name) + type_name = method_name.to_s.delete_suffix(MAPPING_SUFFIX) + + type_name = type_name.singularize if type_name.pluralize == type_name && type_name.singularize.end_with?(MAPPING_SUFFIX) + + type = type_name.upcase + + mapping.select { |key, _| key.start_with? type } + end + + def build_plural_method_from(method_name) + singular = method_name.to_s.singularize + + mapping_method = if singular.end_with?(MAPPING_SUFFIX) + :"#{method_name}#{MAPPING_SUFFIX}" + else + :"#{singular}#{MAPPING_SUFFIX}" + end + + send(mapping_method).values + end + + def build_predicate_method_from(method_name) + plural_method = method_name.to_s.chomp(PREDICATE_SUFFIX).to_sym + + send(plural_method).any? + end + + def build_mapping_predicate_method_from(method_name) + mapping_method = method_name.to_s.chomp(PREDICATE_SUFFIX).to_sym + + send(mapping_method).any? + end end end diff --git a/lib/top_secret/text/global_mapping.rb b/lib/top_secret/text/global_mapping.rb index 838495a..d976f12 100644 --- a/lib/top_secret/text/global_mapping.rb +++ b/lib/top_secret/text/global_mapping.rb @@ -50,11 +50,11 @@ def process_result(result) # @param individual_key [Symbol] The individual key from a filter result # @return [Symbol] The global key with consistent numbering def generate_global_key(individual_key) - label_type = individual_key.to_s.rpartition("_").first + label_type = individual_key.to_s.rpartition(TopSecret::LABEL_DELIMITER).first label_counters[label_type] ||= 0 label_counters[label_type] += 1 - :"#{label_type}_#{label_counters[label_type]}" + :"#{label_type}#{TopSecret::LABEL_DELIMITER}#{label_counters[label_type]}" end end end diff --git a/spec/top_secret/result_spec.rb b/spec/top_secret/result_spec.rb index 4d9a031..e044f34 100644 --- a/spec/top_secret/result_spec.rb +++ b/spec/top_secret/result_spec.rb @@ -38,4 +38,81 @@ end end end + + describe "categorization" do + let(:mapping) do + { + EMAIL_1: "ralph@example.com", + EMAIL_2: "ruby@example.com", + PERSON_1: "Ralph", + IP_ADDRESS_1: "192.168.1.1", + CREDIT_CARD_NUMBER_1: "4242424242424242", + NETWORK_MAPPING_1: "10.0.1.0/24 -> 192.168.1.0/24" + } + end + + it "categorizes by labels" do + expect(subject.emails?).to be true + expect(subject.people?).to be true + expect(subject.credit_card_numbers?).to be true + expect(subject.network_mappings?).to be true + + expect(subject.emails).to eq([ + "ralph@example.com", + "ruby@example.com" + ]) + expect(subject.people).to eq([ + "Ralph" + ]) + expect(subject.credit_card_numbers).to eq([ + "4242424242424242" + ]) + expect(subject.network_mappings).to eq([ + "10.0.1.0/24 -> 192.168.1.0/24" + ]) + + expect(subject.email_mapping).to eq({ + EMAIL_1: "ralph@example.com", + EMAIL_2: "ruby@example.com" + }) + expect(subject.person_mapping).to eq({ + PERSON_1: "Ralph" + }) + expect(subject.credit_card_number_mapping).to eq({ + CREDIT_CARD_NUMBER_1: "4242424242424242" + }) + expect(subject.network_mappings_mapping).to eq({ + NETWORK_MAPPING_1: "10.0.1.0/24 -> 192.168.1.0/24" + }) + end + + it "extracts types" do + expect(subject.types).to include( + :email, + :person, + :ip_address, + :credit_card_number, + :network_mapping, + :credit_card, + :phone_number, + :ssn, + :location + ) + end + + it "responds to dynamic methods" do + expect(subject).to respond_to(:emails) + expect(subject).to respond_to(:emails?) + expect(subject).to respond_to(:email_mapping) + expect(subject).to respond_to(:people) + expect(subject).to respond_to(:people?) + expect(subject).to respond_to(:person_mapping) + expect(subject).to respond_to(:credit_card_numbers) + expect(subject).to respond_to(:credit_card_numbers?) + expect(subject).to respond_to(:credit_card_number_mapping) + expect(subject).to respond_to(:network_mappings) + expect(subject).to respond_to(:network_mappings_mapping?) + expect(subject).to respond_to(:network_mappings_mapping) + end + end end diff --git a/spec/top_secret/text_spec.rb b/spec/top_secret/text_spec.rb index 29d4afe..ea18bd8 100644 --- a/spec/top_secret/text_spec.rb +++ b/spec/top_secret/text_spec.rb @@ -43,6 +43,108 @@ expect(result.safe?).to eq(false) end + it "categorizes sensitive information from free text" do + input = <<~TEXT + My name is Ralph + My location is Boston + My email address is user@example.com + My credit card numbers are 4242-4242-4242-4242 and 4141414141414141 + My social security number is 123-45-6789 + My phone number is 555-555-5555 + TEXT + + result = TopSecret::Text.filter(input) + + expect(result.emails).to eq(["user@example.com"]) + expect(result.emails?).to eq(true) + expect(result.email_mapping).to eq({EMAIL_1: "user@example.com"}) + + expect(result.people).to eq(["Ralph"]) + expect(result.people?).to eq(true) + expect(result.person_mapping).to eq({PERSON_1: "Ralph"}) + + expect(result.locations).to eq(["Boston"]) + expect(result.locations?).to eq(true) + expect(result.location_mapping).to eq({LOCATION_1: "Boston"}) + + expect(result.credit_cards).to eq(["4242-4242-4242-4242", "4141414141414141"]) + expect(result.credit_cards?).to eq(true) + expect(result.credit_card_mapping).to eq({ + CREDIT_CARD_1: "4242-4242-4242-4242", + CREDIT_CARD_2: "4141414141414141" + }) + + expect(result.ssns).to eq(["123-45-6789"]) + expect(result.ssns?).to eq(true) + expect(result.ssn_mapping).to eq({SSN_1: "123-45-6789"}) + + expect(result.phone_numbers).to eq(["555-555-5555"]) + expect(result.phone_numbers?).to eq(true) + expect(result.phone_number_mapping).to eq({PHONE_NUMBER_1: "555-555-5555"}) + end + + context "when there is no sensitive information" do + before do + stub_ner_entities + end + + it "categorizes sensitive information from free text" do + result = TopSecret::Text.filter("") + + expect(result.emails).to eq([]) + expect(result.emails?).to eq(false) + expect(result.email_mapping).to eq({}) + + expect(result.people).to eq([]) + expect(result.people?).to eq(false) + expect(result.person_mapping).to eq({}) + + expect(result.locations).to eq([]) + expect(result.locations?).to eq(false) + expect(result.location_mapping).to eq({}) + + expect(result.credit_cards).to eq([]) + expect(result.credit_cards?).to eq(false) + expect(result.credit_card_mapping).to eq({}) + + expect(result.ssns).to eq([]) + expect(result.ssns?).to eq(false) + expect(result.ssn_mapping).to eq({}) + + expect(result.phone_numbers).to eq([]) + expect(result.phone_numbers?).to eq(false) + expect(result.phone_number_mapping).to eq({}) + end + end + + context "when a custom label is used" do + it "categorizes sensitive information from free text using that label" do + input = "user[at]example.com" + + result = TopSecret::Text.filter(input, email_filter: TopSecret::Filters::Regex.new( + label: "EMAIL_ADDRESS", + regex: /user\[at\]example\.com/ + )) + + expect(result.email_addresses).to eq([input]) + expect(result.email_addresses?).to eq(true) + expect(result.email_address_mapping).to eq({EMAIL_ADDRESS_1: input}) + end + + it "categorizes sensitive information from free text using the default label" do + input = "user[at]example.com" + + result = TopSecret::Text.filter(input, email_filter: TopSecret::Filters::Regex.new( + label: "E_MAIL_ADDRESS", + regex: /user\[at\]example\.com/ + )) + + expect(result.emails).to eq([input]) + expect(result.emails?).to eq(true) + expect(result.email_mapping).to eq({EMAIL_ADDRESS_1: input}) + end + end + context "when the filters option is passed" do it "overrides existing Regex filters" do input = <<~TEXT @@ -565,6 +667,46 @@ end end + it "categorizes sensitive information from free text" do + result = TopSecret::Text.filter_all([ + "user@example.com" + ]) + + expect(result.items.map(&:emails)).to eq([["user@example.com"]]) + expect(result.items.map(&:emails?)).to eq([true]) + expect(result.items.map(&:email_mapping)).to eq([{EMAIL_1: "user@example.com"}]) + end + + context "when there is no sensitive information" do + it "responds to the default filters" do + result = TopSecret::Text.filter_all([""]) + + expect(result.items.map(&:emails)).to eq([[]]) + expect(result.items.map(&:emails?)).to eq([false]) + expect(result.items.map(&:email_mapping)).to eq([{}]) + + expect(result.items.map(&:credit_cards)).to eq([[]]) + expect(result.items.map(&:credit_cards?)).to eq([false]) + expect(result.items.map(&:credit_card_mapping)).to eq([{}]) + + expect(result.items.map(&:phone_numbers)).to eq([[]]) + expect(result.items.map(&:phone_numbers?)).to eq([false]) + expect(result.items.map(&:phone_number_mapping)).to eq([{}]) + + expect(result.items.map(&:ssns)).to eq([[]]) + expect(result.items.map(&:ssns?)).to eq([false]) + expect(result.items.map(&:ssn_mapping)).to eq([{}]) + + expect(result.items.map(&:people)).to eq([[]]) + expect(result.items.map(&:people?)).to eq([false]) + expect(result.items.map(&:person_mapping)).to eq([{}]) + + expect(result.items.map(&:locations)).to eq([[]]) + expect(result.items.map(&:locations?)).to eq([false]) + expect(result.items.map(&:location_mapping)).to eq([{}]) + end + end + it "returns TopSecret::Text::BatchResult" do result = TopSecret::Text.filter_all(["", ""])