11# frozen_string_literal: true
22
33require "json"
4+ require "cgi"
45
56module AiBouncer
67 # HTTP request classifier using KNN on pre-computed attack vectors
78 class Classifier
8- ATTACK_LABELS = %w[ sqli xss path_traversal command_injection credential_stuffing spam_bot ] . freeze
9+ ATTACK_LABELS = %w[
10+ sqli xss path_traversal command_injection credential_stuffing spam_bot
11+ scanner ssrf xxe nosql_injection ssti log4shell open_redirect ldap_injection
12+ ] . freeze
913
1014 attr_reader :model
1115
@@ -52,7 +56,9 @@ def self.request_to_text(method:, path:, body: "", user_agent: "", params: {}, h
5256 # User agent classification
5357 if user_agent && !user_agent . empty?
5458 ua_lower = user_agent . downcase
55- ua_type = if %w[ bot crawler curl python java wget ] . any? { |b | ua_lower . include? ( b ) }
59+ ua_type = if %w[ sqlmap nikto zgrab nmap wpscan dirbuster gobuster nuclei acunetix burp ] . any? { |s | ua_lower . include? ( s ) }
60+ "scanner"
61+ elsif %w[ bot crawler curl python java wget go-http libwww perl mechanize axios node-fetch ruby ] . any? { |b | ua_lower . include? ( b ) }
5662 "bot"
5763 elsif %w[ mozilla chrome safari firefox edge opera ] . any? { |b | ua_lower . include? ( b ) }
5864 "browser"
@@ -76,51 +82,177 @@ def self.request_to_text(method:, path:, body: "", user_agent: "", params: {}, h
7682 # Header analysis
7783 if headers . any?
7884 parts << "HEADERS:#{ headers . size } "
79- # Include header values for pattern detection
8085 headers . each do |name , value |
8186 next if value . nil? || value . empty?
82- # Flag suspicious header names
83- if name . downcase == "referer" || name . downcase == "referrer"
84- parts << "HAS_REFERER"
85- end
87+ name_lower = name . to_s . downcase
88+ parts << "HAS_REFERER" if name_lower == "referer" || name_lower == "referrer"
89+ parts << "HAS_XML_CONTENT" if name_lower == "content-type" && value . to_s . include? ( "xml" )
90+ parts << "HAS_JSON_CONTENT" if name_lower == "content-type" && value . to_s . include? ( "json" )
8691 end
8792 end
8893
89- # Suspicious pattern detection - include headers in combined text
90- header_values = headers . values . compact . join ( ' ' )
94+ # Combine all text for pattern analysis
95+ header_values = headers . values . compact . join ( " " )
9196 combined = "#{ path } #{ body } #{ params . values . join ( ' ' ) } #{ header_values } "
9297
93- if combined =~ /\b (SELECT|INSERT|UPDATE|DELETE|DROP|UNION|OR\s +\d |--|')/i
98+ # URL-decode combined text for better pattern detection
99+ decoded_combined = begin
100+ CGI . unescape ( combined )
101+ rescue StandardError
102+ combined
103+ end
104+
105+ # ============ Advanced Feature Extraction ============
106+
107+ # Entropy calculation (high entropy often indicates encoded attacks)
108+ entropy = calculate_entropy ( combined )
109+ parts << "ENTROPY:#{ entropy_bucket ( entropy ) } "
110+
111+ # URL encoding detection
112+ encoding_depth = detect_encoding_depth ( combined )
113+ parts << "ENCODING:#{ encoding_depth } " if encoding_depth > 0
114+
115+ # Special character density (use decoded for accuracy)
116+ special_density = special_char_density ( decoded_combined )
117+ parts << "SPECIAL_DENSITY:#{ density_bucket ( special_density ) } "
118+
119+ # ============ Attack Pattern Flags (use decoded_combined for detection) ============
120+
121+ # SQL Injection patterns
122+ if decoded_combined =~ /\b (SELECT|INSERT|UPDATE|DELETE|DROP|UNION|OR\s +\d |AND\s +\d |--|'|;|\b WHERE\b |\b FROM\b |\b SLEEP\s *\( |WAITFOR|BENCHMARK|PG_SLEEP|EXTRACTVALUE|UPDATEXML)/i
94123 parts << "FLAG:SQL_KEYWORDS"
95124 end
96125
97- if combined =~ /(<script|javascript:|onerror|onload|alert\( )/i
126+ # XSS patterns
127+ if decoded_combined =~ /(<script|javascript:|onerror|onload|onmouseover|onfocus|onclick|alert\s *\( |prompt\s *\( |confirm\s *\( |<svg|<img[^>]+on\w +=|<iframe|<body[^>]+on\w +=|expression\s *\( |eval\s *\( )/i
98128 parts << "FLAG:XSS_PATTERN"
99129 end
100130
101- if combined =~ /(\. \. |%2e%2e)/i
131+ # Path traversal
132+ if decoded_combined =~ /(\. \. [\/ \\ ])/i
102133 parts << "FLAG:PATH_TRAVERSAL"
103134 end
104135
105- if combined =~ /(\| |;|`|\$ \( |&&|\| \| )/
136+ # Command injection
137+ if decoded_combined =~ /(\| |;|`|\$ \( |&&|\| \| |>\s *\/ |<\s *\/ |\b cat\b |\b ls\b |\b whoami\b |\b id\b |\b ping\b |\b nc\b |\b curl\b |\b wget\b )/i
106138 parts << "FLAG:CMD_INJECTION"
107139 end
108140
109- # Include payload snippet (body, params, and suspicious headers)
141+ # SSRF patterns
142+ if decoded_combined =~ /(169\. 254\. 169\. 254|metadata\. google|127\. 0\. 0\. 1|localhost|0\. 0\. 0\. 0|\[ ::1\] |file:\/ \/ |gopher:\/ \/ |dict:\/ \/ |internal|\. internal)/i
143+ parts << "FLAG:SSRF_PATTERN"
144+ end
145+
146+ # XXE patterns
147+ if decoded_combined =~ /(<!DOCTYPE|<!ENTITY|SYSTEM\s *["']|PUBLIC\s *["']|%xxe|&xxe)/i
148+ parts << "FLAG:XXE_PATTERN"
149+ end
150+
151+ # NoSQL injection patterns (must have $ operator, not just JSON)
152+ if decoded_combined =~ /(\$ gt|\$ ne|\$ lt|\$ or|\$ and|\$ where|\$ regex|\$ exists|\$ in|\$ nin)["\s :}\] ]/i
153+ parts << "FLAG:NOSQL_PATTERN"
154+ end
155+
156+ # SSTI patterns
157+ if decoded_combined =~ /(\{ \{ .*\} \} |\$ \{ .*\} |<%.*%>|\# \{ .*\} |__class__|__mro__|__subclasses__|__globals__|__builtins__|config\. |request\. |self\. )/i
158+ parts << "FLAG:SSTI_PATTERN"
159+ end
160+
161+ # Log4Shell patterns
162+ if decoded_combined =~ /(\$ \{ jndi:|j\$ \{ |jn\$ \{ |\$ \{ lower:j\} |\$ \{ upper:j\} |ldap:\/ \/ |rmi:\/ \/ )/i
163+ parts << "FLAG:LOG4SHELL_PATTERN"
164+ end
165+
166+ # Open redirect patterns
167+ if decoded_combined =~ /(redirect|return|next|url|goto|dest|continue|rurl)=.*?(https?:\/ \/ |\/ \/ )[^\/ ]/i
168+ parts << "FLAG:REDIRECT_PATTERN"
169+ end
170+
171+ # LDAP injection patterns
172+ if decoded_combined =~ /(\* \) \( |\) \( |objectClass|\) \( &\) |\) \( \| )/i
173+ parts << "FLAG:LDAP_PATTERN"
174+ end
175+
176+ # Scanner fingerprints in paths
177+ if path =~ /(\. env|\. git|wp-config|phpinfo|\. aws|backup\. sql|\. htpasswd|web\. config|actuator|swagger|api-docs)/i
178+ parts << "FLAG:SCANNER_PATH"
179+ end
180+
181+ # ============ Payload ============
110182 payload_parts = [ ]
111183 payload_parts << body . to_s unless body . to_s . empty?
112184 payload_parts << params . to_s unless params . empty?
113185 # Include Referer if present (common attack vector)
114- if headers [ "Referer" ] || headers [ "referer" ]
115- referer = headers [ "Referer" ] || headers [ "referer" ]
116- payload_parts << "REFERER:#{ referer } " unless referer . empty?
117- end
186+ referer = headers [ "Referer" ] || headers [ "referer" ]
187+ payload_parts << "REFERER:#{ referer } " if referer && !referer . empty?
118188 payload = payload_parts . join ( " " )
119- parts << "PAYLOAD:#{ payload [ 0 , 300 ] } " unless payload . empty?
189+ parts << "PAYLOAD:#{ payload [ 0 , 500 ] } " unless payload . empty?
120190
121191 parts . join ( " " )
122192 end
123193
194+ # Calculate Shannon entropy of a string
195+ def self . calculate_entropy ( str )
196+ return 0.0 if str . nil? || str . empty?
197+
198+ freq = Hash . new ( 0 )
199+ str . each_char { |c | freq [ c ] += 1 }
200+
201+ len = str . length . to_f
202+ entropy = 0.0
203+ freq . each_value do |count |
204+ prob = count / len
205+ entropy -= prob * Math . log2 ( prob ) if prob > 0
206+ end
207+ entropy
208+ end
209+
210+ def self . entropy_bucket ( entropy )
211+ case entropy
212+ when 0 ..2.5 then "low"
213+ when 2.5 ..4.0 then "normal"
214+ when 4.0 ..5.5 then "high"
215+ else "very_high"
216+ end
217+ end
218+
219+ # Detect URL encoding depth (double/triple encoding)
220+ def self . detect_encoding_depth ( str )
221+ return 0 if str . nil? || str . empty?
222+
223+ depth = 0
224+ current = str
225+ 3 . times do
226+ decoded = begin
227+ CGI . unescape ( current )
228+ rescue StandardError
229+ current
230+ end
231+ break if decoded == current
232+
233+ depth += 1
234+ current = decoded
235+ end
236+ depth
237+ end
238+
239+ # Calculate special character density
240+ def self . special_char_density ( str )
241+ return 0.0 if str . nil? || str . empty?
242+
243+ special_chars = str . count ( "'\" <>(){}[];|&$`\\ !@#%^*=+~" )
244+ special_chars . to_f / str . length
245+ end
246+
247+ def self . density_bucket ( density )
248+ case density
249+ when 0 ..0.05 then "low"
250+ when 0.05 ..0.15 then "normal"
251+ when 0.15 ..0.3 then "high"
252+ else "very_high"
253+ end
254+ end
255+
124256 private
125257
126258 def load_vectors
0 commit comments