1+ package io .sentrius .sso .core .services .documents .retrieval ;
2+
3+ import lombok .extern .slf4j .Slf4j ;
4+
5+ import java .net .InetAddress ;
6+ import java .net .URI ;
7+ import java .net .UnknownHostException ;
8+
9+ /**
10+ * Validates URLs to prevent Server-Side Request Forgery (SSRF) attacks.
11+ * Blocks access to private networks, localhost, and non-HTTP(S) protocols.
12+ */
13+ @ Slf4j
14+ public class UrlValidator {
15+
16+ /**
17+ * Validates a URL to prevent SSRF attacks
18+ *
19+ * @param url The URL to validate
20+ * @throws DocumentRetrievalException if URL is invalid or potentially dangerous
21+ */
22+ public static void validateUrl (String url ) throws DocumentRetrievalException {
23+ if (url == null || url .trim ().isEmpty ()) {
24+ throw new DocumentRetrievalException ("URL cannot be null or empty" );
25+ }
26+
27+ URI uri ;
28+ try {
29+ uri = URI .create (url );
30+ } catch (IllegalArgumentException e ) {
31+ throw new DocumentRetrievalException ("Invalid URL format: " + e .getMessage ());
32+ }
33+
34+ // Validate scheme - only allow http and https
35+ String scheme = uri .getScheme ();
36+ if (scheme == null || (!scheme .equalsIgnoreCase ("http" ) && !scheme .equalsIgnoreCase ("https" ))) {
37+ throw new DocumentRetrievalException (
38+ "Invalid URL scheme. Only HTTP and HTTPS protocols are allowed. Found: " + scheme );
39+ }
40+
41+ // Get the host from the URI
42+ String host = uri .getHost ();
43+ if (host == null || host .trim ().isEmpty ()) {
44+ throw new DocumentRetrievalException ("URL must contain a valid host" );
45+ }
46+
47+ // Normalize host to lowercase for comparison
48+ host = host .toLowerCase ();
49+
50+ // Block localhost and localhost-like hostnames
51+ if (isLocalhost (host )) {
52+ throw new DocumentRetrievalException (
53+ "Access to localhost is not allowed for security reasons" );
54+ }
55+
56+ // Check if host is an IP address literal
57+ if (isIpAddressLiteral (host )) {
58+ // If it's an IP address, validate it directly
59+ try {
60+ InetAddress address = InetAddress .getByName (host );
61+ if (isPrivateOrReservedAddress (address )) {
62+ throw new DocumentRetrievalException (
63+ "Access to private or reserved IP addresses is not allowed for security reasons: " +
64+ address .getHostAddress ());
65+ }
66+ } catch (UnknownHostException e ) {
67+ throw new DocumentRetrievalException ("Invalid IP address: " + host );
68+ }
69+ } else {
70+ // For domain names, try to resolve but don't fail if DNS is unavailable
71+ // This allows the service to attempt the connection, where the actual HTTP client
72+ // will handle DNS resolution failures appropriately
73+ try {
74+ InetAddress address = InetAddress .getByName (host );
75+ if (isPrivateOrReservedAddress (address )) {
76+ throw new DocumentRetrievalException (
77+ "Access to private or reserved IP addresses is not allowed for security reasons: " +
78+ address .getHostAddress ());
79+ }
80+ } catch (UnknownHostException e ) {
81+ // For domain names that don't resolve (e.g., in test environments),
82+ // we allow the request to proceed and let the HTTP client handle it
83+ log .debug ("Could not pre-resolve host {}, will allow HTTP client to handle: {}" ,
84+ host , e .getMessage ());
85+ }
86+ }
87+
88+ log .debug ("URL validation passed for: {}" , url );
89+ }
90+
91+ /**
92+ * Checks if the string is an IP address literal (IPv4 or IPv6)
93+ */
94+ private static boolean isIpAddressLiteral (String host ) {
95+ // Check for IPv4 pattern with valid octet ranges (0-255)
96+ if (host .matches ("^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\ .){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$" )) {
97+ return true ;
98+ }
99+ // Check for IPv6 (contains colons)
100+ if (host .contains (":" )) {
101+ return true ;
102+ }
103+ return false ;
104+ }
105+
106+ /**
107+ * Checks if the hostname is localhost or a localhost variant
108+ */
109+ private static boolean isLocalhost (String host ) {
110+ return host .equals ("localhost" ) ||
111+ host .equals ("127.0.0.1" ) ||
112+ host .equals ("::1" ) ||
113+ host .equals ("0.0.0.0" ) ||
114+ host .startsWith ("localhost." ) ||
115+ host .endsWith (".localhost" );
116+ }
117+
118+ /**
119+ * Checks if an IP address is private, loopback, link-local, or reserved
120+ */
121+ private static boolean isPrivateOrReservedAddress (InetAddress address ) {
122+ // Check for loopback addresses (127.0.0.0/8, ::1)
123+ if (address .isLoopbackAddress ()) {
124+ return true ;
125+ }
126+
127+ // Check for link-local addresses (169.254.0.0/16, fe80::/10)
128+ if (address .isLinkLocalAddress ()) {
129+ return true ;
130+ }
131+
132+ // Check for site-local addresses (deprecated, but still blocked)
133+ // This covers 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, fec0::/10
134+ if (address .isSiteLocalAddress ()) {
135+ return true ;
136+ }
137+
138+ // Check for multicast addresses
139+ if (address .isMulticastAddress ()) {
140+ return true ;
141+ }
142+
143+ // Check for any local address (0.0.0.0, ::)
144+ if (address .isAnyLocalAddress ()) {
145+ return true ;
146+ }
147+
148+ // Additional check for IPv4 private ranges that might not be caught
149+ byte [] bytes = address .getAddress ();
150+ if (bytes .length == 4 ) {
151+ // Check 10.0.0.0/8
152+ if (bytes [0 ] == 10 ) {
153+ return true ;
154+ }
155+ // Check 172.16.0.0/12
156+ if (bytes [0 ] == (byte ) 172 && (bytes [1 ] & 0xF0 ) == 0x10 ) {
157+ return true ;
158+ }
159+ // Check 192.168.0.0/16
160+ if (bytes [0 ] == (byte ) 192 && bytes [1 ] == (byte ) 168 ) {
161+ return true ;
162+ }
163+ // Check 169.254.0.0/16 (AWS/Azure metadata service)
164+ if (bytes [0 ] == (byte ) 169 && bytes [1 ] == (byte ) 254 ) {
165+ return true ;
166+ }
167+ // Check 127.0.0.0/8 (loopback)
168+ if (bytes [0 ] == 127 ) {
169+ return true ;
170+ }
171+ // Check 0.0.0.0/8
172+ if (bytes [0 ] == 0 ) {
173+ return true ;
174+ }
175+ }
176+
177+ return false ;
178+ }
179+ }
0 commit comments