11# coding: utf-8
2+ # frozen_string_literal: true
23require 'cgi'
34
45module RDF
@@ -28,27 +29,27 @@ class URI
2829 include RDF ::Resource
2930
3031 # IRI components
31- UCSCHAR = Regexp . compile ( <<-EOS . gsub ( / \s +/ , '' ) )
32- [ \\ u00A0-\\ uD7FF]|[ \\ uF900-\\ uFDCF]|[ \\ uFDF0-\\ uFFEF]|
33- [ \\ u{10000}-\\ u{1FFFD}]|[ \\ u{20000}-\\ u{2FFFD}]|[ \\ u{30000}-\\ u{3FFFD}]|
34- [ \\ u{40000}-\\ u{4FFFD}]|[ \\ u{50000}-\\ u{5FFFD}]|[ \\ u{60000}-\\ u{6FFFD}]|
35- [ \\ u{70000}-\\ u{7FFFD}]|[ \\ u{80000}-\\ u{8FFFD}]|[ \\ u{90000}-\\ u{9FFFD}]|
36- [ \\ u{A0000}-\\ u{AFFFD}]|[ \\ u{B0000}-\\ u{BFFFD}]|[ \\ u{C0000}-\\ u{CFFFD}]|
37- [ \\ u{D0000}-\\ u{DFFFD}]|[ \\ u{E1000}-\\ u{EFFFD}]
38- EOS
39- IPRIVATE = Regexp . compile ( "[\\ uE000-\\ uF8FF]|[ \\ u{F0000}-\\ u{FFFFD}]|[ \\ u100000 -\\ u10FFFD ]" ) . freeze
32+ UCSCHAR = %(
33+ \\ u00A0-\\ uD7FF\\ uF900-\\ uFDCF\\ uFDF0-\\ uFFEF
34+ \\ u{10000}-\\ u{1FFFD}\\ u{20000}-\\ u{2FFFD}\\ u{30000}-\\ u{3FFFD}
35+ \\ u{40000}-\\ u{4FFFD}\\ u{50000}-\\ u{5FFFD}\\ u{60000}-\\ u{6FFFD}
36+ \\ u{70000}-\\ u{7FFFD}\\ u{80000}-\\ u{8FFFD}\\ u{90000}-\\ u{9FFFD}
37+ \\ u{A0000}-\\ u{AFFFD}\\ u{B0000}-\\ u{BFFFD}\\ u{C0000}-\\ u{CFFFD}
38+ \\ u{D0000}-\\ u{DFFFD}\\ u{E1000}-\\ u{EFFFD}
39+ ) . gsub ( / \s +/ , '' )
40+ IPRIVATE = Regexp . compile ( "[\\ uE000-\\ uF8FF\\ u{F0000}-\\ u{FFFFD}\\ u{100000} -\\ u{10FFFD} ]" ) . freeze
4041 SCHEME = Regexp . compile ( "[A-Za-z](?:[A-Za-z0-9+-\. ])*" ) . freeze
4142 PORT = Regexp . compile ( "[0-9]*" ) . freeze
4243 IP_literal = Regexp . compile ( "\\ [[0-9A-Fa-f:\\ .]*\\ ]" ) . freeze # Simplified, no IPvFuture
4344 PCT_ENCODED = Regexp . compile ( "%[0-9A-Fa-f][0-9A-Fa-f]" ) . freeze
44- GEN_DELIMS = Regexp . compile ( " [:/\\ ? \\ # \\ [ \\ ]@]" ) . freeze
45- SUB_DELIMS = Regexp . compile ( " [!\\ $&'\\ ( \\ ) \\ * \\ +,;=]" ) . freeze
46- RESERVED = Regexp . compile ( "(?: #{ GEN_DELIMS } | #{ SUB_DELIMS } )" ) . freeze
45+ GEN_DELIMS = Regexp . compile ( %q{ [:/\?\#\[\ ]@]} ) . freeze
46+ SUB_DELIMS = Regexp . compile ( %q{ [!\$&'\(\)\*\ +,;=]} ) . freeze
47+ RESERVED = Regexp . union ( GEN_DELIMS , SUB_DELIMS ) . freeze
4748 UNRESERVED = Regexp . compile ( "[A-Za-z0-9\. _~-]" ) . freeze
4849
49- IUNRESERVED = Regexp . compile ( "[A-Za-z0-9 \. _~-]| #{ UCSCHAR } " ) . freeze
50+ IUNRESERVED = Regexp . union ( UNRESERVED , Regexp . compile ( "[#{ UCSCHAR } ]" ) ) . freeze
5051
51- IPCHAR = Regexp . compile ( "(?: #{ IUNRESERVED } | #{ PCT_ENCODED } | #{ SUB_DELIMS } | :|@)" ) . freeze
52+ IPCHAR = Regexp . union ( IUNRESERVED , PCT_ENCODED , SUB_DELIMS , /[ :|@]/ ) . freeze
5253
5354 IQUERY = Regexp . compile ( "(?:#{ IPCHAR } |#{ IPRIVATE } |/|\\ ?)*" ) . freeze
5455
@@ -65,7 +66,7 @@ class URI
6566 IPATH_EMPTY = Regexp . compile ( "" ) . freeze
6667
6768 IREG_NAME = Regexp . compile ( "(?:(?:#{ IUNRESERVED } )|(?:#{ PCT_ENCODED } )|(?:#{ SUB_DELIMS } ))*" ) . freeze
68- IHOST = Regexp . compile ( "(?: #{ IP_literal } )|(?: #{ IREG_NAME } )" ) . freeze
69+ IHOST = Regexp . union ( IP_literal , IREG_NAME ) . freeze
6970 IUSERINFO = Regexp . compile ( "(?:(?:#{ IUNRESERVED } )|(?:#{ PCT_ENCODED } )|(?:#{ SUB_DELIMS } )|:)*" ) . freeze
7071 IAUTHORITY = Regexp . compile ( "(?:#{ IUSERINFO } @)?#{ IHOST } (?::#{ PORT } )?" ) . freeze
7172
@@ -116,7 +117,21 @@ class URI
116117 # Note: not all reserved characters need to be escaped in SPARQL/Turtle, but they must be unescaped when encountered
117118 PN_ESCAPE_CHARS = /[~\. !\$ &'\( \) \* \+ ,;=\/ \? \# @%]/ . freeze
118119 PN_ESCAPES = /\\ #{ Regexp . union ( PN_ESCAPE_CHARS , /[\- _]/ ) } / . freeze
119-
120+
121+ # For URI encoding
122+ # iuserinfo = *( iunreserved / pct-encoded / sub-delims / ":" )
123+ ENCODE_USER =
124+ ENCODE_PASSWORD = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } !$&'\( \) \* \+ ,;=:-]" ) . freeze
125+ # isegment = *ipchar
126+ # ipchar = iunreserved / pct-encoded / sub-delims / ":" / "@"
127+ ENCODE_ISEGMENT = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } !$&'\( \) \* \+ ,;=:-]" ) . freeze
128+ # isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims / "@" )
129+ ENCODE_ISEGMENT_NC = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } !$&'\( \) \* \+ ,;=-]" ) . freeze
130+ # iquery = *( ipchar / iprivate / "/" / "?" )
131+ ENCODE_IQUERY = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } \\ uE000-\\ uF8FF\\ u{F0000}-\\ u{FFFFD}\\ u{100000}-\\ u{10FFFD}/?=]" ) . freeze
132+ # ifragment = *( ipchar / "/" / "?" )
133+ ENCODE_IFRAGMENT = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } /?]" ) . freeze
134+
120135 ##
121136 # Cache size may be set through {RDF.config} using `uri_cache_size`.
122137 #
@@ -170,7 +185,7 @@ def self.parse(str)
170185 # @return [String] normalized path
171186 # @see http://tools.ietf.org/html/rfc3986#section-5.2.4
172187 def self . normalize_path ( path )
173- output , input = "" , path . to_s
188+ output , input = String . new , path . to_s
174189 if input . encoding != Encoding ::ASCII_8BIT
175190 input = input . dup . force_encoding ( Encoding ::ASCII_8BIT )
176191 end
@@ -353,7 +368,7 @@ def length
353368 # @return [Boolean] `true` or `false`
354369 # @since 0.3.9
355370 def valid?
356- RDF ::URI ::IRI . match ( to_s ) || false
371+ RDF ::URI ::IRI . match? ( to_s ) || false
357372 end
358373
359374 ##
@@ -920,7 +935,7 @@ def scheme=(value)
920935 # Return normalized version of scheme, if any
921936 # @return [String]
922937 def normalized_scheme
923- normalize_segment ( scheme . strip , SCHEME , true ) if scheme
938+ scheme . strip . downcase if scheme
924939 end
925940
926941 ##
@@ -946,7 +961,7 @@ def user=(value)
946961 # Normalized version of user
947962 # @return [String]
948963 def normalized_user
949- URI . encode ( CGI . unescape ( user ) , /[^ #{ IUNRESERVED } | #{ SUB_DELIMS } ]/ ) . force_encoding ( Encoding ::UTF_8 ) if user
964+ URI . encode ( CGI . unescape ( user ) , ENCODE_USER ) . force_encoding ( Encoding ::UTF_8 ) if user
950965 end
951966
952967 ##
@@ -972,7 +987,7 @@ def password=(value)
972987 # Normalized version of password
973988 # @return [String]
974989 def normalized_password
975- URI . encode ( CGI . unescape ( password ) , /[^ #{ IUNRESERVED } | #{ SUB_DELIMS } ]/ ) . force_encoding ( Encoding ::UTF_8 ) if password
990+ URI . encode ( CGI . unescape ( password ) , ENCODE_PASSWORD ) . force_encoding ( Encoding ::UTF_8 ) if password
976991 end
977992
978993 HOST_FROM_AUTHORITY_RE = /(?:[^@]+@)?([^:]+)(?::.*)?$/ . freeze
@@ -1000,7 +1015,7 @@ def host=(value)
10001015 # @return [String]
10011016 def normalized_host
10021017 # Remove trailing '.' characters
1003- normalize_segment ( host , IHOST , true ) . chomp ( '.' ) if host
1018+ host . sub ( / \. *$/ , '' ) . downcase if host
10041019 end
10051020
10061021 PORT_FROM_AUTHORITY_RE = /:(\d +)$/ . freeze
@@ -1028,12 +1043,8 @@ def port=(value)
10281043 # @return [String]
10291044 def normalized_port
10301045 if port
1031- np = normalize_segment ( port . to_s , PORT )
1032- if PORT_MAPPING [ normalized_scheme ] == np . to_i
1033- nil
1034- else
1035- np . to_i
1036- end
1046+ np = port . to_i
1047+ PORT_MAPPING [ normalized_scheme ] != np ? np : nil
10371048 end
10381049 end
10391050
@@ -1064,30 +1075,36 @@ def path=(value)
10641075 # Normalized version of path
10651076 # @return [String]
10661077 def normalized_path
1078+ if normalized_scheme == "urn"
1079+ # Special-case URI. Normalize the NID component only
1080+ nid , p = path . to_s . split ( ':' , 2 )
1081+ return "#{ nid . downcase } :#{ p } "
1082+ end
1083+
10671084 segments = path . to_s . split ( '/' , -1 ) # preserve null segments
10681085
10691086 norm_segs = case
10701087 when authority
10711088 # ipath-abempty
1072- segments . map { |s | normalize_segment ( s , ISEGMENT ) }
1089+ segments . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) }
10731090 when segments [ 0 ] . nil?
10741091 # ipath-absolute
10751092 res = [ nil ]
1076- res << normalize_segment ( segments [ 1 ] , ISEGMENT_NZ ) if segments . length > 1
1077- res += segments [ 2 ..-1 ] . map { |s | normalize_segment ( s , ISEGMENT ) } if segments . length > 2
1093+ res << normalize_segment ( segments [ 1 ] , ENCODE_ISEGMENT ) if segments . length > 1
1094+ res += segments [ 2 ..-1 ] . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) } if segments . length > 2
10781095 res
10791096 when segments [ 0 ] . to_s . index ( ':' )
10801097 # ipath-noscheme
10811098 res = [ ]
1082- res << normalize_segment ( segments [ 0 ] , ISEGMENT_NZ_NC )
1083- res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ISEGMENT ) } if segments . length > 1
1099+ res << normalize_segment ( segments [ 0 ] , ENCODE_ISEGMENT_NC )
1100+ res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) } if segments . length > 1
10841101 res
10851102 when segments [ 0 ]
10861103 # ipath-rootless
10871104 # ipath-noscheme
10881105 res = [ ]
1089- res << normalize_segment ( segments [ 0 ] , ISEGMENT_NZ )
1090- res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ISEGMENT ) } if segments . length > 1
1106+ res << normalize_segment ( segments [ 0 ] , ENCODE_ISEGMENT )
1107+ res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) } if segments . length > 1
10911108 res
10921109 else
10931110 # Should be empty
@@ -1096,7 +1113,7 @@ def normalized_path
10961113
10971114 res = self . class . normalize_path ( norm_segs . join ( "/" ) )
10981115 # Special rules for specific protocols having empty paths
1099- normalize_segment ( res . empty? ? ( %w( http https ftp tftp ) . include? ( normalized_scheme ) ? '/' : "" ) : res , IHIER_PART )
1116+ ( res . empty? && %w( http https ftp tftp ) . include? ( normalized_scheme ) ) ? '/' : res
11001117 end
11011118
11021119 ##
@@ -1120,7 +1137,7 @@ def query=(value)
11201137 # Normalized version of query
11211138 # @return [String]
11221139 def normalized_query
1123- normalize_segment ( query , IQUERY ) if query
1140+ normalize_segment ( query , ENCODE_IQUERY ) if query
11241141 end
11251142
11261143 ##
@@ -1144,7 +1161,7 @@ def fragment=(value)
11441161 # Normalized version of fragment
11451162 # @return [String]
11461163 def normalized_fragment
1147- normalize_segment ( fragment , IFRAGMENT ) if fragment
1164+ normalize_segment ( fragment , ENCODE_IFRAGMENT ) if fragment
11481165 end
11491166
11501167 ##
@@ -1274,15 +1291,15 @@ def query_values=(value)
12741291 self . query = case value
12751292 when Array , Hash
12761293 value . map do |( k , v ) |
1277- k = normalize_segment ( k . to_s , UNRESERVED )
1294+ k = normalize_segment ( k . to_s , /[^A-Za-z0-9 \. _~-]/ )
12781295 if v . nil?
12791296 k
12801297 else
12811298 Array ( v ) . map do |vv |
12821299 if vv === TrueClass
12831300 k
12841301 else
1285- "#{ k } =#{ normalize_segment ( vv . to_s , UNRESERVED ) } "
1302+ "#{ k } =#{ normalize_segment ( vv . to_s , /[^A-Za-z0-9 \. _~-]/ ) } "
12861303 end
12871304 end . join ( "&" )
12881305 end
@@ -1331,15 +1348,15 @@ def self._load(data)
13311348 # Normalize a segment using a character range
13321349 #
13331350 # @param [String] value
1334- # @param [Regexp] expr
1351+ # @param [Regexp] expr matches characters to be encoded
13351352 # @param [Boolean] downcase
13361353 # @return [String]
13371354 def normalize_segment ( value , expr , downcase = false )
13381355 if value
13391356 value = value . dup . force_encoding ( Encoding ::UTF_8 )
13401357 decoded = CGI . unescape ( value )
13411358 decoded . downcase! if downcase
1342- URI . encode ( decoded , /[^(?: #{ expr } )]/ ) . force_encoding ( Encoding ::UTF_8 )
1359+ URI . encode ( decoded , expr ) . force_encoding ( Encoding ::UTF_8 )
13431360 end
13441361 end
13451362
@@ -1364,7 +1381,7 @@ def format_authority
13641381 def self . encode ( str , expr )
13651382 str . gsub ( expr ) do
13661383 us = $&
1367- tmp = ''
1384+ tmp = String . new
13681385 us . each_byte do |uc |
13691386 tmp << sprintf ( '%%%02X' , uc )
13701387 end
0 commit comments