@@ -17,4 +17,160 @@ public static boolean isAlnum(char c) {
1717 public static boolean isNonAscii (char c ) {
1818 return c >= 0x80 ;
1919 }
20+
21+ public static final int findUrlEnd (CharSequence input , int beginIndex ) {
22+ int round = 0 ;
23+ int square = 0 ;
24+ int curly = 0 ;
25+ boolean doubleQuote = false ;
26+ boolean singleQuote = false ;
27+ int last = beginIndex ;
28+ loop :
29+ for (int i = beginIndex ; i < input .length (); i ++) {
30+ char c = input .charAt (i );
31+ switch (c ) {
32+ case '\u0000' :
33+ case '\u0001' :
34+ case '\u0002' :
35+ case '\u0003' :
36+ case '\u0004' :
37+ case '\u0005' :
38+ case '\u0006' :
39+ case '\u0007' :
40+ case '\u0008' :
41+ case '\t' :
42+ case '\n' :
43+ case '\u000B' :
44+ case '\f' :
45+ case '\r' :
46+ case '\u000E' :
47+ case '\u000F' :
48+ case '\u0010' :
49+ case '\u0011' :
50+ case '\u0012' :
51+ case '\u0013' :
52+ case '\u0014' :
53+ case '\u0015' :
54+ case '\u0016' :
55+ case '\u0017' :
56+ case '\u0018' :
57+ case '\u0019' :
58+ case '\u001A' :
59+ case '\u001B' :
60+ case '\u001C' :
61+ case '\u001D' :
62+ case '\u001E' :
63+ case '\u001F' :
64+ case ' ' :
65+ case '<' :
66+ case '>' :
67+ case '\u007F' :
68+ case '\u0080' :
69+ case '\u0081' :
70+ case '\u0082' :
71+ case '\u0083' :
72+ case '\u0084' :
73+ case '\u0085' :
74+ case '\u0086' :
75+ case '\u0087' :
76+ case '\u0088' :
77+ case '\u0089' :
78+ case '\u008A' :
79+ case '\u008B' :
80+ case '\u008C' :
81+ case '\u008D' :
82+ case '\u008E' :
83+ case '\u008F' :
84+ case '\u0090' :
85+ case '\u0091' :
86+ case '\u0092' :
87+ case '\u0093' :
88+ case '\u0094' :
89+ case '\u0095' :
90+ case '\u0096' :
91+ case '\u0097' :
92+ case '\u0098' :
93+ case '\u0099' :
94+ case '\u009A' :
95+ case '\u009B' :
96+ case '\u009C' :
97+ case '\u009D' :
98+ case '\u009E' :
99+ case '\u009F' :
100+ // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
101+ // Some characters are not in the above list, even they are not in "unreserved" or "reserved":
102+ // '"', '\\', '^', '`', '{', '|', '}'
103+ // The reason for this is that other link detectors also allow them. Also see below, we require
104+ // the quote and the braces to be balanced.
105+ break loop ;
106+ case '?' :
107+ case '!' :
108+ case '.' :
109+ case ',' :
110+ case ':' :
111+ case ';' :
112+ // These may be part of an URL but not at the end
113+ break ;
114+ case '/' :
115+ // This may be part of an URL and at the end, but not if the previous character can't be the end of an URL
116+ if (last == i - 1 ) {
117+ last = i ;
118+ }
119+ break ;
120+ case '(' :
121+ round ++;
122+ break ;
123+ case ')' :
124+ round --;
125+ if (round >= 0 ) {
126+ last = i ;
127+ } else {
128+ // More closing than opening brackets, stop now
129+ break loop ;
130+ }
131+ break ;
132+ case '[' :
133+ // Allowed in IPv6 address host
134+ square ++;
135+ break ;
136+ case ']' :
137+ // Allowed in IPv6 address host
138+ square --;
139+ if (square >= 0 ) {
140+ last = i ;
141+ } else {
142+ // More closing than opening brackets, stop now
143+ break loop ;
144+ }
145+ break ;
146+ case '{' :
147+ curly ++;
148+ break ;
149+ case '}' :
150+ curly --;
151+ if (curly >= 0 ) {
152+ last = i ;
153+ } else {
154+ // More closing than opening brackets, stop now
155+ break loop ;
156+ }
157+ break ;
158+ case '"' :
159+ doubleQuote = !doubleQuote ;
160+ if (!doubleQuote ) {
161+ last = i ;
162+ }
163+ break ;
164+ case '\'' :
165+ singleQuote = !singleQuote ;
166+ if (!singleQuote ) {
167+ last = i ;
168+ }
169+ break ;
170+ default :
171+ last = i ;
172+ }
173+ }
174+ return last ;
175+ }
20176}
0 commit comments