Skip to content

Commit c50f45b

Browse files
authored
Merge pull request #139 from GateNLP/resolve-url-fix
Various fixes to Utils.resolveURL
2 parents 906bc37 + 7907761 commit c50f45b

File tree

3 files changed

+133
-17
lines changed

3 files changed

+133
-17
lines changed

src/main/java/gate/Utils.java

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,17 +1411,28 @@ public static AnnotationSet intersect(AnnotationSet origSet, Collection<Annotati
14111411
}
14121412
return new ImmutableAnnotationSetImpl(origSet.getDocument(),tmp);
14131413
}
1414-
1415-
public static URL resolveURL(String url) throws IOException {
1416-
while (true) {
1417-
// while we are still following redirects...
14181414

1419-
// create an actual URL instance from the string representation
1420-
URL resourceUrl = new URL(url);
1415+
public static URL resolveURL(String url) throws IOException {
1416+
return resolveURL(new URL(url));
1417+
}
14211418

1422-
if (!resourceUrl.getProtocol().startsWith("http"))
1423-
return resourceUrl;
1419+
public static URL resolveURL(URL url) throws IOException {
1420+
// if it's not http or https then there's no notion of redirection, so
1421+
// stick to the original URL object
1422+
if (!url.getProtocol().equalsIgnoreCase("http") &&
1423+
!url.getProtocol().equalsIgnoreCase("https")) {
1424+
return url;
1425+
}
14241426

1427+
URL resourceUrl = url;
1428+
Set<String> seenUrls = new HashSet<>();
1429+
int followedRedirects = 0;
1430+
// limit to 20 redirects, that's the most any of the major browsers will follow
1431+
while (followedRedirects++ < 20) {
1432+
// check for redirection loop
1433+
if(!seenUrls.add(resourceUrl.toExternalForm())) {
1434+
throw new IOException("Redirection loop detected for URL " + url);
1435+
}
14251436
// open a connection to the URL and...
14261437
HttpURLConnection conn = (HttpURLConnection) resourceUrl.openConnection();
14271438

@@ -1432,19 +1443,27 @@ public static URL resolveURL(String url) throws IOException {
14321443
conn.setInstanceFollowRedirects(false); // Make the logic below easier to detect redirections
14331444

14341445
switch (conn.getResponseCode()) {
1435-
case HttpURLConnection.HTTP_MOVED_PERM:
1436-
case HttpURLConnection.HTTP_MOVED_TEMP:
1437-
// if we've hit a redirect then get the location from the header
1438-
String location = conn.getHeaderField("Location");
1439-
location = URLDecoder.decode(location, "UTF-8");
1440-
URL next = new URL(resourceUrl, location); // Deal with relative URLs
1441-
url = next.toExternalForm();
1442-
continue;
1446+
case 301: // moved permanently
1447+
case 302: // moved temporarily
1448+
case 303: // "see other"
1449+
case 307: // "temporary redirect"
1450+
// if we've hit a redirect then get the location from the header
1451+
String location = conn.getHeaderField("Location");
1452+
location = URLDecoder.decode(location, "UTF-8");
1453+
URL newUrl = new URL(resourceUrl, location); // Deal with relative URLs
1454+
// follow the redirect if (and only if) it goes to another http or https URL
1455+
if(newUrl.getProtocol().equalsIgnoreCase("http") ||
1456+
newUrl.getProtocol().equalsIgnoreCase("https")) {
1457+
resourceUrl = newUrl;
1458+
continue;
1459+
}
14431460
}
14441461

14451462
// we've found a URL without a redirect so at this point we can stop
14461463
return resourceUrl;
14471464
}
1465+
1466+
throw new IOException("Too many redirects for " + url);
14481467
}
14491468

14501469
}

src/main/java/gate/corpora/DocumentImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ public Resource init() throws ResourceInstantiationException {
252252
getFeatures().put("gate.SourceURL", "created from String");
253253
} else {
254254
try {
255-
URL resolved = gate.Utils.resolveURL(sourceUrl.toExternalForm());
255+
URL resolved = gate.Utils.resolveURL(sourceUrl);
256256
getFeatures().put("gate.OriginalURL", sourceUrl.toExternalForm());
257257
sourceUrl = resolved;
258258
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package gate.util;
2+
3+
import gate.Utils;
4+
import junit.framework.TestCase;
5+
import org.apache.http.HttpException;
6+
import org.apache.http.HttpRequest;
7+
import org.apache.http.HttpResponse;
8+
import org.apache.http.impl.bootstrap.HttpServer;
9+
import org.apache.http.impl.bootstrap.ServerBootstrap;
10+
import org.apache.http.protocol.HttpContext;
11+
import org.apache.http.protocol.HttpRequestHandler;
12+
13+
import java.io.IOException;
14+
import java.net.*;
15+
import java.util.concurrent.TimeUnit;
16+
17+
public class TestResolveUrl extends TestCase {
18+
19+
static class RedirectingHandler implements HttpRequestHandler {
20+
21+
InetAddress myAddress;
22+
int port;
23+
24+
@Override
25+
public void handle(HttpRequest request, HttpResponse response, HttpContext context) throws HttpException, IOException {
26+
response.setStatusCode(303);
27+
try {
28+
String path = request.getRequestLine().getUri();
29+
if(path.matches("/loop/[0-3]")) {
30+
// simulates a redirection loop
31+
int nextNum = (path.charAt(6) - '0' + 1) % 4;
32+
String nextPath = "/loop/" + nextNum;
33+
System.err.println("Redirecting to " + nextPath);
34+
response.addHeader("Location", new URI("http", null, myAddress.getHostAddress(), port, nextPath, null, null).toString());
35+
} else if(path.startsWith("/infinite/")) {
36+
// simulates a redirection loop
37+
String nextPath = path + "/x";
38+
System.err.println("Redirecting " + path + " to " + nextPath);
39+
response.addHeader("Location", new URI("http", null, myAddress.getHostAddress(), port, nextPath, null, null).toString());
40+
} else if(path.equals("/redirect-to-file")) {
41+
response.addHeader("Location", new URI("http", null, myAddress.getHostAddress(), port, "/file", null, null).toString());
42+
} else if(path.equals("/file")) {
43+
System.err.println("Attempting to redirect to a file: URL");
44+
response.addHeader("Location", "file:/etc/hosts");
45+
}
46+
} catch(URISyntaxException e) {
47+
throw new IOException(e);
48+
}
49+
}
50+
}
51+
52+
HttpServer server;
53+
URL baseUrl;
54+
55+
public void setUp() throws Exception {
56+
RedirectingHandler handler = new RedirectingHandler();
57+
server = ServerBootstrap.bootstrap()
58+
.setLocalAddress(InetAddress.getLoopbackAddress())
59+
.registerHandler("*", handler).create();
60+
server.start();
61+
handler.myAddress = server.getInetAddress();
62+
handler.port = server.getLocalPort();
63+
baseUrl = new URL("http", server.getInetAddress().getHostAddress(), server.getLocalPort(), "");
64+
System.err.println("Started server at " + baseUrl);
65+
}
66+
67+
public void testRedirectLoop() throws Exception {
68+
URL url = new URL(baseUrl, "/loop/0");
69+
try {
70+
URL newUrl = Utils.resolveURL(url);
71+
fail("resolveURL should have failed due to redirect loop");
72+
} catch(IOException e) {
73+
// exception expected
74+
}
75+
}
76+
77+
public void testTooManyRedirects() throws Exception {
78+
URL url = new URL(baseUrl, "/infinite/x");
79+
try {
80+
URL newUrl = Utils.resolveURL(url);
81+
fail("resolveURL should have failed due to too many redirects");
82+
} catch(IOException e) {
83+
// exception expected
84+
}
85+
}
86+
87+
public void testRedirectToFile() throws Exception {
88+
URL url = new URL(baseUrl, "/redirect-to-file");
89+
URL newUrl = Utils.resolveURL(url);
90+
assertEquals("Redirection should have stopped at http://.../file", new URL(baseUrl, "/file"), newUrl);
91+
}
92+
93+
public void tearDown() throws Exception {
94+
server.stop();
95+
server.awaitTermination(20, TimeUnit.SECONDS);
96+
}
97+
}

0 commit comments

Comments
 (0)