Skip to content

Commit 5604ff0

Browse files
committed
refactor in separate classes, add tests, fix recheck
1 parent 5c3a04c commit 5604ff0

19 files changed

+1522
-537
lines changed

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,7 @@ java -cp 'lib/*:bin/*.jar' nl.melp.linkchecker.LinkChecker
4040
| `--follow-local` | Only local links to that local* domain are followed |
4141
| `--follow-from-local` | Only follow links that are mentioned on the local* domain. This means that the link checker only spans over multiple hosts *once*. |
4242
| `--no-follow` | No links are followed. This is typically useful in combination with the `--recheck` flag |
43-
| `--recheck` | Reset the status for each of the previously failed URLs, and try them again. |
44-
| `--recheck-only-errors` | Only recheck links that had an internal error state, i.e. all urls that had connection errors, timeouts, etc. |
43+
| `--recheck` | Reset the status for each of the previously failed URLs, and recheck the pages they are mentioned on. |
4544
| `--no-recheck` | Don't do recheck, even if url's are marked as "processing". |
4645
| `--reset` | Start with a clean slate |
4746
| `--resume` | Resume a previously stopped session. |
@@ -76,4 +75,4 @@ won't need further configuration.
7675
Please report them at github.com/drm/java-linkchecker
7776

7877
# Copyright
79-
(c) 2019 Gerard van Helden
78+
(c) 2019-2020 Gerard van Helden

lib/fetch.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ fetch() {
4646
'org.slf4j:slf4j-log4j12:1.7.7' \
4747
'org.jsoup:jsoup:1.11.3'
4848

49-
curl -sL https://github.com/drm/java-redis-client/releases/download/v2.0.2/java-redis-client-v2.0.2--javac-11.0.2.jar -o ./java-redis-client-v2.0.2.jar
49+
curl -sL https://github.com/drm/java-redis-client/releases/download/v2.1.0/java-redis-client-v2.1.0--javac-11.0.9.1.jar -o ./java-redis-client-v2.1.0.jar
5050
curl -sL https://github.com/drm/java-redis-collections/releases/download/v1.0.2/java-redis-collections-v1.0.2--javac-11.0.7.jar -o ./java-redis-collections-v1.0.2.jar
5151
}
5252

lib/java-redis-client-v2.0.2.jar

-20.7 KB
Binary file not shown.

lib/java-redis-client-v2.1.0.jar

24.4 KB
Binary file not shown.
8.67 KB
Binary file not shown.

src/log4j.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ log4j.appender.fileName.layout.ConversionPattern=%d [%t] %-5p %c - %m%n
1717
log4j.rootLogger=info, stdout, fileName
1818

1919
org.apache.http=info
20-
log4j.logger.nl.melp.linkchecker=debug
20+
log4j.logger.nl.melp.linkchecker=info
2121

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package nl.melp.linkchecker;
2+
3+
import org.apache.http.HttpEntity;
4+
import org.apache.http.client.methods.CloseableHttpResponse;
5+
6+
import java.io.IOException;
7+
import java.net.URI;
8+
import java.util.Set;
9+
10+
public interface Extractor {
11+
Set<String> extract(URI url, int statusCode, CloseableHttpResponse response, HttpEntity responseEntity) throws IOException;
12+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package nl.melp.linkchecker;
2+
3+
import nl.melp.linkchecker.URIResolver.InvalidURIException;
4+
import org.apache.http.HttpEntity;
5+
import org.apache.http.client.config.RequestConfig;
6+
import org.apache.http.client.methods.CloseableHttpResponse;
7+
import org.apache.http.client.methods.HttpGet;
8+
import org.apache.http.impl.client.CloseableHttpClient;
9+
import org.slf4j.Logger;
10+
11+
import java.io.IOException;
12+
import java.net.URI;
13+
import java.util.LinkedHashSet;
14+
import java.util.Set;
15+
16+
public class Fetcher {
17+
private static final int timeout = 30;
18+
19+
private static final RequestConfig requestConfig = RequestConfig.custom()
20+
.setConnectTimeout(timeout * 1000)
21+
.setConnectionRequestTimeout(timeout * 1000)
22+
.setSocketTimeout(timeout * 1000)
23+
.build();
24+
25+
public static class Result {
26+
private final URI uri;
27+
private final int statusCode;
28+
private final Set<URI> referredLinks;
29+
private final Set<String> invalidLinks;
30+
31+
public Result(URI uri, int statusCode, Set<URI> referredLinks, Set<String> invalidLinks) {
32+
this.uri = uri;
33+
this.statusCode = statusCode;
34+
this.referredLinks = referredLinks;
35+
this.invalidLinks = invalidLinks;
36+
}
37+
38+
public URI getUri() {
39+
return uri;
40+
}
41+
42+
public int getStatusCode() {
43+
return statusCode;
44+
}
45+
46+
public Set<URI> getReferredLinks() {
47+
return referredLinks;
48+
}
49+
50+
public Set<String> getInvalidLinks() {
51+
return invalidLinks;
52+
}
53+
}
54+
55+
private final Logger logger;
56+
private final RunConfig config;
57+
private final Extractor extractor;
58+
private final URIResolver resolver;
59+
60+
public Fetcher(Logger logger, RunConfig config, Extractor extractor, URIResolver resolver) {
61+
this.logger = logger;
62+
this.config = config;
63+
this.extractor = extractor;
64+
this.resolver = resolver;
65+
}
66+
67+
public Result fetch(CloseableHttpClient httpClient, URI url) {
68+
var request = new HttpGet(url);
69+
request.setConfig(requestConfig);
70+
try (CloseableHttpResponse response = httpClient.execute(request)) {
71+
int statusCode = response.getStatusLine().getStatusCode();
72+
73+
logger.trace("Got status " + statusCode + " at " + url);
74+
75+
HttpEntity responseEntity = response.getEntity();
76+
Set<URI> links = new LinkedHashSet<>();
77+
Set<String> invalidLinks = new LinkedHashSet<>();
78+
79+
if (config.shouldExtractLinks(url)) {
80+
for (String link : extractor.extract(url, statusCode, response, responseEntity)) {
81+
try {
82+
final URI target = resolver.resolveUri(url, link);
83+
if (target != null) {
84+
links.add(target);
85+
}
86+
} catch (InvalidURIException e) {
87+
invalidLinks.add(link);
88+
}
89+
}
90+
}
91+
return new Result(url, statusCode, links, invalidLinks);
92+
} catch (IOException e) {
93+
return new Result(url, 0, null, null);
94+
}
95+
}
96+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package nl.melp.linkchecker;
2+
3+
import org.apache.http.Header;
4+
import org.apache.http.HttpEntity;
5+
import org.apache.http.client.methods.CloseableHttpResponse;
6+
import org.jsoup.Jsoup;
7+
import org.jsoup.nodes.Document;
8+
import org.jsoup.nodes.Element;
9+
import org.jsoup.select.Elements;
10+
import org.slf4j.Logger;
11+
12+
import java.io.IOException;
13+
import java.net.URI;
14+
import java.util.LinkedHashSet;
15+
import java.util.Set;
16+
17+
public class HtmlExtractor implements Extractor {
18+
private final Logger logger;
19+
20+
public HtmlExtractor(Logger logger) {
21+
this.logger = logger;
22+
}
23+
24+
@Override
25+
public Set<String> extract(URI url, int statusCode, CloseableHttpResponse response, HttpEntity responseEntity) throws IOException {
26+
Set<String> referred = new LinkedHashSet<>();
27+
Header contentTypeHeader = response.getFirstHeader("Content-Type");
28+
String contentType = contentTypeHeader == null ? "UNKNOWN" : contentTypeHeader.getValue();
29+
30+
if (statusCode == 200) {
31+
if (contentType.startsWith("text/html")) {
32+
Document d = Jsoup.parse(responseEntity.getContent(), "UTF-8", url.toString());
33+
Elements links = d.select("a[href]");
34+
logger.trace("Found " + links.size() + " on " + url);
35+
for (Element link : links) {
36+
referred.add(link.attr("href"));
37+
}
38+
} else {
39+
logger.trace("Not following links in content type " + contentType);
40+
}
41+
} else if (response.getFirstHeader("Location") != null) {
42+
String location = response.getFirstHeader("Location").getValue();
43+
if (referred.add(location)) {
44+
logger.trace("Following redirect (" + statusCode + ") [" + url + " => " + location + "]");
45+
}
46+
} else {
47+
logger.debug("Skipping {}, content-type: {}", url, contentType);
48+
}
49+
return referred;
50+
51+
}
52+
}

0 commit comments

Comments
 (0)