Skip to content

Commit fbf0391

Browse files
authored
Merge pull request #701 from bnfleb/extractor
Add ExtractorJson from BL
2 parents 4ea67a3 + 4d6735f commit fbf0391

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

modules/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@
111111
<artifactId>junit-jupiter</artifactId>
112112
<optional>true</optional>
113113
</dependency>
114+
<dependency>
115+
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
116+
<groupId>com.fasterxml.jackson.core</groupId>
117+
<artifactId>jackson-databind</artifactId>
118+
<version>2.20.0</version>
119+
</dependency>
114120
</dependencies>
115121
<build>
116122
<plugins>
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package org.archive.modules.extractor;
2+
3+
import com.fasterxml.jackson.core.JsonFactory;
4+
import com.fasterxml.jackson.databind.JsonNode;
5+
import com.fasterxml.jackson.databind.ObjectMapper;
6+
import org.archive.modules.CrawlURI;
7+
import org.archive.util.UriUtils;
8+
9+
import java.util.ArrayList;
10+
import java.util.List;
11+
import java.util.Map;
12+
import java.util.logging.Level;
13+
import java.util.logging.Logger;
14+
15+
/**
16+
* Extracts URIs from JSON resources.
17+
* <p>
18+
* n.b. chokes on JSONP, e.g.
19+
* <p>
20+
* breakingNews({"pollPeriod":30000,"isError":false,"html":""})
21+
*
22+
* @author rcoram
23+
*/
24+
public class ExtractorJson extends ContentExtractor {
25+
public final static String JSON_URI = "^https?://[^/]+/.+\\.json\\b.*$";
26+
private static final Logger LOGGER = Logger.getLogger(ExtractorJson.class.getName());
27+
private final JsonFactory factory = new JsonFactory();
28+
private final ObjectMapper mapper = new ObjectMapper(factory);
29+
30+
@Override
31+
protected boolean innerExtract(CrawlURI curi) {
32+
try {
33+
List<String> links = new ArrayList<>();
34+
JsonNode rootNode = mapper.readTree(curi.getRecorder().getContentReplayInputStream());
35+
parse(rootNode, links);
36+
for (String link : links) {
37+
try {
38+
int max = getExtractorParameters().getMaxOutlinks();
39+
addRelativeToBase(curi, max, link, LinkContext.INFERRED_MISC, Hop.INFERRED);
40+
numberOfLinksExtracted.incrementAndGet();
41+
} catch (org.archive.url.URIException e) {
42+
logUriError(e, curi.getUURI(), link);
43+
}
44+
}
45+
} catch (Exception e) {
46+
// Only record this as INFO, as malformed JSON is fairly common.
47+
LOGGER.log(Level.INFO, curi.getURI() + " : " + e.getMessage());
48+
}
49+
return false;
50+
}
51+
52+
@Override
53+
protected boolean shouldExtract(CrawlURI curi) {
54+
String contentType = curi.getContentType();
55+
if (contentType != null && contentType.contains("json")) {
56+
return true;
57+
}
58+
return curi.isSuccess() && curi.toString().matches(JSON_URI);
59+
}
60+
61+
protected List<String> parse(JsonNode rootNode, List<String> links) {
62+
for (Map.Entry<String, JsonNode> field : rootNode.properties()) {
63+
if (field.getValue().textValue() != null
64+
&& UriUtils.isVeryLikelyUri(field.getValue().textValue())) {
65+
links.add(field.getValue().textValue());
66+
} else if (field.getValue().isObject()) {
67+
parse(field.getValue(), links);
68+
} else if (field.getValue().isArray()) {
69+
field.getValue()
70+
.propertyStream()
71+
.forEach(fieldValue -> parse(fieldValue.getValue(), links));
72+
}
73+
}
74+
return links;
75+
}
76+
}

0 commit comments

Comments
 (0)