1+ package org .archive .modules .extractor ;
2+
3+ import com .fasterxml .jackson .core .JsonFactory ;
4+ import com .fasterxml .jackson .databind .JsonNode ;
5+ import com .fasterxml .jackson .databind .ObjectMapper ;
6+ import org .archive .modules .CrawlURI ;
7+ import org .archive .util .UriUtils ;
8+
9+ import java .util .ArrayList ;
10+ import java .util .List ;
11+ import java .util .Map ;
12+ import java .util .logging .Level ;
13+ import java .util .logging .Logger ;
14+
15+ /**
16+ * Extracts URIs from JSON resources.
17+ * <p>
18+ * n.b. chokes on JSONP, e.g.
19+ * <p>
20+ * breakingNews({"pollPeriod":30000,"isError":false,"html":""})
21+ *
22+ * @author rcoram
23+ */
24+ public class ExtractorJson extends ContentExtractor {
25+ public final static String JSON_URI = "^https?://[^/]+/.+\\ .json\\ b.*$" ;
26+ private static final Logger LOGGER = Logger .getLogger (ExtractorJson .class .getName ());
27+ private final JsonFactory factory = new JsonFactory ();
28+ private final ObjectMapper mapper = new ObjectMapper (factory );
29+
30+ @ Override
31+ protected boolean innerExtract (CrawlURI curi ) {
32+ try {
33+ List <String > links = new ArrayList <>();
34+ JsonNode rootNode = mapper .readTree (curi .getRecorder ().getContentReplayInputStream ());
35+ parse (rootNode , links );
36+ for (String link : links ) {
37+ try {
38+ int max = getExtractorParameters ().getMaxOutlinks ();
39+ addRelativeToBase (curi , max , link , LinkContext .INFERRED_MISC , Hop .INFERRED );
40+ numberOfLinksExtracted .incrementAndGet ();
41+ } catch (org .archive .url .URIException e ) {
42+ logUriError (e , curi .getUURI (), link );
43+ }
44+ }
45+ } catch (Exception e ) {
46+ // Only record this as INFO, as malformed JSON is fairly common.
47+ LOGGER .log (Level .INFO , curi .getURI () + " : " + e .getMessage ());
48+ }
49+ return false ;
50+ }
51+
52+ @ Override
53+ protected boolean shouldExtract (CrawlURI curi ) {
54+ String contentType = curi .getContentType ();
55+ if (contentType != null && contentType .contains ("json" )) {
56+ return true ;
57+ }
58+ return curi .isSuccess () && curi .toString ().matches (JSON_URI );
59+ }
60+
61+ protected List <String > parse (JsonNode rootNode , List <String > links ) {
62+ for (Map .Entry <String , JsonNode > field : rootNode .properties ()) {
63+ if (field .getValue ().textValue () != null
64+ && UriUtils .isVeryLikelyUri (field .getValue ().textValue ())) {
65+ links .add (field .getValue ().textValue ());
66+ } else if (field .getValue ().isObject ()) {
67+ parse (field .getValue (), links );
68+ } else if (field .getValue ().isArray ()) {
69+ field .getValue ()
70+ .propertyStream ()
71+ .forEach (fieldValue -> parse (fieldValue .getValue (), links ));
72+ }
73+ }
74+ return links ;
75+ }
76+ }
0 commit comments