Skip to content

Commit 1161d87

Browse files
committed
Browser: Add processor report
1 parent 2210cb7 commit 1161d87

File tree

3 files changed

+40
-0
lines changed

3 files changed

+40
-0
lines changed

engine/src/main/java/org/archive/crawler/processor/Browser.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
import org.archive.util.IdleBarrier;
4444
import org.archive.util.Recorder;
4545
import org.eclipse.jetty.client.Result;
46+
import org.json.JSONException;
47+
import org.json.JSONObject;
4648
import org.springframework.context.ApplicationEventPublisher;
4749

4850
import java.io.IOException;
@@ -57,6 +59,7 @@
5759
import java.util.*;
5860
import java.util.concurrent.ConcurrentHashMap;
5961
import java.util.concurrent.Semaphore;
62+
import java.util.concurrent.atomic.AtomicLong;
6063
import java.util.stream.Stream;
6164

6265
import static java.lang.System.Logger.Level.ERROR;
@@ -80,6 +83,7 @@ public class Browser extends Processor {
8083
protected final Map<String, BrowserPage> pages = new ConcurrentHashMap<>();
8184
protected final Map<BrowsingContext.Context, String> pageIdsByContext = new ConcurrentHashMap<>();
8285
protected final ProcessorChain extractorChain = new ProcessorChain();
86+
protected final AtomicLong subresourcesRecorded = new AtomicLong();
8387
protected List<Behavior> behaviors;
8488
protected String executable;
8589
protected List<String> options = List.of("--headless");
@@ -137,6 +141,29 @@ public void innerProcess(CrawlURI curi) {
137141
}
138142
}
139143

144+
@Override
145+
protected JSONObject toCheckpointJson() throws JSONException {
146+
return super.toCheckpointJson().put("subresourcesRecorded", subresourcesRecorded.get());
147+
}
148+
149+
@Override
150+
protected void fromCheckpointJson(JSONObject json) throws JSONException {
151+
super.fromCheckpointJson(json);
152+
subresourcesRecorded.set(json.getLong("subresourcesRecorded"));
153+
}
154+
155+
@Override
156+
public String report() {
157+
StringBuilder builder = new StringBuilder();
158+
builder.append(super.report());
159+
builder.append(" Pages visited: ").append(getURICount()).append("\n");
160+
builder.append(" Subresources recorded: ").append(subresourcesRecorded.get()).append("\n");
161+
for (var behavior : behaviors) {
162+
builder.append(behavior.report());
163+
}
164+
return builder.toString();
165+
}
166+
140167
private void visit(CrawlURI curi) {
141168
String pageId = UUID.randomUUID().toString();
142169
var tab = webdriver.browsingContext().create(BrowsingContext.CreateType.tab).context();
@@ -398,6 +425,7 @@ public void onComplete(Result result) {
398425
if (recordingFailed) {
399426
curi.setFetchStatus(FetchStatusCodes.S_RUNTIME_EXCEPTION);
400427
} else {
428+
subresourcesRecorded.incrementAndGet();
401429
curi.getOverlayNames(); // for sideeffect of creating the overlayNames list
402430

403431
Frontier frontier = crawlController.getFrontier();

modules/src/main/java/org/archive/modules/behaviors/Behavior.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,8 @@
2121

2222
public interface Behavior {
2323
void run(Page page);
24+
25+
default String report() {
26+
return " Behavior: " + getClass().getName() + "\n";
27+
}
2428
}

modules/src/main/java/org/archive/modules/behaviors/ExtractLinks.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@
2828
import org.archive.net.UURIFactory;
2929

3030
import java.util.List;
31+
import java.util.concurrent.atomic.AtomicLong;
3132

3233
public class ExtractLinks implements Behavior {
34+
private final AtomicLong numberOfLinksExtracted = new AtomicLong(0);
3335
private final UriErrorLoggerModule loggerModule;
3436

3537
public ExtractLinks(UriErrorLoggerModule loggerModule) {
@@ -56,9 +58,15 @@ public void run(Page page) {
5658
UURI dest = UURIFactory.getInstance(page.curi().getUURI(), url);
5759
CrawlURI link = page.curi().createCrawlURI(dest, LinkContext.NAVLINK_MISC, Hop.NAVLINK);
5860
page.curi().getOutLinks().add(link);
61+
numberOfLinksExtracted.incrementAndGet();
5962
} catch (URIException e) {
6063
loggerModule.logUriError(e, page.curi().getUURI(), url);
6164
}
6265
}
6366
}
67+
68+
@Override
69+
public String report() {
70+
return Behavior.super.report() + " Links extracted: " + numberOfLinksExtracted.get() + "\n";
71+
}
6472
}

0 commit comments

Comments
 (0)