Skip to content

Commit a342884

Browse files
authored
Merge pull request #1083 from code4craft/revert-1082-common-downloader-status-process
Revert "Common the downloader status process and pass error information when …"
2 parents ee5a058 + acfbd7b commit a342884

File tree

4 files changed

+183
-168
lines changed

4 files changed

+183
-168
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public Html download(String url) {
2626
/**
2727
* A simple method to download a url.
2828
*
29-
* @param url url
29+
* @param url url
3030
* @param charset charset
3131
* @return html
3232
*/
@@ -38,7 +38,7 @@ public Html download(String url, String charset) {
3838
protected void onSuccess(Request request) {
3939
}
4040

41-
protected void onError(Request request, Throwable e) {
41+
protected void onError(Request request) {
4242
}
4343

4444
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ public Page download(Request request, Task task) {
8787
return page;
8888
} catch (IOException e) {
8989
logger.warn("download page {} error", request.getUrl(), e);
90-
onError(request, e);
90+
onError(request);
9191
return page;
9292
} finally {
9393
if (httpResponse != null) {
@@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
110110
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
111111
Page page = new Page();
112112
page.setBytes(bytes);
113-
if (!request.isBinaryContent()) {
113+
if (!request.isBinaryContent()){
114114
if (charset == null) {
115115
charset = getHtmlCharset(contentType, bytes);
116116
}

webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java

Lines changed: 71 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -16,112 +16,135 @@
1616
* @version 0.5.3
1717
*/
1818
public class PhantomJSDownloader extends AbstractDownloader {
19-
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
19+
20+
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
2021
private static String crawlJsPath;
2122
private static String phantomJsCommand = "phantomjs"; // default
2223

24+
private int retryNum;
25+
private int threadNum;
26+
2327
public PhantomJSDownloader() {
2428
this.initPhantomjsCrawlPath();
2529
}
26-
30+
2731
/**
2832
* 添加新的构造函数,支持phantomjs自定义命令
29-
* <p>
30-
* example:
31-
* phantomjs.exe 支持windows环境
32-
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
33-
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
34-
*
33+
*
34+
* example:
35+
* phantomjs.exe 支持windows环境
36+
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
37+
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
38+
*
3539
* @param phantomJsCommand phantomJsCommand
3640
*/
3741
public PhantomJSDownloader(String phantomJsCommand) {
3842
this.initPhantomjsCrawlPath();
3943
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
4044
}
41-
45+
4246
/**
4347
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
4448
* <pre>
4549
* crawl.js start --
46-
*
50+
*
4751
* var system = require('system');
4852
* var url = system.args[1];
49-
*
53+
*
5054
* var page = require('webpage').create();
5155
* page.settings.loadImages = false;
5256
* page.settings.resourceTimeout = 5000;
53-
*
57+
*
5458
* page.open(url, function (status) {
5559
* if (status != 'success') {
5660
* console.log("HTTP request failed!");
5761
* } else {
5862
* console.log(page.content);
5963
* }
60-
*
64+
*
6165
* page.close();
6266
* phantom.exit();
6367
* });
64-
*
68+
*
6569
* -- crawl.js end
6670
* </pre>
6771
* 具体项目时可以将以上js代码复制下来使用
68-
* <p>
72+
*
6973
* example:
70-
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
71-
*
74+
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
75+
*
7276
* @param phantomJsCommand phantomJsCommand
73-
* @param crawlJsPath crawlJsPath
77+
* @param crawlJsPath crawlJsPath
7478
*/
7579
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
76-
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
77-
PhantomJSDownloader.crawlJsPath = crawlJsPath;
80+
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
81+
PhantomJSDownloader.crawlJsPath = crawlJsPath;
7882
}
79-
83+
8084
private void initPhantomjsCrawlPath() {
81-
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
82-
+ System.getProperty("file.separator") + "crawl.js ";
85+
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
8386
}
8487

8588
@Override
8689
public Page download(Request request, Task task) {
8790
if (logger.isInfoEnabled()) {
8891
logger.info("downloading page: " + request.getUrl());
8992
}
90-
91-
Page page = Page.fail();
92-
try {
93-
String content = getPage(request);
94-
if (!content.contains("HTTP request failed")) {
95-
page.setDownloadSuccess(true);
96-
page.setRawText(content);
97-
page.setUrl(new PlainText(request.getUrl()));
93+
String content = getPage(request);
94+
if (content.contains("HTTP request failed")) {
95+
for (int i = 1; i <= getRetryNum(); i++) {
96+
content = getPage(request);
97+
if (!content.contains("HTTP request failed")) {
98+
break;
99+
}
100+
}
101+
if (content.contains("HTTP request failed")) {
102+
//when failed
103+
Page page = new Page();
98104
page.setRequest(request);
99-
page.setStatusCode(200);
105+
return page;
100106
}
101-
onSuccess(request);
102-
} catch (Exception e) {
103-
onError(request, e);
104-
logger.warn("download page {} error", request.getUrl(), e);
105107
}
108+
109+
Page page = new Page();
110+
page.setRawText(content);
111+
page.setUrl(new PlainText(request.getUrl()));
112+
page.setRequest(request);
113+
page.setStatusCode(200);
106114
return page;
107115
}
108116

109117
@Override
110118
public void setThread(int threadNum) {
111-
// ignore
119+
this.threadNum = threadNum;
112120
}
113121

114-
protected String getPage(Request request) throws Exception {
115-
String url = request.getUrl();
116-
Runtime runtime = Runtime.getRuntime();
117-
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
118-
InputStream is = process.getInputStream();
119-
BufferedReader br = new BufferedReader(new InputStreamReader(is));
120-
StringBuilder builder = new StringBuilder();
121-
String line;
122-
while ((line = br.readLine()) != null) {
123-
builder.append(line).append("\n");
122+
protected String getPage(Request request) {
123+
try {
124+
String url = request.getUrl();
125+
Runtime runtime = Runtime.getRuntime();
126+
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
127+
InputStream is = process.getInputStream();
128+
BufferedReader br = new BufferedReader(new InputStreamReader(is));
129+
StringBuffer stringBuffer = new StringBuffer();
130+
String line;
131+
while ((line = br.readLine()) != null) {
132+
stringBuffer.append(line).append("\n");
133+
}
134+
return stringBuffer.toString();
135+
} catch (IOException e) {
136+
e.printStackTrace();
124137
}
125-
return builder.toString();
138+
139+
return null;
140+
}
141+
142+
public int getRetryNum() {
143+
return retryNum;
144+
}
145+
146+
public PhantomJSDownloader setRetryNum(int retryNum) {
147+
this.retryNum = retryNum;
148+
return this;
126149
}
127150
}

0 commit comments

Comments
 (0)