|
16 | 16 | * @version 0.5.3 |
17 | 17 | */ |
18 | 18 | public class PhantomJSDownloader extends AbstractDownloader { |
19 | | - private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); |
| 19 | + |
| 20 | + private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); |
20 | 21 | private static String crawlJsPath; |
21 | 22 | private static String phantomJsCommand = "phantomjs"; // default |
22 | 23 |
|
| 24 | + private int retryNum; |
| 25 | + private int threadNum; |
| 26 | + |
23 | 27 | public PhantomJSDownloader() { |
24 | 28 | this.initPhantomjsCrawlPath(); |
25 | 29 | } |
26 | | - |
| 30 | + |
27 | 31 | /** |
28 | 32 | * 添加新的构造函数,支持phantomjs自定义命令 |
29 | | - * <p> |
30 | | - * example: |
31 | | - * phantomjs.exe 支持windows环境 |
32 | | - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 |
33 | | - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException |
34 | | - * |
| 33 | + * |
| 34 | + * example: |
| 35 | + * phantomjs.exe 支持windows环境 |
| 36 | + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 |
| 37 | + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException |
| 38 | + * |
35 | 39 | * @param phantomJsCommand phantomJsCommand |
36 | 40 | */ |
37 | 41 | public PhantomJSDownloader(String phantomJsCommand) { |
38 | 42 | this.initPhantomjsCrawlPath(); |
39 | 43 | PhantomJSDownloader.phantomJsCommand = phantomJsCommand; |
40 | 44 | } |
41 | | - |
| 45 | + |
42 | 46 | /** |
43 | 47 | * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js |
44 | 48 | * <pre> |
45 | 49 | * crawl.js start -- |
46 | | - * |
| 50 | + * |
47 | 51 | * var system = require('system'); |
48 | 52 | * var url = system.args[1]; |
49 | | - * |
| 53 | + * |
50 | 54 | * var page = require('webpage').create(); |
51 | 55 | * page.settings.loadImages = false; |
52 | 56 | * page.settings.resourceTimeout = 5000; |
53 | | - * |
| 57 | + * |
54 | 58 | * page.open(url, function (status) { |
55 | 59 | * if (status != 'success') { |
56 | 60 | * console.log("HTTP request failed!"); |
57 | 61 | * } else { |
58 | 62 | * console.log(page.content); |
59 | 63 | * } |
60 | | - * |
| 64 | + * |
61 | 65 | * page.close(); |
62 | 66 | * phantom.exit(); |
63 | 67 | * }); |
64 | | - * |
| 68 | + * |
65 | 69 | * -- crawl.js end |
66 | 70 | * </pre> |
67 | 71 | * 具体项目时可以将以上js代码复制下来使用 |
68 | | - * <p> |
| 72 | + * |
69 | 73 | * example: |
70 | | - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); |
71 | | - * |
| 74 | + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); |
| 75 | + * |
72 | 76 | * @param phantomJsCommand phantomJsCommand |
73 | | - * @param crawlJsPath crawlJsPath |
| 77 | + * @param crawlJsPath crawlJsPath |
74 | 78 | */ |
75 | 79 | public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { |
76 | | - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; |
77 | | - PhantomJSDownloader.crawlJsPath = crawlJsPath; |
| 80 | + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; |
| 81 | + PhantomJSDownloader.crawlJsPath = crawlJsPath; |
78 | 82 | } |
79 | | - |
| 83 | + |
80 | 84 | private void initPhantomjsCrawlPath() { |
81 | | - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() |
82 | | - + System.getProperty("file.separator") + "crawl.js "; |
| 85 | + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; |
83 | 86 | } |
84 | 87 |
|
85 | 88 | @Override |
86 | 89 | public Page download(Request request, Task task) { |
87 | 90 | if (logger.isInfoEnabled()) { |
88 | 91 | logger.info("downloading page: " + request.getUrl()); |
89 | 92 | } |
90 | | - |
91 | | - Page page = Page.fail(); |
92 | | - try { |
93 | | - String content = getPage(request); |
94 | | - if (!content.contains("HTTP request failed")) { |
95 | | - page.setDownloadSuccess(true); |
96 | | - page.setRawText(content); |
97 | | - page.setUrl(new PlainText(request.getUrl())); |
| 93 | + String content = getPage(request); |
| 94 | + if (content.contains("HTTP request failed")) { |
| 95 | + for (int i = 1; i <= getRetryNum(); i++) { |
| 96 | + content = getPage(request); |
| 97 | + if (!content.contains("HTTP request failed")) { |
| 98 | + break; |
| 99 | + } |
| 100 | + } |
| 101 | + if (content.contains("HTTP request failed")) { |
| 102 | + //when failed |
| 103 | + Page page = new Page(); |
98 | 104 | page.setRequest(request); |
99 | | - page.setStatusCode(200); |
| 105 | + return page; |
100 | 106 | } |
101 | | - onSuccess(request); |
102 | | - } catch (Exception e) { |
103 | | - onError(request, e); |
104 | | - logger.warn("download page {} error", request.getUrl(), e); |
105 | 107 | } |
| 108 | + |
| 109 | + Page page = new Page(); |
| 110 | + page.setRawText(content); |
| 111 | + page.setUrl(new PlainText(request.getUrl())); |
| 112 | + page.setRequest(request); |
| 113 | + page.setStatusCode(200); |
106 | 114 | return page; |
107 | 115 | } |
108 | 116 |
|
109 | 117 | @Override |
110 | 118 | public void setThread(int threadNum) { |
111 | | - // ignore |
| 119 | + this.threadNum = threadNum; |
112 | 120 | } |
113 | 121 |
|
114 | | - protected String getPage(Request request) throws Exception { |
115 | | - String url = request.getUrl(); |
116 | | - Runtime runtime = Runtime.getRuntime(); |
117 | | - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); |
118 | | - InputStream is = process.getInputStream(); |
119 | | - BufferedReader br = new BufferedReader(new InputStreamReader(is)); |
120 | | - StringBuilder builder = new StringBuilder(); |
121 | | - String line; |
122 | | - while ((line = br.readLine()) != null) { |
123 | | - builder.append(line).append("\n"); |
| 122 | + protected String getPage(Request request) { |
| 123 | + try { |
| 124 | + String url = request.getUrl(); |
| 125 | + Runtime runtime = Runtime.getRuntime(); |
| 126 | + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); |
| 127 | + InputStream is = process.getInputStream(); |
| 128 | + BufferedReader br = new BufferedReader(new InputStreamReader(is)); |
| 129 | + StringBuffer stringBuffer = new StringBuffer(); |
| 130 | + String line; |
| 131 | + while ((line = br.readLine()) != null) { |
| 132 | + stringBuffer.append(line).append("\n"); |
| 133 | + } |
| 134 | + return stringBuffer.toString(); |
| 135 | + } catch (IOException e) { |
| 136 | + e.printStackTrace(); |
124 | 137 | } |
125 | | - return builder.toString(); |
| 138 | + |
| 139 | + return null; |
| 140 | + } |
| 141 | + |
| 142 | + public int getRetryNum() { |
| 143 | + return retryNum; |
| 144 | + } |
| 145 | + |
| 146 | + public PhantomJSDownloader setRetryNum(int retryNum) { |
| 147 | + this.retryNum = retryNum; |
| 148 | + return this; |
126 | 149 | } |
127 | 150 | } |
0 commit comments