This repository was archived by the owner on Dec 30, 2021. It is now read-only.
File tree Expand file tree Collapse file tree 5 files changed +53
-25
lines changed Expand file tree Collapse file tree 5 files changed +53
-25
lines changed Original file line number Diff line number Diff line change 1- WORK IN PROGRESS, PLUGIN IS NOT READY YET!
2- ========================
1+ ## Introduction
2+ Plugin for [ website-scraper] ( https://github.com/s0ph1e/node-website-scraper ) which returns html for dynamic websites using PhantomJS.
3+
4+ ## Installation
5+ ``` sh
6+ npm install website-scraper website-scraper-phantom
7+ ```
8+
9+ ## Usage
10+ ``` javascript
11+ const scrape = require (' website-scraper' );
12+ const phantomHtml = require (' website-scraper-phantom' );
13+
14+ scrape ({
15+ urls: [' https://www.instagram.com/gopro/' ],
16+ directory: ' /path/to/save' ,
17+ httpResponseHandler: phantomHtml
18+ }).then (console .log ).catch (console .log );
19+ ```
20+
21+ ## How it works
22+ It starts PhantomJS which just opens page and waits when page is loaded.
23+ It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality.
Original file line number Diff line number Diff line change 1- var Promise = require ( 'bluebird' ) ;
2- var getPhantomHtml = require ( './get-phantom-html.js' ) ;
1+ 'use strict' ;
2+
3+ const Promise = require ( 'bluebird' ) ;
4+ const getPhantomHtml = require ( './src/get-phantom-html.js' ) ;
35
46/**
57 * Makes phantom request if response contains html, returns original response body otherwise
68 * @param {Object } response - response object from `request` module
79 * @return {Promise } - resolved with body if success, rejected if error
810 */
911module . exports = ( response ) => {
10- var contentType = response . headers [ 'content-type' ] ;
11- var isHtml = contentType && contentType . split ( ';' ) [ 0 ] === 'text/html' ;
12+ const contentType = response . headers [ 'content-type' ] ;
13+ const isHtml = contentType && contentType . split ( ';' ) [ 0 ] === 'text/html' ;
1214 if ( isHtml ) {
1315 return getPhantomHtml ( response . request . href ) ;
1416 } else {
Original file line number Diff line number Diff line change 11{
2- "name" : " website-scraper-phantom-html" ,
3- "version" : " 0.0.0" ,
4- "description" : " " ,
2+ "name" : " website-scraper-phantom" ,
3+ "version" : " 0.1.0" ,
4+ "description" : " Plugin for website-scraper which receives html for dynamic websites using PhantomJS" ,
5+ "readmeFilename" : " README.md" ,
56 "main" : " index.js" ,
7+ "keywords" : [
8+ " website-scraper" ,
9+ " phantomjs" ,
10+ " html"
11+ ],
612 "dependencies" : {
713 "phantomjs-prebuilt" : " ^2.1.14" ,
814 "system" : " ^1.2.0" ,
1521 },
1622 "repository" : {
1723 "type" : " git" ,
18- "url" : " git+https://github.com/s0ph1e/node-website-scraper-phantom-html .git"
24+ "url" : " git+https://github.com/s0ph1e/node-website-scraper-phantom.git"
1925 },
20- "author" : " s0ph1e " ,
26+ "author" :
" Sophia Antipenko <[email protected] > " ,
2127 "license" : " MIT" ,
2228 "bugs" : {
23- "url" : " https://github.com/s0ph1e/node-website-scraper-phantom-html /issues"
29+ "url" : " https://github.com/s0ph1e/node-website-scraper-phantom/issues"
2430 },
25- "homepage" : " https://github.com/s0ph1e/node-website-scraper-phantom-html#readme "
31+ "homepage" : " https://github.com/s0ph1e/node-website-scraper-phantom"
2632}
Original file line number Diff line number Diff line change 1- var path = require ( 'path' ) ;
2- var phantomjs = require ( 'phantomjs-prebuilt' ) ;
3- var Promise = require ( 'bluebird' ) ;
1+ 'use strict' ;
42
5- var scriptPath = path . join ( __dirname , 'script.js' ) ;
3+ const path = require ( 'path' ) ;
4+ const phantomjs = require ( 'phantomjs-prebuilt' ) ;
5+ const Promise = require ( 'bluebird' ) ;
66
7- module . exports = function ( url ) {
7+ const scriptPath = path . join ( __dirname , 'script.js' ) ;
8+
9+ module . exports = ( url ) => {
810 return new Promise ( ( resolve , reject ) => {
9- var program = phantomjs . exec ( scriptPath , url ) ;
10- var stdout = '' ;
11- var stderr = '' ;
11+ const program = phantomjs . exec ( scriptPath , url ) ;
12+ let stdout = '' ;
13+ let stderr = '' ;
1214
1315 program . stdout . on ( 'data' , ( data ) => {
1416 stdout += data ;
Original file line number Diff line number Diff line change @@ -13,13 +13,10 @@ function done() {
1313 phantom . exit ( 0 ) ;
1414}
1515
16- page . onLoadFinished = function ( ) {
17- setTimeout ( done , 1000 ) ;
18- } ;
19-
2016page . open ( url , function ( status ) {
2117 if ( status !== 'success' ) {
2218 system . stderr . write ( 'Can\'t open page' ) ;
2319 phantom . exit ( 1 ) ;
2420 }
21+ setTimeout ( done , 1000 ) ;
2522} ) ;
You can’t perform that action at this time.
0 commit comments