Skip to content
This repository was archived by the owner on Dec 30, 2021. It is now read-only.

Commit 940d3c7

Browse files
committed
Add readme, update code, update package.json
1 parent ed29119 commit 940d3c7

File tree

5 files changed

+53
-25
lines changed

5 files changed

+53
-25
lines changed

README.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,23 @@
1-
WORK IN PROGRESS, PLUGIN IS NOT READY YET!
2-
========================
1+
## Introduction
2+
Plugin for [website-scraper](https://github.com/s0ph1e/node-website-scraper) which returns html for dynamic websites using PhantomJS.
3+
4+
## Installation
5+
```sh
6+
npm install website-scraper website-scraper-phantom
7+
```
8+
9+
## Usage
10+
```javascript
11+
const scrape = require('website-scraper');
12+
const phantomHtml = require('website-scraper-phantom');
13+
14+
scrape({
15+
urls: ['https://www.instagram.com/gopro/'],
16+
directory: '/path/to/save',
17+
httpResponseHandler: phantomHtml
18+
}).then(console.log).catch(console.log);
19+
```
20+
21+
## How it works
22+
It starts PhantomJS which just opens page and waits when page is loaded.
23+
It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality.

index.js

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
var Promise = require('bluebird');
2-
var getPhantomHtml = require('./get-phantom-html.js');
1+
'use strict';
2+
3+
const Promise = require('bluebird');
4+
const getPhantomHtml = require('./src/get-phantom-html.js');
35

46
/**
57
* Makes phantom request if response contains html, returns original response body otherwise
68
* @param {Object} response - response object from `request` module
79
* @return {Promise} - resolved with body if success, rejected if error
810
*/
911
module.exports = (response) => {
10-
var contentType = response.headers['content-type'];
11-
var isHtml = contentType && contentType.split(';')[0] === 'text/html';
12+
const contentType = response.headers['content-type'];
13+
const isHtml = contentType && contentType.split(';')[0] === 'text/html';
1214
if (isHtml) {
1315
return getPhantomHtml(response.request.href);
1416
} else {

package.json

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
{
2-
"name": "website-scraper-phantom-html",
3-
"version": "0.0.0",
4-
"description": "",
2+
"name": "website-scraper-phantom",
3+
"version": "0.1.0",
4+
"description": "Plugin for website-scraper which receives html for dynamic websites using PhantomJS",
5+
"readmeFilename": "README.md",
56
"main": "index.js",
7+
"keywords": [
8+
"website-scraper",
9+
"phantomjs",
10+
"html"
11+
],
612
"dependencies": {
713
"phantomjs-prebuilt": "^2.1.14",
814
"system": "^1.2.0",
@@ -15,12 +21,12 @@
1521
},
1622
"repository": {
1723
"type": "git",
18-
"url": "git+https://github.com/s0ph1e/node-website-scraper-phantom-html.git"
24+
"url": "git+https://github.com/s0ph1e/node-website-scraper-phantom.git"
1925
},
20-
"author": "s0ph1e",
26+
"author": "Sophia Antipenko <[email protected]>",
2127
"license": "MIT",
2228
"bugs": {
23-
"url": "https://github.com/s0ph1e/node-website-scraper-phantom-html/issues"
29+
"url": "https://github.com/s0ph1e/node-website-scraper-phantom/issues"
2430
},
25-
"homepage": "https://github.com/s0ph1e/node-website-scraper-phantom-html#readme"
31+
"homepage": "https://github.com/s0ph1e/node-website-scraper-phantom"
2632
}

get-phantom-html.js renamed to src/get-phantom-html.js

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
var path = require('path');
2-
var phantomjs = require('phantomjs-prebuilt');
3-
var Promise = require('bluebird');
1+
'use strict';
42

5-
var scriptPath = path.join(__dirname, 'script.js');
3+
const path = require('path');
4+
const phantomjs = require('phantomjs-prebuilt');
5+
const Promise = require('bluebird');
66

7-
module.exports = function (url) {
7+
const scriptPath = path.join(__dirname, 'script.js');
8+
9+
module.exports = (url) => {
810
return new Promise((resolve, reject) => {
9-
var program = phantomjs.exec(scriptPath, url);
10-
var stdout = '';
11-
var stderr = '';
11+
const program = phantomjs.exec(scriptPath, url);
12+
let stdout = '';
13+
let stderr = '';
1214

1315
program.stdout.on('data', (data) => {
1416
stdout += data;

script.js renamed to src/script.js

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,10 @@ function done() {
1313
phantom.exit(0);
1414
}
1515

16-
page.onLoadFinished = function() {
17-
setTimeout(done, 1000);
18-
};
19-
2016
page.open(url, function (status) {
2117
if (status !== 'success') {
2218
system.stderr.write('Can\'t open page');
2319
phantom.exit(1);
2420
}
21+
setTimeout(done, 1000);
2522
});

0 commit comments

Comments
 (0)