-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebScraper.js
More file actions
120 lines (115 loc) · 3.82 KB
/
WebScraper.js
File metadata and controls
120 lines (115 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
var XLSX = require('xlsx');
var cheerio = require('cheerio');
var fs = require('fs');
var URL = require('url');
var scraper = require('google-search-scraper');
var workbook = XLSX.readFileSync('web.xlsx');
var sheetName = workbook.SheetNames[0];
var processedCount = 0;
var arr = [];
var givenSites = [];
var $;
var sheet = workbook.Sheets[sheetName];
//result contains the first row present in the excel
var result = [];
var row;
var rowNum;
var colNum;
var range = XLSX.utils.decode_range(sheet['!ref']);
for(rowNum = range.s.r; rowNum <= range.e.r; rowNum++){
row = [];
for(colNum=range.s.c; colNum<=range.e.c; colNum++){
var nextCell = sheet[
XLSX.utils.encode_cell({r: rowNum, c: colNum})
];
if( typeof nextCell === 'undefined' ){
row.push(void 0);
} else row.push(nextCell.w);
}
result.push(row);
}
if(result.length>0){
//once the excel is read, the input of excel is passed to the google scraper to identify the urls present in the given site with the given search terms
Googlescraper(result);
}
function Googlescraper(result){
var site = "site:";
var searchString = result[0][0];
//The first row first column in excel should be the search string
var searchSite = result[0][1];
//The first row second column in excel should be the search site
console.log("search site is "+searchSite);
var queryforGoogle = site.concat(searchSite).concat(" ").concat(searchString);
var options = {
query: queryforGoogle,
limit: 5
//solver: dbc
};
scraper.search(options, function(err, url) {
// This is called for each result
if(err) throw err;
else{
console.log(url);
//All the urls that google gives are pushed into the array arr.
arr.push(url);
}
if(arr.length>0){
//The urls obtained from google are processed to search for the given attributes
ProcessURLS(arr);
}
});
}
function ProcessURLS(arr){
for(var j=0;j<arr.length;j++){
console.log("Array Length is"+arr.length);
request(arr[j], function (error, response, body)
{
if (!error && response.statusCode == 200)
{
console.log(arr[j]);
$ = cheerio.load(body);
$('script').remove();
$('noscript').remove();
var content = $.text();
var data1 = content.replace(/\s\s+/g, ' ');
var title = $("title").text().replace(/\s/g,'');
console.log("the title is "+title);
var finalTitle = result[0][0].replace(/\s/g,'');
//Make sure that the scraper crawls the correct page by checking for the search term in the title of the page.
var isTitleMatch = title.search(finalTitle);
if(isTitleMatch != -1){
getTheChildren();
}
}
});
}
}
function getTheChildren(){
for(var h=2;h<result[0].length && h!=null;h++){
(function(h) {
var value = result[0][h];
//Please go through the site before hardcoding the tag below. THe default is given as tr -this may differ for various sites
//Eg: If the content that you are looking for is present in paragraph tag, replace tr with p
var h = $("tr:contains('" + value + "')").children().text();
if(h == "" && value){
//If a given keyword doesn't match with that of original term present in site, the below logic forms various patterns to perform searches and get the result
var inarr = value.split(" ");
console.log("The value of inarr is "+inarr.length);
if(inarr.length > 1){
for(var f=0;f<inarr.length;f++){
h = $("tr:contains('" + inarr[f] + "')").children().text();
if(h != "")
break;
}
}
}
var attribute = "The value of attibute"+value+" is "+h+"\n";
//The results of scraper are written into a file resultsOfScraper
fs.appendFile("resultsOfScraper.txt",attribute,function(err){
if(err){
console.log(err);
}
})
})(h);
}
}