Skip to content

Commit 58d10b7

Browse files
committed
- fixed randomness interval
- added dynamic columns for table selector
1 parent 30c2703 commit 58d10b7

File tree

10 files changed

+131
-48
lines changed

10 files changed

+131
-48
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,9 @@
22
projectFilesBackup
33
extension.zip
44

5+
/.vs/web-scraper-chrome-extension/v15/.suo
6+
/.vs/web-scraper-chrome-extension/v15
7+
/.vs/VSWorkspaceState.json
8+
/.vs/slnx.sqlite
9+
/.vs/ProjectSettings.json
10+
/.vs/config/applicationhost.config

extension/background_page/background_script.js

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ chrome.runtime.onMessage.addListener(
6868
sitemap: sitemap,
6969
browser: browser,
7070
store: store,
71-
requestInterval: request.requestInterval
71+
requestInterval: request.requestInterval,
72+
requestIntervalRandomness: request.requestIntervalRandomness
7273
});
7374

7475
try {
@@ -81,8 +82,10 @@ chrome.runtime.onMessage.addListener(
8182
message: 'Finished scraping ' + sitemap._id
8283
}, function(id) {
8384
// notification showed
84-
});
85-
sendResponse();
85+
});
86+
// table selector can dynamically add columns (addMissingColumns Feature)
87+
var selectors = sitemap.selectors;
88+
sendResponse(selectors);
8689
});
8790
}
8891
catch (e) {

extension/content_script/content_script.js

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ chrome.runtime.onMessage.addListener(
88
var extractor = new DataExtractor(request);
99
var deferredData = extractor.getData();
1010
deferredData.done(function(data){
11-
console.log("dataextractor data", data);
12-
sendResponse(data);
11+
console.log("dataextractor data", data);
12+
var selectors = extractor.sitemap.selectors;
13+
sendResponse(data, selectors);
1314
});
1415
return true;
1516
}
@@ -18,8 +19,9 @@ chrome.runtime.onMessage.addListener(
1819
var extractor = new DataExtractor(request);
1920
var deferredData = extractor.getSingleSelectorData(request.parentSelectorIds, request.selectorId);
2021
deferredData.done(function(data){
21-
console.log("dataextractor data", data);
22-
sendResponse(data);
22+
console.log("dataextractor data", data);
23+
var selectors = extractor.sitemap.selectors;
24+
sendResponse(data, selectors);
2325
});
2426
return true;
2527
}
@@ -31,8 +33,8 @@ chrome.runtime.onMessage.addListener(
3133
console.log("received ContentScript request", request);
3234

3335
var deferredResponse = contentScript[request.fn](request.request);
34-
deferredResponse.done(function(response) {
35-
sendResponse(response);
36+
deferredResponse.done(function (response) {
37+
sendResponse(response, null);
3638
});
3739

3840
return true;

extension/devtools/views/SelectorEdit.html

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,18 @@
7878
</div>
7979
</div>
8080

81+
<!-- tableAddMissingColumns -->
82+
<div class="form-group feature feature-tableAddMissingColumns">
83+
<label for="tableAddMissingColumns" class="col-sm-2 control-label">Extract missing columns</label>
84+
<div class="col-sm-8">
85+
<div class="checkbox">
86+
<label>
87+
<input type="checkbox" name="tableAddMissingColumns" {{#selector.tableAddMissingColumns}} checked="checked" {{/selector.tableAddMissingColumns}}>
88+
</label>
89+
</div>
90+
</div>
91+
</div>
92+
8193
<!-- ClickType -->
8294
<div class="form-group feature feature-clickType">
8395
<label for="clickType" class="col-sm-2 control-label">Click type</label>

extension/scripts/ChromePopupBrowser.js

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,14 @@ ChromePopupBrowser.prototype = {
7575
parentSelectorId: parentSelectorId
7676
};
7777

78-
chrome.tabs.sendMessage(tab.id, message, function (data) {
79-
console.log("extracted data from web page", data);
78+
chrome.tabs.sendMessage(tab.id, message, function (data, selectors) {
79+
console.log("extracted data from web page", data);
80+
81+
if (selectors) {
82+
// table selector can dynamically add columns (addMissingColumns Feature)
83+
scope.scraper.sitemap.selectors = selectors;
84+
}
85+
8086
callback.call(scope, data);
8187
});
8288
}.bind(this));

extension/scripts/Controller.js

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -888,7 +888,8 @@ SitemapController.prototype = {
888888
var id = $("#edit-selector [name=id]").val();
889889
var selectorsSelector = $("#edit-selector [name=selector]").val();
890890
var tableDataRowSelector = $("#edit-selector [name=tableDataRowSelector]").val();
891-
var tableHeaderRowSelector = $("#edit-selector [name=tableHeaderRowSelector]").val();
891+
var tableHeaderRowSelector = $("#edit-selector [name=tableHeaderRowSelector]").val();
892+
var tableAddMissingColumns = $("#edit-selector [name=tableAddMissingColumns]").is(":checked");
892893
var clickElementSelector = $("#edit-selector [name=clickElementSelector]").val();
893894
var type = $("#edit-selector [name=type]").val();
894895
var clickElementUniquenessType = $("#edit-selector [name=clickElementUniquenessType]").val();
@@ -935,7 +936,8 @@ SitemapController.prototype = {
935936
var newSelector = new Selector({
936937
id: id,
937938
selector: selectorsSelector,
938-
tableHeaderRowSelector: tableHeaderRowSelector,
939+
tableHeaderRowSelector: tableHeaderRowSelector,
940+
tableAddMissingColumns: tableAddMissingColumns,
939941
tableDataRowSelector: tableDataRowSelector,
940942
clickElementSelector: clickElementSelector,
941943
clickElementUniquenessType: clickElementUniquenessType,
@@ -1067,14 +1069,17 @@ SitemapController.prototype = {
10671069
}
10681070

10691071
var requestInterval = $("input[name=requestInterval]").val();
1070-
var pageLoadDelay = $("input[name=pageLoadDelay]").val();
1072+
var pageLoadDelay = $("input[name=pageLoadDelay]").val();
1073+
var intervalRandomness = $("input[name=requestIntervalRandomness]").val();
1074+
10711075

10721076
var sitemap = this.state.currentSitemap;
10731077
var request = {
10741078
scrapeSitemap: true,
10751079
sitemap: JSON.parse(JSON.stringify(sitemap)),
10761080
requestInterval: requestInterval,
1077-
pageLoadDelay: pageLoadDelay
1081+
pageLoadDelay: pageLoadDelay,
1082+
requestIntervalRandomness: intervalRandomness
10781083
};
10791084

10801085
// show sitemap scraping panel
@@ -1083,7 +1088,9 @@ SitemapController.prototype = {
10831088
$("#submit-scrape-sitemap").closest(".form-group").hide();
10841089
$("#scrape-sitemap-config input").prop('disabled', true);
10851090

1086-
chrome.runtime.sendMessage(request, function (response) {
1091+
chrome.runtime.sendMessage(request, function (selectors) {
1092+
// table selector can dynamically add columns (addMissingColumns Feature)
1093+
sitemap.selectors = selectors;
10871094
this.browseSitemapData();
10881095
}.bind(this));
10891096
return false;
@@ -1093,7 +1100,8 @@ SitemapController.prototype = {
10931100
this.setStateEditSitemap(sitemap);
10941101
this.browseSitemapData();
10951102
},
1096-
browseSitemapData: function () {
1103+
browseSitemapData: function () {
1104+
10971105
this.setActiveNavigationButton('sitemap-browse');
10981106
var sitemap = this.state.currentSitemap;
10991107
this.store.getSitemapData(sitemap, function (data) {
@@ -1437,8 +1445,8 @@ SitemapController.prototype = {
14371445

14381446
if (response.length === 0) {
14391447
return;
1440-
}
1441-
var dataColumns = Object.keys(response[0]);
1448+
}
1449+
var dataColumns = Object.keys(response[0]);
14421450

14431451
console.log(dataColumns);
14441452

extension/scripts/Job.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ Job.prototype = {
6767
}
6868
}
6969
this.dataItems.push(result);
70-
}
70+
}
71+
// table selector can dynamically add columns (addMissingColumns Feature)
72+
sitemap.selectors = this.scraper.sitemap.selectors;
7173
console.log(job);
7274
callback(job);
7375
}.bind(this), this);

extension/scripts/Scraper.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,8 @@ Scraper.prototype = {
183183

184184
}.bind(this));
185185

186-
$.whenCallSequentially(deferredDatamanipulations).done(function() {
186+
$.whenCallSequentially(deferredDatamanipulations).done(function () {
187+
this.store.saveSitemap(this.sitemap, function () { });
187188
this.resultWriter.writeDocs(scrapedRecords, function () {
188189

189190
var now = (new Date()).getTime();

extension/scripts/Selector/SelectorTable.js

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,24 +33,45 @@ var SelectorTable = {
3333
}
3434
}
3535
isVerticalRow = true;
36-
36+
3737
} else if ($headerRow.find("th").length) {
3838
$headerRow = $headerRow.find("th");
3939
} else if ($headerRow.find("td").length) {
4040
$headerRow = $headerRow.find("td");
4141
}
4242

43-
$headerRow.each(function (i) {
44-
var header = $(this).text().trim();
43+
$headerRow.each(function (i, value) {
44+
var header = $(value).text().trim();
4545
columns[header] = {
4646
index: i + 1,
4747
isVerticalHeader: isVerticalRow
4848
};
49-
});
49+
}.bind(this));
50+
51+
this.addMissingColumns($headerRow);
5052
}
5153
return columns;
5254
},
5355

56+
addMissingColumns(headerRow) {
57+
headerRow.each(function (i, value) {
58+
if (this.tableAddMissingColumns) {
59+
var header = $(value).text().trim();
60+
var column = $.grep(this.columns, function (h) {
61+
return h.name === header;
62+
});
63+
64+
if (column.length !== 1) {
65+
this.columns.push({
66+
header: header,
67+
name: header,
68+
extract: true
69+
});
70+
}
71+
}
72+
}.bind(this));
73+
},
74+
5475
getVerticalDataCells: function (table, dataSelector) {
5576
var selectors = $(table).find(dataSelector),
5677
isRow = selectors[0].nodeName === "TR",
@@ -70,13 +91,14 @@ var SelectorTable = {
7091
var index = (dataCell.cellIndex - 1 | dataCell.rowIndex);
7192
var headerCellName = $(dataCell).closest('tr').find("th:first-child").text().trim();
7293
var dataCellvalue = $(dataCell).text().trim();
94+
7395
var extractData = $.grep(this.columns, function (h) {
7496
return h.name === headerCellName && h.extract;
7597
}).length == 1;
7698

7799
if (extractData) {
78100
result[index][headerCellName] = dataCellvalue;
79-
}
101+
}
80102
}
81103
}.bind(this));
82104
}
@@ -98,27 +120,28 @@ var SelectorTable = {
98120
headerCellCount = objKeys.length,
99121
isVerticalHeader = headerCellCount && headerCells[Object.keys(headerCells)[0]].isVerticalHeader;
100122

101-
if (isVerticalHeader) {
102-
var results = this.getVerticalDataCells(table, dataSelector);
103-
result.push.apply(result, results);
104-
} else {
105-
$(table).find(dataSelector).each(function (i, dataCell) {
106-
var data = {};
107-
this.columns.forEach(function (headerCell) {
108-
if (headerCell.extract === true) {
109-
if (headerCells[headerCell.header] === undefined) {
110-
data[headerCell.name] = null;
111-
}
112-
else {
113-
var header = headerCells[headerCell.header];
114-
var rowText = $(dataCell).find(">:nth-child(" + header.index + ")").text().trim();
115-
data[headerCell.name] = rowText;
116-
}
123+
if (isVerticalHeader) {
124+
var results = this.getVerticalDataCells(table, dataSelector);
125+
result.push.apply(result, results);
126+
} else {
127+
$(table).find(dataSelector).each(function (i, dataCell) {
128+
var data = {};
129+
130+
this.columns.forEach(function (headerCell) {
131+
if (headerCell.extract === true) {
132+
if (headerCells[headerCell.header] === undefined) {
133+
data[headerCell.name] = null;
117134
}
118-
});
119-
result.push(data);
120-
}.bind(this));
121-
}
135+
else {
136+
var header = headerCells[headerCell.header];
137+
var rowText = $(dataCell).find(">:nth-child(" + header.index + ")").text().trim();
138+
data[headerCell.name] = rowText;
139+
}
140+
}
141+
});
142+
result.push(data);
143+
}.bind(this));
144+
}
122145
}.bind(this));
123146

124147
dfd.resolve(result);
@@ -137,7 +160,7 @@ var SelectorTable = {
137160
},
138161

139162
getFeatures: function () {
140-
return ['multiple', 'columns', 'delay', 'tableDataRowSelector', 'tableHeaderRowSelector']
163+
return ['multiple', 'columns', 'delay', 'tableDataRowSelector', 'tableHeaderRowSelector', 'tableAddMissingColumns']
141164
},
142165

143166
getItemCSSSelector: function () {

extension/scripts/Store.js

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,27 @@ StoreScrapeResultWriter.prototype = {
1414
if(docs.length === 0) {
1515
callback();
1616
}
17-
else {
17+
else {
18+
19+
// get all keys of the objects
20+
var keys = [];
21+
docs.forEach(function (doc) {
22+
for (var key in doc) {
23+
if (doc.hasOwnProperty(key) && keys.indexOf(key) === -1) { keys.push(key); }
24+
};
25+
});
26+
27+
// add missing keys to objects
28+
// This can happen if same element containing different properties <table>
29+
docs.forEach(function (doc) {
30+
var objKeys = Object.keys(doc)
31+
keys.forEach(function (key) {
32+
if (!(key in doc)) {
33+
doc[key] = "";
34+
}
35+
});
36+
});
37+
1838
this.db.bulkDocs({docs:docs}, function(err, response) {
1939
if(err !== null) {
2040
console.log("Error while persisting scraped data to db", err);

0 commit comments

Comments
 (0)