Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ LGPLv3

## Changelog

### v0.3
* Added date pattern in start urls
* Added limit pagination in Element click selector and Element scroll down selector
* Fixed little bug

### v0.2
* Added Element click selector
* Added Element scroll down selector
Expand Down
2 changes: 1 addition & 1 deletion extension/background_page/background_script.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ chrome.runtime.onMessage.addListener(
});
}
catch (e) {
console.log("Scraper execution cancelled".e);
console.log("Scraper execution cancelled", e);
}

return true;
Expand Down
4 changes: 4 additions & 0 deletions extension/devtools/devtools_scraper_panel.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
<script src="../scripts/Controller.js"></script>

<script src="../scripts/App.js"></script>

<script src="../scripts/DateUtils/SimpleDateFormatter.js"></script>
<script src="../scripts/DateUtils/DateRoller.js"></script>
<script src="../scripts/DateUtils/DatePatternSupport.js"></script>
</head>
<body></body>
</html>
26 changes: 18 additions & 8 deletions extension/devtools/views/SelectorEdit.html
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,25 @@
</div>
</div>

<!-- ClickType -->
<div class="form-group feature feature-clickType">
<label for="clickType" class="col-lg-1 control-label">Click type</label>
<!-- ClickType -->
<div class="form-group feature feature-clickType">
<label for="clickType" class="col-lg-1 control-label">Click type</label>

<div class="input-group col-lg-10">
<select class="form-control" id="clickType" name="clickType">
<option value="clickOnce">Click once (pagination, tabs)</option>
<option value="clickMore">Click more (click to load more elements. Stops when no new elements with unique text content are found.)</option>
</select>
</div>
</div>

<!-- Pagination Limit -->
<div class="form-group feature feature-paginationLimit">
<label for="paginationLimit" class="col-lg-1 control-label">Pagination limit</label>

<div class="input-group col-lg-10">
<select class="form-control" id="clickType" name="clickType">
<option value="clickOnce">Click once (pagination, tabs)</option>
<option value="clickMore">Click more (click to load more elements. Stops when no new elements with unique text content are found.)</option>
</select>
<div class="col-lg-10">
<input type="text" class="form-control" name="paginationLimit" id="paginationLimit"
placeholder="empty = unlimited" value="{{selector.paginationLimit}}">
</div>
</div>

Expand Down
57 changes: 57 additions & 0 deletions extension/devtools/views/SitemapEditMetadata.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,63 @@
</div>
</div>
{{/startUrl.push}}

<div class="col-lg-10 col-lg-offset-1">
Supported URL patterns:<br>
1. <b>Numeric</b> with optional step and zero padding – [<i>START</i>-<i>END</i>:<i>STEP</i>] – [001-010:10]<br>
2. <b>Date interval</b> – [date&lt;<i>PATTERN</i>&gt;&lt;<i>START</i>&gt;&lt;<i>END</i>&gt;] – [date&lt;dd.MM.yyyy&gt;&lt;01.01.2017&gt;&lt;now&gt;]<br>
<ul>
date placeholder may be <b>yesterday</b> / <b>now</b> / <b>tomorrow</b><br>
other template components (in Java style)
<ul>
<table border="0">
<tbody>
<tr>
<td>yyyy</td>
<td>full year (4 digits)</td>
</tr>
<tr>
<td>yy</td>
<td>last 2 digits of year</td>
</tr>
<tr>
<td>MMM &emsp;</td>
<td>Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec</td>
</tr>
<tr>
<td>MM</td>
<td>month number (01-12)</td>
</tr>
<tr>
<td>dd</td>
<td>day of month</td>
</tr>
<!-- <tr>
<td></td>
<td></td>
</tr>
<tr>
<td>hh</td>
<td>hours</td>
</tr>
<tr>
<td>mm</td>
<td>minutes</td>
</tr>
<tr>
<td>ss</td>
<td>seconds</td>
</tr>
<tr>
<td>sss</td>
<td>milliseconds</td>
</tr>-->
</tbody>
</table>
</ul>
</ul>
</div>

<div class="form-group">
<div class="col-lg-offset-1 col-lg-10">
<button type="submit" class="btn btn-primary" id="submit-edit-sitemap">Save Sitemap</button>
Expand Down
12 changes: 8 additions & 4 deletions extension/manifest.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"manifest_version": 2,
"version": "0.2.0.10",
"name": "Web Scraper",
"short_name": "Web Scraper",
"version": "0.3.0.10",
"name": "Web Scraper (Codoff's mod)",
"short_name": "Web Scraper (Codoff's mod)",
"description": "Tool for data extraction from websites",
"permissions": ["<all_urls>", "tabs", "notifications", "storage", "unlimitedStorage", "downloads"],
"icons": {
Expand Down Expand Up @@ -49,7 +49,11 @@
"scripts/Store.js",
"scripts/ContentScript.js",
"scripts/BackgroundScript.js",
"background_page/background_script.js"
"background_page/background_script.js",

"scripts/DateUtils/SimpleDateFormatter.js",
"scripts/DateUtils/DateRoller.js",
"scripts/DateUtils/DatePatternSupport.js"
]
},
"web_accessible_resources": [
Expand Down
20 changes: 19 additions & 1 deletion extension/scripts/Controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,22 @@ SitemapController.prototype = {
}
}
},
paginationLimit: {
validators: {
numeric: {
message: 'Pagination limit must be numeric or empty'
},
callback: {
message: 'Pagination limit must be 1 at least',
callback: function(value, validator) {
if(!value) {
return true;
}
return value >= 1;
}
}
}
},
parentSelectors: {
validators: {
notEmpty: {
Expand Down Expand Up @@ -857,6 +873,7 @@ SitemapController.prototype = {
var type = $("#edit-selector [name=type]").val();
var clickElementUniquenessType = $("#edit-selector [name=clickElementUniquenessType]").val();
var clickType = $("#edit-selector [name=clickType]").val();
var paginationLimit = $("#edit-selector [name=paginationLimit]").val();
var discardInitialElements = $("#edit-selector [name=discardInitialElements]").is(":checked");
var multiple = $("#edit-selector [name=multiple]").is(":checked");
var downloadImage = $("#edit-selector [name=downloadImage]").is(":checked");
Expand Down Expand Up @@ -889,6 +906,7 @@ SitemapController.prototype = {
clickElementSelector: clickElementSelector,
clickElementUniquenessType: clickElementUniquenessType,
clickType: clickType,
paginationLimit: paginationLimit,
discardInitialElements: discardInitialElements,
type: type,
multiple: multiple,
Expand Down Expand Up @@ -1345,7 +1363,7 @@ SitemapController.prototype = {
chrome.runtime.sendMessage(request, function (response) {

if (response.length === 0) {
return
return;
}
var dataColumns = Object.keys(response[0]);

Expand Down
64 changes: 64 additions & 0 deletions extension/scripts/DateUtils/DatePatternSupport.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Support for "[date<dd.MM.yyyy><01.01.2016><now>]" pattern
*
* @author © Denis Bakhtenkov [email protected]
* @version 2016
*/

/* global DateRoller */

var DatePatternSupport = {
/**
*
* @param {String} startUrl
* @returns {Array}
*/
expandUrl: function (startUrl) {

function nowSupport(d) {
switch (d) {
case "now":
return df.format(new Date());
case "yesterday":
var date = new Date();
date.setDate(date.getDate() - 1);
return df.format(new Date(date));
case "tomorrow":
var date = new Date();
date.setDate(date.getDate() + 1);
return df.format(new Date(date));
default:
return d;
}
}

var startUrls = startUrl;
// single start url
if (startUrl.push === undefined) {
startUrls = [startUrls];
}

var df;
var urls = [];
startUrls.forEach(function (startUrl) {
var re = /^(.*?)\[date<(.*)><(.*)><(.*)>\](.*)$/;
var matches = startUrl.match(re);
if (matches) {
df = new SimpleDateFormatter(matches[2]);
var startDate = df.parse(nowSupport(matches[3]));
var endDate = df.parse(nowSupport(matches[4]));

var roller = DateRoller.days(startDate, endDate);
roller.forEach(function (date) {
urls.push(matches[1] + df.format(date) + matches[5]);
});

} else {
urls.push(startUrl);
}
});

return urls;
}

};
41 changes: 41 additions & 0 deletions extension/scripts/DateUtils/DateRoller.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Iterator from first day to second
*
* @author © Denis Bakhtenkov [email protected]
* @version 2016
*/

var DateRoller = {

/**
*
* @param {Date} from
* @param {Date} to
* @returns {Array} all days between From and To
*/
days: function (from, to) {

/**
*
* @param {Date} first
* @param {Date} second
* @returns {Number}
*/
function compareDays(first, second) {
var day = 24 * 60 * 60 * 1000;
return Math.floor(first / day) - Math.floor(second / day);
}

var res = [];
var curDate = new Date(from);
var step = from <= to ? 1 : -1;

do {
res.push(new Date(curDate));
curDate.setDate(curDate.getDate() + step);
} while (compareDays(curDate, to) * step <= 0);

return res;
}

};
Loading