Skip to content

Commit 2f9decb

Browse files
author
Codoff
committed
Support date pattern in URL's
1 parent 6b6c188 commit 2f9decb

File tree

7 files changed

+286
-5
lines changed

7 files changed

+286
-5
lines changed

extension/background_page/background_script.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ chrome.runtime.onMessage.addListener(
8686
});
8787
}
8888
catch (e) {
89-
console.log("Scraper execution cancelled".e);
89+
console.log("Scraper execution cancelled", e);
9090
}
9191

9292
return true;

extension/devtools/views/SitemapEditMetadata.html

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,63 @@
3737
</div>
3838
</div>
3939
{{/startUrl.push}}
40+
41+
<div class="col-lg-10 col-lg-offset-1">
42+
Supported URL patterns:<br>
43+
1. <b>Numeric</b> with optional step and zero padding – [<i>START</i>-<i>END</i>:<i>STEP</i>] – [001-010:10]<br>
44+
2. <b>Date interval</b> – [date&lt;<i>PATTERN</i>&gt;&lt;<i>START</i>&gt;&lt;<i>END</i>&gt;] – [date&lt;dd.MM.yyyy&gt;&lt;01.01.2017&gt;&lt;now&gt;]<br>
45+
<ul>
46+
date placeholder may be <b>yesterday</b> / <b>now</b> / <b>tomorrow</b><br>
47+
other template components (in Java style)
48+
<ul>
49+
<table border="0">
50+
<tbody>
51+
<tr>
52+
<td>yyyy</td>
53+
<td>full year (4 digits)</td>
54+
</tr>
55+
<tr>
56+
<td>yy</td>
57+
<td>last 2 digits of year</td>
58+
</tr>
59+
<tr>
60+
<td>MMM &emsp;</td>
61+
<td>Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec</td>
62+
</tr>
63+
<tr>
64+
<td>MM</td>
65+
<td>month number (01-12)</td>
66+
</tr>
67+
<tr>
68+
<td>dd</td>
69+
<td>day of month</td>
70+
</tr>
71+
<!-- <tr>
72+
<td></td>
73+
<td></td>
74+
</tr>
75+
<tr>
76+
<td>hh</td>
77+
<td>hours</td>
78+
</tr>
79+
<tr>
80+
<td>mm</td>
81+
<td>minutes</td>
82+
</tr>
83+
<tr>
84+
<td>ss</td>
85+
<td>seconds</td>
86+
</tr>
87+
<tr>
88+
<td>sss</td>
89+
<td>milliseconds</td>
90+
</tr>-->
91+
</tbody>
92+
</table>
93+
</ul>
94+
</ul>
95+
</div>
96+
4097
<div class="form-group">
4198
<div class="col-lg-offset-1 col-lg-10">
4299
<button type="submit" class="btn btn-primary" id="submit-edit-sitemap">Save Sitemap</button>

extension/manifest.json

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"manifest_version": 2,
3-
"version": "0.2.0.10",
4-
"name": "Web Scraper",
5-
"short_name": "Web Scraper",
3+
"version": "0.3.0.00",
4+
"name": "Web Scraper (Codoff's mod)",
5+
"short_name": "Web Scraper (Codoff's mod)",
66
"description": "Tool for data extraction from websites",
77
"permissions": ["<all_urls>", "tabs", "notifications", "storage", "unlimitedStorage", "downloads"],
88
"icons": {
@@ -49,7 +49,11 @@
4949
"scripts/Store.js",
5050
"scripts/ContentScript.js",
5151
"scripts/BackgroundScript.js",
52-
"background_page/background_script.js"
52+
"background_page/background_script.js",
53+
54+
"scripts/DateUtils/SimpleDateFormatter.js",
55+
"scripts/DateUtils/DateRoller.js",
56+
"scripts/DateUtils/DatePatternSupport.js"
5357
]
5458
},
5559
"web_accessible_resources": [
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Support for "[date<dd.MM.yyyy><01.01.2016><now>]" pattern
3+
*
4+
* @author © Denis Bakhtenkov [email protected]
5+
* @version 2016
6+
*/
7+
8+
/* global DateRoller */
9+
10+
var DatePatternSupport = {
11+
/**
12+
*
13+
* @param {String} startUrl
14+
* @returns {Array}
15+
*/
16+
expandUrl: function (startUrl) {
17+
18+
function nowSupport(d) {
19+
switch (d) {
20+
case "now":
21+
return df.format(new Date());
22+
case "yesterday":
23+
var date = new Date();
24+
date.setDate(date.getDate() - 1);
25+
return df.format(new Date(date));
26+
case "tomorrow":
27+
var date = new Date();
28+
date.setDate(date.getDate() + 1);
29+
return df.format(new Date(date));
30+
default:
31+
return d;
32+
}
33+
}
34+
35+
var startUrls = startUrl;
36+
// single start url
37+
if (startUrl.push === undefined) {
38+
startUrls = [startUrls];
39+
}
40+
41+
var df;
42+
var urls = [];
43+
startUrls.forEach(function (startUrl) {
44+
var re = /^(.*?)\[date<(.*)><(.*)><(.*)>\](.*)$/;
45+
var matches = startUrl.match(re);
46+
if (matches) {
47+
df = new SimpleDateFormatter(matches[2]);
48+
var startDate = df.parse(nowSupport(matches[3]));
49+
var endDate = df.parse(nowSupport(matches[4]));
50+
51+
var roller = DateRoller.days(startDate, endDate);
52+
roller.forEach(function (date) {
53+
urls.push(matches[1] + df.format(date) + matches[5]);
54+
});
55+
56+
} else {
57+
urls.push(startUrl);
58+
}
59+
});
60+
61+
return urls;
62+
}
63+
64+
};
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Iterator from first day to second
3+
*
4+
* @author © Denis Bakhtenkov [email protected]
5+
* @version 2016
6+
*/
7+
8+
var DateRoller = {
9+
10+
/**
11+
*
12+
* @param {Date} from
13+
* @param {Date} to
14+
* @returns {Array} all days between From and To
15+
*/
16+
days: function (from, to) {
17+
18+
/**
19+
*
20+
* @param {Date} first
21+
* @param {Date} second
22+
* @returns {Number}
23+
*/
24+
function compareDays(first, second) {
25+
var day = 24 * 60 * 60 * 1000;
26+
return Math.floor(first / day) - Math.floor(second / day);
27+
}
28+
29+
var res = [];
30+
var curDate = new Date(from);
31+
var step = from <= to ? 1 : -1;
32+
33+
do {
34+
res.push(new Date(curDate));
35+
curDate.setDate(curDate.getDate() + step);
36+
} while (compareDays(curDate, to) * step <= 0);
37+
38+
return res;
39+
}
40+
41+
};
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/**
2+
* Formatter for Date, parse and format with pattern
3+
*
4+
* @author © Denis Bakhtenkov [email protected]
5+
* @version 2016
6+
* @param {String} pattern
7+
* default is dd.MM.yyyy
8+
* @returns {SimpleDateFormatter}
9+
*/
10+
var SimpleDateFormatter = function (pattern) {
11+
this.pattern = pattern || "dd.MM.yyyy";
12+
this.months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"];
13+
};
14+
15+
/**
16+
* Return pattern
17+
* @returns {String}
18+
*/
19+
SimpleDateFormatter.prototype.getPattern = function () {
20+
return this.pattern;
21+
};
22+
23+
/**
24+
* 'dd.MM.yyyy hh:mm:ss'
25+
* @param {Date} date
26+
* @returns {String}
27+
*/
28+
SimpleDateFormatter.prototype.format = function (date) {
29+
30+
/**
31+
* Adding left 'zero' if value's length less than digits
32+
* @param {Number} value
33+
* @param {Number} digits
34+
* @returns {String}
35+
*/
36+
function lzero(value, digits) {
37+
digits = digits || 2;
38+
var result = value.toString();
39+
while (result.length < digits) {
40+
result = "0" + result;
41+
}
42+
return result;
43+
}
44+
45+
var variants = {
46+
yyyy: date.getFullYear(),
47+
yy: lzero(date.getFullYear() % 100),
48+
MMM: this.months[date.getMonth()],
49+
MM: lzero(date.getMonth() + 1),
50+
dd: lzero(date.getDate()),
51+
hh: lzero(date.getHours()),
52+
mm: lzero(date.getMinutes()),
53+
sss: lzero(date.getMilliseconds(), 3),
54+
ss: lzero(date.getSeconds())
55+
};
56+
57+
var format = this.pattern;
58+
59+
for (var i in variants) {
60+
format = format.replace(i, variants[i]);
61+
}
62+
63+
return format;
64+
};
65+
66+
/**
67+
* 16.06.2016
68+
* dd.MM.yyyy
69+
*
70+
* @param {String} string
71+
* @returns {Date}
72+
*/
73+
SimpleDateFormatter.prototype.parse = function (string) {
74+
75+
var date = new Date(0);
76+
var pat = this.pattern;
77+
var input = string;
78+
var variants = {
79+
yyyy: "date.setFullYear(parseInt(value));",
80+
yy: "date.setYear(parseInt(value) + 2000);",
81+
MMM: "date.setMonth(parseInt(value));",
82+
MM: "date.setMonth(parseInt(value) - 1);",
83+
dd: "date.setDate(parseInt(value));",
84+
hh: "date.setHours(parseInt(value));",
85+
mm: "date.setMinutes(parseInt(value));",
86+
sss: "date.setMilliseconds(parseInt(value));",
87+
ss: "date.setSeconds(parseInt(value));"
88+
};
89+
90+
for (var i in variants) {
91+
var pos = pat.search(i);
92+
if (pos !== -1) {
93+
var value = input.substr(pos, i.length);
94+
input = input.substring(0, pos) + input.substring(pos + i.length);
95+
pat = pat.substring(0, pos) + pat.substring(pos + i.length);
96+
if (i === "MMM") {
97+
for (var j in this.months) {
98+
if (value === this.months[j]) {
99+
value = j;
100+
eval(variants[i]);
101+
break;
102+
}
103+
}
104+
} else {
105+
eval(variants[i]);
106+
}
107+
}
108+
}
109+
110+
return date;
111+
};

extension/scripts/Sitemap.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
/* global DatePatternSupport */
2+
13
var Sitemap = function (sitemapObj) {
24
this.initData(sitemapObj);
35
};
@@ -66,6 +68,8 @@ Sitemap.prototype = {
6668
startUrls = [startUrls];
6769
}
6870

71+
startUrls = DatePatternSupport.expandUrl(startUrls);
72+
6973
var urls = [];
7074
startUrls.forEach(function(startUrl) {
7175

0 commit comments

Comments
 (0)