diff --git a/README.md b/README.md index 5a64886a..52fd95a0 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ LGPLv3 ## Changelog +### v0.3 + * Added date pattern in start urls + * Added limit pagination in Element click selector and Element scroll down selector + * Fixed little bug + ### v0.2 * Added Element click selector * Added Element scroll down selector diff --git a/extension/background_page/background_script.js b/extension/background_page/background_script.js index 480287e8..9d3e73b8 100644 --- a/extension/background_page/background_script.js +++ b/extension/background_page/background_script.js @@ -86,7 +86,7 @@ chrome.runtime.onMessage.addListener( }); } catch (e) { - console.log("Scraper execution cancelled".e); + console.log("Scraper execution cancelled", e); } return true; diff --git a/extension/devtools/devtools_scraper_panel.html b/extension/devtools/devtools_scraper_panel.html index 6f7c4e30..9c3cbcf1 100644 --- a/extension/devtools/devtools_scraper_panel.html +++ b/extension/devtools/devtools_scraper_panel.html @@ -34,6 +34,10 @@ + + + + \ No newline at end of file diff --git a/extension/devtools/views/SelectorEdit.html b/extension/devtools/views/SelectorEdit.html index 04928fc9..11fded0e 100644 --- a/extension/devtools/views/SelectorEdit.html +++ b/extension/devtools/views/SelectorEdit.html @@ -77,15 +77,25 @@ - -
- + +
+ + +
+ +
+
+ + +
+ -
- +
+
diff --git a/extension/devtools/views/SitemapEditMetadata.html b/extension/devtools/views/SitemapEditMetadata.html index 544df6dd..5cd78c4d 100644 --- a/extension/devtools/views/SitemapEditMetadata.html +++ b/extension/devtools/views/SitemapEditMetadata.html @@ -37,6 +37,63 @@
{{/startUrl.push}} + +
+ Supported URL patterns:
+ 1. Numeric with optional step and zero padding – [START-END:STEP] – [001-010:10]
+ 2. Date interval – [date<PATTERN><START><END>] – [date<dd.MM.yyyy><01.01.2017><now>]
+ +
+
diff --git a/extension/manifest.json b/extension/manifest.json index 8080fcdc..c85acc2f 100644 --- a/extension/manifest.json +++ b/extension/manifest.json @@ -1,8 +1,8 @@ { "manifest_version": 2, - "version": "0.2.0.10", - "name": "Web Scraper", - "short_name": "Web Scraper", + "version": "0.3.0.10", + "name": "Web Scraper (Codoff's mod)", + "short_name": "Web Scraper (Codoff's mod)", "description": "Tool for data extraction from websites", "permissions": ["", "tabs", "notifications", "storage", "unlimitedStorage", "downloads"], "icons": { @@ -49,7 +49,11 @@ "scripts/Store.js", "scripts/ContentScript.js", "scripts/BackgroundScript.js", - "background_page/background_script.js" + "background_page/background_script.js", + + "scripts/DateUtils/SimpleDateFormatter.js", + "scripts/DateUtils/DateRoller.js", + "scripts/DateUtils/DatePatternSupport.js" ] }, "web_accessible_resources": [ diff --git a/extension/scripts/Controller.js b/extension/scripts/Controller.js index 3caa4cab..68b93b11 100644 --- a/extension/scripts/Controller.js +++ b/extension/scripts/Controller.js @@ -693,6 +693,22 @@ SitemapController.prototype = { } } }, + paginationLimit: { + validators: { + numeric: { + message: 'Pagination limit must be numeric or empty' + }, + callback: { + message: 'Pagination limit must be 1 at least', + callback: function(value, validator) { + if(!value) { + return true; + } + return value >= 1; + } + } + } + }, parentSelectors: { validators: { notEmpty: { @@ -857,6 +873,7 @@ SitemapController.prototype = { var type = $("#edit-selector [name=type]").val(); var clickElementUniquenessType = $("#edit-selector [name=clickElementUniquenessType]").val(); var clickType = $("#edit-selector [name=clickType]").val(); + var paginationLimit = $("#edit-selector [name=paginationLimit]").val(); var discardInitialElements = $("#edit-selector [name=discardInitialElements]").is(":checked"); var multiple = $("#edit-selector [name=multiple]").is(":checked"); var downloadImage = $("#edit-selector [name=downloadImage]").is(":checked"); @@ -889,6 +906,7 @@ SitemapController.prototype = { clickElementSelector: clickElementSelector, clickElementUniquenessType: clickElementUniquenessType, clickType: clickType, + paginationLimit: paginationLimit, discardInitialElements: discardInitialElements, type: type, multiple: multiple, @@ -1345,7 +1363,7 @@ SitemapController.prototype = { chrome.runtime.sendMessage(request, function (response) { if (response.length === 0) { - return + return; } var dataColumns = Object.keys(response[0]); diff --git a/extension/scripts/DateUtils/DatePatternSupport.js b/extension/scripts/DateUtils/DatePatternSupport.js new file mode 100644 index 00000000..ad3192b8 --- /dev/null +++ b/extension/scripts/DateUtils/DatePatternSupport.js @@ -0,0 +1,64 @@ +/* + * Support for "[date<01.01.2016>]" pattern + * + * @author © Denis Bakhtenkov denis.bakhtenkov@gmail.com + * @version 2016 + */ + +/* global DateRoller */ + +var DatePatternSupport = { + /** + * + * @param {String} startUrl + * @returns {Array} + */ + expandUrl: function (startUrl) { + + function nowSupport(d) { + switch (d) { + case "now": + return df.format(new Date()); + case "yesterday": + var date = new Date(); + date.setDate(date.getDate() - 1); + return df.format(new Date(date)); + case "tomorrow": + var date = new Date(); + date.setDate(date.getDate() + 1); + return df.format(new Date(date)); + default: + return d; + } + } + + var startUrls = startUrl; + // single start url + if (startUrl.push === undefined) { + startUrls = [startUrls]; + } + + var df; + var urls = []; + startUrls.forEach(function (startUrl) { + var re = /^(.*?)\[date<(.*)><(.*)><(.*)>\](.*)$/; + var matches = startUrl.match(re); + if (matches) { + df = new SimpleDateFormatter(matches[2]); + var startDate = df.parse(nowSupport(matches[3])); + var endDate = df.parse(nowSupport(matches[4])); + + var roller = DateRoller.days(startDate, endDate); + roller.forEach(function (date) { + urls.push(matches[1] + df.format(date) + matches[5]); + }); + + } else { + urls.push(startUrl); + } + }); + + return urls; + } + +}; \ No newline at end of file diff --git a/extension/scripts/DateUtils/DateRoller.js b/extension/scripts/DateUtils/DateRoller.js new file mode 100644 index 00000000..ff22df9e --- /dev/null +++ b/extension/scripts/DateUtils/DateRoller.js @@ -0,0 +1,41 @@ +/* + * Iterator from first day to second + * + * @author © Denis Bakhtenkov denis.bakhtenkov@gmail.com + * @version 2016 + */ + +var DateRoller = { + + /** + * + * @param {Date} from + * @param {Date} to + * @returns {Array} all days between From and To + */ + days: function (from, to) { + + /** + * + * @param {Date} first + * @param {Date} second + * @returns {Number} + */ + function compareDays(first, second) { + var day = 24 * 60 * 60 * 1000; + return Math.floor(first / day) - Math.floor(second / day); + } + + var res = []; + var curDate = new Date(from); + var step = from <= to ? 1 : -1; + + do { + res.push(new Date(curDate)); + curDate.setDate(curDate.getDate() + step); + } while (compareDays(curDate, to) * step <= 0); + + return res; + } + +}; \ No newline at end of file diff --git a/extension/scripts/DateUtils/SimpleDateFormatter.js b/extension/scripts/DateUtils/SimpleDateFormatter.js new file mode 100644 index 00000000..93a0adc3 --- /dev/null +++ b/extension/scripts/DateUtils/SimpleDateFormatter.js @@ -0,0 +1,111 @@ +/** + * Formatter for Date, parse and format with pattern + * + * @author © Denis Bakhtenkov denis.bakhtenkov@gmail.com + * @version 2016 + * @param {String} pattern + * default is dd.MM.yyyy + * @returns {SimpleDateFormatter} + */ +var SimpleDateFormatter = function (pattern) { + this.pattern = pattern || "dd.MM.yyyy"; + this.months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]; +}; + +/** + * Return pattern + * @returns {String} + */ +SimpleDateFormatter.prototype.getPattern = function () { + return this.pattern; +}; + +/** + * 'dd.MM.yyyy hh:mm:ss' + * @param {Date} date + * @returns {String} + */ +SimpleDateFormatter.prototype.format = function (date) { + + /** + * Adding left 'zero' if value's length less than digits + * @param {Number} value + * @param {Number} digits + * @returns {String} + */ + function lzero(value, digits) { + digits = digits || 2; + var result = value.toString(); + while (result.length < digits) { + result = "0" + result; + } + return result; + } + + var variants = { + yyyy: date.getFullYear(), + yy: lzero(date.getFullYear() % 100), + MMM: this.months[date.getMonth()], + MM: lzero(date.getMonth() + 1), + dd: lzero(date.getDate()), + hh: lzero(date.getHours()), + mm: lzero(date.getMinutes()), + sss: lzero(date.getMilliseconds(), 3), + ss: lzero(date.getSeconds()) + }; + + var format = this.pattern; + + for (var i in variants) { + format = format.replace(i, variants[i]); + } + + return format; +}; + +/** + * 16.06.2016 + * dd.MM.yyyy + * + * @param {String} string + * @returns {Date} + */ +SimpleDateFormatter.prototype.parse = function (string) { + + var date = new Date(0); + var pat = this.pattern; + var input = string; + var variants = { + yyyy: "date.setFullYear(parseInt(value));", + yy: "date.setYear(parseInt(value) + 2000);", + MMM: "date.setMonth(parseInt(value));", + MM: "date.setMonth(parseInt(value) - 1);", + dd: "date.setDate(parseInt(value));", + hh: "date.setHours(parseInt(value));", + mm: "date.setMinutes(parseInt(value));", + sss: "date.setMilliseconds(parseInt(value));", + ss: "date.setSeconds(parseInt(value));" + }; + + for (var i in variants) { + var pos = pat.search(i); + if (pos !== -1) { + var value = input.substr(pos, i.length); + input = input.substring(0, pos) + input.substring(pos + i.length); + pat = pat.substring(0, pos) + pat.substring(pos + i.length); + if (i === "MMM") { + for (var j in this.months) { + if (value === this.months[j]) { + value = j; + eval(variants[i]); + break; + } + } + } else { + eval(variants[i]); + } + } + } + + return date; +}; \ No newline at end of file diff --git a/extension/scripts/Selector/SelectorElementClick.js b/extension/scripts/Selector/SelectorElementClick.js index 87592803..113532c0 100644 --- a/extension/scripts/Selector/SelectorElementClick.js +++ b/extension/scripts/Selector/SelectorElementClick.js @@ -81,6 +81,8 @@ var SelectorElementClick = { _getData: function(parentElement) { + var paginationLimit = parseInt(this.paginationLimit); + var paginationCount = 1; var delay = parseInt(this.delay) || 0; var deferredResponse = $.Deferred(); var foundElements = new UniqueElementList('uniqueHTMLText'); @@ -106,7 +108,7 @@ var SelectorElementClick = { var currentClickElement = clickElements[0]; this.triggerButtonClick(currentClickElement); var nextElementSelection = (new Date()).getTime()+delay; - + // infinitely scroll down and find all items var interval = setInterval(function() { @@ -145,11 +147,12 @@ var SelectorElementClick = { // continue clicking and add delay, but if there is nothing // more to click the finish //console.log("total buttons", clickElements.length) - if(clickElements.length === 0) { + if(clickElements.length === 0 || paginationCount >= paginationLimit) { clearInterval(interval); deferredResponse.resolve(foundElements); } else { + paginationCount++; //console.log("click"); currentClickElement = clickElements[0]; // click on elements only once if the type is clickonce @@ -168,7 +171,9 @@ var SelectorElementClick = { return []; }, - getFeatures: function () { - return ['multiple', 'delay', 'clickElementSelector', 'clickType', 'discardInitialElements', 'clickElementUniquenessType'] - } + getFeatures: function () { + return ['multiple', 'delay', 'clickElementSelector', 'clickType', + 'discardInitialElements', 'clickElementUniquenessType', + 'paginationLimit']; + } }; diff --git a/extension/scripts/Selector/SelectorElementScroll.js b/extension/scripts/Selector/SelectorElementScroll.js index faef891d..cf9084b2 100644 --- a/extension/scripts/Selector/SelectorElementScroll.js +++ b/extension/scripts/Selector/SelectorElementScroll.js @@ -22,8 +22,10 @@ var SelectorElementScroll = { window.scrollTo(0,document.body.scrollHeight); }, _getData: function (parentElement) { - - var delay = parseInt(this.delay) || 0; + + var paginationLimit = parseInt(this.paginationLimit); + var paginationCount = 1; + var delay = parseInt(this.delay) || 0; var deferredResponse = $.Deferred(); var foundElements = []; @@ -41,12 +43,13 @@ var SelectorElementScroll = { } var elements = this.getDataElements(parentElement); - // no new elements found - if(elements.length === foundElements.length) { + // no new elements found or pagination limit + if(elements.length === foundElements.length || paginationCount >= paginationLimit) { clearInterval(interval); deferredResponse.resolve(jQuery.makeArray(elements)); } else { + paginationCount++; // continue scrolling and add delay foundElements = elements; this.scrollToBottom(); @@ -63,6 +66,6 @@ var SelectorElementScroll = { }, getFeatures: function () { - return ['multiple', 'delay'] + return ['multiple', 'delay', 'paginationLimit']; } }; diff --git a/extension/scripts/Sitemap.js b/extension/scripts/Sitemap.js index 0cfaf47a..1a958c8d 100644 --- a/extension/scripts/Sitemap.js +++ b/extension/scripts/Sitemap.js @@ -1,3 +1,5 @@ +/* global DatePatternSupport */ + var Sitemap = function (sitemapObj) { this.initData(sitemapObj); }; @@ -66,6 +68,8 @@ Sitemap.prototype = { startUrls = [startUrls]; } + startUrls = DatePatternSupport.expandUrl(startUrls); + var urls = []; startUrls.forEach(function(startUrl) { diff --git a/tests/SpecRunner.html b/tests/SpecRunner.html index 0c1d3e4b..02d8b369 100644 --- a/tests/SpecRunner.html +++ b/tests/SpecRunner.html @@ -46,6 +46,9 @@ + + + @@ -75,6 +78,7 @@ + diff --git a/tests/spec/DateUtilsSpec.js b/tests/spec/DateUtilsSpec.js new file mode 100644 index 00000000..2008a15d --- /dev/null +++ b/tests/spec/DateUtilsSpec.js @@ -0,0 +1,117 @@ +describe("DateUtils", function () { + + beforeEach(function () { + this.addMatchers(selectorMatchers); + }); + + it("'SimpleDateFormatter.format' pattern 'yyyy/MM/dd'", function(){ + var pattern = "yyyy/MM/dd"; + var date = new Date("1979-02-01T00:00:00.000Z"); + var expected = "1979/02/01"; + var df = new SimpleDateFormatter(pattern); + expect(df.format(date)).toEqual(expected); + }); + + it("'SimpleDateFormatter.format' pattern 'dd/MM/yy'", function(){ + var pattern = "dd/MM/yy"; + var date = new Date("1979-02-01T00:00:00.000Z"); + var expected = "01/02/79"; + var df = new SimpleDateFormatter(pattern); + expect(df.format(date)).toEqual(expected); + }); + + it("'SimpleDateFormatter.format' pattern 'dd MMM yy'", function(){ + var pattern = "dd MMM yy"; + var date = new Date("1979-02-01T00:00:00.000Z"); + var expected = "01 Feb 79"; + var df = new SimpleDateFormatter(pattern); + expect(df.format(date)).toEqual(expected); + }); + + it("'SimpleDateFormatter.format' pattern 'dd MMM yyyy'", function(){ + var pattern = "dd MMM yyyy"; + var date = new Date("1979-02-01T00:00:00.000Z"); + var expected = "01 Feb 1979"; + var df = new SimpleDateFormatter(pattern); + expect(df.format(date)).toEqual(expected); + }); + + it("'SimpleDateFormatter.parse' pattern 'dd.MMM.yy'", function(){ + var pattern = "dd.MMM.yy"; + var date = "15.Aug.16"; + var expected = new Date("2016-08-15T00:00:00.000Z"); + var df = new SimpleDateFormatter(pattern); + expect(df.parse(date)).toEqual(expected); + }); + + it("'SimpleDateFormatter.parse' pattern 'MM/dd/yyyy'", function(){ + var pattern = "MM/dd/yyyy"; + var date = "02.29.2016"; + var expected = new Date("2016-02-29T00:00:00.000Z"); + var df = new SimpleDateFormatter(pattern); + expect(df.parse(date)).toEqual(expected); + }); + + it("'SimpleDateFormatter.parse' pattern 'dd.MM.yyyy'", function(){ + var pattern = "dd.MM.yyyy"; + var date = "16.06.2016"; + var expected = new Date("2016-06-16T00:00:00.000Z"); + var df = new SimpleDateFormatter(pattern); + expect(df.parse(date)).toEqual(expected); + }); + + it("'DateRoller.days' should return one day", function(){ + var from = new Date("1979-02-01T00:00:00.000Z"); + var to = new Date("1979-02-01T00:00:00.000Z"); + var roller = DateRoller.days(from, to); + var expectedDays = 1; + expect(roller.length).toEqual(expectedDays); + }); + + it("'DateRoller.days' should return 366 days", function(){ + var from = new Date("2016-12-31T00:00:00.000Z"); + var to = new Date("2016-01-01T00:00:00.000Z"); + var roller = DateRoller.days(from, to); + var expectedDays = 366; + expect(roller.length).toEqual(expectedDays); + }); + + it("'DateRoller.days' should return multiple days increasing", function(){ + var from = new Date("1979-02-01T00:00:00.000Z"); + var to = new Date("1979-02-03T00:00:00.000Z"); + var roller = DateRoller.days(from, to); + var expectedDays = [ + new Date("1979-02-01T00:00:00.000Z"), + new Date("1979-02-02T00:00:00.000Z"), + new Date("1979-02-03T00:00:00.000Z") + ]; + expect(roller).toEqual(expectedDays); + }); + + it("'DateRoller.days' should return multiple days decreasing", function(){ + var from = new Date("2017-05-02T00:00:00.000Z"); + var to = new Date("2017-04-30T00:00:00.000Z"); + var roller = DateRoller.days(from, to); + var expectedDays = [ + new Date("2017-05-02T00:00:00.000Z"), + new Date("2017-05-01T00:00:00.000Z"), + new Date("2017-04-30T00:00:00.000Z") + ]; + expect(roller).toEqual(expectedDays); + }); + + it("'DatePatternSupport.expandUrl' should return one url", function(){ + var url = "http://example.com/[date
<24/05/2017><24/05/2017>]/index.html"; + var expandedUrls = DatePatternSupport.expandUrl(url); + var expectedDays = ["http://example.com/24/05/2017/index.html"]; + expect(expandedUrls).toEqual(expectedDays); + }); + + it("'DatePatternSupport.expandUrl' should return three urls from 'yesterday' to 'tomorrow'", function(){ + var url = "http://example.com/[date
]/index.html"; + var expandedUrls = DatePatternSupport.expandUrl(url); + var expectedDays = 3; + expect(expandedUrls.length).toEqual(expectedDays); + }); + +}); \ No newline at end of file