-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathAbstractExtractor.js
More file actions
125 lines (108 loc) · 3.19 KB
/
AbstractExtractor.js
File metadata and controls
125 lines (108 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Cross-browser compatibility
const storageAPI = (typeof browser !== 'undefined' ? browser : chrome)?.storage;
/**
* Abstract Scraper class
*
* @class Extractor
*/
class Extractor {
/** the name of the extractor */
get _name() { return "<Abstract Extractor>" }
/** the list of regexes that the extractor supports and will work on
* @type{RegExp[]}*/
_sitePatterns = [];
/** sets if the extractor requires a page reload before scraping */
needsReload = true;
_state = {};
get _storage_name() { return `scraper.${this._name}`; }
constructor() {
if (this.constructor == Extractor) {
throw new Error("Abstract classes can't be instantiated.");
}
this._init();
}
_storageInitialized = false;
async _init() {
if (this._storageInitialized) return;
const Self = this;
storageAPI?.onChanged?.addListener((changes, areaName) => {
const name = Self._storage_name;
if (areaName === "local" && name in changes) {
Self._state = changes[name].newValue;
Self._handleStateUpdate(Self._state);
}
});
// get data
const name = this._storage_name;
const data = await storageAPI?.local?.get(name);
if (data == undefined || !(name in data)) return;
this._state = data[name];
this._handleStateUpdate(this._state);
this._storageInitialized = true;
}
/**
* Check if the provided url is supported for scraping
*
* @param {string} url
* @returns {boolean} if the provided url is supported
*/
isSupported(url) {
return this._sitePatterns.some(pattern => pattern.test(url));
}
/**
* Extract info from the current page, return a object with the details
*
* @returns {Promise<Record<string, any>>}
* @abstract
*/
async getDetails() {
throw new Error("Method 'getDetails()' must be implemented.");
}
toString() {
return this._name;
}
/**
* implement this to get updates to the state, gets called on extractor creation
*
* @param {any} state - the updated state
* @abstract
*/
_handleStateUpdate(state) { }
/**
* implement this to get updates to the state, gets called on extractor creation
*
* @param {any} state - the updated state
* @abstract
*/
async _saveState(state) {
if (state == undefined) {
state = this._state;
}
let obj = {};
obj[this._storage_name] = state;
await storageAPI?.local?.set(obj);
}
/**
* Takes in a url and returns a url with only the components needed to identify the book
*
* @param {string} url
* @param {string[]} [keepParams=['ean', 'isbn', 'upc']] url params to not discard
*
* @returns {string}
*/
normalizeUrl(url, keepParams = ['ean', 'isbn', 'upc']) {
try {
const x = new URL(url);
// Preserve key product identifiers when the format is encoded in query params.
const kept = keepParams
.filter((key) => x.searchParams.has(key))
.map((key) => `${key}=${x.searchParams.get(key)}`);
const suffix = kept.length ? `?${kept.join('&')}` : '';
return `${x.origin}${x.pathname}${suffix}`;
} catch {
return url || '';
}
}
// NOTE: maybe add extraction functions into methods on the class
}
export { Extractor };