-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathscraper.js
More file actions
210 lines (188 loc) · 6.07 KB
/
scraper.js
File metadata and controls
210 lines (188 loc) · 6.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import { load } from 'cheerio';
import cache from '../data/patchlogs.json' with { type: 'json' };
import ProgressBar from './progress.js';
import sleep from './sleep.js';
import title from './title.js';
const baseUrl = 'https://forums.warframe.com/forum/3-pc-update-notes/';
/**
* Scraper to get patch logs from forums.
* @property {Array<{PatchData}>} posts
*/
class Scraper {
#pagesBar;
#numPages;
#postsBar;
#numPosts = 0;
#numCached = 0;
#numUncached = 0;
/**
* Array of fetched pages' posts to parse
* @type {Array<Array<PatchData>>}
*/
#fetchedPages = [];
constructor() {
this.setup = new Promise((resolve) => {
this.resolve = resolve;
});
this.posts = [];
}
interrupt() {
console.error('No pages found');
process.exit(1);
}
/**
* Retrieve number of post pages to look through. This value should be set to
* 1 through the constructor if we only need the most recent changes.
* @returns {Promise<number>} set the total number of pages
*/
async getPageNumbers() {
const html = await fetch(baseUrl).then((r) => r.text());
const $ = load(html);
const text = $('a[id^="elPagination"]').text().trim().split(' ');
if (text.length < 2) {
throw new Error('Connection blocked by Cloudflare.');
}
this.#numPages = parseInt(text[text.length - 1], 10);
this.#pagesBar = new ProgressBar('Scraping Page', this.#numPages);
return this.#numPages;
}
/**
* Scrape single page of posts
* @param {string} url to fetch content from
* @returns {void}
*/
async scrape(url) {
const html = await fetch(url).then((r) => r.text());
const $ = load(html);
const selector = $('ol[id^="elTable"] .ipsDataItem');
const page /** @type {PatchData[]} */ = [];
let isCached = false;
// Loop through found elements.
// eslint-disable-next-line no-restricted-syntax
for (const key in selector) {
if (key.match(/^\d+$/)) {
const el = $(selector[key]);
/** @type {PatchData} */
const post = {
name: $(el)
.find('h4 a span')
.text()
.trim()
.replace(/[\t\n]/g, '')
.replace(/\[(.*?)]/g, ''),
url: $(el).find('h4 a').attr('href'),
date: $(el).find('time').attr('datetime'),
imgUrl: '',
additions: '',
changes: '',
fixes: '',
};
if (cache.find((p) => p.name === post.name)) {
isCached = true;
}
page.push(post);
this.#numPosts += 1;
}
}
this.#fetchedPages.push(page);
this.#pagesBar.tick();
if (isCached) {
await Promise.all(
new Array(this.#numPages).fill(0).map(async (i, idx) => {
if (idx < this.#numPages - 1) {
this.#pagesBar.tick();
await sleep(10);
}
})
);
}
return isCached;
}
// after scraping the last of the above pages, we can start parsing posts...
// need to find a way to return above and not re-scrape old pages
async parsePosts(afterEachPage) {
this.#postsBar = new ProgressBar('Parsing Posts', this.#numPosts, true);
// eslint-disable-next-line no-restricted-syntax
for await (const posts of this.#fetchedPages) {
const index = this.#fetchedPages.indexOf(posts);
await this.#parsePage(posts);
if (afterEachPage) {
await afterEachPage(this.posts);
}
if (index !== this.#fetchedPages.length - 1) {
await sleep(1000);
}
}
}
async #parsePage(posts /** @type {Array<PatchData>} */) {
// preserve prior cached posts, don't wait for them to be discovered again
this.posts.push(...cache);
// eslint-disable-next-line no-restricted-syntax
for await (const post of posts) {
if (post.url) {
const cached = cache.find((p) => p.name === post.name);
if (cached) {
this.#numCached += 1;
} else {
await sleep(100);
await this.#scrapePost(post.url, post);
this.posts.push(post);
this.#numUncached += 1;
}
this.#postsBar.tick({ cached: this.#numCached, uncached: this.#numUncached });
}
}
}
/**
* Retrieve logs from a single post.
* @param {string} url url to fetch
* @param {PatchData} data post data
* @returns {void}
*/
async #scrapePost(url, data) {
const html = await fetch(url).then((r) => r.text());
const $ = load(html);
const article = $('article').first();
const post = article.find('div[data-role="commentContent"]');
data.imgUrl = article.first().find('img.ipsImage').first().attr('data-imageproxy-source');
let previousCategory = 'fixes';
/**
* Add changes, fixes, additions
*/
$(post)
.children()
.each((i, el) => {
const strong = title($(el).find('strong').text().trim()).replace(/- /g, '\n');
const em = $(el).find('em').text().trim().replace(/- /g, '\n');
// Description
if (i === 1 && em) {
data.description = em;
}
// Detect category
else if (i && strong) {
['Fixes', 'Additions', 'Changes'].forEach((type) => {
if (strong.includes(type)) {
previousCategory = type.toLowerCase();
}
});
}
// Fixes or changes
else if (strong && !strong.includes('Edited ') && !strong.includes(' by ')) {
if (strong.includes('Fix')) {
data.fixes += strong + (strong.endsWith(':') ? '\n' : ':\n');
previousCategory = 'fixes';
} else {
data.changes += strong + (strong.endsWith(':') ? '\n' : ':\n');
previousCategory = 'changes';
}
} else {
// Add to last category if none could be found
// Regex removes tabs and more than one newline in a row.
const text = $(el).text().trim().replace(/\t/g, '').replace(/[\n]+/g, '\n').replace(/- /g, '\n');
data[previousCategory] += `${text}\n`;
}
});
data.type = data.name.includes('Hotfix') ? 'Hotfix' : 'Update';
}
}
export default new Scraper();