Skip to content

Commit 212b909

Browse files
committed
fix: collect scook book page urls [fixes #15]
1 parent 5ea1c05 commit 212b909

File tree

1 file changed

+99
-71
lines changed

1 file changed

+99
-71
lines changed

src/item/ScookBook.ts

Lines changed: 99 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { Page } from 'puppeteer';
12
import { ScrapeError } from '../error/ScrapeError';
23
import { delay, promisePool } from '../util/promise';
34
import { Book } from './Book';
@@ -6,7 +7,7 @@ import { getPdfOptions } from './get-pdf-options';
67

78
export class ScookBook extends Book {
89
async download(outDir: string, _options?: DownloadOptions) {
9-
const dir = await this.mkSubDir(outDir);
10+
const saveDir = await this.mkSubDir(outDir);
1011
const options = defDownloadOptions(_options);
1112

1213
// Get book frame url
@@ -27,88 +28,115 @@ export class ScookBook extends Book {
2728
await userPage.close();
2829
}
2930

30-
// Get page count, first page url
31-
let pageCount: number;
32-
let pageXUrl: string;
33-
34-
const page = await this.shelf.browser.newPage();
31+
const framePage = await this.shelf.browser.newPage();
3532
try {
36-
await page.goto(bookFrameUrl, {
33+
await framePage.goto(bookFrameUrl, {
3734
waitUntil: 'load',
3835
timeout: this.shelf.options.timeout,
3936
});
4037

41-
while (true) {
42-
try {
43-
pageCount = parseInt(
44-
await page.$eval(
45-
'#total-pages',
46-
(totalPages) => (totalPages as HTMLSpanElement).innerText
47-
)
48-
);
49-
} catch (e) {
50-
await delay(1000);
51-
continue;
52-
}
53-
if (isNaN(pageCount)) continue;
54-
break;
38+
const pageUrls = await this.getPageUrls(framePage);
39+
40+
let downloadedPages = 0;
41+
const getProgress = () => ({
42+
item: this,
43+
percentage: downloadedPages / pageUrls.length,
44+
downloadedPages,
45+
pageCount: pageUrls.length,
46+
});
47+
options.onStart(getProgress());
48+
49+
await promisePool(
50+
async (i) => {
51+
const pageNo = i + 1;
52+
await this.savePage(pageUrls[i], saveDir, pageNo, options);
53+
54+
downloadedPages++;
55+
options.onProgress(getProgress());
56+
},
57+
options.concurrency,
58+
pageUrls.length
59+
);
60+
61+
// Merge pdf pages
62+
options.mergePdfs && (await this.mergePdfPages(saveDir, pageUrls.length));
63+
} finally {
64+
await framePage.close();
65+
}
66+
}
67+
68+
private async getPageUrls(framePage: Page) {
69+
// get count
70+
let pageCount: number;
71+
while (true) {
72+
try {
73+
pageCount = parseInt(
74+
await framePage.$eval(
75+
'#total-pages',
76+
(totalPages) => (totalPages as HTMLSpanElement).innerText
77+
)
78+
);
79+
} catch (e) {
80+
await delay(1000);
81+
continue;
5582
}
83+
if (isNaN(pageCount)) continue;
84+
break;
85+
}
86+
87+
const goPageForm = await framePage.$('form.go-page');
88+
if (!goPageForm) {
89+
throw new ScrapeError('Could not locate scooks go page form.');
90+
}
91+
const curPageInput = await framePage.$('input.current-page');
92+
if (!curPageInput) {
93+
throw new ScrapeError('Could not locate scooks current page input.');
94+
}
95+
96+
let pageUrls: string[] = [];
97+
for (let i = 0; i < pageCount; i++) {
98+
const pageNo = i + 1;
99+
100+
// nav to page
101+
await curPageInput.type(pageNo.toString());
102+
await curPageInput.press('Enter');
56103

57-
const img = await page.$('.image-div > img');
104+
// get page
105+
const img = await framePage.$('.image-div > img');
58106
if (!img) {
59107
throw new ScrapeError('Could not locate scook book page image.');
60108
}
61-
pageXUrl = await img.evaluate((img) => (img as HTMLImageElement).src);
62-
} finally {
63-
await page.close();
109+
const pageUrl = await img.evaluate(
110+
(img) => (img as HTMLImageElement).src
111+
);
112+
pageUrls.push(pageUrl);
64113
}
65114

66-
// Page download pool
67-
let downloadedPages = 0;
68-
const getProgress = () => ({
69-
item: this,
70-
percentage: downloadedPages / pageCount,
71-
downloadedPages,
72-
pageCount,
73-
});
74-
options.onStart(getProgress());
75-
76-
await promisePool(
77-
async (i) => {
78-
const pageNo = i + 1;
79-
80-
const page = await this.shelf.browser.newPage();
81-
try {
82-
await page.goto(
83-
pageXUrl.replace(
84-
/(?<=-)[0-9]+(?=\.)/g,
85-
pageNo.toString().padStart(3, '0')
86-
),
87-
{
88-
waitUntil: 'domcontentloaded',
89-
timeout: this.shelf.options.timeout,
90-
}
91-
);
92-
93-
// Save it as pdf
94-
const pdfFile = this.getPdfPath(dir, pageNo);
95-
96-
await page.pdf({
97-
...(await getPdfOptions(page, options)),
98-
path: pdfFile,
99-
});
115+
return pageUrls;
116+
}
100117

101-
downloadedPages++;
102-
options.onProgress(getProgress());
103-
} finally {
104-
await page.close();
105-
}
106-
},
107-
options.concurrency,
108-
pageCount
109-
);
110-
111-
// Merge pdf pages
112-
options.mergePdfs && (await this.mergePdfPages(dir, pageCount));
118+
private async savePage(
119+
pageUrl: string,
120+
saveDir: string,
121+
pageNo: number,
122+
options: DownloadOptions
123+
) {
124+
const page = await this.shelf.browser.newPage();
125+
try {
126+
await page.goto(pageUrl, {
127+
waitUntil: 'domcontentloaded',
128+
timeout: this.shelf.options.timeout,
129+
});
130+
131+
// Save as pdf
132+
const pdfFile = this.getPdfPath(saveDir, pageNo);
133+
134+
await page.pdf({
135+
...(await getPdfOptions(page, options)),
136+
path: pdfFile,
137+
});
138+
} finally {
139+
await page.close();
140+
}
113141
}
114142
}

0 commit comments

Comments
 (0)