1+ import { Page } from 'puppeteer' ;
12import { ScrapeError } from '../error/ScrapeError' ;
23import { delay , promisePool } from '../util/promise' ;
34import { Book } from './Book' ;
@@ -6,7 +7,7 @@ import { getPdfOptions } from './get-pdf-options';
67
78export class ScookBook extends Book {
89 async download ( outDir : string , _options ?: DownloadOptions ) {
9- const dir = await this . mkSubDir ( outDir ) ;
10+ const saveDir = await this . mkSubDir ( outDir ) ;
1011 const options = defDownloadOptions ( _options ) ;
1112
1213 // Get book frame url
@@ -27,88 +28,115 @@ export class ScookBook extends Book {
2728 await userPage . close ( ) ;
2829 }
2930
30- // Get page count, first page url
31- let pageCount : number ;
32- let pageXUrl : string ;
33-
34- const page = await this . shelf . browser . newPage ( ) ;
31+ const framePage = await this . shelf . browser . newPage ( ) ;
3532 try {
36- await page . goto ( bookFrameUrl , {
33+ await framePage . goto ( bookFrameUrl , {
3734 waitUntil : 'load' ,
3835 timeout : this . shelf . options . timeout ,
3936 } ) ;
4037
41- while ( true ) {
42- try {
43- pageCount = parseInt (
44- await page . $eval (
45- '#total-pages' ,
46- ( totalPages ) => ( totalPages as HTMLSpanElement ) . innerText
47- )
48- ) ;
49- } catch ( e ) {
50- await delay ( 1000 ) ;
51- continue ;
52- }
53- if ( isNaN ( pageCount ) ) continue ;
54- break ;
38+ const pageUrls = await this . getPageUrls ( framePage ) ;
39+
40+ let downloadedPages = 0 ;
41+ const getProgress = ( ) => ( {
42+ item : this ,
43+ percentage : downloadedPages / pageUrls . length ,
44+ downloadedPages,
45+ pageCount : pageUrls . length ,
46+ } ) ;
47+ options . onStart ( getProgress ( ) ) ;
48+
49+ await promisePool (
50+ async ( i ) => {
51+ const pageNo = i + 1 ;
52+ await this . savePage ( pageUrls [ i ] , saveDir , pageNo , options ) ;
53+
54+ downloadedPages ++ ;
55+ options . onProgress ( getProgress ( ) ) ;
56+ } ,
57+ options . concurrency ,
58+ pageUrls . length
59+ ) ;
60+
61+ // Merge pdf pages
62+ options . mergePdfs && ( await this . mergePdfPages ( saveDir , pageUrls . length ) ) ;
63+ } finally {
64+ await framePage . close ( ) ;
65+ }
66+ }
67+
68+ private async getPageUrls ( framePage : Page ) {
69+ // get count
70+ let pageCount : number ;
71+ while ( true ) {
72+ try {
73+ pageCount = parseInt (
74+ await framePage . $eval (
75+ '#total-pages' ,
76+ ( totalPages ) => ( totalPages as HTMLSpanElement ) . innerText
77+ )
78+ ) ;
79+ } catch ( e ) {
80+ await delay ( 1000 ) ;
81+ continue ;
5582 }
83+ if ( isNaN ( pageCount ) ) continue ;
84+ break ;
85+ }
86+
87+ const goPageForm = await framePage . $ ( 'form.go-page' ) ;
88+ if ( ! goPageForm ) {
89+ throw new ScrapeError ( 'Could not locate scooks go page form.' ) ;
90+ }
91+ const curPageInput = await framePage . $ ( 'input.current-page' ) ;
92+ if ( ! curPageInput ) {
93+ throw new ScrapeError ( 'Could not locate scooks current page input.' ) ;
94+ }
95+
96+ let pageUrls : string [ ] = [ ] ;
97+ for ( let i = 0 ; i < pageCount ; i ++ ) {
98+ const pageNo = i + 1 ;
99+
100+ // nav to page
101+ await curPageInput . type ( pageNo . toString ( ) ) ;
102+ await curPageInput . press ( 'Enter' ) ;
56103
57- const img = await page . $ ( '.image-div > img' ) ;
104+ // get page
105+ const img = await framePage . $ ( '.image-div > img' ) ;
58106 if ( ! img ) {
59107 throw new ScrapeError ( 'Could not locate scook book page image.' ) ;
60108 }
61- pageXUrl = await img . evaluate ( ( img ) => ( img as HTMLImageElement ) . src ) ;
62- } finally {
63- await page . close ( ) ;
109+ const pageUrl = await img . evaluate (
110+ ( img ) => ( img as HTMLImageElement ) . src
111+ ) ;
112+ pageUrls . push ( pageUrl ) ;
64113 }
65114
66- // Page download pool
67- let downloadedPages = 0 ;
68- const getProgress = ( ) => ( {
69- item : this ,
70- percentage : downloadedPages / pageCount ,
71- downloadedPages,
72- pageCount,
73- } ) ;
74- options . onStart ( getProgress ( ) ) ;
75-
76- await promisePool (
77- async ( i ) => {
78- const pageNo = i + 1 ;
79-
80- const page = await this . shelf . browser . newPage ( ) ;
81- try {
82- await page . goto (
83- pageXUrl . replace (
84- / (?< = - ) [ 0 - 9 ] + (? = \. ) / g,
85- pageNo . toString ( ) . padStart ( 3 , '0' )
86- ) ,
87- {
88- waitUntil : 'domcontentloaded' ,
89- timeout : this . shelf . options . timeout ,
90- }
91- ) ;
92-
93- // Save it as pdf
94- const pdfFile = this . getPdfPath ( dir , pageNo ) ;
95-
96- await page . pdf ( {
97- ...( await getPdfOptions ( page , options ) ) ,
98- path : pdfFile ,
99- } ) ;
115+ return pageUrls ;
116+ }
100117
101- downloadedPages ++ ;
102- options . onProgress ( getProgress ( ) ) ;
103- } finally {
104- await page . close ( ) ;
105- }
106- } ,
107- options . concurrency ,
108- pageCount
109- ) ;
110-
111- // Merge pdf pages
112- options . mergePdfs && ( await this . mergePdfPages ( dir , pageCount ) ) ;
118+ private async savePage (
119+ pageUrl : string ,
120+ saveDir : string ,
121+ pageNo : number ,
122+ options : DownloadOptions
123+ ) {
124+ const page = await this . shelf . browser . newPage ( ) ;
125+ try {
126+ await page . goto ( pageUrl , {
127+ waitUntil : 'domcontentloaded' ,
128+ timeout : this . shelf . options . timeout ,
129+ } ) ;
130+
131+ // Save as pdf
132+ const pdfFile = this . getPdfPath ( saveDir , pageNo ) ;
133+
134+ await page . pdf ( {
135+ ...( await getPdfOptions ( page , options ) ) ,
136+ path : pdfFile ,
137+ } ) ;
138+ } finally {
139+ await page . close ( ) ;
140+ }
113141 }
114142}
0 commit comments