11#!/usr/bin/env node
22/* eslint-disable no-console */
3+ import { ListObjectsV2Command , PutObjectCommand , S3Client } from '@aws-sdk/client-s3' ;
34import { selectAll } from 'hast-util-select' ;
45import { createHash } from 'node:crypto' ;
56import { createReadStream , createWriteStream , existsSync } from 'node:fs' ;
6- import { mkdir , opendir , readFile , rm } from 'node:fs/promises' ;
7+ import { mkdir , opendir , readFile , rm , writeFile } from 'node:fs/promises' ;
78import { cpus } from 'node:os' ;
89import * as path from 'node:path' ;
9- import { Readable } from 'node:stream' ;
10+ import { compose , Readable } from 'node:stream' ;
11+ import { text } from 'node:stream/consumers' ;
1012import { pipeline } from 'node:stream/promises' ;
1113import { fileURLToPath } from 'node:url' ;
1214import { isMainThread , parentPort , Worker , workerData } from 'node:worker_threads' ;
@@ -23,17 +25,40 @@ import {unified} from 'unified';
2325import { remove } from 'unist-util-remove' ;
2426
2527const CACHE_COMPRESS_LEVEL = 4 ;
28+ const R2_BUCKET = 'sentry-docs' ;
29+ const accessKeyId = process . env . R2_ACCESS_KEY_ID ;
30+ const secretAccessKey = process . env . R2_SECRET_ACCESS_KEY ;
2631
27- function taskFinishHandler ( data ) {
28- if ( data . failedTasks . length === 0 ) {
29- console . log (
30- `💰 Worker[${ data . id } ]: Cache hits: ${ data . cacheHits } (${ Math . round ( ( data . cacheHits / data . success ) * 100 ) } %)`
31- ) ;
32- console . log ( `✅ Worker[${ data . id } ]: converted ${ data . success } files successfully.` ) ;
32+ function getS3Client ( ) {
33+ return new S3Client ( {
34+ endpoint : 'https://773afa1f62ff86c80db4f24f7ff1e9c8.r2.cloudflarestorage.com' ,
35+ region : 'auto' ,
36+ credentials : {
37+ accessKeyId,
38+ secretAccessKey,
39+ } ,
40+ retryMode : 'adaptive' ,
41+ } ) ;
42+ }
43+
44+ async function uploadToCFR2 ( s3Client , relativePath , data ) {
45+ const command = new PutObjectCommand ( {
46+ Bucket : R2_BUCKET ,
47+ Key : relativePath ,
48+ Body : data ,
49+ ContentType : 'text/markdown' ,
50+ } ) ;
51+ await s3Client . send ( command ) ;
52+ return ;
53+ }
54+
55+ function taskFinishHandler ( { id, success, failedTasks} ) {
56+ if ( failedTasks . length === 0 ) {
57+ console . log ( `✅ Worker[${ id } ]: converted ${ success } files successfully.` ) ;
3358 return false ;
3459 }
35- console . error ( `❌ Worker[${ data . id } ]: ${ data . failedTasks . length } files failed:` ) ;
36- console . error ( data . failedTasks ) ;
60+ console . error ( `❌ Worker[${ id } ]: ${ failedTasks . length } files failed:` ) ;
61+ console . error ( failedTasks ) ;
3762 return true ;
3863}
3964
@@ -68,13 +93,34 @@ async function createWork() {
6893 const numWorkers = Math . max ( Math . floor ( cpus ( ) . length / 2 ) , 2 ) ;
6994 const workerTasks = new Array ( numWorkers ) . fill ( null ) . map ( ( ) => [ ] ) ;
7095
96+ const existingFilesOnR2 = null ;
97+ if ( accessKeyId && secretAccessKey ) {
98+ console . log ( `☁️ Getting existing hashes from R2...` ) ;
99+ const s3Client = getS3Client ( ) ;
100+ let continuationToken = undefined ;
101+ do {
102+ const response = await s3Client . send (
103+ new ListObjectsV2Command ( {
104+ Bucket : R2_BUCKET ,
105+ ContinuationToken : continuationToken ,
106+ } )
107+ ) ;
108+ continuationToken = response . NextContinuationToken ;
109+ for ( const { Key, ETag} of response . Contents ) {
110+ existingFilesOnR2 . set ( Key , ETag . slice ( 1 , - 1 ) ) ; // Remove quotes from ETag
111+ }
112+ } while ( continuationToken ) ;
113+ console . log ( `✅ Found ${ existingFilesOnR2 . size } existing files on R2.` ) ;
114+ }
115+
71116 console . log ( `🔎 Discovering files to convert...` ) ;
72117
73118 let numFiles = 0 ;
74119 let workerIdx = 0 ;
75120 // Need a high buffer size here otherwise Node skips some subdirectories!
76121 // See https://github.com/nodejs/node/issues/48820
77122 const dir = await opendir ( INPUT_DIR , { recursive : true , bufferSize : 1024 } ) ;
123+
78124 for await ( const dirent of dir ) {
79125 if ( dirent . name . endsWith ( '.html' ) && dirent . isFile ( ) ) {
80126 const sourcePath = path . join ( dirent . parentPath || dirent . path , dirent . name ) ;
@@ -84,7 +130,13 @@ async function createWork() {
84130 ) ;
85131 await mkdir ( targetDir , { recursive : true } ) ;
86132 const targetPath = path . join ( targetDir , dirent . name . slice ( 0 , - 5 ) + '.md' ) ;
87- workerTasks [ workerIdx ] . push ( { sourcePath, targetPath} ) ;
133+ const relativePath = path . relative ( OUTPUT_DIR , targetPath ) ;
134+ workerTasks [ workerIdx ] . push ( {
135+ sourcePath,
136+ targetPath,
137+ relativePath,
138+ r2Hash : existingFilesOnR2 ? existingFilesOnR2 . get ( relativePath ) : null ,
139+ } ) ;
88140 workerIdx = ( workerIdx + 1 ) % numWorkers ;
89141 numFiles ++ ;
90142 }
@@ -96,7 +148,12 @@ async function createWork() {
96148 const workerPromises = new Array ( numWorkers - 1 ) . fill ( null ) . map ( ( _ , id ) => {
97149 return new Promise ( ( resolve , reject ) => {
98150 const worker = new Worker ( selfPath , {
99- workerData : { id, noCache, cacheDir : CACHE_DIR , tasks : workerTasks [ id ] } ,
151+ workerData : {
152+ id,
153+ noCache,
154+ cacheDir : CACHE_DIR ,
155+ tasks : workerTasks [ id ] ,
156+ } ,
100157 } ) ;
101158 let hasErrors = false ;
102159 worker . on ( 'message' , data => ( hasErrors = taskFinishHandler ( data ) ) ) ;
@@ -113,10 +170,10 @@ async function createWork() {
113170 // The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
114171 workerPromises . push (
115172 processTaskList ( {
116- noCache,
117- cacheDir : CACHE_DIR ,
118- tasks : workerTasks [ workerTasks . length - 1 ] ,
119173 id : workerTasks . length - 1 ,
174+ tasks : workerTasks [ workerTasks . length - 1 ] ,
175+ cacheDir : CACHE_DIR ,
176+ noCache,
120177 } ) . then ( data => {
121178 if ( taskFinishHandler ( data ) ) {
122179 throw new Error ( `Worker[${ data . id } ] had some errors.` ) ;
@@ -133,25 +190,24 @@ async function createWork() {
133190const md5 = data => createHash ( 'md5' ) . update ( data ) . digest ( 'hex' ) ;
134191
135192async function genMDFromHTML ( source , target , { cacheDir, noCache} ) {
136- const text = ( await readFile ( source , { encoding : 'utf8' } ) )
193+ const leanHTML = ( await readFile ( source , { encoding : 'utf8' } ) )
137194 // Remove all script tags, as they are not needed in markdown
138195 // and they are not stable across builds, causing cache misses
139196 . replace ( / < s c r i p t [ ^ > ] * > [ \s \S ] * ?< \/ s c r i p t > / gi, '' ) ;
140- const hash = md5 ( text ) ;
141- const cacheFile = path . join ( cacheDir , hash ) ;
197+ const cacheKey = md5 ( leanHTML ) ;
198+ const cacheFile = path . join ( cacheDir , cacheKey ) ;
142199 if ( ! noCache ) {
143200 try {
144- await pipeline (
145- createReadStream ( cacheFile ) ,
146- createBrotliDecompress ( ) ,
147- createWriteStream ( target , {
148- encoding : 'utf8' ,
149- } )
201+ const data = await text (
202+ compose ( createReadStream ( cacheFile ) , createBrotliDecompress ( ) )
150203 ) ;
204+ await writeFile ( target , data , { encoding : 'utf8' } ) ;
151205
152- return true ;
153- } catch {
154- // pass
206+ return { cacheHit : true , data} ;
207+ } catch ( err ) {
208+ if ( err . code !== 'ENOENT' ) {
209+ console . warn ( `Error using cache file ${ cacheFile } :` , err ) ;
210+ }
155211 }
156212 }
157213
@@ -178,7 +234,7 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
178234 . use ( ( ) => tree => remove ( tree , { type : 'inlineCode' , value : '' } ) )
179235 . use ( remarkGfm )
180236 . use ( remarkStringify )
181- . process ( text )
237+ . process ( leanHTML )
182238 ) ;
183239 const reader = Readable . from ( data ) ;
184240
@@ -203,23 +259,62 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
203259 ) . catch ( err => console . warn ( 'Error writing cache file:' , err ) ) ,
204260 ] ) ;
205261
206- return false ;
262+ return { cacheHit : false , data } ;
207263}
208264
209265async function processTaskList ( { id, tasks, cacheDir, noCache} ) {
266+ const s3Client = getS3Client ( ) ;
210267 const failedTasks = [ ] ;
211- let cacheHits = 0 ;
212- for ( const { sourcePath, targetPath} of tasks ) {
268+ let cacheMisses = [ ] ;
269+ let r2CacheMisses = [ ] ;
270+ console . log ( `🤖 Worker[${ id } ]: Starting to process ${ tasks . length } files...` ) ;
271+ for ( const { sourcePath, targetPath, relativePath, r2Hash} of tasks ) {
213272 try {
214- cacheHits + = await genMDFromHTML ( sourcePath , targetPath , {
273+ const { data , cacheHit } = await genMDFromHTML ( sourcePath , targetPath , {
215274 cacheDir,
216275 noCache,
217276 } ) ;
277+ if ( ! cacheHit ) {
278+ cacheMisses . push ( relativePath ) ;
279+ }
280+
281+ if ( r2Hash !== null ) {
282+ const fileHash = md5 ( data ) ;
283+ if ( r2Hash !== fileHash ) {
284+ r2CacheMisses . push ( relativePath ) ;
285+ console . log (
286+ `📤 Worker[${ id } ]: Uploading ${ relativePath } to R2, hash mismatch: ${ r2Hash } !== ${ fileHash } `
287+ ) ;
288+ await uploadToCFR2 ( s3Client , relativePath , data ) ;
289+ }
290+ }
218291 } catch ( error ) {
219292 failedTasks . push ( { sourcePath, targetPath, error} ) ;
220293 }
221294 }
222- return { id, success : tasks . length - failedTasks . length , failedTasks, cacheHits} ;
295+ const success = tasks . length - failedTasks . length ;
296+ if ( r2CacheMisses . length / tasks . length > 0.1 ) {
297+ console . warn (
298+ `⚠️ Worker[${ id } ]: More than 10% of files had a different hash on R2, this might indicate a problem with the cache or the generation process.`
299+ ) ;
300+ } else if ( r2CacheMisses . length > 0 ) {
301+ console . log (
302+ `📤 Worker[${ id } ]: Updated the following files on R2: \n${ r2CacheMisses . map ( n => ` - ${ n } ` ) . join ( '\n' ) } `
303+ ) ;
304+ }
305+ if ( cacheMisses . length / tasks . length > 0.1 ) {
306+ console . warn ( `⚠️ Worker[${ id } ]: More than 10% cache miss rate during build.` ) ;
307+ } else if ( cacheMisses . length > 0 ) {
308+ console . log (
309+ `❇️ Worker[${ id } ]: Updated cache for the following files: \n${ cacheMisses . map ( n => ` - ${ n } ` ) . join ( '\n' ) } `
310+ ) ;
311+ }
312+
313+ return {
314+ id,
315+ success,
316+ failedTasks,
317+ } ;
223318}
224319
225320async function doWork ( work ) {
0 commit comments