@@ -29,15 +29,31 @@ const skip = new Set([
29
29
// Not in HTML
30
30
25 , 54 ,
31
31
] ) ;
32
+ const MAX_CONCURRENCY = 10 ;
33
+ const REFETCH_OLD_VERSIONS = false ;
32
34
33
- async . each ( range ( 1 , MAX_REPORT ) , ( num , cb ) => {
35
+ async . eachLimit ( range ( 1 , MAX_REPORT ) , MAX_CONCURRENCY , ( num , cb ) => {
34
36
if ( skip . has ( num ) ) {
35
37
console . log ( 'Skipping report #' + num ) ;
36
38
cb ( ) ;
37
39
return ;
38
40
}
39
41
40
- const url = `https://www.unicode.org/reports/tr${ num } /` ;
42
+ recurseStandard ( num , `https://www.unicode.org/reports/tr${ num } /` , null , cb ) ;
43
+ } , ( err ) => {
44
+ if ( err ) {
45
+ console . log ( 'there was an error' ) ;
46
+ console . error ( err ) ;
47
+ return ;
48
+ }
49
+ const output = { } ;
50
+ for ( const key of Object . keys ( current ) . sort ( ) ) {
51
+ output [ key ] = current [ key ] ;
52
+ }
53
+ helper . writeBiblio ( FILENAME , output ) ;
54
+ } ) ;
55
+
56
+ function recurseStandard ( num , url , latestId , cb ) {
41
57
console . log ( 'Fetching' , url , '...' ) ;
42
58
request ( {
43
59
url,
@@ -53,13 +69,15 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
53
69
console . log ( 'Parsing' , url , '...' ) ;
54
70
const dom = new JSDOM ( body , { url } ) ;
55
71
const { document } = dom . window ;
56
- const type = document . title . slice ( 0 , 3 ) ;
57
- if ( type !== 'UTS' && type !== 'UTR' && type !== 'UAX' ) {
58
- console . log ( 'Unable to parse title' , document . title ) ;
72
+
73
+ const infoTableEl = document . querySelector ( '.body > table' ) ;
74
+ const infoTable = infoTableEl && parseTable ( infoTableEl ) ;
75
+ if ( ! infoTable ) {
76
+ console . log ( 'Unable to find information table' ) ;
59
77
cb ( ) ;
60
78
return ;
61
79
}
62
- const id = type + num ;
80
+
63
81
const statusEl = document . querySelector ( '.body > h2' ) ;
64
82
if ( ! statusEl ) {
65
83
console . log ( 'Unable to find status' ) ;
@@ -68,6 +86,24 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
68
86
}
69
87
const status = trimText ( statusEl . textContent ) ;
70
88
89
+ let type = document . title . match ( / \b ( U T S | U T R | U A X ) / ) ;
90
+ if ( type !== 'UTS' && type !== 'UTR' && type !== 'UAX' ) {
91
+ // Fallback for https://www.unicode.org/reports/tr35/
92
+ const lowerStatus = status . toLowerCase ( ) ;
93
+ if ( lowerStatus . indexOf ( 'technical standard' ) != - 1 ) {
94
+ type = 'UTS' ;
95
+ } else if ( lowerStatus . indexOf ( 'standard annex' ) != - 1 ) {
96
+ type = 'UAX' ;
97
+ } else if ( lowerStatus . indexOf ( 'technical report' ) != - 1 ) {
98
+ type = 'UTR' ;
99
+ } else {
100
+ console . log ( 'Unable to parse document type' ) ;
101
+ cb ( ) ;
102
+ return ;
103
+ }
104
+ }
105
+ const thisId = type + num ;
106
+
71
107
const titleEl = statusEl . nextElementSibling ;
72
108
if ( ! titleEl || titleEl . tagName !== 'H1' ) {
73
109
console . log ( 'Unable to find title' ) ;
@@ -78,69 +114,101 @@ async.each(range(1, MAX_REPORT), (num, cb) => {
78
114
if ( ! / [ a - z ] / . test ( title ) )
79
115
title = titleCase ( title ) ;
80
116
81
- const infoTableEl = document . querySelector ( '.body > table' ) ;
82
- const infoTable = infoTableEl && parseTable ( infoTableEl ) ;
83
- if ( ! infoTable ) {
84
- console . log ( 'Unable to find information table' ) ;
117
+ if ( latestId == null ) {
118
+ // This is first scanned document, so the latest version.
119
+ latestId = thisId ;
120
+
121
+ const authors = infoTable . Editor && parseEditor ( infoTable . Editor ) ;
122
+ if ( ! authors ) {
123
+ console . log ( 'Unable to find/parse editors in table' ) ;
124
+ cb ( ) ;
125
+ return ;
126
+ }
127
+
128
+ current [ thisId ] = {
129
+ href : url ,
130
+ authors,
131
+ etAl : authors . etAl ,
132
+ title,
133
+ status,
134
+ publisher : 'Unicode Consortium' ,
135
+ versions : current [ latestId ] ?. versions ?? { }
136
+ } ;
137
+ } else if ( thisId != latestId ) {
138
+ // The document was renamed at some point - create link
139
+ current [ thisId ] = { aliasOf : latestId } ;
140
+ }
141
+
142
+ const href = processURL ( infoTable [ 'This Version' ] ) ;
143
+ if ( ! href ) {
144
+ console . log ( 'Failed to extract version URL' ) ;
85
145
cb ( ) ;
86
146
return ;
87
147
}
88
148
89
- const date = trimText ( infoTable . Date ) ;
90
- if ( ! date ) {
91
- console . log ( 'Unable to find date in table ' ) ;
149
+ const revision = parseRevision ( href ) ;
150
+ if ( ! revision ) {
151
+ console . log ( 'Failed to extract revision ' ) ;
92
152
cb ( ) ;
93
153
return ;
94
154
}
95
- let isRawDate = / \d { 4 } - \d { 2 } - \d { 2 } / . test ( date ) ;
96
-
97
- const href = processURL ( infoTable [ 'This Version' ] || url ) ;
98
155
99
- const authors = infoTable . Editor && parseEditor ( infoTable . Editor ) ;
100
- if ( ! authors ) {
101
- console . log ( 'Unable to find/parse editors in table' ) ;
156
+ if ( ! infoTable . Date ) {
157
+ console . log ( 'Unable to find date in table' ) ;
158
+ cb ( ) ;
159
+ return ;
160
+ }
161
+ /*
162
+ * Replace all spaces. We cannot simply trim as https://www.unicode.org/reports/tr57/tr57-2.html
163
+ * contains "2024- 07-01" due to the coloring.
164
+ */
165
+ const rawDate = infoTable . Date . replace ( / \s / g, '' ) ;
166
+ if ( ! / \d { 4 } - \d { 2 } - \d { 2 } / . test ( rawDate ) ) {
167
+ console . log ( 'Unable to parse data in table' ) ;
102
168
cb ( ) ;
103
169
return ;
104
170
}
105
171
106
- if ( type !== 'UAX' && current [ `UAX${ num } ` ] )
107
- current [ `UAX${ num } ` ] = { aliasOf : id } ;
108
- if ( type !== 'UTR' && current [ `UTR${ num } ` ] )
109
- current [ `UTR${ num } ` ] = { aliasOf : id } ;
110
- if ( type !== 'UTS' && current [ `UTS${ num } ` ] )
111
- current [ `UTS${ num } ` ] = { aliasOf : id } ;
172
+ const version = parseVersion ( infoTable . Version ) ;
173
+ if ( version )
174
+ title = `${ title } version ${ version } ` ;
175
+ else
176
+ title = `${ title } revision ${ revision } ` ;
112
177
113
- current [ id ] = {
114
- authors,
115
- etAl : authors . etAl ,
178
+ const wasAlreadyDefined = revision in current [ latestId ] . versions ;
179
+ current [ latestId ] . versions [ revision ] = {
116
180
href,
181
+ rawDate,
117
182
title,
118
- date : isRawDate ? undefined : date ,
119
- rawDate : isRawDate ? date : undefined ,
120
- status,
121
- publisher : 'Unicode Consortium'
183
+ status : current [ latestId ] . status != status ? status : undefined ,
122
184
} ;
185
+
186
+ /*
187
+ * If this revision was already defined, then don't waste time and bandwidth fetching
188
+ * previous revisions which should have no changes.
189
+ *
190
+ * We're running this check after updating the information for this version in case this
191
+ * is the latest and is a WIP, as we have already downloaded it anyway.
192
+ */
193
+ if ( ! wasAlreadyDefined || REFETCH_OLD_VERSIONS ) {
194
+ const previousUrl = processURL ( infoTable [ 'Previous Version' ] ) ;
195
+ if ( previousUrl ) {
196
+ recurseStandard ( num , previousUrl , latestId , cb ) ;
197
+ return ;
198
+ }
199
+ }
123
200
cb ( ) ;
124
201
} ) ;
125
- } , ( err ) => {
126
- if ( err ) {
127
- console . log ( 'there was an error' ) ;
128
- console . error ( err ) ;
129
- return ;
130
- }
131
- const output = { } ;
132
- for ( const key of Object . keys ( current ) . sort ( ) ) {
133
- output [ key ] = current [ key ] ;
134
- }
135
- helper . writeBiblio ( FILENAME , output ) ;
136
- } ) ;
202
+ }
137
203
138
204
function * range ( from , until ) {
139
205
for ( let i = from ; i <= until ; i ++ )
140
206
yield i ;
141
207
}
142
208
143
209
function trimText ( str ) {
210
+ if ( ! str )
211
+ return str ;
144
212
return str . replace ( / ® / g, '' ) . trim ( ) . replace ( / \s + / g, ' ' ) ;
145
213
}
146
214
@@ -154,9 +222,9 @@ function gatherText(element) {
154
222
if ( node . nodeType === node . ELEMENT_NODE && node . tagName === 'BR' )
155
223
str += '\n' ;
156
224
else
157
- str += trimText ( node . textContent ) + ' ' ;
225
+ str += node . textContent ;
158
226
}
159
- return str ;
227
+ return trimText ( str ) ;
160
228
}
161
229
162
230
function parseTable ( tableEl ) {
@@ -173,7 +241,16 @@ function parseTable(tableEl) {
173
241
}
174
242
175
243
function processURL ( str ) {
176
- return trimText ( str ) . replace ( / ^ h t t p : / , 'https:' ) ;
244
+ if ( ! str )
245
+ return null ;
246
+ str = trimText ( str ) ;
247
+ /*
248
+ * Check for "Previous Version" in https://www.unicode.org/reports/tr38/tr38-5.html and
249
+ * others, where it is "n/a".
250
+ */
251
+ if ( str . substring ( 0 , 4 ) != 'http' )
252
+ return null ;
253
+ return str . replace ( / ^ h t t p : / , 'https:' ) ;
177
254
}
178
255
179
256
function parseEditor ( str ) {
@@ -184,3 +261,22 @@ function parseEditor(str) {
184
261
}
185
262
return arr ;
186
263
}
264
+
265
+ function parseRevision ( url ) {
266
+ if ( ! url )
267
+ return null ;
268
+ /*
269
+ * Find a in the URL the pattern "/tr<num>/tr<num>-<revision>". This works for the two cases:
270
+ * - /tr<num>/tr<num>-<rev>/tr<num>.html (only UTS #35?)
271
+ * - /tr<num>/tr<num>-<rev>.html (all others)
272
+ */
273
+ const match = url . match ( / \/ ( t r \d + ) \/ \1- (?< rev > \d + ) / , url ) ;
274
+ return match ? match . groups . rev : null ;
275
+ }
276
+
277
+ function parseVersion ( str ) {
278
+ if ( ! str )
279
+ return null ;
280
+ // Some have "Unicode 11.0.0" instead of the version alone. Strip it.
281
+ return trimText ( str ) . replace ( / ^ U n i c o d e \s * / , '' ) ;
282
+ }
0 commit comments