@@ -19,53 +19,63 @@ import {
1919import fs from "fs" ;
2020import langTagsJson from "../language-data/langtags.json" with { type : "json" } ;
2121
22+ function bestAutonymFromEntry ( entry : any ) {
23+ return entry . localnames ? entry . localnames [ 0 ] : entry . localname ;
24+ }
25+
2226// We want to have one entry for every ISO 639-3 code, whereas langtags.json sometimes has multiple entries per code
2327// Combine entry into the entry with matching ISO 630-3 code in langs if there is one, otherwise create a new entry
2428function addOrCombineLangtagsEntry (
2529 entry : ILangtagsJsonEntryInternal ,
2630 langs : { [ key : string ] : ILanguageInternal }
2731) {
28- if ( ! entry . iso639_3 ) {
32+ if ( ! entry . indivIsoCode ) {
33+ console . log ( "Missing indivIsoCode for " , entry . full ) ;
2934 // langTags.json has metadata items in the same list mixed in with the data entries
3035 return ;
3136 }
3237
33- if ( langs [ entry . iso639_3 ] ) {
38+ if ( langs [ entry . indivIsoCode ] ) {
3439 // We already have an entry with this code, combine with it
3540
3641 // We prioritize autonyms from the "localnames" field (which matches ethnologue if present)
3742 // over the "localname" field (which is from CLDR and may be specific to a region e.g. "español de México")
3843 // Some languages may have an entry with "localname" but not "localname" and another entry with "localname" but not "localnames"
39- langs [ entry . iso639_3 ] . autonym = entry . localnames
44+ langs [ entry . indivIsoCode ] . autonym = entry . localnames
4045 ? entry . localnames [ 0 ]
41- : langs [ entry . iso639_3 ] . autonym || entry . localname ;
42- langs [ entry . iso639_3 ] . regionNames . add ( entry . regionname ) ;
46+ : langs [ entry . indivIsoCode ] . autonym || entry . localname ;
47+ if ( ! entry . tag . includes ( "-" ) ) {
48+ langs [ entry . indivIsoCode ] . defaultScriptAutonym =
49+ bestAutonymFromEntry ( entry ) ;
50+ }
51+ langs [ entry . indivIsoCode ] . regionNames . add ( entry . regionname ) ;
4352 if (
4453 // some languages will have multiple entries with the same script. If so we just want to make sure we take one that has an autonym if possible
45- ! langs [ entry . iso639_3 ] . scripts [ entry . script ] ||
54+ ! langs [ entry . indivIsoCode ] . scripts [ entry . script ] ||
4655 ( entry . localnames ?. length || 0 ) > 0
4756 ) {
48- langs [ entry . iso639_3 ] . scripts [ entry . script ] = {
57+ langs [ entry . indivIsoCode ] . scripts [ entry . script ] = {
4958 code : entry . script ,
5059 name : scriptNames [ entry . script ] ,
5160 languageNameInScript :
5261 ( entry . localnames || [ undefined ] ) [ 0 ] ||
53- langs [ entry . iso639_3 ] . scripts [ entry . script ] ?. languageNameInScript ||
62+ langs [ entry . indivIsoCode ] . scripts [ entry . script ]
63+ ?. languageNameInScript ||
5464 entry . localname ,
5565 } as IScript ;
5666 }
57- langs [ entry . iso639_3 ] . names = new Set ( [
58- ...langs [ entry . iso639_3 ] . names ,
67+ langs [ entry . indivIsoCode ] . names = new Set ( [
68+ ...langs [ entry . indivIsoCode ] . names ,
5969 ...getAllPossibleNames ( entry ) ,
6070 ] ) ;
61- langs [ entry . iso639_3 ] . alternativeTags = new Set ( [
62- ...langs [ entry . iso639_3 ] . alternativeTags ,
71+ langs [ entry . indivIsoCode ] . alternativeTags = new Set ( [
72+ ...langs [ entry . indivIsoCode ] . alternativeTags ,
6373 entry . full ,
6474 ...( entry . tags ?? [ ] ) ,
6575 ] ) ;
6676
67- langs [ entry . iso639_3 ] . isRepresentativeForMacrolanguage =
68- langs [ entry . iso639_3 ] . isRepresentativeForMacrolanguage ||
77+ langs [ entry . indivIsoCode ] . isRepresentativeForMacrolanguage =
78+ langs [ entry . indivIsoCode ] . isRepresentativeForMacrolanguage ||
6979 entry . isRepresentativeForMacrolanguage ;
7080 } else {
7181 const scriptCode = entry . script ;
@@ -79,20 +89,28 @@ function addOrCombineLangtagsEntry(
7989 } as IScript ;
8090 }
8191 // create a new entry for this language code
82- langs [ entry . iso639_3 ] = {
83- autonym : entry . localnames ? entry . localnames [ 0 ] : entry . localname ,
92+ langs [ entry . indivIsoCode ] = {
93+ autonym : bestAutonymFromEntry ( entry ) ,
94+ // if there is no "-" in entry.tag, the language subtag alone is considered equivalent to this entry i.e. this is the default script
95+ defaultScriptAutonym : entry . tag . includes ( "-" )
96+ ? undefined
97+ : bestAutonymFromEntry ( entry ) ,
8498 exonym : entry . name ,
85- iso639_3_code : entry . iso639_3 as string ,
86- languageSubtag : entry . tag . split ( "-" ) [ 0 ] , // might be 2-letter
99+ iso639_3_code : entry . indivIsoCode as string ,
100+ // If the indivIsoCode is different from the iso639_3 code, the iso639_3 (and so probably also the tag) was a
101+ // macrolanguage code so we want to make sure to use the individual language code instead
102+ languageSubtag :
103+ entry . indivIsoCode != entry . iso639_3
104+ ? entry . indivIsoCode
105+ : entry . tag . split ( "-" ) [ 0 ] , // might be 2-letter
87106 regionNames : new Set ( [ entry . regionname ] ) ,
88107 names : getAllPossibleNames ( entry ) ,
89108 scripts,
90109 parentMacrolanguage :
91- macrolanguagesByCode [ indivlangsToMacrolangs [ entry . iso639_3 ] ] ,
110+ macrolanguagesByCode [ indivlangsToMacrolangs [ entry . indivIsoCode ] ] ,
92111 isRepresentativeForMacrolanguage : entry . isRepresentativeForMacrolanguage ,
93- isMacrolanguage : isMacrolanguage ( entry . iso639_3 ) ,
94112 alternativeTags : new Set ( [ entry . full , ...( entry . tags || [ ] ) ] ) ,
95- languageType : getLanguageType ( entry . iso639_3 ) ,
113+ languageType : getLanguageType ( entry . indivIsoCode ) ,
96114 } as ILanguageInternal ;
97115 }
98116}
@@ -102,57 +120,52 @@ function parseLangtagsJson() {
102120 const langTags = langTagsJson as any [ ] ;
103121 const consolidatedLangTags : { [ key : string ] : ILanguageInternal } = { } ;
104122 for ( const entry of langTags ) {
123+ const augmentedEntry = entry as ILangtagsJsonEntryInternal ;
105124 const languageSubtag = entry . tag . split ( "-" ) [ 0 ] ;
125+
106126 // If listed with a macrolanguage code, this is a "representative language", we need to identify it by its equivalent
107127 // individual language code. See macrolanguageNotes.md
108128 if ( isMacrolanguage ( entry . iso639_3 ) || isMacrolanguage ( languageSubtag ) ) {
109- const indivIsoCode = isMacrolanguage ( entry . iso639_3 )
129+ augmentedEntry [ "isRepresentativeForMacrolanguage" ] = true ;
130+ augmentedEntry [ "indivIsoCode" ] = isMacrolanguage ( entry . iso639_3 )
110131 ? macrolangsToRepresentativeLangs [ entry . iso639_3 ]
111132 : entry . iso639_3 ;
112- if ( indivIsoCode ) {
113- addOrCombineLangtagsEntry (
114- {
115- ...entry ,
116- iso639_3 : indivIsoCode ,
117- tag : indivIsoCode ,
118- isRepresentativeForMacrolanguage : true ,
119- } as ILangtagsJsonEntryInternal ,
120- consolidatedLangTags
121- ) ;
122- } else {
133+
134+ if ( ! augmentedEntry [ "indivIsoCode" ] ) {
123135 // This is a data anomaly but we do have 5 as of Feb 2025: bnc, nor, san, hbs, zap
124136 // See macrolanguageNotes.md. These cases should be specially handled.
125137 console . log (
126138 "No indivIsoCode found for macrolang" ,
127139 entry . iso639_3 ,
128140 entry . tag
129141 ) ;
130- addOrCombineLangtagsEntry (
131- {
132- ...entry ,
133- isRepresentativeForMacrolanguage : true ,
134- } as ILangtagsJsonEntryInternal ,
135- consolidatedLangTags
136- ) ;
137142 }
138- } else {
139- addOrCombineLangtagsEntry ( entry , consolidatedLangTags ) ;
140143 }
144+
145+ // in normal cases, indivIsoCode is just the iso639_3 code
146+ augmentedEntry [ "indivIsoCode" ] =
147+ augmentedEntry [ "indivIsoCode" ] || entry . iso639_3 ;
148+
149+ addOrCombineLangtagsEntry ( augmentedEntry , consolidatedLangTags ) ;
141150 }
142151
143152 // Tweak some of the data into the format we want
144153 const reformattedLangs : ILanguage [ ] = Object . values ( consolidatedLangTags ) . map (
145154 ( langData : ILanguageInternal ) => {
155+ const autonym = stripMacrolanguageParenthetical (
156+ langData . defaultScriptAutonym || langData . autonym
157+ ) ;
158+ const exonym = stripMacrolanguageParenthetical ( langData . exonym ) ;
146159 // Don't repeat the autonym and exonym in the names list
147- langData . names . delete ( langData . autonym ) ;
148- langData . names . delete ( langData . exonym ) ;
160+ langData . names . delete ( autonym ) ;
161+ langData . names . delete ( exonym ) ;
149162 const regionNamesForSearch = [
150163 ...( uncommaAll ( langData . regionNames ) as Set < string > ) ,
151164 ] . filter ( ( regionName ) => ! ! regionName ) ;
152165 const regionNamesForDisplay = regionNamesForSearch . join ( COMMA_SEPARATOR ) ;
153166 return {
154- autonym : uncomma ( stripMacrolanguageParenthetical ( langData . autonym ) ) ,
155- exonym : uncomma ( stripMacrolanguageParenthetical ( langData . exonym ) ) ,
167+ autonym : uncomma ( autonym ) ,
168+ exonym : uncomma ( exonym ) ,
156169 iso639_3_code : langData . iso639_3_code ,
157170 languageSubtag : langData . languageSubtag ,
158171 // For all these normal individual languages, we display and search key the same region list
@@ -164,7 +177,7 @@ function parseLangtagsJson() {
164177 ] . filter ( ( name ) => ! ! name ) ,
165178 alternativeTags : [ ...langData . alternativeTags ] ,
166179 parentMacrolanguage : langData . parentMacrolanguage ,
167- isMacrolanguage : langData . isMacrolanguage ,
180+ isMacrolanguage : false , // we add macrolanguages separately below. See macrolanguageNotes.md
168181 isRepresentativeForMacrolanguage :
169182 langData . isRepresentativeForMacrolanguage ,
170183 languageType : langData . languageType ,
@@ -207,7 +220,7 @@ function parseLangTagsTxt() {
207220 if ( line . length === 0 ) {
208221 continue ;
209222 }
210- const tags = line . split ( " = " ) . map ( ( tag ) => tag . trim ( ) ) ;
223+ const tags = line . split ( " = " ) . map ( ( t ) => t . trim ( ) ) ;
211224 tagLookups . push ( {
212225 shortest : tags [ 0 ] ,
213226 maximal : tags [ tags . length - 1 ] ,
0 commit comments