Skip to content

Commit 4908554

Browse files
authored
Merge pull request #114 from sillsdev/BL-15209_autonyms_in_default_scripts
fix: BL-15209 autonyms should be in the default script (#114)
2 parents 9b05105 + a1ddb9b commit 4908554

File tree

4 files changed

+75
-51
lines changed

4 files changed

+75
-51
lines changed

components/language-chooser/common/find-language/language-data/languageData.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

components/language-chooser/common/find-language/languageSearch.spec.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,16 @@ describe("asyncGetAllLanguageResults", () => {
169169
);
170170
}, 10000);
171171

172+
it("should prefer any autonym on the default script entry", async () => {
173+
const zsmResults = await asyncGetAllLanguageResults("zsm");
174+
expect(zsmResults[0].autonym).toBe("Bahasa Malaysia");
175+
});
176+
177+
it("should use a non-default script autonym if no autonym on default script entry", async () => {
178+
const cjsResults = await asyncGetAllLanguageResults("cjs");
179+
expect(cjsResults[0].autonym).toBe("Тадар тили");
180+
});
181+
172182
it("should prefer localnames[0] for autonym", async () => {
173183
const azerbaijaniResults = await asyncGetAllLanguageResults("azerbaijani");
174184
expect(azerbaijaniResults[0].autonym).toBe("Azərbaycan dili");

components/language-chooser/common/find-language/scripts/langtagProcessing.ts

Lines changed: 61 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,53 +19,63 @@ import {
1919
import fs from "fs";
2020
import langTagsJson from "../language-data/langtags.json" with { type: "json" };
2121

22+
function bestAutonymFromEntry(entry: any) {
23+
return entry.localnames ? entry.localnames[0] : entry.localname;
24+
}
25+
2226
// We want to have one entry for every ISO 639-3 code, whereas langtags.json sometimes has multiple entries per code
2327
// Combine entry into the entry with matching ISO 630-3 code in langs if there is one, otherwise create a new entry
2428
function addOrCombineLangtagsEntry(
2529
entry: ILangtagsJsonEntryInternal,
2630
langs: { [key: string]: ILanguageInternal }
2731
) {
28-
if (!entry.iso639_3) {
32+
if (!entry.indivIsoCode) {
33+
console.log("Missing indivIsoCode for ", entry.full);
2934
// langTags.json has metadata items in the same list mixed in with the data entries
3035
return;
3136
}
3237

33-
if (langs[entry.iso639_3]) {
38+
if (langs[entry.indivIsoCode]) {
3439
// We already have an entry with this code, combine with it
3540

3641
// We prioritize autonyms from the "localnames" field (which matches ethnologue if present)
3742
// over the "localname" field (which is from CLDR and may be specific to a region e.g. "español de México")
3843
// Some languages may have an entry with "localname" but not "localname" and another entry with "localname" but not "localnames"
39-
langs[entry.iso639_3].autonym = entry.localnames
44+
langs[entry.indivIsoCode].autonym = entry.localnames
4045
? entry.localnames[0]
41-
: langs[entry.iso639_3].autonym || entry.localname;
42-
langs[entry.iso639_3].regionNames.add(entry.regionname);
46+
: langs[entry.indivIsoCode].autonym || entry.localname;
47+
if (!entry.tag.includes("-")) {
48+
langs[entry.indivIsoCode].defaultScriptAutonym =
49+
bestAutonymFromEntry(entry);
50+
}
51+
langs[entry.indivIsoCode].regionNames.add(entry.regionname);
4352
if (
4453
// some languages will have multiple entries with the same script. If so we just want to make sure we take one that has an autonym if possible
45-
!langs[entry.iso639_3].scripts[entry.script] ||
54+
!langs[entry.indivIsoCode].scripts[entry.script] ||
4655
(entry.localnames?.length || 0) > 0
4756
) {
48-
langs[entry.iso639_3].scripts[entry.script] = {
57+
langs[entry.indivIsoCode].scripts[entry.script] = {
4958
code: entry.script,
5059
name: scriptNames[entry.script],
5160
languageNameInScript:
5261
(entry.localnames || [undefined])[0] ||
53-
langs[entry.iso639_3].scripts[entry.script]?.languageNameInScript ||
62+
langs[entry.indivIsoCode].scripts[entry.script]
63+
?.languageNameInScript ||
5464
entry.localname,
5565
} as IScript;
5666
}
57-
langs[entry.iso639_3].names = new Set([
58-
...langs[entry.iso639_3].names,
67+
langs[entry.indivIsoCode].names = new Set([
68+
...langs[entry.indivIsoCode].names,
5969
...getAllPossibleNames(entry),
6070
]);
61-
langs[entry.iso639_3].alternativeTags = new Set([
62-
...langs[entry.iso639_3].alternativeTags,
71+
langs[entry.indivIsoCode].alternativeTags = new Set([
72+
...langs[entry.indivIsoCode].alternativeTags,
6373
entry.full,
6474
...(entry.tags ?? []),
6575
]);
6676

67-
langs[entry.iso639_3].isRepresentativeForMacrolanguage =
68-
langs[entry.iso639_3].isRepresentativeForMacrolanguage ||
77+
langs[entry.indivIsoCode].isRepresentativeForMacrolanguage =
78+
langs[entry.indivIsoCode].isRepresentativeForMacrolanguage ||
6979
entry.isRepresentativeForMacrolanguage;
7080
} else {
7181
const scriptCode = entry.script;
@@ -79,20 +89,28 @@ function addOrCombineLangtagsEntry(
7989
} as IScript;
8090
}
8191
// create a new entry for this language code
82-
langs[entry.iso639_3] = {
83-
autonym: entry.localnames ? entry.localnames[0] : entry.localname,
92+
langs[entry.indivIsoCode] = {
93+
autonym: bestAutonymFromEntry(entry),
94+
// if there is no "-" in entry.tag, the language subtag alone is considered equivalent to this entry i.e. this is the default script
95+
defaultScriptAutonym: entry.tag.includes("-")
96+
? undefined
97+
: bestAutonymFromEntry(entry),
8498
exonym: entry.name,
85-
iso639_3_code: entry.iso639_3 as string,
86-
languageSubtag: entry.tag.split("-")[0], // might be 2-letter
99+
iso639_3_code: entry.indivIsoCode as string,
100+
// If the indivIsoCode is different from the iso639_3 code, the iso639_3 (and so probably also the tag) was a
101+
// macrolanguage code so we want to make sure to use the individual language code instead
102+
languageSubtag:
103+
entry.indivIsoCode != entry.iso639_3
104+
? entry.indivIsoCode
105+
: entry.tag.split("-")[0], // might be 2-letter
87106
regionNames: new Set([entry.regionname]),
88107
names: getAllPossibleNames(entry),
89108
scripts,
90109
parentMacrolanguage:
91-
macrolanguagesByCode[indivlangsToMacrolangs[entry.iso639_3]],
110+
macrolanguagesByCode[indivlangsToMacrolangs[entry.indivIsoCode]],
92111
isRepresentativeForMacrolanguage: entry.isRepresentativeForMacrolanguage,
93-
isMacrolanguage: isMacrolanguage(entry.iso639_3),
94112
alternativeTags: new Set([entry.full, ...(entry.tags || [])]),
95-
languageType: getLanguageType(entry.iso639_3),
113+
languageType: getLanguageType(entry.indivIsoCode),
96114
} as ILanguageInternal;
97115
}
98116
}
@@ -102,57 +120,52 @@ function parseLangtagsJson() {
102120
const langTags = langTagsJson as any[];
103121
const consolidatedLangTags: { [key: string]: ILanguageInternal } = {};
104122
for (const entry of langTags) {
123+
const augmentedEntry = entry as ILangtagsJsonEntryInternal;
105124
const languageSubtag = entry.tag.split("-")[0];
125+
106126
// If listed with a macrolanguage code, this is a "representative language", we need to identify it by its equivalent
107127
// individual language code. See macrolanguageNotes.md
108128
if (isMacrolanguage(entry.iso639_3) || isMacrolanguage(languageSubtag)) {
109-
const indivIsoCode = isMacrolanguage(entry.iso639_3)
129+
augmentedEntry["isRepresentativeForMacrolanguage"] = true;
130+
augmentedEntry["indivIsoCode"] = isMacrolanguage(entry.iso639_3)
110131
? macrolangsToRepresentativeLangs[entry.iso639_3]
111132
: entry.iso639_3;
112-
if (indivIsoCode) {
113-
addOrCombineLangtagsEntry(
114-
{
115-
...entry,
116-
iso639_3: indivIsoCode,
117-
tag: indivIsoCode,
118-
isRepresentativeForMacrolanguage: true,
119-
} as ILangtagsJsonEntryInternal,
120-
consolidatedLangTags
121-
);
122-
} else {
133+
134+
if (!augmentedEntry["indivIsoCode"]) {
123135
// This is a data anomaly but we do have 5 as of Feb 2025: bnc, nor, san, hbs, zap
124136
// See macrolanguageNotes.md. These cases should be specially handled.
125137
console.log(
126138
"No indivIsoCode found for macrolang",
127139
entry.iso639_3,
128140
entry.tag
129141
);
130-
addOrCombineLangtagsEntry(
131-
{
132-
...entry,
133-
isRepresentativeForMacrolanguage: true,
134-
} as ILangtagsJsonEntryInternal,
135-
consolidatedLangTags
136-
);
137142
}
138-
} else {
139-
addOrCombineLangtagsEntry(entry, consolidatedLangTags);
140143
}
144+
145+
// in normal cases, indivIsoCode is just the iso639_3 code
146+
augmentedEntry["indivIsoCode"] =
147+
augmentedEntry["indivIsoCode"] || entry.iso639_3;
148+
149+
addOrCombineLangtagsEntry(augmentedEntry, consolidatedLangTags);
141150
}
142151

143152
// Tweak some of the data into the format we want
144153
const reformattedLangs: ILanguage[] = Object.values(consolidatedLangTags).map(
145154
(langData: ILanguageInternal) => {
155+
const autonym = stripMacrolanguageParenthetical(
156+
langData.defaultScriptAutonym || langData.autonym
157+
);
158+
const exonym = stripMacrolanguageParenthetical(langData.exonym);
146159
// Don't repeat the autonym and exonym in the names list
147-
langData.names.delete(langData.autonym);
148-
langData.names.delete(langData.exonym);
160+
langData.names.delete(autonym);
161+
langData.names.delete(exonym);
149162
const regionNamesForSearch = [
150163
...(uncommaAll(langData.regionNames) as Set<string>),
151164
].filter((regionName) => !!regionName);
152165
const regionNamesForDisplay = regionNamesForSearch.join(COMMA_SEPARATOR);
153166
return {
154-
autonym: uncomma(stripMacrolanguageParenthetical(langData.autonym)),
155-
exonym: uncomma(stripMacrolanguageParenthetical(langData.exonym)),
167+
autonym: uncomma(autonym),
168+
exonym: uncomma(exonym),
156169
iso639_3_code: langData.iso639_3_code,
157170
languageSubtag: langData.languageSubtag,
158171
// For all these normal individual languages, we display and search key the same region list
@@ -164,7 +177,7 @@ function parseLangtagsJson() {
164177
].filter((name) => !!name),
165178
alternativeTags: [...langData.alternativeTags],
166179
parentMacrolanguage: langData.parentMacrolanguage,
167-
isMacrolanguage: langData.isMacrolanguage,
180+
isMacrolanguage: false, // we add macrolanguages separately below. See macrolanguageNotes.md
168181
isRepresentativeForMacrolanguage:
169182
langData.isRepresentativeForMacrolanguage,
170183
languageType: langData.languageType,
@@ -207,7 +220,7 @@ function parseLangTagsTxt() {
207220
if (line.length === 0) {
208221
continue;
209222
}
210-
const tags = line.split(" = ").map((tag) => tag.trim());
223+
const tags = line.split(" = ").map((t) => t.trim());
211224
tagLookups.push({
212225
shortest: tags[0],
213226
maximal: tags[tags.length - 1],

components/language-chooser/common/find-language/scripts/langtagProcessingHelpers.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export const COMMA_SEPARATOR = ", ";
1313

1414
export interface ILanguageInternal {
1515
autonym: string;
16+
defaultScriptAutonym?: string;
1617
exonym: string;
1718
iso639_3_code: string;
1819
languageSubtag: string;
@@ -28,7 +29,7 @@ export interface ILanguageInternal {
2829

2930
export interface ILangtagsJsonEntryInternal {
3031
full: string;
31-
iso639_3: string;
32+
iso639_3: string; //straight from langtags.json, may be a macrolanguage code. See macrolanguageNotes.md
3233
iana: string[];
3334
latnnames: string[];
3435
localname: string;
@@ -53,6 +54,7 @@ export interface ILangtagsJsonEntryInternal {
5354

5455
// These are not in the langtags.json file but may be added in the processing
5556
isRepresentativeForMacrolanguage: boolean;
57+
indivIsoCode: string; // If iso639_3 is a macrolanguage code, this is the corresponding (representative) individual language code - see macrolanguageNotes.md
5658
}
5759

5860
interface IIsoCodeDetailsInternal {
@@ -273,7 +275,6 @@ function findIndivIsoCode(
273275
isoCodesDetails[parts[0]] || isoCodesDetails[iso639_1To639_3[parts[0]]];
274276
if (!isoCodeDetails) {
275277
// probably a deprecated code
276-
console.log("failed to find iso639_3 code for", tag);
277278
continue;
278279
}
279280
if (

0 commit comments

Comments
 (0)