diff --git a/fluent-langneg/src/locale.ts b/fluent-langneg/src/locale.ts index d4a96822..7f4b36db 100644 --- a/fluent-langneg/src/locale.ts +++ b/fluent-langneg/src/locale.ts @@ -1,177 +1,29 @@ -const languageCodeRe = "([a-z]{2,3}|\\*)"; -const scriptCodeRe = "(?:-([a-z]{4}|\\*))"; -const regionCodeRe = "(?:-([a-z]{2}|\\*))"; -const variantCodeRe = "(?:-(([0-9][a-z0-9]{3}|[a-z0-9]{5,8})|\\*))"; +export class LocaleWrapper extends Intl.Locale { + variants?: string; -/** - * Regular expression splitting locale id into four pieces: - * - * Example: `en-Latn-US-macos` - * - * language: en - * script: Latn - * region: US - * variant: macos - * - * It can also accept a range `*` character on any position. - */ -const localeRe = new RegExp( - `^${languageCodeRe}${scriptCodeRe}?${regionCodeRe}?${variantCodeRe}?$`, - "i" -); - -export class Locale { - isWellFormed: boolean; - language?: string; - script?: string; - region?: string; - variant?: string; - - /** - * Parses a locale id using the localeRe into an array with four elements. - * - * If the second argument `range` is set to true, it places range `*` char - * in place of any missing piece. - * - * It also allows skipping the script section of the id, so `en-US` is - * properly parsed as `en-*-US-*`. - */ constructor(locale: string) { - const result = localeRe.exec(locale.replace(/_/g, "-")); - if (!result) { - this.isWellFormed = false; - return; - } - - let [, language, script, region, variant] = result; - - if (language) { - this.language = language.toLowerCase(); - } - if (script) { - this.script = script[0].toUpperCase() + script.slice(1); - } - if (region) { - this.region = region.toUpperCase(); - } - this.variant = variant; - this.isWellFormed = true; - } - - isEqual(other: Locale): boolean { - return ( - this.language === other.language && - this.script === other.script && - this.region === other.region && - this.variant === other.variant - ); - } - - matches(other: Locale, thisRange = false, otherRange = false): boolean { - return ( - (this.language === other.language || - (thisRange && this.language === undefined) || - (otherRange && other.language === undefined)) && - (this.script === other.script || - (thisRange && this.script === undefined) || - (otherRange && other.script === undefined)) && - (this.region === other.region || - (thisRange && this.region === undefined) || - (otherRange && other.region === undefined)) && - (this.variant === other.variant || - (thisRange && this.variant === undefined) || - (otherRange && other.variant === undefined)) - ); - } - - toString(): string { - return [this.language, this.script, this.region, this.variant] - .filter(part => part !== undefined) - .join("-"); - } - - clearVariants(): void { - this.variant = undefined; - } - - clearRegion(): void { - this.region = undefined; - } - - addLikelySubtags(): boolean { - const newLocale = getLikelySubtagsMin(this.toString().toLowerCase()); - if (newLocale) { - this.language = newLocale.language; - this.script = newLocale.script; - this.region = newLocale.region; - this.variant = newLocale.variant; - return true; + let tag = locale + .replace(/_/g, "-") + .replace(/^\*/, "und") + .replace(/-\*/g, ""); + + super(tag); + + if (!("variants" in this)) { + // Available on Firefox 141 & later + let lsrTagLength = this.language.length; + if (this.script) lsrTagLength += this.script.length + 1; + if (this.region) lsrTagLength += this.region.length + 1; + + if (tag.length > lsrTagLength) { + let unicodeExtStart: number | undefined = tag.search(/-[a-zA-Z]-/); + if (unicodeExtStart === -1) unicodeExtStart = undefined; + this.variants = tag.substring(lsrTagLength + 1, unicodeExtStart); + } } - return false; } -} -/** - * Below is a manually a list of likely subtags corresponding to Unicode - * CLDR likelySubtags list. - * This list is curated by the maintainers of Project Fluent and is - * intended to be used in place of the full likelySubtags list in use cases - * where full list cannot be (for example, due to the size). - * - * This version of the list is based on CLDR 30.0.3. - */ -const likelySubtagsMin: Record = { - ar: "ar-arab-eg", - "az-arab": "az-arab-ir", - "az-ir": "az-arab-ir", - be: "be-cyrl-by", - da: "da-latn-dk", - el: "el-grek-gr", - en: "en-latn-us", - fa: "fa-arab-ir", - ja: "ja-jpan-jp", - ko: "ko-kore-kr", - pt: "pt-latn-br", - sr: "sr-cyrl-rs", - "sr-ru": "sr-latn-ru", - sv: "sv-latn-se", - ta: "ta-taml-in", - uk: "uk-cyrl-ua", - zh: "zh-hans-cn", - "zh-hant": "zh-hant-tw", - "zh-hk": "zh-hant-hk", - "zh-mo": "zh-hant-mo", - "zh-tw": "zh-hant-tw", - "zh-gb": "zh-hant-gb", - "zh-us": "zh-hant-us", -}; - -const regionMatchingLangs = [ - "az", - "bg", - "cs", - "de", - "es", - "fi", - "fr", - "hu", - "it", - "lt", - "lv", - "nl", - "pl", - "ro", - "ru", -]; - -function getLikelySubtagsMin(loc: string): Locale | null { - if (Object.prototype.hasOwnProperty.call(likelySubtagsMin, loc)) { - return new Locale(likelySubtagsMin[loc]); - } - const locale = new Locale(loc); - if (locale.language && regionMatchingLangs.includes(locale.language)) { - locale.region = locale.language.toUpperCase(); - return locale; + get language(): string { + return super.language ?? "und"; } - return null; } diff --git a/fluent-langneg/src/matches.ts b/fluent-langneg/src/matches.ts index 213001de..64097187 100644 --- a/fluent-langneg/src/matches.ts +++ b/fluent-langneg/src/matches.ts @@ -1,4 +1,4 @@ -import { Locale } from "./locale.js"; +import { LocaleWrapper } from "./locale.js"; /** * Negotiates the languages between the list of requested locales against @@ -75,133 +75,158 @@ export function filterMatches( availableLocales: Array, strategy: string ): Array { - const supportedLocales: Set = new Set(); - const availableLocalesMap: Map = new Map(); + const supportedLocales = new Set(); + const availableLocalesMap = new Map(); for (let locale of availableLocales) { - let newLocale = new Locale(locale); - if (newLocale.isWellFormed) { - availableLocalesMap.set(locale, new Locale(locale)); + try { + availableLocalesMap.set(locale, new LocaleWrapper(locale)); + } catch { + continue; } } - outer: for (const reqLocStr of requestedLocales) { - const reqLocStrLC = reqLocStr.toLowerCase(); - const requestedLocale = new Locale(reqLocStrLC); - - if (requestedLocale.language === undefined) { + outer: for (const reqTag of requestedLocales) { + let requested: LocaleWrapper; + try { + requested = new LocaleWrapper(reqTag); + } catch { continue; } // 1) Attempt to make an exact match // Example: `en-US` === `en-US` + const reqTagLowerCase = reqTag.toLowerCase(); for (const key of availableLocalesMap.keys()) { - if (reqLocStrLC === key.toLowerCase()) { + if (reqTagLowerCase === key.toLowerCase()) { supportedLocales.add(key); availableLocalesMap.delete(key); - if (strategy === "lookup") { - return Array.from(supportedLocales); - } else if (strategy === "filtering") { - continue; - } else { - continue outer; + switch (strategy) { + case "lookup": + break outer; + case "filtering": + continue; + default: + continue outer; } } } + const reqVariants = requested.variants; + // 2) Attempt to match against the available range - // This turns `en` into `en-*-*-*` and `en-US` into `en-*-US-*` // Example: ['en-US'] * ['en'] = ['en'] - for (const [key, availableLocale] of availableLocalesMap.entries()) { - if (availableLocale.matches(requestedLocale, true, false)) { + for (const [key, available] of availableLocalesMap) { + if ( + languagesMatch(available, requested, false) && + scriptsMatch(available, requested, false) && + regionsMatch(available, requested, false) && + (available.variants === reqVariants || available.variants === undefined) + ) { supportedLocales.add(key); availableLocalesMap.delete(key); - if (strategy === "lookup") { - return Array.from(supportedLocales); - } else if (strategy === "filtering") { - continue; - } else { - continue outer; + switch (strategy) { + case "lookup": + break outer; + case "filtering": + continue; + default: + continue outer; } } } // 3) Attempt to retrieve a maximal version of the requested locale ID - // If data is available, it'll expand `en` into `en-Latn-US` and - // `zh` into `zh-Hans-CN`. + // It'll expand `en` into `en-Latn-US` and `zh` into `zh-Hans-CN`. // Example: ['en'] * ['en-GB', 'en-US'] = ['en-US'] - if (requestedLocale.addLikelySubtags()) { - for (const [key, availableLocale] of availableLocalesMap.entries()) { - if (availableLocale.matches(requestedLocale, true, false)) { - supportedLocales.add(key); - availableLocalesMap.delete(key); - if (strategy === "lookup") { - return Array.from(supportedLocales); - } else if (strategy === "filtering") { + requested = requested.maximize(); + + for (const [key, available] of availableLocalesMap) { + if ( + languagesMatch(available, requested, false) && + scriptsMatch(available, requested, false) && + regionsMatch(available, requested, false) && + (available.variants === reqVariants || available.variants === undefined) + ) { + supportedLocales.add(key); + availableLocalesMap.delete(key); + switch (strategy) { + case "lookup": + break outer; + case "filtering": continue; - } else { + default: continue outer; - } } } } // 4) Attempt to look up for a different variant for the same locale ID // Example: ['en-US-mac'] * ['en-US-win'] = ['en-US-win'] - requestedLocale.clearVariants(); - - for (const [key, availableLocale] of availableLocalesMap.entries()) { - if (availableLocale.matches(requestedLocale, true, true)) { + for (const [key, available] of availableLocalesMap) { + if ( + languagesMatch(available, requested, true) && + scriptsMatch(available, requested, true) && + regionsMatch(available, requested, true) + ) { supportedLocales.add(key); availableLocalesMap.delete(key); - if (strategy === "lookup") { - return Array.from(supportedLocales); - } else if (strategy === "filtering") { - continue; - } else { - continue outer; + switch (strategy) { + case "lookup": + break outer; + case "filtering": + continue; + default: + continue outer; } } } // 5) Attempt to match against the likely subtag without region - // In the example below, addLikelySubtags will turn + // In the example below, maximize() will turn // `zh-Hant` into `zh-Hant-TW` giving `zh-TW` priority match // over `zh-CN`. // // Example: ['zh-Hant-HK'] * ['zh-TW', 'zh-CN'] = ['zh-TW'] - requestedLocale.clearRegion(); - - if (requestedLocale.addLikelySubtags()) { - for (const [key, availableLocale] of availableLocalesMap.entries()) { - if (availableLocale.matches(requestedLocale, true, false)) { - supportedLocales.add(key); - availableLocalesMap.delete(key); - if (strategy === "lookup") { - return Array.from(supportedLocales); - } else if (strategy === "filtering") { + requested = new Intl.Locale(requested.language, { + script: requested.script, + }).maximize(); + + for (const [key, available] of availableLocalesMap) { + if ( + languagesMatch(available, requested, false) && + scriptsMatch(available, requested, false) && + regionsMatch(available, requested, false) + ) { + supportedLocales.add(key); + availableLocalesMap.delete(key); + switch (strategy) { + case "lookup": + break outer; + case "filtering": continue; - } else { + default: continue outer; - } } } } // 6) Attempt to look up for a different region for the same locale ID // Example: ['en-US'] * ['en-AU'] = ['en-AU'] - requestedLocale.clearRegion(); - - for (const [key, availableLocale] of availableLocalesMap.entries()) { - if (availableLocale.matches(requestedLocale, true, true)) { + for (const [key, available] of availableLocalesMap) { + if ( + languagesMatch(available, requested, true) && + scriptsMatch(available, requested, true) + ) { supportedLocales.add(key); availableLocalesMap.delete(key); - if (strategy === "lookup") { - return Array.from(supportedLocales); - } else if (strategy === "filtering") { - continue; - } else { - continue outer; + switch (strategy) { + case "lookup": + break outer; + case "filtering": + continue; + default: + continue outer; } } } @@ -209,3 +234,31 @@ export function filterMatches( return Array.from(supportedLocales); } + +function languagesMatch( + a: Intl.Locale, + b: Intl.Locale, + range = false +): boolean { + return ( + a.language === b.language || + a.language === "und" || + (range && b.language === "und") + ); +} + +function scriptsMatch(a: Intl.Locale, b: Intl.Locale, range = false): boolean { + return ( + a.script === b.script || + a.script === undefined || + (range && b.script === undefined) + ); +} + +function regionsMatch(a: Intl.Locale, b: Intl.Locale, range = false): boolean { + return ( + a.region === b.region || + a.region === undefined || + (range && b.region === undefined) + ); +} diff --git a/fluent-langneg/test/langneg_test.js b/fluent-langneg/test/langneg_test.js index a52e2281..37129fbb 100644 --- a/fluent-langneg/test/langneg_test.js +++ b/fluent-langneg/test/langneg_test.js @@ -1,6 +1,8 @@ import assert from "assert"; import { negotiateLanguages } from "../src/negotiate_languages.ts"; +const nodeVersion = parseInt(process.version.substring(1)); + const data = { filtering: { "exact match": [ @@ -42,11 +44,11 @@ const data = { ], [["fr"], ["fr-CA", "fr-FR"], ["fr-FR", "fr-CA"]], [["az-IR"], ["az-Latn", "az-Arab"], ["az-Arab"]], - [["sr-RU"], ["sr-Cyrl", "sr-Latn"], ["sr-Latn"]], + [["sr-RU"], ["sr-Cyrl", "sr-Latn"], ["sr-Cyrl"], undefined, 22], [["sr"], ["sr-Latn", "sr-Cyrl"], ["sr-Cyrl"]], [["zh-GB"], ["zh-Hans", "zh-Hant"], ["zh-Hant"]], [["sr", "ru"], ["sr-Latn", "ru"], ["ru"]], - [["sr-RU"], ["sr-Latn-RO", "sr-Cyrl"], ["sr-Latn-RO"]], + [["sr-RO"], ["sr-Cyrl", "sr-Latn"], ["sr-Latn"]], ], "should match cross-region": [ [["en"], ["en-US"], ["en-US"]], @@ -82,13 +84,13 @@ const data = { ], "should handle default locale properly": [ [["fr"], ["de", "it"], []], - [["fr"], ["de", "it"], "en-US", ["en-US"]], - [["fr"], ["de", "en-US"], "en-US", ["en-US"]], + [["fr"], ["de", "it"], ["en-US"], "en-US"], + [["fr"], ["de", "en-US"], ["en-US"], "en-US"], [ ["fr", "de-DE"], ["de-DE", "fr-CA"], - "en-US", ["fr-CA", "de-DE", "en-US"], + "en-US", ], ], "should handle all matches on the 1st higher than any on the 2nd": [ @@ -126,21 +128,14 @@ const data = { [ ["fr", "en"], ["en-US", "fr-FR", "en", "fr"], - undefined, - "matching", ["fr", "en"], ], + [["es-419"], ["es", "en"], ["es"]], ], }, lookup: { "should match only one": [ - [ - ["fr-FR", "en"], - ["en-US", "fr-FR", "en", "fr"], - "en-US", - "lookup", - ["fr-FR"], - ], + [["fr-FR", "en"], ["en-US", "fr-FR", "en", "fr"], ["fr-FR"], "en-US"], ], }, }; @@ -153,14 +148,17 @@ suite("Language Negotiation", () => { const group = data[strategy][groupName]; test(`${strategy} - ${groupName}`, () => { - for (const test of group) { - const requested = test[0]; - const available = test[1]; - const supported = test[test.length - 1]; - - const result = negotiateLanguages(test[0], test[1], { - defaultLocale: test.length > 3 ? test[2] : undefined, - strategy: test.length > 4 ? test[3] : undefined, + for (const [ + requested, + available, + supported, + defaultLocale, + minNodeVersion, + ] of group) { + if (nodeVersion < minNodeVersion) continue; + const result = negotiateLanguages(requested, available, { + defaultLocale, + strategy, }); assert.deepEqual( result, diff --git a/fluent-langneg/test/locale_test.js b/fluent-langneg/test/locale_test.js index 673f07be..facff5cb 100644 --- a/fluent-langneg/test/locale_test.js +++ b/fluent-langneg/test/locale_test.js @@ -1,9 +1,14 @@ import assert from "assert"; -import { Locale } from "../src/locale.ts"; +import { LocaleWrapper } from "../src/locale.ts"; function isLocaleEqual(str, ref) { - const locale = new Locale(str); - return locale.isEqual(ref); + const locale = new LocaleWrapper(str); + return ( + locale.language === ref.language && + locale.script === ref.script && + locale.region === ref.region && + locale.variants === ref.variants + ); } suite("Parses simple locales", () => { @@ -61,7 +66,7 @@ suite("Parses simple locales", () => { language: "en", script: "Latn", region: "US", - variant: "macos", + variants: "macos", }) ); @@ -70,7 +75,7 @@ suite("Parses simple locales", () => { language: "lij", script: "Arab", region: "FA", - variant: "linux", + variants: "linux", }) ); }); @@ -87,7 +92,7 @@ suite("Parses simple locales", () => { isLocaleEqual("lij-FA-linux", { language: "lij", region: "FA", - variant: "linux", + variants: "linux", }) ); }); @@ -104,7 +109,17 @@ suite("Parses simple locales", () => { isLocaleEqual("lij-FA-linux", { language: "lij", region: "FA", - variant: "linux", + variants: "linux", + }) + ); + }); + + test("skipping extensions", () => { + assert.ok( + isLocaleEqual("en-US-macos-linux-u-hc-h12", { + language: "en", + region: "US", + variants: "macos-linux", }) ); }); @@ -114,20 +129,20 @@ suite("Parses locale ranges", () => { test("language part", () => { assert.ok( isLocaleEqual("*", { - language: "*", + language: "und", }) ); assert.ok( isLocaleEqual("*-Latn", { - language: "*", + language: "und", script: "Latn", }) ); assert.ok( isLocaleEqual("*-US", { - language: "*", + language: "und", region: "US", }) ); @@ -137,14 +152,12 @@ suite("Parses locale ranges", () => { assert.ok( isLocaleEqual("en-*", { language: "en", - script: "*", }) ); assert.ok( isLocaleEqual("en-*-US", { language: "en", - script: "*", region: "US", }) ); @@ -155,7 +168,6 @@ suite("Parses locale ranges", () => { isLocaleEqual("en-Latn-*", { language: "en", script: "Latn", - region: "*", }) ); }); @@ -166,7 +178,6 @@ suite("Parses locale ranges", () => { language: "en", script: "Latn", region: "US", - variant: "*", }) ); });