Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
createTag,
createTagFromOrthography,
defaultRegionForLangTag,
formatDialectCode,
getMaximalLangtag,
getShortestSufficientLangtag,
isManuallyEnteredTagLanguage,
Expand Down Expand Up @@ -371,6 +372,48 @@ describe("createTagFromOrthography", () => {
})
).toEqual("en-x-foobar");
});
it("should modify dialog name if necessary", () => {
expect(
createTagFromOrthography({
language: {
languageSubtag: "en",
exonym: "English",
scripts: [],
iso639_3_code: "eng",
regionNamesForDisplay: "",
regionNamesForSearch: [],
names: [],
alternativeTags: [],
languageType: LanguageType.Living,
isMacrolanguage: false,
} as ILanguage,
script: { code: "Latn", name: "Latin" } as IScript,
customDetails: {
dialect: "Special English!",
} as ICustomizableLanguageDetails,
})
).toEqual("en-x-SpecialE");
expect(
createTagFromOrthography({
language: {
languageSubtag: "en",
exonym: "English",
scripts: [],
iso639_3_code: "eng",
regionNamesForDisplay: "",
regionNamesForSearch: [],
names: [],
alternativeTags: [],
languageType: LanguageType.Living,
isMacrolanguage: false,
} as ILanguage,
script: { code: "Latn", name: "Latin" } as IScript,
customDetails: {
dialect: "ai-newFancySmartAi",
} as ICustomizableLanguageDetails,
})
).toEqual("en-x-ai-newFancy");
});
});

describe("isValidBcp47Tag checking is sane", () => {
Expand All @@ -381,6 +424,7 @@ describe("isValidBcp47Tag checking is sane", () => {
expect(isValidBcp47Tag("en-Latn-US-x-foobar")).toBeTruthy();
expect(isValidBcp47Tag("en-x-foobar")).toBeTruthy();
expect(isValidBcp47Tag("en-US")).toBeTruthy();
expect(isValidBcp47Tag("en-x-ai-google")).toBeTruthy();
});

it("should return true for macrolang-indiv lang formatted tags, including sign language tags", () => {
Expand Down Expand Up @@ -426,3 +470,32 @@ describe("sanity checks for isUnlistedLanguage and isManuallyEnteredTagLanguage"
).toEqual(true);
});
});

describe("formatting dialect codes", () => {
it("should return empty string for undefined or empty input", () => {
expect(formatDialectCode("")).toEqual("");
expect(formatDialectCode(undefined)).toEqual("");
});
it("should trim whitespace", () => {
expect(formatDialectCode(" foo bar ")).toEqual("foobar");
});
it("should remove illegal characters", () => {
expect(formatDialectCode("foo!@#$%^&*()bar")).toEqual("foobar");
});
it("should retain dashes", () => {
expect(formatDialectCode("ai-google")).toEqual("ai-google");
});
it("should truncate sections to 8 characters", () => {
expect(
formatDialectCode("123456789-123456789-123456789-123456789")
).toEqual("12345678-12345678-12345678-12345678");
expect(formatDialectCode("ai-newFancySmartAi")).toEqual("ai-newFancy");
});
it("various combinations", () => {
expect(
formatDialectCode(
" 1234 5678 9-1234 5678 9-!@#$%^&*()1234 5678 9-foobar"
)
).toEqual("12345678-12345678-12345678-foobar");
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ export function getMaximalLangtag(langtag: string): string | undefined {
return lookup.get(langtag.toLowerCase());
}

// This is pretty naive. If you are using the language-chooser-react-hook and there may be a manually entered language
// tag or bracket demarcation, use createTagFromOrthography in language-chooser-react-hook instead
// This is pretty naive. Exported for unit testing, but most situations should use createTagFromOrthography instead
export function createTag({
languageCode,
scriptCode,
Expand Down Expand Up @@ -101,10 +100,9 @@ export function createTag({
if (!languageCode || dialectCode) {
tag += "-x-";
}
// Subtags have a maximum length of 8 characters, so if dialectCode is longer than that, truncate it
// when appending to the tag. See BL-14806.
// Dialect code should have already been formatted, i.e. by formatDialectCode
if (dialectCode) {
tag += `${dialectCode.length <= 8 ? dialectCode : dialectCode.slice(0, 8)}`;
tag += `${dialectCode}`;
}
return getShortestSufficientLangtag(tag) || tag;
}
Expand Down Expand Up @@ -240,6 +238,22 @@ export function defaultRegionForLangTag(
}
}

/// Returns a code for use in the Private Use section of a BCP 47 tag,
/// made up of strings of up to 8 alphanumeric characters, separated by hyphens.
/// Removes non-alphanumeric characters (other than hyphens) and truncates each section to 8 characters
/// Enhance: we could further enforce BCP-47 rules, e.g. minimum length of each section
/// see https://www.rfc-editor.org/rfc/bcp/bcp47.txt
export function formatDialectCode(dialect?: string): string {
if (!dialect) return "";
return dialect
.split("-")
.map((s) => {
const alphanumeric = s.replace(/[^a-zA-Z0-9]/g, "");
return alphanumeric.slice(0, 8);
})
.join("-");
}

export function createTagFromOrthography(orthography: IOrthography): string {
const strippedOrthography = deepStripDemarcation(orthography);
if (isManuallyEnteredTagLanguage(strippedOrthography.language)) {
Expand All @@ -259,7 +273,7 @@ export function createTagFromOrthography(orthography: IOrthography): string {
languageCode: strippedOrthography.language?.languageSubtag,
scriptCode,
regionCode: strippedOrthography.customDetails?.region?.code,
dialectCode: strippedOrthography.customDetails?.dialect,
dialectCode: formatDialectCode(strippedOrthography.customDetails?.dialect),
});
}

Expand Down