Skip to content

Commit cff8fd9

Browse files
committed
fix: BL-15204 restrict dialect code format
1 parent c87c163 commit cff8fd9

File tree

2 files changed

+93
-6
lines changed

2 files changed

+93
-6
lines changed

components/language-chooser/common/find-language/languageTagUtils.spec.ts

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
createTag,
44
createTagFromOrthography,
55
defaultRegionForLangTag,
6+
formatDialectCode,
67
getMaximalLangtag,
78
getShortestSufficientLangtag,
89
isManuallyEnteredTagLanguage,
@@ -371,6 +372,48 @@ describe("createTagFromOrthography", () => {
371372
})
372373
).toEqual("en-x-foobar");
373374
});
375+
it("should modify dialog name if necessary", () => {
376+
expect(
377+
createTagFromOrthography({
378+
language: {
379+
languageSubtag: "en",
380+
exonym: "English",
381+
scripts: [],
382+
iso639_3_code: "eng",
383+
regionNamesForDisplay: "",
384+
regionNamesForSearch: [],
385+
names: [],
386+
alternativeTags: [],
387+
languageType: LanguageType.Living,
388+
isMacrolanguage: false,
389+
} as ILanguage,
390+
script: { code: "Latn", name: "Latin" } as IScript,
391+
customDetails: {
392+
dialect: "Special English!",
393+
} as ICustomizableLanguageDetails,
394+
})
395+
).toEqual("en-x-SpecialE");
396+
expect(
397+
createTagFromOrthography({
398+
language: {
399+
languageSubtag: "en",
400+
exonym: "English",
401+
scripts: [],
402+
iso639_3_code: "eng",
403+
regionNamesForDisplay: "",
404+
regionNamesForSearch: [],
405+
names: [],
406+
alternativeTags: [],
407+
languageType: LanguageType.Living,
408+
isMacrolanguage: false,
409+
} as ILanguage,
410+
script: { code: "Latn", name: "Latin" } as IScript,
411+
customDetails: {
412+
dialect: "ai-newFancySmartAi",
413+
} as ICustomizableLanguageDetails,
414+
})
415+
).toEqual("en-x-ai-newFancy");
416+
});
374417
});
375418

376419
describe("isValidBcp47Tag checking is sane", () => {
@@ -381,6 +424,7 @@ describe("isValidBcp47Tag checking is sane", () => {
381424
expect(isValidBcp47Tag("en-Latn-US-x-foobar")).toBeTruthy();
382425
expect(isValidBcp47Tag("en-x-foobar")).toBeTruthy();
383426
expect(isValidBcp47Tag("en-US")).toBeTruthy();
427+
expect(isValidBcp47Tag("en-x-ai-google")).toBeTruthy();
384428
});
385429

386430
it("should return true for macrolang-indiv lang formatted tags, including sign language tags", () => {
@@ -426,3 +470,32 @@ describe("sanity checks for isUnlistedLanguage and isManuallyEnteredTagLanguage"
426470
).toEqual(true);
427471
});
428472
});
473+
474+
describe("formatting dialect codes", () => {
475+
it("should return empty string for undefined or empty input", () => {
476+
expect(formatDialectCode("")).toEqual("");
477+
expect(formatDialectCode(undefined)).toEqual("");
478+
});
479+
it("should trim whitespace", () => {
480+
expect(formatDialectCode(" foo bar ")).toEqual("foobar");
481+
});
482+
it("should remove illegal characters", () => {
483+
expect(formatDialectCode("foo!@#$%^&*()bar")).toEqual("foobar");
484+
});
485+
it("should retain dashes", () => {
486+
expect(formatDialectCode("ai-google")).toEqual("ai-google");
487+
});
488+
it("should truncate sections to 8 characters", () => {
489+
expect(
490+
formatDialectCode("123456789-123456789-123456789-123456789")
491+
).toEqual("12345678-12345678-12345678-12345678");
492+
expect(formatDialectCode("ai-newFancySmartAi")).toEqual("ai-newFancy");
493+
});
494+
it("various combinations", () => {
495+
expect(
496+
formatDialectCode(
497+
" 1234 5678 9-1234 5678 9-!@#$%^&*()1234 5678 9-foobar"
498+
)
499+
).toEqual("12345678-12345678-12345678-foobar");
500+
});
501+
});

components/language-chooser/common/find-language/languageTagUtils.ts

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,7 @@ export function getMaximalLangtag(langtag: string): string | undefined {
7070
return lookup.get(langtag.toLowerCase());
7171
}
7272

73-
// This is pretty naive. If you are using the language-chooser-react-hook and there may be a manually entered language
74-
// tag or bracket demarcation, use createTagFromOrthography in language-chooser-react-hook instead
73+
// This is pretty naive. Exported for unit testing, but most situations should use createTagFromOrthography instead
7574
export function createTag({
7675
languageCode,
7776
scriptCode,
@@ -101,10 +100,9 @@ export function createTag({
101100
if (!languageCode || dialectCode) {
102101
tag += "-x-";
103102
}
104-
// Subtags have a maximum length of 8 characters, so if dialectCode is longer than that, truncate it
105-
// when appending to the tag. See BL-14806.
103+
// Dialect code should have already been formatted, i.e. by formatDialectCode
106104
if (dialectCode) {
107-
tag += `${dialectCode.length <= 8 ? dialectCode : dialectCode.slice(0, 8)}`;
105+
tag += `${dialectCode}`;
108106
}
109107
return getShortestSufficientLangtag(tag) || tag;
110108
}
@@ -240,6 +238,22 @@ export function defaultRegionForLangTag(
240238
}
241239
}
242240

241+
/// Returns a code for use in the Private Use section of a BCP 47 tag,
242+
/// made up of strings of up to 8 alphanumeric characters, separated by hyphens.
243+
/// Removes non-alphanumeric characters (other than hyphens) and truncates each section to 8 characters
244+
/// Enhance: we could further enforce BCP-47 rules, e.g. minimum length of each section
245+
/// see https://www.rfc-editor.org/rfc/bcp/bcp47.txt
246+
export function formatDialectCode(dialect?: string): string {
247+
if (!dialect) return "";
248+
return dialect
249+
.split("-")
250+
.map((s) => {
251+
const alphanumeric = s.replace(/[^a-zA-Z0-9]/g, "");
252+
return alphanumeric.slice(0, 8);
253+
})
254+
.join("-");
255+
}
256+
243257
export function createTagFromOrthography(orthography: IOrthography): string {
244258
const strippedOrthography = deepStripDemarcation(orthography);
245259
if (isManuallyEnteredTagLanguage(strippedOrthography.language)) {
@@ -259,7 +273,7 @@ export function createTagFromOrthography(orthography: IOrthography): string {
259273
languageCode: strippedOrthography.language?.languageSubtag,
260274
scriptCode,
261275
regionCode: strippedOrthography.customDetails?.region?.code,
262-
dialectCode: strippedOrthography.customDetails?.dialect,
276+
dialectCode: formatDialectCode(strippedOrthography.customDetails?.dialect),
263277
});
264278
}
265279

0 commit comments

Comments
 (0)