sillsdev · StephenMcConnel · Sep 30, 2025 · Sep 26, 2025
diff --git a/components/language-chooser/common/find-language/languageTagUtils.spec.ts b/components/language-chooser/common/find-language/languageTagUtils.spec.ts
@@ -3,6 +3,7 @@ import {
   createTag,
   createTagFromOrthography,
   defaultRegionForLangTag,
+  formatDialectCode,
   getMaximalLangtag,
   getShortestSufficientLangtag,
   isManuallyEnteredTagLanguage,
@@ -371,6 +372,48 @@ describe("createTagFromOrthography", () => {
       })
     ).toEqual("en-x-foobar");
   });
+  it("should modify dialog name if necessary", () => {
+    expect(
+      createTagFromOrthography({
+        language: {
+          languageSubtag: "en",
+          exonym: "English",
+          scripts: [],
+          iso639_3_code: "eng",
+          regionNamesForDisplay: "",
+          regionNamesForSearch: [],
+          names: [],
+          alternativeTags: [],
+          languageType: LanguageType.Living,
+          isMacrolanguage: false,
+        } as ILanguage,
+        script: { code: "Latn", name: "Latin" } as IScript,
+        customDetails: {
+          dialect: "Special English!",
+        } as ICustomizableLanguageDetails,
+      })
+    ).toEqual("en-x-SpecialE");
+    expect(
+      createTagFromOrthography({
+        language: {
+          languageSubtag: "en",
+          exonym: "English",
+          scripts: [],
+          iso639_3_code: "eng",
+          regionNamesForDisplay: "",
+          regionNamesForSearch: [],
+          names: [],
+          alternativeTags: [],
+          languageType: LanguageType.Living,
+          isMacrolanguage: false,
+        } as ILanguage,
+        script: { code: "Latn", name: "Latin" } as IScript,
+        customDetails: {
+          dialect: "ai-newFancySmartAi",
+        } as ICustomizableLanguageDetails,
+      })
+    ).toEqual("en-x-ai-newFancy");
+  });
 });
 
 describe("isValidBcp47Tag checking is sane", () => {
@@ -381,6 +424,7 @@ describe("isValidBcp47Tag checking is sane", () => {
     expect(isValidBcp47Tag("en-Latn-US-x-foobar")).toBeTruthy();
     expect(isValidBcp47Tag("en-x-foobar")).toBeTruthy();
     expect(isValidBcp47Tag("en-US")).toBeTruthy();
+    expect(isValidBcp47Tag("en-x-ai-google")).toBeTruthy();
   });
 
   it("should return true for macrolang-indiv lang formatted tags, including sign language tags", () => {
@@ -426,3 +470,32 @@ describe("sanity checks for isUnlistedLanguage and isManuallyEnteredTagLanguage"
     ).toEqual(true);
   });
 });
+
+describe("formatting dialect codes", () => {
+  it("should return empty string for undefined or empty input", () => {
+    expect(formatDialectCode("")).toEqual("");
+    expect(formatDialectCode(undefined)).toEqual("");
+  });
+  it("should trim whitespace", () => {
+    expect(formatDialectCode(" foo  bar ")).toEqual("foobar");
+  });
+  it("should remove illegal characters", () => {
+    expect(formatDialectCode("foo!@#$%^&*()bar")).toEqual("foobar");
+  });
+  it("should retain dashes", () => {
+    expect(formatDialectCode("ai-google")).toEqual("ai-google");
+  });
+  it("should truncate sections to 8 characters", () => {
+    expect(
+      formatDialectCode("123456789-123456789-123456789-123456789")
+    ).toEqual("12345678-12345678-12345678-12345678");
+    expect(formatDialectCode("ai-newFancySmartAi")).toEqual("ai-newFancy");
+  });
+  it("various combinations", () => {
+    expect(
+      formatDialectCode(
+        "  1234 5678 9-1234 5678 9-!@#$%^&*()1234 5678 9-foobar"
+      )
+    ).toEqual("12345678-12345678-12345678-foobar");
+  });
+});
diff --git a/components/language-chooser/common/find-language/languageTagUtils.ts b/components/language-chooser/common/find-language/languageTagUtils.ts
@@ -70,8 +70,7 @@ export function getMaximalLangtag(langtag: string): string | undefined {
   return lookup.get(langtag.toLowerCase());
 }
 
-// This is pretty naive. If you are using the language-chooser-react-hook and there may be a manually entered language
-// tag or bracket demarcation, use createTagFromOrthography in language-chooser-react-hook instead
+// This is pretty naive. Exported for unit testing, but most situations should use createTagFromOrthography instead
 export function createTag({
   languageCode,
   scriptCode,
@@ -101,10 +100,9 @@ export function createTag({
   if (!languageCode || dialectCode) {
     tag += "-x-";
   }
-  // Subtags have a maximum length of 8 characters, so if dialectCode is longer than that, truncate it
-  // when appending to the tag.  See BL-14806.
+  // Dialect code should have already been formatted, i.e. by formatDialectCode
   if (dialectCode) {
-    tag += `${dialectCode.length <= 8 ? dialectCode : dialectCode.slice(0, 8)}`;
+    tag += `${dialectCode}`;
   }
   return getShortestSufficientLangtag(tag) || tag;
 }
@@ -240,6 +238,22 @@ export function defaultRegionForLangTag(
   }
 }
 
+/// Returns a code for use in the Private Use section of a BCP 47 tag,
+/// made up of strings of up to 8 alphanumeric characters, separated by hyphens.
+/// Removes non-alphanumeric characters (other than hyphens) and truncates each section to 8 characters
+/// Enhance: we could further enforce BCP-47 rules, e.g. minimum length of each section
+/// see https://www.rfc-editor.org/rfc/bcp/bcp47.txt
+export function formatDialectCode(dialect?: string): string {
+  if (!dialect) return "";
+  return dialect
+    .split("-")
+    .map((s) => {
+      const alphanumeric = s.replace(/[^a-zA-Z0-9]/g, "");
+      return alphanumeric.slice(0, 8);
+    })
+    .join("-");
+}
+
 export function createTagFromOrthography(orthography: IOrthography): string {
   const strippedOrthography = deepStripDemarcation(orthography);
   if (isManuallyEnteredTagLanguage(strippedOrthography.language)) {
@@ -259,7 +273,7 @@ export function createTagFromOrthography(orthography: IOrthography): string {
     languageCode: strippedOrthography.language?.languageSubtag,
     scriptCode,
     regionCode: strippedOrthography.customDetails?.region?.code,
-    dialectCode: strippedOrthography.customDetails?.dialect,
+    dialectCode: formatDialectCode(strippedOrthography.customDetails?.dialect),
   });
 }