Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"lint": "eslint src --ext ts,tsx",
"preview": "vite preview",
"predeploy": "npm run build",
"deploy": "gh-pages -d dist"
"deploy": "gh-pages -d dist",
"build:cldr-locales": "ts-node scripts/ingest/build-cldr-locales.ts"
},
"dependencies": {
"cldr-core": "^47.0.0",
Expand All @@ -20,6 +21,7 @@
},
"devDependencies": {
"@eslint/js": "^9.25.0",
"@types/node": "^24.3.1",
"@types/react": "^19.0.10",
"@types/react-dom": "^19.0.4",
"@typescript-eslint/eslint-plugin": "^8.30.1",
Expand Down
5 changes: 5 additions & 0 deletions public/data/unicode/cldrLocales.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"release": "0.0.0",
"generatedAt": "",
"locales": []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This output file is empty -- is there supposed to be data in it?

}
197 changes: 197 additions & 0 deletions scripts/ingest/build-cldr-locales.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
/**
* Build script to generate a list of CLDR locale support objects.
*
* This script fetches data from the Unicode CLDR project and combines
* multiple data sources to produce a JSON file consumed by the UI. The
* output file is stored in `public/data/unicode/cldrLocales.json` and
* includes one entry per locale with a variety of support metrics. These
* metrics include whether the locale has an XML file in the CLDR
* repository, the tier of support (core, modern, full), coverage levels
* and percentages, ICU inclusion, default-content flags and more.
*
* This script is not executed at runtime by the browser. Instead it is
* intended to be run manually or as part of a build pipeline. Because
* network access may not be available when run in some environments, the
* script is defensive: network calls are isolated and the data sources
* are configurable via constants at the top of the file.
*/

import fs from 'node:fs/promises';
import path from 'node:path';

// CLDR release version to pull. When bumping this version, update
// both the charts path and the cldr-core package version.
const CLDR_RELEASE = '47.0.0';

// Base URLs (pinned to the release above)
const CLDR_CORE_BASE = `https://cdn.jsdelivr.net/npm/cldr-core@${CLDR_RELEASE}`;
const CHARTS_TSV_BASE = `https://raw.githubusercontent.com/unicode-org/cldr-staging/main/docs/charts/${CLDR_RELEASE.replace(
/\.0\.0$/,
'',
)}/tsv`;
const CLDR_REPO_RAW_BASE = 'https://raw.githubusercontent.com/unicode-org/cldr/main/common/main';
Comment on lines +23 to +28
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not certain I want to ingest the data -- what were the tradeoffs that you considered when you decided to use these URLs?

There is the CLDR Json repository that we already have access to data in. You can see how we use it in files like UnicodeData.tsx where we access the data with imports like import territoryInfo from 'cldr-core/supplemental/territoryInfo.json';


// Output location
const OUTPUT_FILE = path.join(process.cwd(), 'public', 'data', 'unicode', 'cldrLocales.json');

/** Simple TSV parser (no quoted fields needed for CLDR TSVs) */
function parseTsv(tsv: string): Record<string, string>[] {
const lines = tsv.trim().split(/\r?\n/);
if (lines.length === 0) return [];
const header = lines[0].split('\t');
return lines.slice(1).map((line) => {
const values = line.split('\t');
const obj: Record<string, string> = {};
header.forEach((key, idx) => {
obj[key] = values[idx] ?? '';
});
return obj;
});
}

/** Check if `common/main/<locale>.xml` exists in the CLDR repo */
async function xmlExists(locale: string): Promise<boolean> {
const url = `${CLDR_REPO_RAW_BASE}/${locale}.xml`;
try {
const response = await fetch(url, { method: 'HEAD' });
return response.ok;
} catch {
// Treat network failures as "unknown/false"
return false;
}
}

interface RawCoverageRow {
'Language/Locale': string;
'Target Level': string;
'≟': string;
'Computed Level': string;
ICU: string;
'%': string;
'ⓜ%': string;
'ⓑ%': string;
'ⓒ%': string;
'Missing Features': string;
'Default Region': string;
}

interface RawMissingCountsRow {
'Language/Locale': string;
Found: string;
Unconfirmed: string;
Missing: string;
}

interface AvailableLocales {
core: string[];
modern: string[];
full: string[];
defaultContent: string[];
}

/** Main routine */
async function buildCldrLocales(): Promise<void> {
const availablePromise = fetch(`${CLDR_CORE_BASE}/availableLocales.json`).then((r) => r.json());
const coverageTsvPromise = fetch(`${CHARTS_TSV_BASE}/locale-coverage.tsv`).then((r) => r.text());
const missingCountsTsvPromise = fetch(`${CHARTS_TSV_BASE}/locale-missing-counts.tsv`).then((r) =>
r.text(),
);

const [availableLocales, coverageTsv, missingTsv] = await Promise.all([
availablePromise,
coverageTsvPromise,
missingCountsTsvPromise,
]);

const available: AvailableLocales = availableLocales as AvailableLocales;
const coverageRows = parseTsv(coverageTsv) as unknown as RawCoverageRow[];
const missingRows = parseTsv(missingTsv) as unknown as RawMissingCountsRow[];

const coverageMap: Record<string, RawCoverageRow> = {};
for (const row of coverageRows) {
const id = row['Language/Locale'];
if (id && id.includes('_')) coverageMap[id] = row;
}
const missingMap: Record<string, RawMissingCountsRow> = {};
for (const row of missingRows) {
const id = row['Language/Locale'];
if (id && id.includes('_')) missingMap[id] = row;
}

const localeList = new Set<string>([...available.core, ...available.modern, ...available.full]);
const output: any[] = [];

const pct = (value: string): number | undefined => {
if (!value || value.trim() === '' || value.trim() === '—') return undefined;
const num = parseFloat(value);
return Number.isFinite(num) ? num : undefined;
};

for (const loc of localeList) {
const tier = available.core.includes(loc)
? 'core'
: available.modern.includes(loc)
? 'modern'
: 'full';
const isDefault = available.defaultContent?.includes(loc) ?? false;
const coverage = coverageMap[loc];
const missing = missingMap[loc];

// Queue XML existence check; we’ll resolve after building objects
const xmlPromise = xmlExists(loc);

output.push({
locale: loc,
language: loc.split(/[_-]/)[0],
region: loc.split(/[_-]/)[1] ?? undefined,
script: loc.split(/[_-]/)[2] ?? undefined,
tier,
// use new property name expected by UI
localeIsDefaultForLanguage: isDefault,
// Coverage fields
targetLevel: coverage?.['Target Level'] || undefined,
computedLevel: coverage?.['Computed Level'] || undefined,
confirmedPct: pct(coverage?.['%'] || ''),
pctModern: pct(coverage?.['%'] || ''),
pctModerate: pct(coverage?.['ⓜ%'] || ''),
pctBasic: pct(coverage?.['ⓑ%'] || ''),
pctCore: pct(coverage?.['ⓒ%'] || ''),
icuIncluded: coverage?.ICU?.toLowerCase().includes('icu') ?? false,
defaultRegion: coverage?.['Default Region'] || undefined,
notes:
coverage && coverage['Missing Features'] ? coverage['Missing Features'].split(/,\s*/) : [],
missingCounts: missing
? {
found: Number.parseInt(missing.Found || '0', 10),
unconfirmed: Number.parseInt(missing.Unconfirmed || '0', 10),
missing: Number.parseInt(missing.Missing || '0', 10),
}
: undefined,
__xmlPromise: xmlPromise,
});
}

// Resolve XML presence flags
await Promise.all(
output.map(async (entry) => {
const present = await entry.__xmlPromise;
entry.presentInCLDRDatabase = present;
delete entry.__xmlPromise;
}),
);

const finalOutput = {
release: CLDR_RELEASE,
generatedAt: new Date().toISOString(),
locales: output,
};

await fs.mkdir(path.dirname(OUTPUT_FILE), { recursive: true });
await fs.writeFile(OUTPUT_FILE, JSON.stringify(finalOutput, null, 2));
console.log(`Wrote ${output.length} locale records to ${OUTPUT_FILE}`);
}

buildCldrLocales().catch((err) => {
console.error(err);
process.exit(1);
});
18 changes: 18 additions & 0 deletions src/data/cldrLocales.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import cldrLocalesData from '../../public/data/unicode/cldrLocales.json';
import type { CLDRLocaleIndex, CLDRLocaleSupport } from '../types/CLDRLocaleTypes';

// Import the generated JSON instead of requiring it.

// Cast to the correct interface
const rawData: CLDRLocaleIndex = cldrLocalesData as unknown as CLDRLocaleIndex;

/** Return all CLDR locale support entries. */
export function getAllCldrLocales(): CLDRLocaleSupport[] {
return rawData.locales;
}

/** Retrieve the CLDR support entry for a given locale code. */
export function getCldrLocale(localeId: string): CLDRLocaleSupport | undefined {
const idLower = localeId.toLowerCase();
return rawData.locales.find((entry) => entry.locale.toLowerCase() === idLower);
}
63 changes: 63 additions & 0 deletions src/types/CLDRLocaleTypes.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/**
* Types for CLDR locale support. These types mirror the structure of the
* objects generated by the build script in scripts/ingest/build-cldr-locales.ts.
*/

export interface CLDRLocaleSupport {
/** Full BCP-47 locale, e.g. "en_US" */
locale: string;
/** Language code, e.g. "en" */
language: string;
/** Region code, e.g. "US" (optional) */
region?: string;
/** Script code, e.g. "Latn" (optional) */
script?: string;

/** Tier of locale support: core, modern or full */
tier: 'core' | 'modern' | 'full';
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look like the CLDR levels that I am used to. I'm used to seeing "core", "basic", "moderate", "modern". Where do these tiers come from?


/**
* True if this locale is listed in defaultContent.json. Indicates that the
* locale is the default content locale for its language.
*/
localeIsDefaultForLanguage?: boolean;

/** Whether a corresponding XML exists in CLDR repo */
presentInCLDRDatabase?: boolean;

/** Coverage target (modern/moderate/basic/core) */
targetLevel?: string;
/** Coverage computed level */
computedLevel?: string;

/** Aggregate % confirmed */
confirmedPct?: number;

/** Breakdown percentages (optional) */
pctModern?: number;
pctModerate?: number;
pctBasic?: number;
pctCore?: number;

/** True if the locale is included in ICU datasets (per charts) */
icuIncluded?: boolean;

/** Charts’ default region column (if present) */
defaultRegion?: string;

/** Missing feature tokens from charts */
notes?: string[];

/** Counts at target level */
missingCounts?: {
found: number;
unconfirmed: number;
missing: number;
};
}

export interface CLDRLocaleIndex {
release: string;
generatedAt: string;
locales: CLDRLocaleSupport[];
}
Loading