-
Notifications
You must be signed in to change notification settings - Fork 9
Locales: Add CLDR support per locale #204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 2 commits
0948561
98a6417
38c9c5f
0c8ae16
20fb625
dcdab07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| { | ||
| "release": "0.0.0", | ||
| "generatedAt": "", | ||
| "locales": [] | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,197 @@ | ||
| /** | ||
| * Build script to generate a list of CLDR locale support objects. | ||
wizardsimms marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| * | ||
| * This script fetches data from the Unicode CLDR project and combines | ||
| * multiple data sources to produce a JSON file consumed by the UI. The | ||
| * output file is stored in `public/data/unicode/cldrLocales.json` and | ||
| * includes one entry per locale with a variety of support metrics. These | ||
| * metrics include whether the locale has an XML file in the CLDR | ||
| * repository, the tier of support (core, modern, full), coverage levels | ||
| * and percentages, ICU inclusion, default-content flags and more. | ||
| * | ||
| * This script is not executed at runtime by the browser. Instead it is | ||
| * intended to be run manually or as part of a build pipeline. Because | ||
| * network access may not be available when run in some environments, the | ||
| * script is defensive: network calls are isolated and the data sources | ||
| * are configurable via constants at the top of the file. | ||
| */ | ||
|
|
||
| import fs from 'node:fs/promises'; | ||
| import path from 'node:path'; | ||
|
|
||
| // CLDR release version to pull. When bumping this version, update | ||
| // both the charts path and the cldr-core package version. | ||
| const CLDR_RELEASE = '47.0.0'; | ||
|
|
||
| // Base URLs (pinned to the release above) | ||
| const CLDR_CORE_BASE = `https://cdn.jsdelivr.net/npm/cldr-core@${CLDR_RELEASE}`; | ||
| const CHARTS_TSV_BASE = `https://raw.githubusercontent.com/unicode-org/cldr-staging/main/docs/charts/${CLDR_RELEASE.replace( | ||
| /\.0\.0$/, | ||
| '', | ||
| )}/tsv`; | ||
| const CLDR_REPO_RAW_BASE = 'https://raw.githubusercontent.com/unicode-org/cldr/main/common/main'; | ||
|
Comment on lines
+23
to
+28
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not certain I want to ingest the data -- what were the tradeoffs that you considered when you decided to use these URLs? There is the CLDR Json repository that we already have access to data in. You can see how we use it in files like |
||
|
|
||
| // Output location | ||
| const OUTPUT_FILE = path.join(process.cwd(), 'public', 'data', 'unicode', 'cldrLocales.json'); | ||
|
|
||
| /** Simple TSV parser (no quoted fields needed for CLDR TSVs) */ | ||
| function parseTsv(tsv: string): Record<string, string>[] { | ||
| const lines = tsv.trim().split(/\r?\n/); | ||
| if (lines.length === 0) return []; | ||
| const header = lines[0].split('\t'); | ||
| return lines.slice(1).map((line) => { | ||
| const values = line.split('\t'); | ||
| const obj: Record<string, string> = {}; | ||
| header.forEach((key, idx) => { | ||
| obj[key] = values[idx] ?? ''; | ||
| }); | ||
| return obj; | ||
| }); | ||
| } | ||
|
|
||
| /** Check if `common/main/<locale>.xml` exists in the CLDR repo */ | ||
| async function xmlExists(locale: string): Promise<boolean> { | ||
| const url = `${CLDR_REPO_RAW_BASE}/${locale}.xml`; | ||
| try { | ||
| const response = await fetch(url, { method: 'HEAD' }); | ||
| return response.ok; | ||
| } catch { | ||
| // Treat network failures as "unknown/false" | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| interface RawCoverageRow { | ||
| 'Language/Locale': string; | ||
| 'Target Level': string; | ||
| '≟': string; | ||
| 'Computed Level': string; | ||
| ICU: string; | ||
| '%': string; | ||
| 'ⓜ%': string; | ||
| 'ⓑ%': string; | ||
| 'ⓒ%': string; | ||
| 'Missing Features': string; | ||
| 'Default Region': string; | ||
| } | ||
|
|
||
| interface RawMissingCountsRow { | ||
| 'Language/Locale': string; | ||
| Found: string; | ||
| Unconfirmed: string; | ||
| Missing: string; | ||
| } | ||
|
|
||
| interface AvailableLocales { | ||
| core: string[]; | ||
| modern: string[]; | ||
| full: string[]; | ||
| defaultContent: string[]; | ||
| } | ||
|
|
||
| /** Main routine */ | ||
| async function buildCldrLocales(): Promise<void> { | ||
| const availablePromise = fetch(`${CLDR_CORE_BASE}/availableLocales.json`).then((r) => r.json()); | ||
| const coverageTsvPromise = fetch(`${CHARTS_TSV_BASE}/locale-coverage.tsv`).then((r) => r.text()); | ||
| const missingCountsTsvPromise = fetch(`${CHARTS_TSV_BASE}/locale-missing-counts.tsv`).then((r) => | ||
| r.text(), | ||
| ); | ||
|
|
||
| const [availableLocales, coverageTsv, missingTsv] = await Promise.all([ | ||
| availablePromise, | ||
| coverageTsvPromise, | ||
| missingCountsTsvPromise, | ||
| ]); | ||
|
|
||
| const available: AvailableLocales = availableLocales as AvailableLocales; | ||
| const coverageRows = parseTsv(coverageTsv) as unknown as RawCoverageRow[]; | ||
| const missingRows = parseTsv(missingTsv) as unknown as RawMissingCountsRow[]; | ||
|
|
||
| const coverageMap: Record<string, RawCoverageRow> = {}; | ||
| for (const row of coverageRows) { | ||
| const id = row['Language/Locale']; | ||
| if (id && id.includes('_')) coverageMap[id] = row; | ||
| } | ||
| const missingMap: Record<string, RawMissingCountsRow> = {}; | ||
| for (const row of missingRows) { | ||
| const id = row['Language/Locale']; | ||
| if (id && id.includes('_')) missingMap[id] = row; | ||
| } | ||
|
|
||
| const localeList = new Set<string>([...available.core, ...available.modern, ...available.full]); | ||
| const output: any[] = []; | ||
|
|
||
| const pct = (value: string): number | undefined => { | ||
| if (!value || value.trim() === '' || value.trim() === '—') return undefined; | ||
| const num = parseFloat(value); | ||
| return Number.isFinite(num) ? num : undefined; | ||
| }; | ||
|
|
||
| for (const loc of localeList) { | ||
| const tier = available.core.includes(loc) | ||
| ? 'core' | ||
| : available.modern.includes(loc) | ||
| ? 'modern' | ||
| : 'full'; | ||
| const isDefault = available.defaultContent?.includes(loc) ?? false; | ||
| const coverage = coverageMap[loc]; | ||
| const missing = missingMap[loc]; | ||
|
|
||
| // Queue XML existence check; we’ll resolve after building objects | ||
| const xmlPromise = xmlExists(loc); | ||
|
|
||
| output.push({ | ||
| locale: loc, | ||
| language: loc.split(/[_-]/)[0], | ||
| region: loc.split(/[_-]/)[1] ?? undefined, | ||
| script: loc.split(/[_-]/)[2] ?? undefined, | ||
| tier, | ||
| // use new property name expected by UI | ||
| localeIsDefaultForLanguage: isDefault, | ||
| // Coverage fields | ||
| targetLevel: coverage?.['Target Level'] || undefined, | ||
| computedLevel: coverage?.['Computed Level'] || undefined, | ||
| confirmedPct: pct(coverage?.['%'] || ''), | ||
| pctModern: pct(coverage?.['%'] || ''), | ||
| pctModerate: pct(coverage?.['ⓜ%'] || ''), | ||
| pctBasic: pct(coverage?.['ⓑ%'] || ''), | ||
| pctCore: pct(coverage?.['ⓒ%'] || ''), | ||
| icuIncluded: coverage?.ICU?.toLowerCase().includes('icu') ?? false, | ||
| defaultRegion: coverage?.['Default Region'] || undefined, | ||
| notes: | ||
| coverage && coverage['Missing Features'] ? coverage['Missing Features'].split(/,\s*/) : [], | ||
| missingCounts: missing | ||
| ? { | ||
| found: Number.parseInt(missing.Found || '0', 10), | ||
| unconfirmed: Number.parseInt(missing.Unconfirmed || '0', 10), | ||
| missing: Number.parseInt(missing.Missing || '0', 10), | ||
| } | ||
| : undefined, | ||
| __xmlPromise: xmlPromise, | ||
| }); | ||
| } | ||
|
|
||
| // Resolve XML presence flags | ||
| await Promise.all( | ||
| output.map(async (entry) => { | ||
| const present = await entry.__xmlPromise; | ||
| entry.presentInCLDRDatabase = present; | ||
| delete entry.__xmlPromise; | ||
| }), | ||
| ); | ||
|
|
||
| const finalOutput = { | ||
| release: CLDR_RELEASE, | ||
| generatedAt: new Date().toISOString(), | ||
| locales: output, | ||
| }; | ||
|
|
||
| await fs.mkdir(path.dirname(OUTPUT_FILE), { recursive: true }); | ||
| await fs.writeFile(OUTPUT_FILE, JSON.stringify(finalOutput, null, 2)); | ||
| console.log(`Wrote ${output.length} locale records to ${OUTPUT_FILE}`); | ||
| } | ||
|
|
||
| buildCldrLocales().catch((err) => { | ||
| console.error(err); | ||
| process.exit(1); | ||
| }); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| import cldrLocalesData from '../../public/data/unicode/cldrLocales.json'; | ||
| import type { CLDRLocaleIndex, CLDRLocaleSupport } from '../types/CLDRLocaleTypes'; | ||
|
|
||
| // Import the generated JSON instead of requiring it. | ||
|
|
||
| // Cast to the correct interface | ||
| const rawData: CLDRLocaleIndex = cldrLocalesData as unknown as CLDRLocaleIndex; | ||
|
|
||
| /** Return all CLDR locale support entries. */ | ||
| export function getAllCldrLocales(): CLDRLocaleSupport[] { | ||
| return rawData.locales; | ||
| } | ||
|
|
||
| /** Retrieve the CLDR support entry for a given locale code. */ | ||
| export function getCldrLocale(localeId: string): CLDRLocaleSupport | undefined { | ||
| const idLower = localeId.toLowerCase(); | ||
| return rawData.locales.find((entry) => entry.locale.toLowerCase() === idLower); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| /** | ||
| * Types for CLDR locale support. These types mirror the structure of the | ||
| * objects generated by the build script in scripts/ingest/build-cldr-locales.ts. | ||
| */ | ||
|
|
||
| export interface CLDRLocaleSupport { | ||
| /** Full BCP-47 locale, e.g. "en_US" */ | ||
| locale: string; | ||
| /** Language code, e.g. "en" */ | ||
| language: string; | ||
| /** Region code, e.g. "US" (optional) */ | ||
| region?: string; | ||
| /** Script code, e.g. "Latn" (optional) */ | ||
| script?: string; | ||
|
|
||
| /** Tier of locale support: core, modern or full */ | ||
| tier: 'core' | 'modern' | 'full'; | ||
|
||
|
|
||
| /** | ||
| * True if this locale is listed in defaultContent.json. Indicates that the | ||
| * locale is the default content locale for its language. | ||
| */ | ||
| localeIsDefaultForLanguage?: boolean; | ||
|
|
||
| /** Whether a corresponding XML exists in CLDR repo */ | ||
| presentInCLDRDatabase?: boolean; | ||
|
|
||
| /** Coverage target (modern/moderate/basic/core) */ | ||
| targetLevel?: string; | ||
| /** Coverage computed level */ | ||
| computedLevel?: string; | ||
|
|
||
| /** Aggregate % confirmed */ | ||
| confirmedPct?: number; | ||
|
|
||
| /** Breakdown percentages (optional) */ | ||
| pctModern?: number; | ||
| pctModerate?: number; | ||
| pctBasic?: number; | ||
| pctCore?: number; | ||
|
|
||
| /** True if the locale is included in ICU datasets (per charts) */ | ||
| icuIncluded?: boolean; | ||
|
|
||
| /** Charts’ default region column (if present) */ | ||
| defaultRegion?: string; | ||
|
|
||
| /** Missing feature tokens from charts */ | ||
| notes?: string[]; | ||
|
|
||
| /** Counts at target level */ | ||
| missingCounts?: { | ||
| found: number; | ||
| unconfirmed: number; | ||
| missing: number; | ||
| }; | ||
| } | ||
|
|
||
| export interface CLDRLocaleIndex { | ||
| release: string; | ||
| generatedAt: string; | ||
| locales: CLDRLocaleSupport[]; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This output file is empty -- is there supposed to be data in it?