diff --git a/Cargo.lock b/Cargo.lock index c5114bdbf39..1549884ae55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,7 +165,7 @@ dependencies = [ "rustc-hash 2.1.1", "serde", "serde_derive", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -389,6 +389,16 @@ dependencies = [ "shlex", ] +[[package]] +name = "cfg-expr" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a2c5f3bf25ec225351aa1c8e230d04d880d3bd89dea133537dafad4ae291e5c" +dependencies = [ + "smallvec", + "target-lexicon", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -477,7 +487,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -535,6 +545,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -658,7 +678,7 @@ dependencies = [ "databake", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "synstructure", ] @@ -683,7 +703,7 @@ checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -716,7 +736,7 @@ dependencies = [ "diplomat_core", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -763,7 +783,7 @@ dependencies = [ "pulldown-cmark", "quote", "serde", - "syn 2.0.101", + "syn 2.0.106", "syn-inline-mod", "toml", ] @@ -780,7 +800,7 @@ dependencies = [ "serde", "smallvec", "strck", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -791,7 +811,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -872,18 +892,6 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" -[[package]] -name = "env_preferences" -version = "0.1.0" -dependencies = [ - "core-foundation-sys", - "displaydoc", - "icu_locale_core", - "libc", - "windows", - "windows-core 0.60.1", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -972,6 +980,69 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-core", + "futures-macro", + "futures-task", + "pin-project-lite", + "pin-utils", + "slab", +] + [[package]] name = "getopts" version = "0.2.21" @@ -1012,6 +1083,91 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +[[package]] +name = "gio" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed68efc12b748a771be2dccc49480d8584004382967c98323245fc3c38b74a42" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "gio-sys", + "glib", + "libc", + "pin-project-lite", + "smallvec", +] + +[[package]] +name = "gio-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171ed2f6dd927abbe108cfd9eebff2052c335013f5879d55bab0dc1dee19b706" +dependencies = [ + "glib-sys", + "gobject-sys", + "libc", + "system-deps", + "windows-sys 0.59.0", +] + +[[package]] +name = "glib" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1f2cbc4577536c849335878552f42086bfd25a8dcd6f54a18655cf818b20c8f" +dependencies = [ + "bitflags", + "futures-channel", + "futures-core", + "futures-executor", + "futures-task", + "futures-util", + "gio-sys", + "glib-macros", + "glib-sys", + "gobject-sys", + "libc", + "memchr", + "smallvec", +] + +[[package]] +name = "glib-macros" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55eda916eecdae426d78d274a17b48137acdca6fba89621bd3705f2835bc719f" +dependencies = [ + "heck 0.5.0", + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "glib-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d09d3d0fddf7239521674e57b0465dfbd844632fec54f059f7f56112e3f927e1" +dependencies = [ + "libc", + "system-deps", +] + +[[package]] +name = "gobject-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "538e41d8776173ec107e7b0f2aceced60abc368d7e1d81c1f0e2ecd35f59080d" +dependencies = [ + "glib-sys", + "libc", + "system-deps", +] + [[package]] name = "half" version = "2.4.1" @@ -1462,6 +1618,23 @@ dependencies = [ "icu_provider", ] +[[package]] +name = "icu_host_info" +version = "0.1.0" +dependencies = [ + "core-foundation", + "core-foundation-sys", + "displaydoc", + "gio", + "icu", + "icu_calendar", + "icu_datetime", + "icu_locale_core", + "icu_time", + "libc", + "windows", +] + [[package]] name = "icu_list" version = "2.0.0" @@ -1910,7 +2083,7 @@ checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -1943,9 +2116,9 @@ checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" [[package]] name = "libc" -version = "0.2.172" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libc_alloc" @@ -2048,9 +2221,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.4" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "minimal-lexical" @@ -2298,6 +2471,24 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "plotters" version = "0.3.7" @@ -2380,6 +2571,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +dependencies = [ + "toml_edit 0.23.5", +] + [[package]] name = "proc-macro2" version = "1.0.95" @@ -2848,7 +3048,7 @@ checksum = "51e694923b8824cf0e9b382adf0f60d4e05f348f357b38833a3fa5ed7c2ede04" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -2908,6 +3108,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +[[package]] +name = "slab" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" + [[package]] name = "smallvec" version = "1.15.0" @@ -2973,7 +3179,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -2995,9 +3201,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.101" +version = "2.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" dependencies = [ "proc-macro2", "quote", @@ -3011,7 +3217,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fa6dca1fdb7b2ed46dd534a326725419d4fb10f23d8c85a8b2860e5eb25d0f9" dependencies = [ "proc-macro2", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -3022,7 +3228,20 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", +] + +[[package]] +name = "system-deps" +version = "7.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "550b2c61a9c30b85ca1f6ef0afcd2befcb12e73b1d31ef0526423bc7b6a99d7f" +dependencies = [ + "cfg-expr", + "heck 0.5.0", + "pkg-config", + "toml", + "version-compare", ] [[package]] @@ -3041,6 +3260,12 @@ dependencies = [ "libc", ] +[[package]] +name = "target-lexicon" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" + [[package]] name = "thiserror" version = "2.0.12" @@ -3058,7 +3283,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -3148,8 +3373,8 @@ checksum = "05ae329d1f08c4d17a59bed7ff5b5a769d062e64a62d34a3261b219e62cd5aae" dependencies = [ "serde", "serde_spanned", - "toml_datetime", - "toml_edit", + "toml_datetime 0.6.9", + "toml_edit 0.22.26", ] [[package]] @@ -3161,6 +3386,15 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a197c0ec7d131bfc6f7e82c8442ba1595aeab35da7adbf05b6b73cd06a16b6be" +dependencies = [ + "serde_core", +] + [[package]] name = "toml_edit" version = "0.22.26" @@ -3170,11 +3404,32 @@ dependencies = [ "indexmap", "serde", "serde_spanned", - "toml_datetime", + "toml_datetime 0.6.9", "toml_write", "winnow", ] +[[package]] +name = "toml_edit" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2ad0b7ae9cfeef5605163839cb9221f453399f15cfb5c10be9885fcf56611f9" +dependencies = [ + "indexmap", + "toml_datetime 0.7.1", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b551886f449aa90d4fe2bdaa9f4a2577ad2dde302c61ecf262d80b116db95c10" +dependencies = [ + "winnow", +] + [[package]] name = "toml_write" version = "0.1.1" @@ -3309,6 +3564,12 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" +[[package]] +name = "version-compare" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b" + [[package]] name = "version_check" version = "0.9.5" @@ -3377,7 +3638,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "wasm-bindgen-shared", ] @@ -3399,7 +3660,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3597,7 +3858,7 @@ checksum = "83577b051e2f49a058c308f17f273b570a6a758386fc291b5f6a934dd84e48c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -3608,7 +3869,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -3858,7 +4119,7 @@ version = "0.8.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "synstructure", "yoke", "zerovec", @@ -3881,7 +4142,7 @@ checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -3897,7 +4158,7 @@ version = "0.1.6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "synstructure", "zerofrom", "zerovec", @@ -3966,7 +4227,7 @@ dependencies = [ "quote", "serde", "serde_json", - "syn 2.0.101", + "syn 2.0.106", "zerofrom", "zerovec", ] diff --git a/Cargo.toml b/Cargo.toml index 72587f57625..e494aedec8f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,7 +70,7 @@ members = [ "utils/ixdtf", "utils/litemap", "utils/resb", - "utils/env_preferences", + "utils/host_info", "utils/tinystr", "utils/tzif", "utils/potential_utf", diff --git a/components/locale_core/src/extensions/unicode/attributes.rs b/components/locale_core/src/extensions/unicode/attributes.rs index 357ae8a2667..53152bbbc29 100644 --- a/components/locale_core/src/extensions/unicode/attributes.rs +++ b/components/locale_core/src/extensions/unicode/attributes.rs @@ -148,6 +148,29 @@ impl Attributes { { self.deref().iter().map(|t| t.as_str()).try_for_each(f) } + + /// Extends the `Attributes` with values from another `Attributes`. + /// + /// # Example + /// + /// ``` + /// use icu::locale::extensions::unicode::Attributes; + /// + /// let mut attrs: Attributes = "foobar-foobaz".parse().unwrap(); + /// let attrs2: Attributes = "foobar-fooqux".parse().unwrap(); + /// + /// attrs.extend(attrs2); + /// + /// assert_eq!(attrs, "foobar-foobaz-fooqux".parse().unwrap()); + /// ``` + #[cfg(feature = "alloc")] + pub fn extend(&mut self, other: Attributes) { + for attr in other.0 { + if let Err(idx) = self.binary_search(&attr) { + self.0.insert(idx, attr); + } + } + } } /// ✨ *Enabled with the `alloc` Cargo feature.* diff --git a/components/locale_core/src/extensions/unicode/keywords.rs b/components/locale_core/src/extensions/unicode/keywords.rs index f3ecd4cad72..c31921a318b 100644 --- a/components/locale_core/src/extensions/unicode/keywords.rs +++ b/components/locale_core/src/extensions/unicode/keywords.rs @@ -386,6 +386,27 @@ impl Keywords { Ok(()) } + /// Extends the `Keywords` with values from another `Keywords`. + /// + /// # Example + /// + /// ``` + /// use icu::locale::extensions::unicode::Keywords; + /// + /// let mut kw: Keywords = "ab-cd-ca-buddhist".parse().unwrap(); + /// let kw2: Keywords = "ca-gregory-hc-h12".parse().unwrap(); + /// + /// kw.extend(kw2); + /// + /// assert_eq!(kw, "ab-cd-ca-gregory-hc-h12".parse().unwrap()); + /// ``` + #[cfg(feature = "alloc")] + pub fn extend(&mut self, other: Keywords) { + for (key, value) in other.0 { + self.0.insert(key, value); + } + } + /// This needs to be its own method to help with type inference in helpers.rs #[cfg(test)] pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self { diff --git a/components/locale_core/src/extensions/unicode/mod.rs b/components/locale_core/src/extensions/unicode/mod.rs index 9577d397df3..0b60f779f6d 100644 --- a/components/locale_core/src/extensions/unicode/mod.rs +++ b/components/locale_core/src/extensions/unicode/mod.rs @@ -214,6 +214,26 @@ impl Unicode { } Ok(()) } + + /// Extends the `Unicode` with values from another `Unicode`. + /// + /// # Example + /// + /// ``` + /// use icu::locale::extensions::unicode::Unicode; + /// + /// let mut ue: Unicode = "u-foobar-ca-buddhist".parse().unwrap(); + /// let ue2: Unicode = "u-ca-gregory-hc-h12".parse().unwrap(); + /// + /// ue.extend(ue2); + /// + /// assert_eq!(ue, "u-foobar-ca-gregory-hc-h12".parse().unwrap()); + /// ``` + #[cfg(feature = "alloc")] + pub fn extend(&mut self, other: Unicode) { + self.keywords.extend(other.keywords); + self.attributes.extend(other.attributes); + } } /// ✨ *Enabled with the `alloc` Cargo feature.* diff --git a/components/locale_core/src/preferences/mod.rs b/components/locale_core/src/preferences/mod.rs index fe47b1e2b6f..e74ffb04362 100644 --- a/components/locale_core/src/preferences/mod.rs +++ b/components/locale_core/src/preferences/mod.rs @@ -37,7 +37,7 @@ //! //! # Preferences Merging //! -//! In traditional internatonalization APIs, the argument passed to constructors is a locale. +//! In traditional internationalization APIs, the argument passed to constructors is a locale. //! ICU4X changes this paradigm by accepting a `Preferences`, which can be extracted from a [`Locale`] and combined with //! other `Preferences`s provided by the environment. //! diff --git a/utils/env_preferences/README.md b/utils/env_preferences/README.md deleted file mode 100644 index 5b932c9f374..00000000000 --- a/utils/env_preferences/README.md +++ /dev/null @@ -1,14 +0,0 @@ - - -# env_preferences - -`env_preferences` is a crate to retrieve system locale and preferences for -Apple, Linux & Windows systems. - -It provides functionality to fetch preferred locales from the user's operating -system and parse them lossily to an ICU4X [`Locale`](icu_locale_core::Locale). - -It also retrieves preferences for [`Calendar`](https://crates.io/crates/icu_calendar) -& [`TimeZone`](https://crates.io/crates/icu_time) - - diff --git a/utils/env_preferences/src/apple.rs b/utils/env_preferences/src/apple.rs deleted file mode 100644 index 597223d588b..00000000000 --- a/utils/env_preferences/src/apple.rs +++ /dev/null @@ -1,170 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use core_foundation_sys::{ - array::{CFArrayGetCount, CFArrayGetValueAtIndex}, - base::{CFIndex, CFRelease, CFRetain}, - calendar::{CFCalendarCopyCurrent, CFCalendarCopyLocale, CFCalendarGetIdentifier}, - locale::{CFLocaleCopyPreferredLanguages, CFLocaleGetIdentifier}, - string::{ - kCFStringEncodingUTF8, CFStringGetCString, CFStringGetCStringPtr, CFStringGetLength, - CFStringRef, - }, - timezone, -}; -use libc::c_char; -use std::ffi::{CStr, CString}; - -use crate::RetrievalError; - -/// Helps to get string, it tries to get the string directly from the pointer itself, in case it is unable to retrieve -/// the string (c_str_ptr is NULL) a buffer is created of size `length + 1` and we perform manual allocations to get -/// the string -fn get_string(ptr: CFStringRef) -> Result { - // SAFETY: The call to `CFStringGetCStringPtr` because the reference of string we are accessing is not `NULL` - // Returns pointer in O(1) without any memory allocation. This can return NULL so we are handling it by directly - // copying it using `CFStringGetCString` - let c_str_ptr: *const c_char = unsafe { CFStringGetCStringPtr(ptr, kCFStringEncodingUTF8) }; - - if !c_str_ptr.is_null() { - // SAFETY: A valid `NULL` terminator is present which is a requirement of `from_ptr` - let lang_rust_str = unsafe { CStr::from_ptr(c_str_ptr) }.to_str()?; - Ok(lang_rust_str.to_string()) - } else { - // `c_str_ptr` is null, i.e. `CFStringGetCStringPtr` couldn't give desired output, trying with - // manual allocations - // SAFETY: It returns length of the string, from above conditional statement we ensure - // that the `lang_ptr` is not NULL thus making it safe to call - let length = unsafe { CFStringGetLength(ptr) as usize }; - - let mut c_str_buf: Vec = vec![0; length + 1]; - - // SAFETY: Safety is ensured by following points - // 1. `lang_ptr` is not NULL, checked through conditional statement - // 2. `c_str_buf` is large enough and in scope after this call - unsafe { - CFStringGetCString( - ptr, - c_str_buf.as_mut_ptr() as *mut c_char, - c_str_buf.len() as CFIndex, - kCFStringEncodingUTF8, - ); - } - - let c_string = CString::from_vec_with_nul(c_str_buf)?; - c_string - .into_string() - .map_err(|e| RetrievalError::ConversionError(e.utf8_error())) - } -} - -/// Retrieves system locales for Apple operating systems, in the order preferred by the -/// user, it consumes [`CFLocaleCopyPreferredLanguages`](https://developer.apple.com/documentation/corefoundation/1542887-cflocalecopypreferredlanguages) -/// to copy the languages preferred by the user. -pub fn get_raw_locales() -> Result, RetrievalError> { - let mut languages: Vec = Vec::new(); - - // SAFETY: The call to `CFLocaleCopyPreferredLanguages` returns an immutable reference to `CFArray` which is owned by us - // https://developer.apple.com/documentation/corefoundation/cfarrayref. It is ensured that `locale_carr_ref` is not mutated - // Immutablility ensures that nothing is overridden during it's scope - let locale_carr_ref = unsafe { CFLocaleCopyPreferredLanguages() }; - - if !locale_carr_ref.is_null() { - // SAFETY: The call to `CFArrayGetCount` is only made when is `locale_carr_ref` is not `NULL` - let count = unsafe { CFArrayGetCount(locale_carr_ref as _) }; - - for i in 0..count { - // SAFETY: The call to `CFArrayGetValueAtIndex` is safe because we are iterating from 0 to count (`CFArrayGetCount`) which - // in itself will always be greater than 0 and less than count ensuring we will not get "out of bounds" error - let lang_ptr = unsafe { CFArrayGetValueAtIndex(locale_carr_ref, i) }; - - if !lang_ptr.is_null() { - let locale_str = get_string(lang_ptr as CFStringRef)?; - languages.push(locale_str); - } else { - return Err(RetrievalError::NullLocale); - } - } - } else { - // No need to release memory for `locale_carr_ref` since it is NULL - return Err(RetrievalError::NullLocale); - } - // Release for memory - unsafe { - CFRelease(locale_carr_ref as _); - } - - Ok(languages) -} - -/// Gets the list calendar type and it's corresponding locale. It returns a Vec<(String, String)> -/// The first element is the locale of the calendar, second is the calendar identifier -pub fn get_system_calendars() -> Result, RetrievalError> { - let mut calendars = Vec::new(); - let calendar_locale_str: String; - let mut calendar_identifier_str = String::new(); - - // SAFETY: The call to `CFCalendarCopyCurrent` returns a calendar object owned by us - // This calendar object is used extract locale and type of calendar (identifier) - let calendar = unsafe { CFCalendarCopyCurrent() }; - - if !calendar.is_null() { - // SAFETY: Retaining the calendar object when not `NULL` - // It is released when all actions are completed - unsafe { CFRetain(calendar as _) }; - - // SAFETY: Retrieves `CFLocale` object for the calendar, the `if` statement ensures we don't - // pass in a `NULL` references - let locale = unsafe { CFCalendarCopyLocale(calendar as _) }; - - // SAFETY: Retrieves `CFString` (identifier) for the calendar, the `if` statement ensures - // we don't pass in a `NULL` references - let identifier = unsafe { CFCalendarGetIdentifier(calendar as _) }; - - if !locale.is_null() { - // SAFETY: Retain the locale object, released when we extracted the string - unsafe { CFRetain(locale as _) }; - - // SAFETY: Retrieves `CFString` (identifier) for the calendar, the `if` statement ensures - // we don't pass in a `NULL` reference - let locale_identifier = unsafe { CFLocaleGetIdentifier(locale) }; - calendar_locale_str = get_string(locale_identifier as CFStringRef)?; - - // SAFETY: Releases the locale object which was retained - unsafe { CFRelease(locale as _) }; - } else { - return Err(RetrievalError::NullLocale); - } - - if !identifier.is_null() { - calendar_identifier_str = get_string(identifier as CFStringRef)?; - } - // SAFETY: Release the calendar when done to avoid memory leaks - unsafe { CFRelease(calendar as _) }; - - calendars.push((calendar_locale_str, calendar_identifier_str)); - } else { - return Err(RetrievalError::NullCalendar); - } - - Ok(calendars) -} - -/// Get the current time zone of the system -pub fn get_system_time_zone() -> Result { - // SAFETY: Returns the time zone currently used by the system - // Returns an immutable reference to TimeZone object owned by us - let timezone = unsafe { timezone::CFTimeZoneCopySystem() }; - - if !timezone.is_null() { - // SAFETY: Extracts name of time zone from the TimeZone object, reference to timezone - // is guaranteed to be not NULL - let cf_string = unsafe { timezone::CFTimeZoneGetName(timezone) }; - - if !cf_string.is_null() { - return get_string(cf_string); - } - } - Err(RetrievalError::NullTimeZone) -} diff --git a/utils/env_preferences/src/error.rs b/utils/env_preferences/src/error.rs deleted file mode 100644 index ebae643a357..00000000000 --- a/utils/env_preferences/src/error.rs +++ /dev/null @@ -1,98 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use displaydoc::Display; -use std::{ffi::FromVecWithNulError, str::Utf8Error}; - -use crate::parse::posix::PosixParseError; - -/// An error encountered while retrieving the system locale -#[derive(Debug, PartialEq)] -pub enum RetrievalError { - /// Error converting into `&CStr` to `&str` - ConversionError(Utf8Error), - - /// Error creating a `CString` from a buffer with a null terminator - FromVecWithNulError(FromVecWithNulError), - - /// Unable to retrieve the calendar - NullCalendar, - - /// Unable to retrieve the locale - NullLocale, - - /// Unable to retrieve TimeZone - NullTimeZone, - - /// UnknownCategory when retrieving locale for linux - #[cfg(any(doc, target_os = "linux"))] - UnknownCategory, - - /// Error handling for windows system - #[cfg(target_os = "windows")] - Windows(windows::core::Error), - - Other(String), -} - -#[cfg(target_os = "windows")] -impl From for RetrievalError { - fn from(input: windows::core::Error) -> Self { - Self::Windows(input) - } -} - -impl From for RetrievalError { - fn from(input: Utf8Error) -> Self { - Self::ConversionError(input) - } -} - -impl From for RetrievalError { - fn from(input: FromVecWithNulError) -> Self { - Self::FromVecWithNulError(input) - } -} - -/// An error encountered while either retrieving or parsing a system locale -#[derive(Display, Debug, PartialEq)] -pub enum ParseError { - #[displaydoc("Locale failed native parsing logic: {0}")] - Posix(PosixParseError), - #[displaydoc("Unable to parse ICU4X locale: {0}")] - Icu(icu_locale_core::ParseError), -} - -impl From for ParseError { - fn from(value: PosixParseError) -> Self { - Self::Posix(value) - } -} - -impl From for ParseError { - fn from(value: icu_locale_core::ParseError) -> Self { - Self::Icu(value) - } -} - -/// An error encountered while either retrieving or parsing a system locale -#[derive(Display, Debug)] -pub enum LocaleError { - #[displaydoc("Unable to retrieve locales: {0:?}")] - Retrieval(RetrievalError), - #[displaydoc("Unable to parse locale: {0}")] - Parse(ParseError), -} - -impl From for LocaleError { - fn from(value: RetrievalError) -> Self { - Self::Retrieval(value) - } -} - -impl From for LocaleError { - fn from(value: ParseError) -> Self { - Self::Parse(value) - } -} diff --git a/utils/env_preferences/src/lib.rs b/utils/env_preferences/src/lib.rs deleted file mode 100644 index 2d45c051d55..00000000000 --- a/utils/env_preferences/src/lib.rs +++ /dev/null @@ -1,90 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! # env_preferences -//! -//! `env_preferences` is a crate to retrieve system locale and preferences for -//! Apple, Linux & Windows systems. -//! -//! It provides functionality to fetch preferred locales from the user's operating -//! system and parse them lossily to an ICU4X [`Locale`](icu_locale_core::Locale). -//! -//! It also retrieves preferences for [`Calendar`](https://crates.io/crates/icu_calendar) -//! & [`TimeZone`](https://crates.io/crates/icu_time) - -mod error; -pub mod parse; - -pub use error::{LocaleError, ParseError, RetrievalError}; - -// doc -use core_foundation_sys as _; -#[cfg(target_os = "windows")] -use libc as _; - -#[cfg(any(doc, target_os = "macos"))] -pub mod apple; -#[cfg(any(doc, target_os = "linux"))] -pub mod posix; -#[cfg(any(doc, target_os = "windows"))] -pub mod windows; -#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))] -compile_error!( - "Unsupported target OS. Supported operating systems are Apple, Linux & Windows as of now" -); - -#[cfg(target_os = "macos")] -use apple as system; -#[cfg(target_os = "linux")] -use posix as system; -#[cfg(target_os = "windows")] -use windows as system; - -#[cfg(target_os = "macos")] -use parse::apple::AppleLocale as SystemLocale; -#[cfg(target_os = "linux")] -use parse::posix::PosixLocale as SystemLocale; -#[cfg(target_os = "windows")] -use parse::windows::WindowsLocale as SystemLocale; - -/// List the user's available locales as the platform-provided [`String`]s, ordered by preference. -/// -///
-/// -/// The output of this function is platform-dependent and **is not guaranteed** to be a valid -/// BCP-47 identifier. To get a list of parsed locales, see [`get_locales_lossy()`]. -/// -///
-/// -/// Specific information can be found at the platform's implementation: -/// - [`apple::get_raw_locales()`] -/// - [`posix::get_raw_locales()`] -/// - [`windows::get_raw_locales()`] -pub fn get_raw_locales() -> Result, RetrievalError> { - system::get_raw_locales() -} - -/// List the user's available locales as ICU4X [`Locale`](icu_locale_core::Locale)s, ordered by preference. -/// -/// This performs a best-effort conversion that may lose some (or all!) data in certain cases. -/// For getting a list of raw system locales, see [`get_raw_locales()`]. -/// -/// Specific information can be found at the platform's implementation: -/// - [`parse::apple::AppleLocale`] -/// - [`parse::posix::PosixLocale`] -/// - [`parse::windows::WindowsLocale`] -pub fn get_locales_lossy() -> Result, LocaleError> { - let raw_locales = get_raw_locales()?; - let system_locales = raw_locales - .iter() - .map(String::as_str) - .map(SystemLocale::try_from_str) - .collect::, ParseError>>()?; - - system_locales - .iter() - .map(SystemLocale::try_convert_lossy) - .map(|result| result.map_err(LocaleError::from)) - .collect::, LocaleError>>() -} diff --git a/utils/env_preferences/src/parse/aliases.rs b/utils/env_preferences/src/parse/aliases.rs deleted file mode 100644 index 86880a7b1d7..00000000000 --- a/utils/env_preferences/src/parse/aliases.rs +++ /dev/null @@ -1,52 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Platform-specific conversion from locale strings to BCP-47 identifiers. - -/// Strip any Windows "Sort Order Identifier" and return a matching CLDR collation value. -/// -/// The full table is available at: -/// -pub fn strip_windows_collation_suffix_lossy( - lcid: &str, -) -> (&str, Option) { - use icu_locale_core::extensions::unicode::value; - - // All known LCIDs containing an underscore are used for a collation suffix - if let Some((prefix, suffix)) = lcid.split_once('_') { - let collation_value = match suffix { - "phoneb" => value!("phonebk"), - "pronun" => value!("zhuyin"), - "radstr" => value!("unihan"), - "stroke" => value!("stroke"), - "tradnl" => value!("trad"), - // Strip the suffix on LCIDs with an underscore but no (known) matching CLDR data - _ => return (prefix, None), - }; - - // Return the LCID with the stripped prefix, and the matching CLDR collation key - (prefix, Some(collation_value)) - } else { - // No underscore found, return the LCID as-is - (lcid, None) - } -} - -/// Find a BCP-47 identifier from a list of known Windows aliases. -pub fn find_windows_language_alias_lossy( - lcid: &str, -) -> Option { - use icu_locale_core::langid; - - match lcid { - "zh-yue-HK" => Some(langid!("yue-HK")), - // LCID with no (known) matching CLDR data: "math alphanumeric sorting" - // This would be `x-IV_mathan`, but the collation suffix may already be stripped by - // `strip_windows_collation_suffix_lossy`. For some reason, `LocaleEnumProcEx` also uses - // `x-IV-mathan`, so that is included here too. - // https://learn.microsoft.com/en-us/windows/win32/api/winnls/nc-winnls-locale_enumprocex - "x-IV" | "x-IV_mathan" | "x-IV-mathan" => Some(langid!("und")), - _ => None, - } -} diff --git a/utils/env_preferences/src/parse/apple.rs b/utils/env_preferences/src/parse/apple.rs deleted file mode 100644 index 275fed332b9..00000000000 --- a/utils/env_preferences/src/parse/apple.rs +++ /dev/null @@ -1,23 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use icu_locale_core::Locale; - -use crate::ParseError; - -pub struct AppleLocale<'src> { - src: &'src str, -} - -impl<'src> AppleLocale<'src> { - pub fn try_from_str(src: &'src str) -> Result { - Ok(Self { src }) - } - - pub fn try_convert_lossy(&self) -> Result { - let locale = Locale::try_from_str(self.src)?; - - Ok(locale) - } -} diff --git a/utils/env_preferences/src/parse/mod.rs b/utils/env_preferences/src/parse/mod.rs deleted file mode 100644 index 4e4a7a206b0..00000000000 --- a/utils/env_preferences/src/parse/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Parsing functionality for various popular operating systems - -// Re-export all alias functions -mod aliases; -pub use aliases::*; - -pub mod apple; -pub mod posix; -pub mod windows; diff --git a/utils/env_preferences/src/parse/posix.rs b/utils/env_preferences/src/parse/posix.rs deleted file mode 100644 index 01bb131ab23..00000000000 --- a/utils/env_preferences/src/parse/posix.rs +++ /dev/null @@ -1,312 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Parsing functionality for POSIX locale identifiers. -//! For more information, see [`PosixLocale`]. -//! -//! # Usage example -//! ``` -//! # use icu_locale_core::locale; -//! # use env_preferences::parse::posix::PosixLocale; -//! # use env_preferences::LocaleError; -//! # fn main() -> Result<(), LocaleError> { -//! let posix_locale = PosixLocale::try_from_str("en_US")?; -//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US")); -//! # Ok(()) -//! # } -//! ``` - -use displaydoc::Display; -use icu_locale_core::extensions::unicode::{key, value}; -use icu_locale_core::extensions::Extensions; -use icu_locale_core::subtags::{language, script, variant, Language, Region, Variants}; -use icu_locale_core::{locale, LanguageIdentifier, Locale}; - -use crate::ParseError; - -#[derive(Display, Debug, PartialEq)] -/// An error while parsing a POSIX locale identifier -pub enum PosixParseError { - #[displaydoc("Empty locale")] - EmptyLocale, - #[displaydoc("Empty section beginning at offset {offset}")] - EmptySection { offset: usize }, - #[displaydoc("Invalid character at offset {offset}")] - InvalidCharacter { offset: usize }, - #[displaydoc("Invalid locale")] - InvalidLocale, - #[displaydoc("Delimiter repeated at offsets {first_offset} and {second_offset}")] - RepeatedDelimiter { - first_offset: usize, - second_offset: usize, - }, - #[displaydoc("Delimiters found out-of-order at offsets {first_offset} and {second_offset}")] - UnorderedDelimiter { - first_offset: usize, - second_offset: usize, - }, -} - -#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] -enum Delimiter { - Territory, - Codeset, - Modifier, -} - -impl Delimiter { - /// Find any optional sections, returning an error if the delimiters are invalid - pub fn try_find_sections(src: &str) -> Result, PosixParseError> { - // Find the offset and delimiter of each optional section - let optional_sections = src - .chars() - .enumerate() - .flat_map(|(index, character)| match character { - '_' => Some((index, Self::Territory)), - '.' => Some((index, Self::Codeset)), - '@' => Some((index, Self::Modifier)), - _ => None, - }) - .collect::>(); - - // Find any errors in the arrangement of delimiters - for (index, (first_offset, first_delimiter)) in optional_sections.iter().enumerate() { - // Find any repeated delimiters - if let Some((second_offset, _second_delimiter)) = optional_sections - .iter() - // Check all delimiters past this index - .skip(index + 1) - .find(|(_second_offset, second_delimiter)| first_delimiter == second_delimiter) - { - return Err(PosixParseError::RepeatedDelimiter { - first_offset: *first_offset, - second_offset: *second_offset, - }); - } - - // Find any delimiters that have been invalidated by a delimiter that should appear after it - // For example "en.utf8_US" is invalid because codeset appears before territory - if let Some((second_offset, second_delimiter)) = optional_sections.get(index + 1) { - if first_delimiter > second_delimiter { - return Err(PosixParseError::UnorderedDelimiter { - first_offset: *first_offset, - second_offset: *second_offset, - }); - } - } - } - - Ok(optional_sections) - } -} - -#[derive(Debug)] -/// A parsed and validated POSIX locale identifier. -pub struct PosixLocale<'src> { - language: &'src str, - territory: Option<&'src str>, - codeset: Option<&'src str>, - // TODO: is it possible to have multiple modifiers? - modifier: Option<&'src str>, -} - -impl<'src> PosixLocale<'src> { - /// Attempt to parse a POSIX locale. - /// - /// Locales are expected to be in the format `language[_territory][.codeset][@modifier]`; - /// only the language section is mandatory, all other sections are optional. - /// For example: - /// - All sections: `en_US.utf8@euro` - /// - Only required sections: `en` - /// - /// See section 8.2 of the POSIX spec for more details: - /// - pub fn try_from_str(src: &'src str) -> Result { - // These cases are implementation-defined and can be ignored: - // - Empty locales - if src.is_empty() { - return Err(ParseError::Posix(PosixParseError::EmptyLocale)); - } - // - Any locale containing '/' - if let Some(offset) = src.find('/') { - return Err(ParseError::Posix(PosixParseError::InvalidCharacter { - offset, - })); - } - // - Locales consisting of "." or ".." - if src == "." || src == ".." { - return Err(ParseError::Posix(PosixParseError::InvalidLocale)); - } - - // Find any optional sections, and return any delimiter-related errors - let optional_sections = Delimiter::try_find_sections(src)?; - - // The language field continues until the start of the first optional section, if one exists - let language = match optional_sections.first() { - Some((offset, _delimiter)) => &src[..*offset], - None => src, - }; - - // Make sure the language itself is non-empty - if language.is_empty() { - return Err(ParseError::Posix(PosixParseError::EmptySection { - offset: 0, - })); - } - - let mut locale = Self { - language, - territory: None, - codeset: None, - modifier: None, - }; - - for (index, (start_offset, delimiter)) in optional_sections.iter().enumerate() { - // Find the offset of the next section, or end of the string if none exist - let end_offset = optional_sections - .get(index + 1) - .map(|(next_offset, _next_delimiter)| *next_offset) - .unwrap_or(src.len()); - - // Make sure this section is non-empty (more characters than just the delimiter) - if start_offset + 1 >= end_offset { - return Err(ParseError::Posix(PosixParseError::EmptySection { - offset: *start_offset, - })); - } - - // Write the section to the appropriate field - let section_value = Some(&src[start_offset + 1..end_offset]); - match delimiter { - Delimiter::Territory => locale.territory = section_value, - Delimiter::Codeset => locale.codeset = section_value, - Delimiter::Modifier => locale.modifier = section_value, - } - } - - Ok(locale) - } - - /// Attempt to convert a POSIX locale into a valid BCP-47 identifier. - /// - /// This is a best-effort conversion process, and there are valid - /// POSIX locales that will return an error or silently ignore data. - /// In particular, the codeset section is always ignored, and only some common modifiers are handled - /// (unknown modifiers will be silently ignored). - /// - /// # Examples - /// - /// ## Parsing behaviour - /// ``` - /// # use icu_locale_core::locale; - /// # use env_preferences::parse::posix::PosixLocale; - /// # use env_preferences::LocaleError; - /// # fn main() -> Result<(), LocaleError> { - /// // Locales will always include the `posix` variant - /// assert_eq!( - /// PosixLocale::try_from_str("en_US")?.try_convert_lossy()?, - /// locale!("en-US") - /// ); - /// // The codeset field will be ignored - /// assert_eq!( - /// PosixLocale::try_from_str("en_US.iso88591")?.try_convert_lossy()?, - /// locale!("en-US") - /// ); - /// // Any unknown modifiers will be ignored - /// assert_eq!( - /// PosixLocale::try_from_str("en_US@unknown")?.try_convert_lossy()?, - /// locale!("en-US") - /// ); - /// # Ok(()) - /// # } - /// ``` - /// - /// ## Edge cases - /// ``` - /// # use icu_locale_core::locale; - /// # use env_preferences::parse::posix::PosixLocale; - /// # use env_preferences::LocaleError; - /// # fn main() -> Result<(), LocaleError> { - /// // The default "C"/"POSIX" locale will be converted to "en-US-posix" - /// assert_eq!( - /// PosixLocale::try_from_str("C")?.try_convert_lossy()?, - /// locale!("en-US-posix") - /// ); - /// assert_eq!( - /// PosixLocale::try_from_str("POSIX")?.try_convert_lossy()?, - /// locale!("en-US-posix") - /// ); - /// - /// // Known script modifiers will be converted to the matching CLDR keys - /// assert_eq!( - /// PosixLocale::try_from_str("uz_UZ@cyrillic")?.try_convert_lossy()?, - /// locale!("uz-Cyrl-UZ") - /// ); - /// assert_eq!( - /// PosixLocale::try_from_str("ks_IN@devanagari")?.try_convert_lossy()?, - /// locale!("ks-Deva-IN") - /// ); - /// assert_eq!( - /// PosixLocale::try_from_str("be_BY@latin")?.try_convert_lossy()?, - /// locale!("be-Latn-BY") - /// ); - /// - /// // Other known modifiers are handled accordingly - /// assert_eq!( - /// PosixLocale::try_from_str("en_US@euro")?.try_convert_lossy()?, - /// locale!("en-US-u-cu-eur") - /// ); - /// assert_eq!( - /// PosixLocale::try_from_str("aa_ER@saaho")?.try_convert_lossy()?, - /// locale!("ssy-ER") - /// ); - /// # Ok(()) - /// # } - /// ``` - pub fn try_convert_lossy(&self) -> Result { - // The default "C"/"POSIX" locale should map to "en-US-posix", - // which is the default behaviour in ICU4C: - // https://github.com/unicode-org/icu/blob/795d7ac82c4b29cf721d0ad62c0b178347d453bf/icu4c/source/common/putil.cpp#L1738 - if self.language == "C" || self.language == "POSIX" { - return Ok(locale!("en-US-posix")); - } - - let mut extensions = Extensions::new(); - let mut script = None; - let mut variant = None; - - // Parse the language/region - let mut language = Language::try_from_str(self.language)?; - let region = self.territory.map(Region::try_from_str).transpose()?; - - if let Some(modifier) = self.modifier { - match modifier.to_ascii_lowercase().as_str() { - "euro" => { - extensions.unicode.keywords.set(key!("cu"), value!("eur")); - } - // Known script modifiers - "cyrillic" => script = Some(script!("Cyrl")), - "devanagari" => script = Some(script!("Deva")), - "latin" => script = Some(script!("Latn")), - // Saaho seems to be the only "legacy variant" that appears as a modifier: - // https://www.unicode.org/reports/tr35/#table-legacy-variant-mappings - "saaho" => language = language!("ssy"), - "valencia" => variant = Some(variant!("valencia")), - // Some modifiers are known but can't be expressed as a BCP-47 identifier - // e.g. "@abegede", "@iqtelif" - _ => (), - } - } - - Ok(Locale { - id: LanguageIdentifier { - language, - region, - script, - variants: variant.map_or_else(Variants::new, Variants::from_variant), - }, - extensions, - }) - } -} diff --git a/utils/env_preferences/src/parse/windows.rs b/utils/env_preferences/src/parse/windows.rs deleted file mode 100644 index 4569bb8fafa..00000000000 --- a/utils/env_preferences/src/parse/windows.rs +++ /dev/null @@ -1,83 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Parsing functionality for Windows LCIDs. -//! For more information, see [`WindowsLocale`]. -//! -//! # Usage example -//! ``` -//! # use icu_locale_core::locale; -//! # use env_preferences::parse::windows::WindowsLocale; -//! # use env_preferences::LocaleError; -//! # fn main() -> Result<(), LocaleError> { -//! let windows_locale = WindowsLocale::try_from_str("en-US")?; -//! assert_eq!(windows_locale.try_convert_lossy()?, locale!("en-US")); -//! # Ok(()) -//! # } -//! ``` - -use icu_locale_core::extensions::unicode::{key, Keywords, Unicode}; -use icu_locale_core::extensions::Extensions; -use icu_locale_core::{LanguageIdentifier, Locale}; - -use super::aliases::{find_windows_language_alias_lossy, strip_windows_collation_suffix_lossy}; -use crate::ParseError; - -pub struct WindowsLocale<'src> { - src: &'src str, -} - -impl<'src> WindowsLocale<'src> { - pub fn try_from_str(src: &'src str) -> Result { - Ok(Self { src }) - } - - /// ## Edge cases - /// ``` - /// # use icu_locale_core::locale; - /// # use env_preferences::parse::windows::WindowsLocale; - /// # use env_preferences::LocaleError; - /// # fn main() -> Result<(), LocaleError> { - /// // Known invalid values are converted to a matching BCP-47 identifier - /// assert_eq!( - /// WindowsLocale::try_from_str("zh-yue-HK")?.try_convert_lossy()?, - /// locale!("yue-HK") - /// ); - /// - /// // Known collation suffixes and converted to `-u-co-VALUE` extension syntax - /// assert_eq!( - /// WindowsLocale::try_from_str("de-DE_phoneb")?.try_convert_lossy()?, - /// locale!("de-DE-u-co-phonebk") - /// ); - /// assert_eq!( - /// WindowsLocale::try_from_str("zh-TW_pronun")?.try_convert_lossy()?, - /// locale!("zh-TW-u-co-zhuyin") - /// ); - /// # Ok(()) - /// # } - /// ``` - pub fn try_convert_lossy(&self) -> Result { - let (lcid, collation_value) = strip_windows_collation_suffix_lossy(self.src); - let keywords = match collation_value { - // Add the -u-co-VALUE extension to the locale - Some(collation_value) => Keywords::new_single(key!("co"), collation_value), - // No collation value found, use default keywords - None => Keywords::new(), - }; - - // Use a matching alias if found - let language = match find_windows_language_alias_lossy(lcid) { - Some(locale) => locale, - None => LanguageIdentifier::try_from_str(lcid)?, - }; - - Ok(Locale { - id: language, - extensions: Extensions::from_unicode(Unicode { - keywords, - ..Unicode::new() - }), - }) - } -} diff --git a/utils/env_preferences/src/posix.rs b/utils/env_preferences/src/posix.rs deleted file mode 100644 index 4b6f7626af6..00000000000 --- a/utils/env_preferences/src/posix.rs +++ /dev/null @@ -1,119 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use libc::{setlocale, LC_ALL, LC_TIME}; -use std::collections::HashMap; -use std::ffi::CStr; -use std::ptr; -use std::str::FromStr; - -use crate::RetrievalError; - -#[derive(Hash, Eq, PartialEq, Debug)] -pub enum LocaleCategory { - Character, - Number, - Time, - Collate, - Monetary, - Messages, - Paper, - Name, - Address, - Telephone, - Measurement, - Identification, - All, -} - -impl FromStr for LocaleCategory { - type Err = RetrievalError; - - fn from_str(s: &str) -> Result { - match s { - "LC_CTYPE" => Ok(Self::Character), - "LC_NUMERIC" => Ok(Self::Number), - "LC_TIME" => Ok(Self::Time), - "LC_COLLATE" => Ok(Self::Collate), - "LC_MONETARY" => Ok(Self::Monetary), - "LC_MESSAGES" => Ok(Self::Messages), - "LC_PAPER" => Ok(Self::Paper), - "LC_NAME" => Ok(Self::Name), - "LC_ADDRESS" => Ok(Self::Address), - "LC_TELEPHONE" => Ok(Self::Telephone), - "LC_MEASUREMENT" => Ok(Self::Measurement), - "LC_IDENTIFICATION" => Ok(Self::Identification), - "LC_ALL" => Ok(Self::All), - _ => Err(RetrievalError::UnknownCategory), - } - } -} - -/// Use [`get_raw_locale_categories`] to find a list of the user's preferred locales -pub fn get_raw_locales() -> Result, RetrievalError> { - let mut categories = get_raw_locale_categories()?; - let mut locales = Vec::with_capacity(categories.len()); - - // Add LC_ALL if it exists - if let Some(primary_locale) = categories.remove(&LocaleCategory::All) { - locales.push(primary_locale); - } - - // Add any remaining locales that were explicitly set - locales.extend(categories.into_values()); - Ok(locales) -} - -// TODO: Add a function to return all the locales POSIX categories explicitly - -/// Retrieves locale for `LC_ALL` POSIX category. Also returns other categories if any are explicitly -/// set in the thread -pub fn get_raw_locale_categories() -> Result, RetrievalError> { - let mut locale_map = HashMap::new(); - - // SAFETY: Safety is ensured because we pass a `NULL` pointer and retrieve the locale there is - // no subsequent calls for `setlocale` which could change the locale of this particular thread - let locales_ptr = unsafe { setlocale(LC_ALL, ptr::null()) }; - - if locales_ptr.is_null() { - return Err(RetrievalError::NullLocale); - } - - // SAFETY: A valid `NULL` terminator is present which is a requirement of `from_ptr` - let locales_str = unsafe { CStr::from_ptr(locales_ptr) }.to_str()?; - let locale_pairs = locales_str.split(';'); - for locale_pair in locale_pairs { - let mut parts = locale_pair.split('='); - if let Some(value) = parts.next() { - if let Some(key) = parts.next() { - if let Ok(category) = LocaleCategory::from_str(value) { - locale_map.insert(category, key.to_string()); - } - } else { - // Handle case where only a single locale - locale_map.insert(LocaleCategory::All, value.to_string()); - } - } - } - Ok(locale_map) -} - -/// Get the system calendar locale (LC_TIME). -/// -/// This only returns the calendar locale, `gnome-calendar` is the default calendar in linux -/// The locale returned is for `Gregorian` calendar -/// Related issue: `` -pub fn get_system_calendars() -> Result { - // SAFETY: Safety is ensured because we pass a `NULL` pointer and retrieve the locale there is - // no subsequent calls for `setlocale` which could change the locale of this particular thread - let locale_ptr = unsafe { setlocale(LC_TIME, ptr::null()) }; - - if !locale_ptr.is_null() { - // SAFETY: A valid `NULL` terminator is present which is a requirement of `from_ptr` - let rust_str = unsafe { CStr::from_ptr(locale_ptr) }.to_str()?; - let calendar_locale = rust_str.to_string(); - return Ok(calendar_locale); - } - Err(RetrievalError::NullLocale) -} diff --git a/utils/env_preferences/src/windows.rs b/utils/env_preferences/src/windows.rs deleted file mode 100644 index 6b76ae4df82..00000000000 --- a/utils/env_preferences/src/windows.rs +++ /dev/null @@ -1,41 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use crate::RetrievalError; - -/// Retrieves languages preferred by the user , it consumes [`GlobalizationPreferences::Languages`](https://learn.microsoft.com/en-us/uwp/api/windows.system.userprofile.globalizationpreferences.languages?view=winrt-26100) -pub fn get_raw_locales() -> Result, RetrievalError> { - let mut locale_vec_str: Vec = Vec::new(); - let locale = windows::System::UserProfile::GlobalizationPreferences::Languages()?; - - for i in 0..locale.Size()? { - let hstring = locale.GetAt(i)?; - let string = hstring.to_string_lossy(); - locale_vec_str.push(string); - } - Ok(locale_vec_str) -} - -/// Gets the list calendar type and it's corresponding locale. It returns a Vec<(String, String)> -/// The first element is the locale of the calendar, second is the calendar identifier -pub fn get_system_calendars() -> Result, RetrievalError> { - let calendar = windows::Globalization::Calendar::new()?; - let system_calendar = windows::Globalization::Calendar::GetCalendarSystem(&calendar)?; - let calendar_type: String = system_calendar.to_string(); - let locale_list: Vec = get_raw_locales()?; - - let result: Vec<(String, String)> = locale_list - .into_iter() - .map(|locale| (locale, calendar_type.clone())) - .collect(); - - Ok(result) -} - -/// Get the current time zone of the system -pub fn get_system_time_zone() -> Result { - let calendar = windows::Globalization::Calendar::new()?; - let timezone = calendar.GetTimeZone()?; - Ok(timezone.to_string_lossy()) -} diff --git a/utils/env_preferences/tests/parse/mod.rs b/utils/env_preferences/tests/parse/mod.rs deleted file mode 100644 index 767795cfe78..00000000000 --- a/utils/env_preferences/tests/parse/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -mod posix; -mod windows; diff --git a/utils/env_preferences/tests/parse/posix.rs b/utils/env_preferences/tests/parse/posix.rs deleted file mode 100644 index 842546b5af6..00000000000 --- a/utils/env_preferences/tests/parse/posix.rs +++ /dev/null @@ -1,298 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use env_preferences::parse::posix::PosixLocale; -use icu_locale_core::Locale; - -fn expect_success(src: &str, expected: &str) { - let posix_locale = PosixLocale::try_from_str(src).expect(src); - let converted_locale = posix_locale.try_convert_lossy().expect(src); - - let expected_locale = Locale::try_from_str(expected).expect(src); - assert_eq!(converted_locale, expected_locale, "POSIX locale: `{src}`"); -} - -#[test] -fn default_locale() { - expect_success("C", "en-US-posix"); - expect_success("POSIX", "en-US-posix"); -} - -#[test] -fn region() { - expect_success("en_US", "en-US"); - expect_success("ne_NP", "ne-NP"); - expect_success("zh_TW", "zh-TW"); -} - -#[test] -fn codeset_ignored() { - expect_success("lv_LV.iso885913", "lv-LV"); - expect_success("hy_AM.armscii8", "hy-AM"); -} - -#[test] -fn modifier() { - // Currency - expect_success("it_IT@euro", "it-IT-u-cu-eur"); - - // Script - expect_success("uz_UZ@cyrillic", "uz-Cyrl-UZ"); - expect_success("sd_IN@devanagari", "sd-Deva-IN"); - expect_success("sr_RS@latin", "sr-Latn-RS"); - - // Language - expect_success("aa_ER@saaho", "ssy-ER"); - - // Variant - expect_success("ca_ES@valencia", "ca-ES-valencia"); -} - -mod error { - mod parse { - use env_preferences::parse::posix::{PosixLocale, PosixParseError}; - use env_preferences::ParseError; - - fn expect_error(src: &str, posix_error: PosixParseError) { - let result = PosixLocale::try_from_str(src); - let expected = ParseError::Posix(posix_error); - - match result { - Ok(invalid_locale) => { - panic!("Expected the error `{expected:?}`, got the locale `{invalid_locale:?}` from input of `{src}`") - } - Err(error) => { - assert_eq!(error, expected, "Comparing expected output of `{src}`") - } - } - } - - #[test] - fn empty_locale() { - expect_error("", PosixParseError::EmptyLocale); - } - - #[test] - fn empty_section() { - // Single, empty optional section - expect_error("en_", PosixParseError::EmptySection { offset: 2 }); - expect_error("en.", PosixParseError::EmptySection { offset: 2 }); - expect_error("en@", PosixParseError::EmptySection { offset: 2 }); - - // Multiple optional sections, one empty - expect_error("en_.utf8@euro", PosixParseError::EmptySection { offset: 2 }); - expect_error("en_US.@euro", PosixParseError::EmptySection { offset: 5 }); - expect_error("en_US.utf8@", PosixParseError::EmptySection { offset: 10 }); - - // Single delimiter (excluding "." as that should return `PosixParseError::InvalidLocale` instead) - expect_error("_", PosixParseError::EmptySection { offset: 0 }); - expect_error("@", PosixParseError::EmptySection { offset: 0 }); - - // All delimiters - expect_error("_.@", PosixParseError::EmptySection { offset: 0 }); - } - - #[test] - fn invalid_character() { - const SAMPLE_LOCALES: [&str; 2] = [ - "en", // No optional fields - "en_US.utf8@euro", // All optional fields - ]; - - for locale in SAMPLE_LOCALES { - // Insert an invalid character ('/') at every position along the sample locale - for offset in 0..=locale.len() { - let (left, right) = locale.split_at(offset); - let invalid_locale = format!("{left}/{right}"); - expect_error( - &invalid_locale, - PosixParseError::InvalidCharacter { offset }, - ); - } - } - - // Test a single '/' character - expect_error("/", PosixParseError::InvalidCharacter { offset: 0 }); - } - - #[test] - fn invalid_locale() { - expect_error(".", PosixParseError::InvalidLocale); - expect_error("..", PosixParseError::InvalidLocale); - } - - #[test] - fn repeated_delimiter() { - // Repeated delimiter at the end of locale - expect_error( - "en_US.utf8@euro_US", - PosixParseError::RepeatedDelimiter { - first_offset: 2, - second_offset: 15, - }, - ); - expect_error( - "en_US.utf8@euro.utf8", - PosixParseError::RepeatedDelimiter { - first_offset: 5, - second_offset: 15, - }, - ); - expect_error( - "en_US.utf8@euro@euro", - PosixParseError::RepeatedDelimiter { - first_offset: 10, - second_offset: 15, - }, - ); - - // Multiple repeated delimiters - expect_error( - "en.utf8.utf8.utf8", - PosixParseError::RepeatedDelimiter { - first_offset: 2, - second_offset: 7, - }, - ); - - // Consecutive repeated delimiters - expect_error( - "en__US.utf8@euro", - PosixParseError::RepeatedDelimiter { - first_offset: 2, - second_offset: 3, - }, - ); - expect_error( - "en_US..utf8@euro", - PosixParseError::RepeatedDelimiter { - first_offset: 5, - second_offset: 6, - }, - ); - expect_error( - "en_US.utf8@@euro", - PosixParseError::RepeatedDelimiter { - first_offset: 10, - second_offset: 11, - }, - ); - } - - #[test] - fn unordered_delimiter() { - expect_error( - "en_US@euro.utf8", - PosixParseError::UnorderedDelimiter { - first_offset: 5, - second_offset: 10, - }, - ); - expect_error( - "en.utf8_US@euro", - PosixParseError::UnorderedDelimiter { - first_offset: 2, - second_offset: 7, - }, - ); - expect_error( - "en.utf8@euro_US", - PosixParseError::UnorderedDelimiter { - first_offset: 7, - second_offset: 12, - }, - ); - expect_error( - "en@euro_US.utf8", - PosixParseError::UnorderedDelimiter { - first_offset: 2, - second_offset: 7, - }, - ); - expect_error( - "en@euro.utf8_US", - PosixParseError::UnorderedDelimiter { - first_offset: 2, - second_offset: 7, - }, - ); - } - - #[test] - fn offset() { - // Empty section - let src = "en_.utf8@euro"; - match PosixLocale::try_from_str(src) { - Err(ParseError::Posix(PosixParseError::EmptySection { offset })) => { - assert_eq!(&src[offset..offset + 1], "_"); - } - _ => unreachable!(), - } - - // Invalid character - let src = "en_U/S"; - match PosixLocale::try_from_str(src) { - Err(ParseError::Posix(PosixParseError::InvalidCharacter { offset })) => { - assert_eq!(&src[offset..offset + 1], "/"); - } - _ => unreachable!(), - } - - // Repeated delimiter - let src = "en_US.utf8@euro_US"; - match PosixLocale::try_from_str(src) { - Err(ParseError::Posix(PosixParseError::RepeatedDelimiter { - first_offset, - second_offset, - })) => { - assert_eq!(&src[first_offset..first_offset + 1], "_"); - assert_eq!(&src[second_offset..second_offset + 1], "_"); - } - _ => unreachable!(), - } - - // Unordered delimiter - let src = "en_US@euro.utf8"; - match PosixLocale::try_from_str(src) { - Err(ParseError::Posix(PosixParseError::UnorderedDelimiter { - first_offset, - second_offset, - })) => { - assert_eq!(&src[first_offset..first_offset + 1], "@"); - assert_eq!(&src[second_offset..second_offset + 1], "."); - } - _ => unreachable!(), - } - } - } - - mod conversion { - use env_preferences::parse::posix::PosixLocale; - - fn expect_error(src: &str, icu_error: icu_locale_core::ParseError) { - let result = PosixLocale::try_from_str(src) - .expect(src) - .try_convert_lossy(); - let expected = env_preferences::ParseError::Icu(icu_error); - match result { - Ok(invalid_locale) => { - panic!("Expected the error `{icu_error:?}`, got the locale `{invalid_locale:?}` from input of `{src}`") - } - Err(error) => { - assert_eq!(error, expected, "Comparing expected output of `{src}`") - } - } - } - - #[test] - fn invalid_language() { - expect_error("invalid", icu_locale_core::ParseError::InvalidLanguage); - } - - #[test] - fn invalid_region() { - expect_error("en_invalid", icu_locale_core::ParseError::InvalidSubtag); - } - } -} diff --git a/utils/env_preferences/tests/parse/windows.rs b/utils/env_preferences/tests/parse/windows.rs deleted file mode 100644 index 925c608dacb..00000000000 --- a/utils/env_preferences/tests/parse/windows.rs +++ /dev/null @@ -1,60 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use env_preferences::parse::windows::WindowsLocale; -use icu_locale_core::Locale; - -fn expect_success(src: &str, expected: &str) { - let windows_locale = WindowsLocale::try_from_str(src).expect(src); - let locale = windows_locale.try_convert_lossy().expect(src); - - assert_eq!( - locale, - Locale::try_from_str(expected).unwrap(), - "Case: {src}" - ); -} - -#[test] -fn collation() { - /// All MS-LCID collation entries with a known matching CLDR collation value - const CASES: [(&str, &str); 12] = [ - ("de-DE_phoneb", "de-DE-u-co-phonebk"), - ("es-ES_tradnl", "es-ES-u-co-trad"), - ("ja-JP_radstr", "ja-JP-u-co-unihan"), - ("zh-CN_phoneb", "zh-CN-u-co-phonebk"), - ("zh-CN_stroke", "zh-CN-u-co-stroke"), - ("zh-HK_radstr", "zh-HK-u-co-unihan"), - ("zh-MO_radstr", "zh-MO-u-co-unihan"), - ("zh-MO_stroke", "zh-MO-u-co-stroke"), - ("zh-SG_phoneb", "zh-SG-u-co-phonebk"), - ("zh-SG_stroke", "zh-SG-u-co-stroke"), - ("zh-TW_pronun", "zh-TW-u-co-zhuyin"), - ("zh-TW_radstr", "zh-TW-u-co-unihan"), - ]; - - for (src, expected) in CASES { - expect_success(src, expected); - } -} - -#[test] -fn collation_strip_known_invalid() { - // All MS-LCID collation entries with NO known matching CLDR collation value - expect_success("hu-HU_tchncl", "hu-HU"); - expect_success("ka-GE_modern", "ka-GE"); -} - -#[test] -fn collation_strip_unknown() { - expect_success("en-US_unknown", "en-US"); - expect_success("en-US_unknown_multiple_underscores", "en-US"); - expect_success("en-US_unknown-with-hyphens", "en-US"); -} - -#[test] -fn alias() { - expect_success("zh-yue-HK", "yue-HK"); - expect_success("x-IV-mathan", "und"); -} diff --git a/utils/env_preferences/tests/test.rs b/utils/env_preferences/tests/test.rs deleted file mode 100644 index f598bfbd97d..00000000000 --- a/utils/env_preferences/tests/test.rs +++ /dev/null @@ -1,218 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -mod datasets; -mod parse; - -#[cfg(target_os = "linux")] -#[cfg(test)] -mod linux_tests { - use env_preferences::posix::{get_raw_locale_categories, get_system_calendars, LocaleCategory}; - use env_preferences::RetrievalError; - use icu_locale_core::Locale; - use libc::setlocale; - - // Testing fetching of locale, as `get_locales` fetches the locales for category - // `LC_ALL`. For this category this should return non empty - #[test] - fn test_get_raw_locale_categories() { - let locale_res = get_raw_locale_categories().unwrap(); - assert!( - !locale_res.is_empty(), - "Empty hashmap for locales retrieved" - ); - for locale in locale_res.into_values() { - assert!(locale.is_ascii(), "Invalid form of locale retrieved") - } - } - - #[test] - fn test_converting_locales() { - let locale_res: std::collections::HashMap = - get_raw_locale_categories().unwrap(); - for locale in locale_res.into_values() { - let parts: Vec<&str> = locale.split('.').collect(); - - // Skipping "C" and those ending with "UTF-8", as they cannot be converted - // into the locale - if !parts.contains(&"C") && (parts.len() > 1 && parts[parts.len() - 1] != "UTF-8") { - let mut locale_converted: Locale = locale.parse().unwrap(); - locale_converted.extensions.unicode.clear(); - assert_eq!(locale_converted, locale.parse().unwrap()); - } - } - } - - // This test contains unsafe code, the idea is to manually set a locale for `LC_TIME`, - // compare the result from `get_locales` and `get_system_calendar` they must be equal - #[test] - fn test_calendar() { - // Using "C" locale since it is the default, using any other locale like `en_IN` or `en_US` - // may work on some system and may not others depending on the availability - let test_calendar_locale = "C"; - let locale_cstr = - std::ffi::CString::new(test_calendar_locale).expect("CString::new failed"); - - // SAFETY: This call is safe because any subsequent call to `setlocale` we pass a `NULL` locale - // to retrieve locale which does not sets the locale. The test locale `locale_cstr` is a CString - // nul terminated string for which we have the ownership - let tr = unsafe { setlocale(libc::LC_TIME, locale_cstr.as_ptr()) }; - - if tr.is_null() { - panic!("{:?}", RetrievalError::NullLocale); - } - - let calendar_locale = get_system_calendars().unwrap(); - assert_eq!(test_calendar_locale.to_string(), calendar_locale); - } -} - -#[cfg(target_os = "macos")] -#[cfg(test)] -mod macos_test { - use env_preferences::apple::{get_raw_locales, get_system_calendars, get_system_time_zone}; - use icu_locale_core::Locale; - - #[test] - fn test_get_raw_locales() { - let locales_res = get_raw_locales(); - match locales_res { - Ok(locales) => { - for locale in locales { - assert!(!locale.is_empty(), "Empty locale retrieved"); - assert!(locale.is_ascii(), "Invalid form of locale retrieved"); - } - } - Err(e) => { - panic!("{e:?}") - } - } - } - - #[test] - fn test_converting_locales() { - let locales = get_raw_locales().unwrap(); - for locale in locales { - let _loc: Locale = locale.parse().unwrap(); - } - } - - #[test] - fn test_calendar() { - let calendar_res = get_system_calendars().unwrap(); - for calendar in calendar_res { - assert!(!calendar.0.is_empty(), "Couldn't retreive calendar locale"); - assert!(calendar.0.is_ascii(), "Calendar locale form is not valid"); - assert!(!calendar.1.is_empty(), "Couldn't retreive calendar"); - assert!( - calendar.1.is_ascii(), - "Calendar identifier form is not valid" - ); - } - } - - #[test] - fn test_time_zone() { - let time_zone = get_system_time_zone().unwrap(); - assert!(!time_zone.is_empty(), "Couldn't retreive time_zone"); - } -} - -#[cfg(target_os = "windows")] -#[cfg(test)] -mod windows_test { - use env_preferences::parse::windows::WindowsLocale; - use env_preferences::windows::{get_raw_locales, get_system_calendars, get_system_time_zone}; - use icu_locale_core::Locale; - use std::sync::{LazyLock, Mutex}; - use windows::Win32::{ - Foundation::LPARAM, - Globalization::{EnumSystemLocalesEx, LOCALE_ALL}, - }; - use windows_core::{BOOL, PCWSTR}; - - // Since [`EnumSystemLocalesEx`] iterates using a callback with no obvious (safe) way to return data, - // store them in this static instead. Since this is only a single test with roughly 1,000 items, - // it shouldn't be much of a concern. - static LOCALES: LazyLock>> = LazyLock::new(|| Mutex::new(Vec::new())); - - /// Callback provided to the [`EnumSystemLocalesEx`] to enumerate over locales. - unsafe extern "system" fn callback( - locale_name: PCWSTR, - _locale_flags: u32, - _callback_parameter: LPARAM, - ) -> BOOL { - // SAFETY: caller is the [`EnumSystemLocalesEx`] function, which guarantees a valid null-terminated string - let locale_name = unsafe { locale_name.to_string() }.unwrap(); - - // Skip empty locale 0x007F, marked as "Reserved for invariant locale behavior" - // Source: MS-LCID version 16.0, page 13 (section 2.2 under "Language ID" table) - if !locale_name.is_empty() { - LOCALES.lock().unwrap().push(locale_name); - } - - // Tell [`EnumSystemLocalesEx`] to continue enumeration - BOOL::from(true) - } - - /// Enumerate over all Windows locales, and make sure [`WindowsLocale`] can parse it without any (direct) errors. - #[test] - fn system_locales() -> windows_core::Result<()> { - // Find the list of supported locales, using the [`EnumSystemLocalesEx`] API: - // https://learn.microsoft.com/en-us/windows/win32/api/winnls/nf-winnls-enumsystemlocalesex - // SAFETY: a valid function pointer is provided and lpReserved is set to NULL/None as required - unsafe { - EnumSystemLocalesEx(Some(callback), LOCALE_ALL, LPARAM::default(), None)?; - } - - // Get the list of locales which the callback has been modifying - let locales = LOCALES.lock().unwrap(); - - // Make sure [`WindowsLocale`] can parse without any obvious issues - for locale in locales.iter() { - let windows_locale = WindowsLocale::try_from_str(locale).expect(locale); - windows_locale.try_convert_lossy().expect(locale); - } - - Ok(()) - } - - #[test] - fn test_get_raw_locales() { - let locales = get_raw_locales().unwrap(); - for locale in locales { - assert!(!locale.is_empty(), "Empty locale retrieved"); - assert!(locale.is_ascii(), "Invalid form of locale retrieved"); - } - } - - #[test] - fn test_converting_locales() { - let locales = get_raw_locales().unwrap(); - for locale in locales { - let _converted_locale: Locale = locale.parse().unwrap(); - } - } - - #[test] - fn test_calendar() { - let calendars = get_system_calendars().unwrap(); - for calendar in calendars { - assert!(!calendar.0.is_empty(), "Calendar locale is empty"); - assert!(calendar.0.is_ascii(), "Calendar locale form is not valid"); - assert!(!calendar.1.is_empty(), "Calendar identifier is empty"); - assert!( - calendar.1.is_ascii(), - "Calendar identifier form is not valid" - ); - } - } - - #[test] - fn test_time_zone() { - let time_zone = get_system_time_zone().unwrap(); - assert!(!time_zone.is_empty(), "Couldn't retreive time_zone"); - assert!(time_zone.is_ascii(), "Invalid TimeZone format"); - } -} diff --git a/utils/env_preferences/Cargo.toml b/utils/host_info/Cargo.toml similarity index 52% rename from utils/env_preferences/Cargo.toml rename to utils/host_info/Cargo.toml index d62264e357f..6d3470d744b 100644 --- a/utils/env_preferences/Cargo.toml +++ b/utils/host_info/Cargo.toml @@ -3,7 +3,7 @@ # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). [package] -name = "env_preferences" +name = "icu_host_info" version = "0.1.0" publish = false @@ -17,11 +17,17 @@ categories.workspace = true include.workspace = true [dependencies] -core-foundation-sys = "0.8.6" displaydoc = { workspace = true } icu_locale_core = { workspace = true, features = ["alloc"] } -libc = "0.2.155" +icu_datetime = { workspace = true, optional = true } +libc = "0.2.175" +[target.'cfg(target_os = "linux")'.dependencies] +gio = { version = "0.21.2", optional = true } + +[target.'cfg(target_os = "macos")'.dependencies] +core-foundation-sys = { version = "0.8.6" } +core-foundation = { version = "0.10.1" } [target.'cfg(target_os = "windows")'.dependencies.windows] version = "0.60.0" @@ -36,5 +42,20 @@ features = [ "Win32_Globalization", ] -[target.'cfg(target_os = "windows")'.dev-dependencies] -windows-core = "0.60.1" +[dev-dependencies] +icu = { path = "../../components/icu", default-features = false } +icu_datetime = { workspace = true, features = ["compiled_data"] } +icu_calendar = { workspace = true } +icu_time = { workspace = true, features = ["compiled_data"] } + +[features] +default = [] +datetime = ["dep:icu_datetime"] +gnome = ["dep:gio"] + +[[example]] +name = "dt_format" +required-features = ["datetime"] + +[package.metadata.cargo-all-features] +skip_optional_dependencies = true diff --git a/utils/env_preferences/LICENSE b/utils/host_info/LICENSE similarity index 100% rename from utils/env_preferences/LICENSE rename to utils/host_info/LICENSE diff --git a/utils/host_info/README.md b/utils/host_info/README.md new file mode 100644 index 00000000000..02921aba250 --- /dev/null +++ b/utils/host_info/README.md @@ -0,0 +1,288 @@ +# icu_host_info [![crates.io](https://img.shields.io/crates/v/icu_host_info)](https://crates.io/crates/icu_host_info) + + + +## host_info + +`host_info` is a library providing functionality to retrieve regional preferences +from host environments - primarily the operating system the program is running in. + +The library is designed to bind the different host environment preferences architectures +to ICU4X model. + +## Example + +```rust +use icu_host_info::HostInfo; +use icu::calendar::Date; +use icu::datetime::{fieldsets, DateTimeFormatter}; + +let date = Date::try_new_gregorian(2025, 10, 10) + .expect("Failed to create date"); + +// requires feature `datetime` +let prefs = HostInfo::datetime_preferences() + .expect("Failed to retrieve host info"); + +let dtf = DateTimeFormatter::try_new(prefs, fieldsets::YMD::long()) + .expect("Failed to create datetime formatter."); + +let formatted_dt = dtf.format(&date); + +assert_eq!(formatted_dt.to_string(), "October 10, 2025"); +``` + +## Feature Matrix + +The library intends to provide means to retrieve regional preferences +to ICU4X preferences with a focus on Unicode Extensions, but allow for +propagation of preferences offered by the host environments which may +not have a representation in Unicode Extensions (for example: date format pattern). + +Legend: +- ✅ = OS + `host_info` support +- ⚠️ = OS supports, `host_info` doesn't +- ❌ = OS doesn't supported + +| Feature | Android | iOS | Linux (1) | macOS | Windows | +|---------------------| :-----: | :-: | :------------------: | :---: | :-----: | +| Requested Locales | ✅ | ✅ | ✅ | ✅ | ✅ | +| Calendar | ⚠️ | ⚠️ | ⚠️ | ✅ | ✅ | +| Region | ⚠️ | ⚠️ | ⚠️ | ✅ | ✅ | +| Hour cycle | ⚠️ | ⚠️ | ✅ | ✅ | ⚠️ | +| Measurement System | ⚠️ | ⚠️ | ⚠️ | ✅ | ⚠️ | +| Measurement Override| ⚠️ | ⚠️ | ⚠️ | ✅ | ⚠️ | +| First Day of week | ⚠️ | ⚠️ | ⚠️ | ✅ | ✅ | +| Collation | ⚠️ | ⚠️ | ⚠️ | ✅ | ❌ | +| Date format | ⚠️ | ⚠️ | ⚠️ | ⚠️ | ⚠️ | +| Number format | ⚠️ | ⚠️ | ⚠️ | ⚠️ | ⚠️ | + +(1) In case of Linux different DE's such as Gnoem and KDE are supported together. + +## Integrating preferences into ICU4X formatters + +The library provides three ways of injecting retrieved values into formatters: + +### 1. Preference Bag + +For most common components, such as `DateTimeFormatter`, the library exposes +a direct getter that retrieves a `Preferences` struct for that component. +This getter is located behind a flag to allow for control over which dependencies are being +pulled. + +#### Example + +```rust +use icu_host_info::HostInfo; +use icu::datetime::{fieldsets, DateTimeFormatter}; + +// requires feature `datetime` +let prefs = HostInfo::datetime_preferences() + .expect("Failed to retrieve host info"); + +let dtf = DateTimeFormatter::try_new(prefs, fieldsets::YMD::long()) + .expect("Failed to create datetime formatter."); +``` + +### 2. Locale + +For all components that `HostInfo` does not have special preference getter for, +and for cases where the user prefers to avoid pulling extra dependencies at the cost +of narrowing down the retrieved values to just ones encoded in Unicode Extensions, +the library provides an ergonomic getter: + +#### Example + +```rust +use icu_host_info::HostInfo; +use icu::{ + datetime::{fieldsets, DateTimeFormatter}, + locale::Locale, +}; + +let mut locale = HostInfo::requested_locales() + .expect("Failed to retrieve locales") + .first() + .cloned() + .unwrap_or(Locale::UNKNOWN); + +locale.extensions.unicode = HostInfo::unicode_extensions() + .expect("Failed to retrieve host info"); + +let dtf = DateTimeFormatter::try_new(locale.into(), fieldsets::YMD::long()) + .expect("Failed to create datetime formatter."); +``` + +Notice that the regional preferences encoded in Unicode Extensions +are retrieved separately from the list of requested locales. +There are two reasons for this design: +- The user has to decide whether the regional preferences apply onto all locales, or just the first one +- The locale negotiation may result in a different locale being selected. + +### 3. Individual Preferences + +For each preference the library also attempts to provide a direct getter +allowing the user to retrieve just that preference and use it as they see fit. + +#### Example + +```rust +use icu_host_info::HostInfo; +use icu::locale::preferences::extensions::unicode::keywords::HourCycle; + +let mut calendar: Option = HostInfo::hour_cycle() + .expect("Failed to retrieve hour_cycle preference"); +``` + +## Locale Negotiation + +Locale Negotiation is an upcoming feature in ICU4X which will enable the system integrating ICU4X to +perform a negotiation between requested locales, and locales for which the data is available in the system. +The output of `HostInfo` will be utilized in that negotiation allowing the deployment to 1) select +the most appropriate locales for the given user and target modality, 2) apply regional preferences onto that +locale. + +The need to allow `HostInfo` to be pluggable info locale negotiation and multi source merging (see next section) +guided many design choices in this library. This section will be extended once locale negotiation is implemented. + +## Multi Source Merging + +In simple systems the user will most often use ICU4X to format +some information in a selected locale, and use this library to augument +the formatting with regional preferences set by the user in the host environment. + +In more complex systems, the user may also want to introduce a second source of regional preferences +and mix the values set in the host environment with those set in the program itself. + +For example, a web browser may offer some regional preferences set in the browser +itself, or even set separate for some contexts of the browser. + +In those cases, the depoyment requires merging of the preferences. +ICU4X exposes an `extend` method on both `Preferences` and `Unicode` extensions struct. + +This allows the system to retrieve [`HostInfo`] Preferences or `Unicode`, and applications' +equivalent, and merge of them. + +### `Preferences` Example + +```rust +use icu_host_info::HostInfo; +use icu::datetime::{fieldsets, DateTimeFormatter}; + +let app_prefs = app.datetime_preferences(); + +// requires feature `datetime` +let mut combined_prefs = HostInfo::datetime_preferences() + .expect("Failed to retrieve host info"); + +combined_prefs.extend(app_prefs); + +let dtf = DateTimeFormatter::try_new(combined_prefs, fieldsets::YMD::long()) + .expect("Failed to create datetime formatter."); +``` + +### `Unicode` Extensions Example + +```rust +use icu_host_info::HostInfo; +use icu::{ + datetime::{fieldsets, DateTimeFormatter}, + locale::locale, +}; + +let mut locale = locale!("fr-CA"); + +let app_ue = app.unicode_extensions(); + +let mut combined_ue = HostInfo::unicode_extensions() + .expect("Failed to retrieve host info"); + +combined_ue.extend(app_ue); + +locale.extensions.unicode = combined_ue; + +let dtf = DateTimeFormatter::try_new(locale.into(), fieldsets::YMD::long()) + .expect("Failed to create datetime formatter."); +``` + +## Design Decisions + +The library operates on a boundary of diverse set of host +environments and uniformal ICU4X design derived from Unicode LDML. +It requires a number of design tradeoffs that had to be made in +order to achieve the uniformity and scale over time as the host +platforms design evolves. + +### Host Environment + +The library is designed to handle retrieval of data from the direct host +environment. This usually means an operating system, but it can mean a +virtual environment, sandbox or runtime. +In such a case it is the responsibility of the execution logic +setting up such environment to ensure propagation of customer preferences. + +### Lossy Results + +The library makes best-effort to retrieve the values +that can be directly used in ICU4X. As the operating systems, +runtimes and ICU4X evolve, there's always a risk of a mismatch. +This library makes a design decision to be lossy-by-default. + +Any value that cannot be directly mapped onto a valid value is ignored +and indistinguishable in the ergonomic API from a missing value. + +Similarly, the API does not distinguish between missing binding logic and unknown value. +The assumption is that users of this library are aiming to respect user choices +encoded in host environment regional preferences, but are not in a position +to act differently on a failed attempt to retrieve them from a missing attempt. +Therefore errors in this library are very rare and only related to catastrophic +cases like memory corruption or OS API errors propagation. + +### Normalized vs Raw values + +The main API of this library - [`HostInfo`] - provides methods that return normalized +values, often directly taken from `icu::locale_core::preferences`. +Per-host backends provide additional trait implementation that returns +raw values, allowing the user to handle or introspect those values manually. +When using `HostInfo`, the library performs best-effort to normalize and parse +those raw values into canonical Unicode ICU4X representation, often discarding +unknown values and values that fail to parse. + +Those raw backends are not exposed in the documentation as the documentation. + +#### Example + +```rust +use icu_host_info::backends::{ + RawHostInfoBackend, + macos::MacOSHostInfoBackend, +}; + +let raw_cal: Option = MacOSHostInfoBackend::raw_calendar() + .expect("Failed to retrieve raw calendar"); +``` + +### Minimize defaults + +The library attempts to use host APIs in a way that allows distinguishing between +preference values that represent defaults for a given locale, from ones manually set +by the user. +In some cases, the host API does not allow for distinguishing of that, which may result +in overly expressive locales such as `en-US-ca-gregory` (`gregory` being already a default calendar for en-US). + +This, like other aspects of the library, operates on best-effort basis and may be further improved in the future +releases as better bindings become available. + +#### Host API Design Guidance + +A note for host API designers - it is useful for foundational libraries such as this to expose APIs that enable us +to distinguish between regional preferences values derived by the host from defaults of a locale, from cases +when the value is explicitly set by the user. +This dinstinction allows ICU4X to better serve in locale negotiations scenario where other-than-first locale may be used +and the deployment should respect whether the user set a given preference explicitly or left it to the per-locale default. + + + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/utils/host_info/examples/dt_format.rs b/utils/host_info/examples/dt_format.rs new file mode 100644 index 00000000000..80e664a6c47 --- /dev/null +++ b/utils/host_info/examples/dt_format.rs @@ -0,0 +1,24 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_calendar::Date; +use icu_datetime::{fieldsets, input::Time, DateTimeFormatter}; +use icu_host_info::HostInfo; +use icu_time::DateTime; + +fn main() { + let prefs = HostInfo::datetime_preferences().expect("Failed to retrieve host info"); + let dtf = DateTimeFormatter::try_new( + prefs, + fieldsets::YMDT::long().with_alignment(icu_datetime::options::Alignment::Column), + ) + .expect("Failed to create datetime formatter."); + + let date = Date::try_new_gregorian(2020, 10, 10).unwrap(); + let time = Time::try_new(18, 56, 0, 0).unwrap(); + + let formatted_dt = dtf.format(&DateTime { date, time }); + + println!("Today is: {formatted_dt}"); +} diff --git a/utils/host_info/examples/dt_format_locale.rs b/utils/host_info/examples/dt_format_locale.rs new file mode 100644 index 00000000000..9eeea15b1f4 --- /dev/null +++ b/utils/host_info/examples/dt_format_locale.rs @@ -0,0 +1,33 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_calendar::Date; +use icu_datetime::{fieldsets, input::Time, DateTimeFormatter}; +use icu_host_info::HostInfo; +use icu_locale_core::Locale; +use icu_time::DateTime; + +fn main() { + let mut locale = HostInfo::requested_locales() + .unwrap() + .first() + .cloned() + .unwrap_or(Locale::UNKNOWN); + + locale.extensions.unicode = + HostInfo::unicode_extensions().expect("Failed to retrieve host info"); + + let dtf = DateTimeFormatter::try_new( + locale.into(), + fieldsets::YMDT::long().with_alignment(icu_datetime::options::Alignment::Column), + ) + .expect("Failed to create datetime formatter."); + + let date = Date::try_new_gregorian(2025, 10, 10).unwrap(); + let time = Time::try_new(18, 56, 0, 0).unwrap(); + + let formatted_dt = dtf.format(&DateTime { date, time }); + + println!("Today is: {formatted_dt}"); +} diff --git a/utils/host_info/examples/get_data.rs b/utils/host_info/examples/get_data.rs new file mode 100644 index 00000000000..6a47f3e3885 --- /dev/null +++ b/utils/host_info/examples/get_data.rs @@ -0,0 +1,31 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_host_info::HostInfo; + +fn main() { + println!("resolved backend: {:?}", HostInfo::resolved_backend()); + println!("-----"); + println!("requested locales: {:?}", HostInfo::requested_locales()); + println!("calendar: {:?}", HostInfo::calendar()); + println!("region: {:?}", HostInfo::region()); + println!("hour_cycle: {:?}", HostInfo::hour_cycle()); + println!("measurement_system: {:?}", HostInfo::measurement_system()); + println!( + "measurement_unit_override: {:?}", + HostInfo::measurement_unit_override() + ); + println!("first_day: {:?}", HostInfo::first_day_of_week()); + println!("collation: {:?}", HostInfo::collation()); + println!("-----"); + println!( + "unicode_extensions: {:?}", + HostInfo::unicode_extensions().unwrap().to_string() + ); + #[cfg(feature = "datetime")] + println!( + "datetimeformatter_preferences: {:#?}", + HostInfo::datetime_preferences() + ); +} diff --git a/utils/host_info/src/backends/android.rs b/utils/host_info/src/backends/android.rs new file mode 100644 index 00000000000..2b21df3a3ed --- /dev/null +++ b/utils/host_info/src/backends/android.rs @@ -0,0 +1,33 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{ + backends::{HostInfoBackend, RawHostInfoBackend}, + error::HostInfoError, + posix::{raw_locale_categories, LocaleCategory}, +}; + +pub struct AndroidHostInfoBackend; + +impl HostInfoBackend for AndroidHostInfoBackend {} + +impl RawHostInfoBackend for AndroidHostInfoBackend { + fn raw_requested_locales() -> Result, HostInfoError> { + let mut categories = raw_locale_categories()?; + let mut locales = Vec::with_capacity(categories.len()); + + // Add LC_ALL if it exists + if let Some(primary_locale) = categories.remove(&LocaleCategory::All) { + locales.push(primary_locale); + } + + // Add any remaining locales that were explicitly set + for s in categories.into_values() { + if !locales.contains(&s) { + locales.push(s); + } + } + Ok(locales) + } +} diff --git a/utils/host_info/src/backends/linux.rs b/utils/host_info/src/backends/linux.rs new file mode 100644 index 00000000000..c6be3ffafc5 --- /dev/null +++ b/utils/host_info/src/backends/linux.rs @@ -0,0 +1,101 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{ + backends::{HostInfoBackend, RawHostInfoBackend}, + error::HostInfoError, + locale::PosixLocale, +}; +use icu_locale_core::{preferences::extensions::unicode::keywords::HourCycle, Locale}; + +pub struct LinuxHostInfoBackend; + +impl HostInfoBackend for LinuxHostInfoBackend { + #[cfg(feature = "datetime")] + fn datetime_preferences() -> Result { + use crate::posix::{raw_locale_categories, LocaleCategory}; + + let mut categories = raw_locale_categories()?; + + let mut locale = Locale::UNKNOWN; + if let Some(lc_time) = categories.remove(&LocaleCategory::Time) { + if let Ok(loc) = PosixLocale::try_from_str(&lc_time) { + if let Ok(loc) = Locale::try_from(loc) { + locale = loc; + } + } + } else { + if let Some(lc_all) = categories.remove(&LocaleCategory::All) { + if let Ok(loc) = PosixLocale::try_from_str(&lc_all) { + if let Ok(loc) = Locale::try_from(loc) { + locale = loc; + } + } + } + } + + let mut result = icu_datetime::DateTimeFormatterPreferences::from(locale); + result.numbering_system = None; + result.hour_cycle = Self::hour_cycle()?; + result.calendar_algorithm = Self::calendar()?; + Ok(result) + } + + fn requested_locales() -> Result, HostInfoError> { + Ok(Self::raw_requested_locales()? + .into_iter() + .filter_map(|s| { + PosixLocale::try_from_str(&s) + .ok() + .and_then(|posix_locale| Locale::try_from(posix_locale).ok()) + }) + .collect()) + } + + fn hour_cycle() -> Result, HostInfoError> { + #[cfg(feature = "gnome")] + if let Some(hc) = gnome_clock_format_hc() { + return Ok(Some(hc)); + } + Ok(None) + } +} + +impl RawHostInfoBackend for LinuxHostInfoBackend { + fn raw_requested_locales() -> Result, HostInfoError> { + // 1) LANGUAGE: colon-separated, ordered + if let Ok(s) = std::env::var("LANGUAGE") { + let v: Vec = s + .split(':') + .filter(|x| !x.is_empty()) + .map(|s| s.to_string()) + .collect(); + if !v.is_empty() { + return Ok(v); + } + } + + // 2) Fallbacks: LC_MESSAGES > LC_ALL > LANG + for k in ["LC_MESSAGES", "LC_ALL", "LANG"] { + if let Ok(s) = std::env::var(k) { + if !s.is_empty() { + return Ok(vec![s]); + } + } + } + + Ok(vec![]) + } +} + +#[cfg(feature = "gnome")] +fn gnome_clock_format_hc() -> Option { + use gio::prelude::*; + let s = gio::Settings::new("org.gnome.desktop.interface"); + match s.string("clock-format").as_str() { + "12h" => Some(HourCycle::H12), + "24h" => Some(HourCycle::H23), + _ => None, + } +} diff --git a/utils/host_info/src/backends/macos.rs b/utils/host_info/src/backends/macos.rs new file mode 100644 index 00000000000..a30905abd37 --- /dev/null +++ b/utils/host_info/src/backends/macos.rs @@ -0,0 +1,519 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core_foundation::{ + array::CFArray, + base::{kCFAllocatorDefault, CFGetTypeID, CFTypeRef, TCFType}, + dictionary::{ + CFDictionaryGetCount, CFDictionaryGetKeysAndValues, CFDictionaryGetTypeID, CFDictionaryRef, + }, + number::{kCFNumberSInt32Type, CFNumberGetTypeID, CFNumberGetValue}, + string::{CFString, CFStringGetMaximumSizeForEncoding}, +}; +use core_foundation_sys::{ + base::CFRelease, + calendar::{CFCalendarCopyCurrent, CFCalendarGetIdentifier}, + date_formatter::CFDateFormatterCreateDateFormatFromTemplate, + locale::{ + kCFLocaleCountryCode, kCFLocaleMeasurementSystem, CFLocaleCopyCurrent, + CFLocaleCopyPreferredLanguages, CFLocaleGetValue, CFLocaleRef, + }, + preferences::{kCFPreferencesAnyHost, kCFPreferencesCurrentUser, CFPreferencesCopyValue}, + string::{kCFStringEncodingUTF8, CFStringGetCString, CFStringGetCStringPtr, CFStringRef}, +}; +use icu_locale_core::{ + extensions::unicode, + preferences::extensions::unicode::keywords::{ + CalendarAlgorithm, CollationType, HourCycle, MeasurementSystem, + }, + subtags::Language, +}; +use std::os::raw::c_char; +use std::{ffi::CStr, str::FromStr}; + +use crate::{ + backends::{HostInfoBackend, RawHostInfoBackend}, + error::HostInfoError, +}; + +pub struct MacOSHostInfoBackend; + +impl HostInfoBackend for MacOSHostInfoBackend { + fn calendar() -> Result, HostInfoError> { + Ok(Self::raw_calendar()? + .and_then(|raw| { + let canonical = match raw.as_str() { + "gregorian" => "gregory", + r => r, + }; + unicode::Value::from_str(canonical).ok() + }) + .and_then(|value| CalendarAlgorithm::try_from(&value).ok())) + } + + fn hour_cycle() -> Result, HostInfoError> { + with_current_locale(|locale| { + let template = CFString::new("j"); // hour-cycle probe + + // SAFETY: All parameters are valid - kCFAllocatorDefault is a system constant, + // template is a valid CFStringRef, locale is non-null, and 0 is a valid options value. + let format = unsafe { + CFDateFormatterCreateDateFormatFromTemplate( + kCFAllocatorDefault, + template.as_concrete_TypeRef(), + 0, + locale, + ) + }; + + if format.is_null() { + return None; + } + + // SAFETY: format is non-null and owned by us, so we use wrap_under_create_rule. + // This properly handles ownership and will release the format when dropped. + let format_string = unsafe { CFString::wrap_under_create_rule(format) }; + + // Detect hour cycle from the first character of the format pattern + match format_string.to_string().chars().next() { + Some('K') => Some(HourCycle::H11), + Some('h') => Some(HourCycle::H12), + Some('H') => Some(HourCycle::H23), + _ => None, + } + }) + } + + fn measurement_system() -> Result, HostInfoError> { + Ok( + Self::raw_measurement_system()?.and_then(|raw| match raw.as_str() { + "U.S." => Some(MeasurementSystem::USSystem), + "U.K." => Some(MeasurementSystem::UKSystem), + "Metric" => Some(MeasurementSystem::Metric), + _ => None, + }), + ) + } + + fn collation() -> Result, HostInfoError> { + Ok(Self::raw_collation()?.and_then(|(lang, col)| { + if let Ok(val) = unicode::Value::from_str(&col) { + if let Ok(col) = CollationType::try_from(&val) { + let lang = Language::try_from_str(lang.as_str()).unwrap_or(Language::UNKNOWN); + Some((lang, col)) + } else { + None + } + } else { + None + } + })) + } +} + +impl RawHostInfoBackend for MacOSHostInfoBackend { + /// Retrieves system locales for Apple operating systems, in the order preferred by the + /// user, using [`CFLocaleCopyPreferredLanguages`](https://developer.apple.com/documentation/corefoundation/1542887-cflocalecopypreferredlanguages). + fn raw_requested_locales() -> Result, HostInfoError> { + // SAFETY: CFLocaleCopyPreferredLanguages returns an owned CFArrayRef that we must release. + // The function is documented to return NULL only in exceptional circumstances. + let arr_ref = unsafe { CFLocaleCopyPreferredLanguages() }; + if arr_ref.is_null() { + return Ok(vec![]); + } + let arr = unsafe { CFArray::::wrap_under_create_rule(arr_ref as _) }; + + // Use iterator combinators for more idiomatic Rust + let out = (0..arr.len()) + .filter_map(|i| arr.get(i)) + .map(|v| v.to_string()) + .collect(); + Ok(out) + } + + /// Gets the current system calendar identifier. + fn raw_calendar() -> Result, HostInfoError> { + /// RAII wrapper for CFCalendarRef + struct CFCalendarWrapper(core_foundation_sys::calendar::CFCalendarRef); + + impl CFCalendarWrapper { + fn new() -> Option { + // SAFETY: CFCalendarCopyCurrent returns an owned CFCalendarRef that we must release. + let cal = unsafe { CFCalendarCopyCurrent() }; + if cal.is_null() { + None + } else { + Some(CFCalendarWrapper(cal)) + } + } + + fn get_identifier(&self) -> Option { + // SAFETY: self.0 is non-null. CFCalendarGetIdentifier expects a CFCalendarRef + // cast to the appropriate type, and returns a borrowed CFStringRef that doesn't + // need to be released. + let identifier = unsafe { CFCalendarGetIdentifier(self.0 as _) }; + cfstring_to_string(identifier as CFStringRef) + } + } + + impl Drop for CFCalendarWrapper { + fn drop(&mut self) { + // SAFETY: We own the calendar reference and must release it. + unsafe { CFRelease(self.0 as _) }; + } + } + + let calendar = CFCalendarWrapper::new(); + Ok(calendar.and_then(|cal| cal.get_identifier())) + } + + fn raw_region() -> Result, HostInfoError> { + with_current_locale(|locale| { + // SAFETY: locale is non-null and kCFLocaleCountryCode is a valid key. + // CFLocaleGetValue returns a borrowed reference. + let value = unsafe { CFLocaleGetValue(locale, kCFLocaleCountryCode) }; + + if value.is_null() { + return None; + } + + // SAFETY: We use wrap_under_get_rule because the value is borrowed, not owned. + let cf_string = unsafe { CFString::wrap_under_get_rule(value as CFStringRef) }; + Some(cf_string.to_string()) + }) + } + + fn raw_measurement_system() -> Result, HostInfoError> { + with_current_locale(|locale| { + // SAFETY: locale is non-null and kCFLocaleMeasurementSystem is a valid key. + // CFLocaleGetValue returns a borrowed reference. + let value = unsafe { CFLocaleGetValue(locale, kCFLocaleMeasurementSystem) }; + + if value.is_null() { + return None; + } + + // SAFETY: We use wrap_under_get_rule because the value is borrowed, not owned. + let cf_string = unsafe { CFString::wrap_under_get_rule(value as CFStringRef) }; + Some(cf_string.to_string()) + }) + } + + fn raw_measurement_unit_override() -> Result, HostInfoError> { + unsafe { + let key = CFString::new("AppleTemperatureUnit"); + let domain = CFString::new(".GlobalPreferences"); + let v = CFPreferencesCopyValue( + key.as_concrete_TypeRef(), + domain.as_concrete_TypeRef(), + kCFPreferencesCurrentUser, + kCFPreferencesAnyHost, + ); + if v.is_null() { + return Ok(None); + } + let s = core_foundation::string::CFString::wrap_under_get_rule(v as CFStringRef) + .to_string(); + Ok(Some(s)) + } + } + + fn raw_first_day_of_week() -> Result, HostInfoError> { + unsafe { + let key = CFString::new("AppleFirstWeekday"); + let domain = CFString::new(".GlobalPreferences"); + let val: CFTypeRef = CFPreferencesCopyValue( + key.as_concrete_TypeRef(), + domain.as_concrete_TypeRef(), + kCFPreferencesCurrentUser, + kCFPreferencesAnyHost, + ); + if val.is_null() { + return Ok(None); + } + + if CFGetTypeID(val) != CFDictionaryGetTypeID() { + return Ok(None); + } + + // take the first value in the dictionary + let dict = val as CFDictionaryRef; + let count = CFDictionaryGetCount(dict); + if count == 0 { + CFRelease(val); + return Ok(None); + } + let mut keys: Vec = vec![std::ptr::null_mut(); count as usize]; + let mut vals: Vec = vec![std::ptr::null_mut(); count as usize]; + CFDictionaryGetKeysAndValues(dict, keys.as_mut_ptr() as _, vals.as_mut_ptr() as _); + + unsafe fn cfnum_i32(n: CFTypeRef) -> Option { + if CFGetTypeID(n) != CFNumberGetTypeID() { + return None; + } + let mut out = 0i32; + if CFNumberGetValue(n as _, kCFNumberSInt32Type, &mut out as *mut _ as _) { + Some(out) + } else { + None + } + } + Ok(cfnum_i32(vals[0]).and_then(|n| match n { + 1 => Some("sun".to_string()), + 2 => Some("mon".to_string()), + 3 => Some("tue".to_string()), + 4 => Some("wed".to_string()), + 5 => Some("thu".to_string()), + 6 => Some("fri".to_string()), + 7 => Some("sat".to_string()), + _ => None, + })) + } + } + + fn raw_collation() -> Result, HostInfoError> { + /// Parse macOS "AppleCollationOrder" style locale strings into (language, collation). + /// Accepts: + /// - "zh@collation=stroke" + /// - "zh-Hant@collation=zhuyin;foo=bar" + /// - "zh-u-co-pinyin" + /// + /// Returns None if no collation is present or language is invalid. + pub fn parse_mac_collation_locale(input: &str) -> Option<(String, String)> { + if input.is_empty() { + return None; + } + + // 1) Split off any "@..." suffix first (Apple legacy syntax uses @collation=...) + let (before_at, after_at_opt) = match input.split_once('@') { + Some((head, tail)) => (head, Some(tail)), + None => (input, None), + }; + + // 2) Extract language subtag from the head (before '@'): first token before '-' or '_' + let lang = before_at + .split(['-', '_']) + .next() + .map(|s| s.to_ascii_lowercase()) + .filter(|s| !s.is_empty() && s.chars().all(|c| c.is_ascii_alphabetic())) + .filter(|s| (2..=8).contains(&s.len()))?; // permit 2–8 alpha per BCP47 + + // Helper to validate and normalize collation value + fn norm_co(s: &str) -> Option { + let t = s.to_ascii_lowercase(); + if t.is_empty() { + return None; + } + // Allow a–z and hyphen (e.g., "radical-stroke") + if t.chars().all(|c| c.is_ascii_lowercase() || c == '-') { + Some(t) + } else { + None + } + } + + // 3a) Try legacy "@collation=" form + if let Some(after_at) = after_at_opt { + if let Some(rest) = after_at.strip_prefix("collation=") { + let co = rest + .split([';', '@']) // stop at next key or stray '@' + .next() + .and_then(norm_co); + if let Some(co) = co { + return Some((lang, co)); + } + } + // If "@..." present but not collation, fall through to try -u- parsing below + } + + // 3b) Try BCP47 U-extension with "co" + // Look for "-u-" then scan for "-co-" + let lower = input.to_ascii_lowercase(); + if let Some(u_pos) = lower.find("-u-") { + let tail = &lower[u_pos + 3..]; // after "-u-" + let mut it = tail.split('-'); + while let Some(k) = it.next() { + if k == "co" { + if let Some(v) = it.next() { + if let Some(co) = norm_co(v) { + return Some((lang, co)); + } + } + break; + } + // Skip possible multi-part values for other keys; for co it's single + } + } + + None + } + + unsafe { + let key = CFString::new("AppleCollationOrder"); + let domain = CFString::new(".GlobalPreferences"); + let val = CFPreferencesCopyValue( + key.as_concrete_TypeRef(), + domain.as_concrete_TypeRef(), + kCFPreferencesCurrentUser, + kCFPreferencesAnyHost, + ); + if !val.is_null() { + let cf = CFString::wrap_under_get_rule(val as CFStringRef); + let s = cf.to_string(); + if let Some((lang, co)) = parse_mac_collation_locale(&s) { + return Ok(Some((lang, co))); + } + // Some locales like "pl" carry no @collation; ignore and fall through + } + } + + Ok(None) + } +} + +/// RAII wrapper for CFLocaleRef to ensure proper cleanup +struct CFLocaleWrapper(CFLocaleRef); + +impl CFLocaleWrapper { + fn new() -> Option { + // SAFETY: CFLocaleCopyCurrent returns an owned CFLocaleRef that we must release. + let locale = unsafe { CFLocaleCopyCurrent() }; + if locale.is_null() { + None + } else { + Some(CFLocaleWrapper(locale)) + } + } + + fn as_ref(&self) -> CFLocaleRef { + self.0 + } +} + +impl Drop for CFLocaleWrapper { + fn drop(&mut self) { + // SAFETY: We own the locale reference and must release it. + unsafe { CFRelease(self.0 as _) }; + } +} + +/// Helper function to reduce duplication when working with current locale. +/// Handles the common pattern of getting current locale, using it, and releasing it. +fn with_current_locale(f: F) -> Result, HostInfoError> +where + F: FnOnce(CFLocaleRef) -> Option, +{ + let locale = CFLocaleWrapper::new(); + Ok(locale.and_then(|loc| f(loc.as_ref()))) +} + +/// Converts a CFStringRef to a Rust String. +/// Returns None if the CFStringRef is null or conversion fails. +fn cfstring_to_string(cf_string: CFStringRef) -> Option { + if cf_string.is_null() { + return None; + } + + // SAFETY: cf_string is non-null as verified above. + unsafe { + // Try the fast path first - get direct pointer to UTF-8 data + let direct_ptr = CFStringGetCStringPtr(cf_string, kCFStringEncodingUTF8); + if !direct_ptr.is_null() { + // SAFETY: CFStringGetCStringPtr returned non-null, so it points to valid UTF-8 data. + return CStr::from_ptr(direct_ptr as *const c_char) + .to_str() + .ok() + .map(str::to_owned); + } + + // Fall back to copying the string data + let length = core_foundation_sys::string::CFStringGetLength(cf_string); + let max_size = CFStringGetMaximumSizeForEncoding(length, kCFStringEncodingUTF8) + 1; + + // Use stack buffer for small strings to avoid heap allocation + const STACK_BUFFER_SIZE: usize = 256; + let mut stack_buffer = [0u8; STACK_BUFFER_SIZE]; + + if max_size <= STACK_BUFFER_SIZE as isize { + // SAFETY: stack_buffer has sufficient size, cf_string is non-null, + // and kCFStringEncodingUTF8 is a valid encoding. + let success = CFStringGetCString( + cf_string, + stack_buffer.as_mut_ptr() as *mut i8, + STACK_BUFFER_SIZE as isize, + kCFStringEncodingUTF8, + ); + + if success != 0 { + // SAFETY: CFStringGetCString succeeded, so buffer contains valid UTF-8 C string. + return CStr::from_ptr(stack_buffer.as_ptr() as *const c_char) + .to_str() + .ok() + .map(str::to_owned); + } + } else { + // Use heap allocation for larger strings + let mut heap_buffer = vec![0u8; max_size as usize]; + + // SAFETY: heap_buffer has the required size as calculated by CFStringGetMaximumSizeForEncoding, + // cf_string is non-null, and kCFStringEncodingUTF8 is a valid encoding. + let success = CFStringGetCString( + cf_string, + heap_buffer.as_mut_ptr() as *mut i8, + max_size, + kCFStringEncodingUTF8, + ); + + if success != 0 { + // SAFETY: CFStringGetCString succeeded, so buffer contains valid UTF-8 C string. + return CStr::from_ptr(heap_buffer.as_ptr() as *const c_char) + .to_str() + .ok() + .map(str::to_owned); + } + } + + None + } +} + +#[cfg(test)] +mod tests { + use crate::backends::{macos::MacOSHostInfoBackend, RawHostInfoBackend}; + use icu_locale_core::Locale; + + #[test] + fn test_get_raw_locales() { + let locales_res = MacOSHostInfoBackend::raw_requested_locales(); + match locales_res { + Ok(locales) => { + for locale in locales { + assert!(!locale.is_empty(), "Empty locale retrieved"); + assert!(locale.is_ascii(), "Invalid form of locale retrieved"); + } + } + Err(e) => { + panic!("{e:?}") + } + } + } + + #[test] + fn test_converting_locales() { + let locales = MacOSHostInfoBackend::raw_requested_locales().unwrap(); + for locale in locales { + let _loc: Locale = locale.parse().unwrap(); + } + } + + #[test] + fn test_calendar() { + let calendar = MacOSHostInfoBackend::raw_calendar().unwrap(); + assert!(calendar.is_some(), "Couldn't retrieve calendar"); + assert!( + calendar.unwrap().is_ascii(), + "Calendar identifier form is not valid" + ); + } +} diff --git a/utils/host_info/src/backends/mod.rs b/utils/host_info/src/backends/mod.rs new file mode 100644 index 00000000000..a11d5fc4b79 --- /dev/null +++ b/utils/host_info/src/backends/mod.rs @@ -0,0 +1,280 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Per-host implementations for `HostInfo`. +//! +//! This module contains traits implemented for per-host backends. +//! +//! When compiling for any given host architecture, the developer +//! may access the per-backend implementation via `icu_host_info::backends::{arch}::{Arch}HostInfoBackend`. +//! +//! # RawHostInfoBackend +//! +//! This trait provides low level implementation of per-host bindings to retrieve regional preferences in their +//! original form. +//! +//! # HostInfoBackend +//! +//! This trait provides high level implementation of per-host bindings to convert raw values into their ICU4X +//! types. +use std::str::FromStr; + +use icu_locale_core::{ + extensions::unicode::{self, key, Unicode, Value}, + preferences::extensions::unicode::keywords::{ + CalendarAlgorithm, CollationType, FirstDay, HourCycle, MeasurementSystem, + MeasurementUnitOverride, + }, + subtags::{Language, Region}, + Locale, +}; + +use crate::error::HostInfoError; + +#[cfg(target_os = "android")] +#[doc(hidden)] +pub mod android; + +#[cfg(target_os = "ios")] +#[doc(hidden)] +pub mod macos; + +#[cfg(target_os = "linux")] +#[doc(hidden)] +pub mod linux; + +#[cfg(target_os = "macos")] +#[doc(hidden)] +pub mod macos; + +#[cfg(target_os = "windows")] +#[doc(hidden)] +pub mod windows; + +#[cfg(not(any( + target_os = "android", + target_os = "ios", + target_os = "linux", + target_os = "macos", + target_os = "windows" +)))] +#[doc(hidden)] +mod unavailable; + +/// High level implementation of per-host bindings to convert raw values into their ICU4X types. +pub trait HostInfoBackend: RawHostInfoBackend { + /// The implementation should attempt to collect all relevant regional preferences available in the given + /// host environment into a unicode extensions bag. + fn unicode_extensions() -> Result { + let mut result = Unicode::new(); + if let Some(calendar) = Self::calendar()? { + result.keywords.set(key!("ca"), calendar.into()); + } + if let Some(hc) = Self::hour_cycle()? { + result.keywords.set(key!("hc"), hc.into()); + } + if let Some(ms) = Self::measurement_system()? { + result.keywords.set(key!("ms"), ms.into()); + } + if let Some(mu) = Self::measurement_unit_override()? { + result.keywords.set(key!("mu"), mu.into()); + } + if let Some(fw) = Self::first_day_of_week()? { + result.keywords.set(key!("fw"), fw.into()); + } + if let Some((_lang, co)) = Self::collation()? { + result.keywords.set(key!("co"), co.into()); + } + if let Some(rg) = Self::region()? { + let mut rg_str = rg.to_string(); + rg_str.push_str("zzzz"); + if let Ok(value) = Value::try_from_str(&rg_str) { + result.keywords.set(key!("rg"), value); + } + } + Ok(result) + } + + /// The implementation should attempt to retrieve date/time related regional preferences and collect + /// them into `DateTimeFormatterPreferences` bag. + #[cfg(feature = "datetime")] + fn datetime_preferences() -> Result { + use icu_locale_core::Locale; + + let requested_locales = Self::requested_locales()?; + let requested_locale = requested_locales + .first() + .cloned() + .unwrap_or(Locale::UNKNOWN); + let mut result = icu_datetime::DateTimeFormatterPreferences::from(requested_locale); + result.numbering_system = None; + result.hour_cycle = Self::hour_cycle()?; + result.calendar_algorithm = Self::calendar()?; + Ok(result) + } + + /// The implementation should attempt to retrieve requested locales set by the user in the host system. + fn requested_locales() -> Result, HostInfoError> { + Ok(Self::raw_requested_locales()? + .into_iter() + .filter_map(|s| Locale::try_from_str(&s).ok()) + .collect()) + } + + /// The implementation should attempt to retrieve calendar set by the user in the host system. + fn calendar() -> Result, HostInfoError> { + Ok(Self::raw_calendar()? + .and_then(|raw| unicode::Value::from_str(&raw).ok()) + .and_then(|value| CalendarAlgorithm::try_from(&value).ok())) + } + + /// The implementation should attempt to retrieve region set by the user in the host system. + fn region() -> Result, HostInfoError> { + Ok(Self::raw_region()?.and_then(|raw| Region::try_from_str(&raw).ok())) + } + + /// The implementation should attempt to retrieve hour_cycle set by the user in the host system. + fn hour_cycle() -> Result, HostInfoError> { + Ok(Self::raw_hour_cycle()? + .and_then(|raw| unicode::Value::from_str(&raw).ok()) + .and_then(|value| HourCycle::try_from(&value).ok())) + } + + /// The implementation should attempt to retrieve measurement system set by the user in the host system. + fn measurement_system() -> Result, HostInfoError> { + Ok(Self::raw_measurement_system()? + .and_then(|raw| unicode::Value::from_str(&raw).ok()) + .and_then(|value| MeasurementSystem::try_from(&value).ok())) + } + + /// The implementation should attempt to retrieve measurement unit override set by the user in the host system. + fn measurement_unit_override() -> Result, HostInfoError> { + Ok(Self::raw_measurement_unit_override()? + .and_then(|raw| unicode::Value::from_str(&raw).ok()) + .and_then(|value| MeasurementUnitOverride::try_from(&value).ok())) + } + + /// The implementation should attempt to retrieve first day of week set by the user in the host system. + fn first_day_of_week() -> Result, HostInfoError> { + Ok(Self::raw_first_day_of_week()? + .and_then(|raw| unicode::Value::from_str(&raw).ok()) + .and_then(|value| FirstDay::try_from(&value).ok())) + } + + /// The implementation should attempt to retrieve collation set by the user in the host system. + fn collation() -> Result, HostInfoError> { + Ok(Self::raw_collation()?.and_then(|(raw_lang, raw_col)| { + unicode::Value::from_str(&raw_col) + .ok() + .and_then(|col| CollationType::try_from(&col).ok()) + .and_then(|col| Language::from_str(&raw_lang).ok().map(|lang| (lang, col))) + })) + } +} + +/// Low level implementation of per-host bindings to retrieve regional preferences in their original form. +/// +/// As per library design, the implementations should attempt to return `None` in scenarios where user +/// did not explicitly set a value for any of the preferences. +/// For example, if the user set `en-US` as their preferred locale, and did not manually set `HourCycle` +/// to any value, the host API may return hour cycle default value for en-US. +/// If possible, the implementation should attempt to distinguish between explicity set value that matches +/// default for a given locale, from lack of explicit value set. +/// +/// If that is not possible, the API should return the value retrieved from the system for each field getter. +/// +/// The goal is to avoid constructing a `en-US-hc-h12` locale in a scenario where the user set their locale to `en-US` +/// but did not explicitly define hour cycle preference, and the `h12` value is just a default for `en-US`. +/// This becomes impactful when locale negotiation results in the system picking one of the fallback locales, and +/// needs to determine if it should follow its regional preferences, or take some from the host system. +/// For example, if the user set `["en-US", "de-DE"]` as their requested locales, and the host API returns `h12` for +/// the hour cycle getter, it may be problematic to not know if this is explicit preference of the user, or default +/// for `en-US`. As a result, it may become challenging to decide if `h12` should be used even if `de-DE` is being negotiated +/// as the locale for the given application. +pub trait RawHostInfoBackend { + /// Attempt to retrieve a list of locales set in the host regional preferences as requested by the user. + /// + /// The list is ordered and should contain locales explicitly requested by the user, with an empty + /// list being a valid response in case no locale has been set by the user, or the backend cannot retrieve any. + fn raw_requested_locales() -> Result, HostInfoError> { + Ok(vec![]) + } + + /// Attempt to retrieve calendar system set in the host regional preferences by the user. + fn raw_calendar() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve region set in the host regional preferences by the user. + fn raw_region() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve hour cycle set in the host regional preferences by the user. + fn raw_hour_cycle() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve measurement system set in the host regional preferences by the user. + fn raw_measurement_system() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve measurement unut override set in the host regional preferences by the user. + /// + /// This should retrieve `temperature` unit. + fn raw_measurement_unit_override() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve first day of week option set in the host regional preferences by the user. + fn raw_first_day_of_week() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve collation set in the host regional preferences by the user. + fn raw_collation() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve customized date format set in the host regional preferences by the user. + fn raw_date_format() -> Result, HostInfoError> { + Ok(None) + } + + /// Attempt to retrieve customized number format set in the host regional preferences by the user. + fn raw_number_format() -> Result, HostInfoError> { + Ok(None) + } +} + +#[cfg(target_os = "android")] +#[doc(hidden)] +pub(crate) type Impl = android::AndroidHostInfoBackend; + +#[cfg(target_os = "ios")] +#[doc(hidden)] +pub(crate) type Impl = macos::MacOSHostInfoBackend; + +#[cfg(target_os = "linux")] +#[doc(hidden)] +pub(crate) type Impl = linux::LinuxHostInfoBackend; + +#[cfg(target_os = "macos")] +#[doc(hidden)] +pub(crate) type Impl = macos::MacOSHostInfoBackend; + +#[cfg(target_os = "windows")] +#[doc(hidden)] +pub(crate) type Impl = windows::WindowsHostInfoBackend; + +#[cfg(not(any( + target_os = "android", + target_os = "ios", + target_os = "linux", + target_os = "macos", + target_os = "windows" +)))] +#[doc(hidden)] +pub(crate) type Impl = unavailable::UnavailableHostInfoBackend; diff --git a/utils/host_info/src/backends/unavailable.rs b/utils/host_info/src/backends/unavailable.rs new file mode 100644 index 00000000000..c7ee94ec84f --- /dev/null +++ b/utils/host_info/src/backends/unavailable.rs @@ -0,0 +1,14 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{ + backends::{HostInfoBackend, RawHostInfoBackend}, + error::HostInfoError, +}; + +pub struct UnavailableHostInfoBackend; + +impl HostInfoBackend for UnavailableHostInfoBackend {} + +impl RawHostInfoBackend for UnavailableHostInfoBackend {} diff --git a/utils/host_info/src/backends/windows.rs b/utils/host_info/src/backends/windows.rs new file mode 100644 index 00000000000..cfef10fc488 --- /dev/null +++ b/utils/host_info/src/backends/windows.rs @@ -0,0 +1,182 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{ + backends::{HostInfoBackend, RawHostInfoBackend}, + error::HostInfoError, + locale::WindowsLocale, +}; +use icu_locale_core::{ + extensions::unicode, preferences::extensions::unicode::keywords::CalendarAlgorithm, Locale, +}; +use std::str::FromStr; + +pub struct WindowsHostInfoBackend; + +impl HostInfoBackend for WindowsHostInfoBackend { + fn requested_locales() -> Result, HostInfoError> { + Ok(Self::raw_requested_locales()? + .into_iter() + .filter_map(|s| { + WindowsLocale::try_from_str(&s) + .map_err(|_| HostInfoError::HostLocaleError) + .and_then(|wl| Locale::try_from(wl).map_err(Into::into)) + .ok() + }) + .collect()) + } + + fn calendar() -> Result, HostInfoError> { + Ok(Self::raw_calendar()? + .and_then(|raw| { + let canonical = match raw.as_str() { + "GregorianCalendar" => "gregory", + "JapaneseCalendar" => "japanese", + "TaiwanCalendar" => "roc", + "KoreanCalendar" => "dangi", + "HebrewCalendar" => "hebrew", + "HijriCalendar" => "islamic", + "UmmAlQuraCalendar" => "islamic-umalqura", + "PersianCalendar" => "persian", + "ThaiCalendar" => "buddhist", + "JulianCalendar" => "julian", + r => r, + }; + unicode::Value::from_str(canonical).ok() + }) + .and_then(|value| CalendarAlgorithm::try_from(&value).ok())) + } +} + +impl RawHostInfoBackend for WindowsHostInfoBackend { + fn raw_region() -> Result, HostInfoError> { + let region = + windows::System::UserProfile::GlobalizationPreferences::HomeGeographicRegion()?; + let s = region.to_string_lossy(); + if s.is_empty() { + Ok(None) + } else { + Ok(Some(s)) + } + } + + fn raw_requested_locales() -> Result, HostInfoError> { + let locale = windows::System::UserProfile::GlobalizationPreferences::Languages()?; + let len = locale.Size()?; + + let mut locale_vec_str: Vec = Vec::with_capacity(len as usize); + + for i in 0..len { + let hstring = locale.GetAt(i)?; + let string = hstring.to_string_lossy(); + locale_vec_str.push(string); + } + Ok(locale_vec_str) + } + + fn raw_calendar() -> Result, HostInfoError> { + let calendar = ::windows::Globalization::Calendar::new()?; + let system_calendar = ::windows::Globalization::Calendar::GetCalendarSystem(&calendar)?; + let calendar_type: String = system_calendar.to_string(); + Ok(Some(calendar_type)) + } + + fn raw_first_day_of_week() -> Result, HostInfoError> { + Ok( + match ::windows::System::UserProfile::GlobalizationPreferences::WeekStartsOn()?.0 { + 0 => Some("sun".to_string()), + 1 => Some("mon".to_string()), + 2 => Some("tue".to_string()), + 3 => Some("wed".to_string()), + 4 => Some("thu".to_string()), + 5 => Some("fri".to_string()), + 6 => Some("sat".to_string()), + _ => None, + }, + ) + } +} + +#[cfg(test)] +mod tests { + use crate::backends::{windows::WindowsHostInfoBackend, RawHostInfoBackend}; + use crate::locale::WindowsLocale; + use icu_locale_core::Locale; + use std::sync::{LazyLock, Mutex}; + use windows::core::{BOOL, PCWSTR}; + use windows::Win32::{ + Foundation::LPARAM, + Globalization::{EnumSystemLocalesEx, LOCALE_ALL}, + }; + + // Since [`EnumSystemLocalesEx`] iterates using a callback with no obvious (safe) way to return data, + // store them in this static instead. Since this is only a single test with roughly 1,000 items, + // it shouldn't be much of a concern. + static LOCALES: LazyLock>> = LazyLock::new(|| Mutex::new(Vec::new())); + + /// Callback provided to the [`EnumSystemLocalesEx`] to enumerate over locales. + unsafe extern "system" fn callback( + locale_name: PCWSTR, + _locale_flags: u32, + _callback_parameter: LPARAM, + ) -> BOOL { + // SAFETY: caller is the [`EnumSystemLocalesEx`] function, which guarantees a valid null-terminated string + let locale_name = unsafe { locale_name.to_string() }.unwrap(); + + // Skip empty locale 0x007F, marked as "Reserved for invariant locale behavior" + // Source: MS-LCID version 16.0, page 13 (section 2.2 under "Language ID" table) + if !locale_name.is_empty() { + LOCALES.lock().unwrap().push(locale_name); + } + + // Tell [`EnumSystemLocalesEx`] to continue enumeration + BOOL::from(true) + } + + /// Enumerate over all Windows locales, and make sure [`WindowsLocale`] can parse it without any (direct) errors. + #[test] + fn system_locales() -> windows::core::Result<()> { + // Find the list of supported locales, using the [`EnumSystemLocalesEx`] API: + // https://learn.microsoft.com/en-us/windows/win32/api/winnls/nf-winnls-enumsystemlocalesex + // SAFETY: a valid function pointer is provided and lpReserved is set to NULL/None as required + unsafe { + EnumSystemLocalesEx(Some(callback), LOCALE_ALL, LPARAM::default(), None)?; + } + + // Get the list of locales which the callback has been modifying + let locales = LOCALES.lock().unwrap(); + + // Make sure [`WindowsLocale`] can parse without any obvious issues + for locale in locales.iter() { + let windows_locale = WindowsLocale::try_from_str(locale).expect(locale); + Locale::try_from(windows_locale).expect(locale); + } + + Ok(()) + } + + #[test] + fn test_get_raw_requested_locales() { + let locales = WindowsHostInfoBackend::raw_requested_locales().unwrap(); + for locale in locales { + assert!(!locale.is_empty(), "Empty locale retrieved"); + assert!(locale.is_ascii(), "Invalid form of locale retrieved"); + } + } + + #[test] + fn test_converting_locales() { + let locales = WindowsHostInfoBackend::raw_requested_locales().unwrap(); + for locale in locales { + let _converted_locale: Locale = locale.parse().unwrap(); + } + } + + #[test] + fn test_calendar() { + let calendar = WindowsHostInfoBackend::raw_calendar().unwrap().unwrap(); + assert!(!calendar.is_empty(), "Calendar identifier is empty"); + assert!(calendar.is_ascii(), "Calendar identifier form is not valid"); + } +} diff --git a/utils/host_info/src/error.rs b/utils/host_info/src/error.rs new file mode 100644 index 00000000000..ce11a04d6b8 --- /dev/null +++ b/utils/host_info/src/error.rs @@ -0,0 +1,64 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use displaydoc::Display; +use icu_locale_core::ParseError; +use std::error::Error; +use std::{ffi::FromVecWithNulError, str::Utf8Error}; + +/// An error encountered while retrieving the host information +#[derive(Debug, Display)] +pub enum HostInfoError { + #[displaydoc("Error converting into `&CStr` to `&str`")] + Conversion(Utf8Error), + + #[displaydoc("Error creating a `CString` from a buffer with a null terminator")] + FromVecWithNul(FromVecWithNulError), + + #[displaydoc("No backend matching backend have been identified")] + UnavailableBackend, + + #[displaydoc("Unknown category when retrieving locale category for linux")] + UnknownCategory, + + #[cfg(target_os = "windows")] + #[displaydoc("Windows error: {0}")] + Windows(windows::core::Error), + + #[displaydoc("Host locale parsing error")] + HostLocaleError, + + #[displaydoc("Failed to parse region")] + UnknownRegion, + + #[displaydoc("Failed to parse a locale: {0}")] + LocaleParse(ParseError), +} + +impl Error for HostInfoError {} + +impl From for HostInfoError { + fn from(input: Utf8Error) -> Self { + Self::Conversion(input) + } +} + +impl From for HostInfoError { + fn from(input: FromVecWithNulError) -> Self { + Self::FromVecWithNul(input) + } +} + +#[cfg(target_os = "windows")] +impl From for HostInfoError { + fn from(input: windows::core::Error) -> Self { + Self::Windows(input) + } +} + +impl From for HostInfoError { + fn from(input: ParseError) -> Self { + Self::LocaleParse(input) + } +} diff --git a/utils/host_info/src/host_info.rs b/utils/host_info/src/host_info.rs new file mode 100644 index 00000000000..815957f8d2e --- /dev/null +++ b/utils/host_info/src/host_info.rs @@ -0,0 +1,243 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_locale_core::{ + extensions::unicode::Unicode, + preferences::extensions::unicode::keywords::{ + CalendarAlgorithm, CollationType, FirstDay, HourCycle, MeasurementSystem, + MeasurementUnitOverride, + }, + subtags::{Language, Region}, + Locale, +}; + +use crate::{ + backends::{self, HostInfoBackend}, + error::HostInfoError, +}; + +use super::HostKind; + +pub const RESOLVED_BACKEND: Option = { + #[cfg(target_os = "android")] + { + Some(HostKind::Android) + } + #[cfg(target_os = "ios")] + { + Some(HostKind::Ios) + } + #[cfg(target_os = "linux")] + { + Some(HostKind::Linux) + } + #[cfg(target_os = "macos")] + { + Some(HostKind::MacOS) + } + #[cfg(target_os = "windows")] + { + Some(HostKind::Windows) + } + #[cfg(not(any( + target_os = "android", + target_os = "ios", + target_os = "linux", + target_os = "macos", + target_os = "windows" + )))] + { + None + } +}; + +/// Provides getters for common regional preferences from the host environment. +/// +/// # Example +/// +/// ```ignore +/// use icu_host_info::HostInfo; +/// use icu::calendar::Date; +/// use icu::datetime::{fieldsets, DateTimeFormatter}; +/// +/// let date = Date::try_new_gregorian(2025, 10, 10) +/// .expect("Failed to create date"); +/// +/// // requires feature `datetime` +/// let prefs = HostInfo::datetime_preferences() +/// .expect("Failed to retrieve host info"); +/// +/// let dtf = DateTimeFormatter::try_new(prefs, fieldsets::YMD::long()) +/// .expect("Failed to create datetime formatter."); +/// +/// let formatted_dt = dtf.format(&date); +/// +/// assert_eq!(formatted_dt.to_string(), "October 10, 2025"); +/// ``` +pub struct HostInfo; + +impl HostInfo { + /// Retrieves `Unicode` extensions struct populated from host regional preferences. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let ue = HostInfo::unicode_extensions() + /// .expect("Failed to retrieve host info"); + /// ``` + pub fn unicode_extensions() -> Result { + backends::Impl::unicode_extensions() + } + + /// Retrieves `Preferences` object for `DateTimeFormatter`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let ue = HostInfo::datetime_preferences() + /// .expect("Failed to retrieve datetime preferences"); + /// ``` + #[cfg(feature = "datetime")] + pub fn datetime_preferences( + ) -> Result { + backends::Impl::datetime_preferences() + } + + /// Retrieves an ordered list of locales set as requested by the user in the host + /// environment regional preferences. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let locales = HostInfo::requested_locales() + /// .expect("Failed to retrieve requested locales"); + /// ``` + pub fn requested_locales() -> Result, HostInfoError> { + backends::Impl::requested_locales() + } + + /// Retrieves a calendar preference. + /// + /// In `::unicode_extensions()` this field is being encoded as `ca`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::calendar() + /// .expect("Failed to retrieve calendar"); + /// ``` + pub fn calendar() -> Result, HostInfoError> { + backends::Impl::calendar() + } + + /// Retrieves a region set in the host environment regional preferences. + /// + /// That region may be already populated into `requested_locales` or not, depending + /// on the host. + /// In `::unicode_extensions()` this field is being encoded as `rg`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::region() + /// .expect("Failed to retrieve region"); + /// ``` + pub fn region() -> Result, HostInfoError> { + backends::Impl::region() + } + + /// Retrieves an hour_cycle preference. + /// + /// In `::unicode_extensions()` this field is being encoded as `hc`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::hour_cycle() + /// .expect("Failed to retrieve hour cycle"); + /// ``` + pub fn hour_cycle() -> Result, HostInfoError> { + backends::Impl::hour_cycle() + } + + /// Retrieves a measurement system preference. + /// + /// In `::unicode_extensions()` this field is being encoded as `ms`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::calendar() + /// .expect("Failed to retrieve calendar"); + /// ``` + pub fn measurement_system() -> Result, HostInfoError> { + backends::Impl::measurement_system() + } + + /// Retrieves a first day of week preference. + /// + /// In `::unicode_extensions()` this field is being encoded as `fd`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::first_day_of_week() + /// .expect("Failed to retrieve first day of week"); + /// ``` + pub fn first_day_of_week() -> Result, HostInfoError> { + backends::Impl::first_day_of_week() + } + + /// Retrieves a collation preference. + /// + /// In `::unicode_extensions()` this field is being encoded as `co`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::collation() + /// .expect("Failed to retrieve collation"); + /// ``` + pub fn collation() -> Result, HostInfoError> { + backends::Impl::collation() + } + + /// Retrieves measurement unit override preference. + /// + /// In `::unicode_extensions()` this field is being encoded as `mu`. + /// + /// # Example + /// + /// ``` + /// use icu_host_info::HostInfo; + /// + /// let region = HostInfo::measurement_unit_override() + /// .expect("Failed to retrieve measurement unit override"); + /// ``` + pub fn measurement_unit_override() -> Result, HostInfoError> { + backends::Impl::measurement_unit_override() + } + + pub fn resolved_backend() -> Option { + RESOLVED_BACKEND + } +} diff --git a/utils/host_info/src/lib.rs b/utils/host_info/src/lib.rs new file mode 100644 index 00000000000..9aae370f107 --- /dev/null +++ b/utils/host_info/src/lib.rs @@ -0,0 +1,301 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! # host_info +//! +//! `host_info` is a library providing functionality to retrieve regional preferences +//! from host environments - primarily the operating system the program is running in. +//! +//! The library is designed to bind the different host environment preferences architectures +//! to ICU4X model. +//! +//! # Example +//! +//! ```ignore +//! use icu_host_info::HostInfo; +//! use icu::calendar::Date; +//! use icu::datetime::{fieldsets, DateTimeFormatter}; +//! +//! let date = Date::try_new_gregorian(2025, 10, 10) +//! .expect("Failed to create date"); +//! +//! // requires feature `datetime` +//! let prefs = HostInfo::datetime_preferences() +//! .expect("Failed to retrieve host info"); +//! +//! let dtf = DateTimeFormatter::try_new(prefs, fieldsets::YMD::long()) +//! .expect("Failed to create datetime formatter."); +//! +//! let formatted_dt = dtf.format(&date); +//! +//! assert_eq!(formatted_dt.to_string(), "October 10, 2025"); +//! ``` +//! +//! # Feature Matrix +//! +//! The library intends to provide means to retrieve regional preferences +//! to ICU4X preferences with a focus on Unicode Extensions, but allow for +//! propagation of preferences offered by the host environments which may +//! not have a representation in Unicode Extensions (for example: date format pattern). +//! +//! Legend: +//! - ✅ = OS + `host_info` support +//! - ⚠️ = OS supports, `host_info` doesn't +//! - ❌ = OS doesn't supported +//! +//! | Feature | Android | iOS | Linux (1) | macOS | Windows | +//! |---------------------| :-----: | :-: | :------------------: | :---: | :-----: | +//! | Requested Locales | ✅ | ✅ | ✅ | ✅ | ✅ | +//! | Calendar | ⚠️ | ⚠️ | ⚠️ | ✅ | ✅ | +//! | Region | ⚠️ | ⚠️ | ⚠️ | ✅ | ✅ | +//! | Hour cycle | ⚠️ | ⚠️ | ✅ | ✅ | ⚠️ | +//! | Measurement System | ⚠️ | ⚠️ | ⚠️ | ✅ | ⚠️ | +//! | Measurement Override| ⚠️ | ⚠️ | ⚠️ | ✅ | ⚠️ | +//! | First Day of week | ⚠️ | ⚠️ | ⚠️ | ✅ | ✅ | +//! | Collation | ⚠️ | ⚠️ | ⚠️ | ✅ | ❌ | +//! | Date format | ⚠️ | ⚠️ | ⚠️ | ⚠️ | ⚠️ | +//! | Number format | ⚠️ | ⚠️ | ⚠️ | ⚠️ | ⚠️ | +//! +//! (1) In case of Linux different DE's such as Gnoem and KDE are supported together. +//! +//! # Integrating preferences into ICU4X formatters +//! +//! The library provides three ways of injecting retrieved values into formatters: +//! +//! ## 1. Preference Bag +//! +//! For most common components, such as `DateTimeFormatter`, the library exposes +//! a direct getter that retrieves a `Preferences` struct for that component. +//! This getter is located behind a flag to allow for control over which dependencies are being +//! pulled. +//! +//! ### Example +//! +//! ```ignore +//! use icu_host_info::HostInfo; +//! use icu::datetime::{fieldsets, DateTimeFormatter}; +//! +//! // requires feature `datetime` +//! let prefs = HostInfo::datetime_preferences() +//! .expect("Failed to retrieve host info"); +//! +//! let dtf = DateTimeFormatter::try_new(prefs, fieldsets::YMD::long()) +//! .expect("Failed to create datetime formatter."); +//! ``` +//! +//! ## 2. Locale +//! +//! For all components that `HostInfo` does not have special preference getter for, +//! and for cases where the user prefers to avoid pulling extra dependencies at the cost +//! of narrowing down the retrieved values to just ones encoded in Unicode Extensions, +//! the library provides an ergonomic getter: +//! +//! ### Example +//! +//! ``` +//! use icu_host_info::HostInfo; +//! use icu::{ +//! datetime::{fieldsets, DateTimeFormatter}, +//! locale::Locale, +//! }; +//! +//! let mut locale = HostInfo::requested_locales() +//! .expect("Failed to retrieve locales") +//! .first() +//! .cloned() +//! .unwrap_or(Locale::UNKNOWN); +//! +//! locale.extensions.unicode = HostInfo::unicode_extensions() +//! .expect("Failed to retrieve host info"); +//! +//! let dtf = DateTimeFormatter::try_new(locale.into(), fieldsets::YMD::long()) +//! .expect("Failed to create datetime formatter."); +//! ``` +//! +//! Notice that the regional preferences encoded in Unicode Extensions +//! are retrieved separately from the list of requested locales. +//! There are two reasons for this design: +//! - The user has to decide whether the regional preferences apply onto all locales, or just the first one +//! - The locale negotiation may result in a different locale being selected. +//! +//! ## 3. Individual Preferences +//! +//! For each preference the library also attempts to provide a direct getter +//! allowing the user to retrieve just that preference and use it as they see fit. +//! +//! ### Example +//! +//! ``` +//! use icu_host_info::HostInfo; +//! use icu::locale::preferences::extensions::unicode::keywords::HourCycle; +//! +//! let mut calendar: Option = HostInfo::hour_cycle() +//! .expect("Failed to retrieve hour_cycle preference"); +//! ``` +//! +//! # Locale Negotiation +//! +//! Locale Negotiation is an upcoming feature in ICU4X which will enable the system integrating ICU4X to +//! perform a negotiation between requested locales, and locales for which the data is available in the system. +//! The output of `HostInfo` will be utilized in that negotiation allowing the deployment to 1) select +//! the most appropriate locales for the given user and target modality, 2) apply regional preferences onto that +//! locale. +//! +//! The need to allow `HostInfo` to be pluggable info locale negotiation and multi source merging (see next section) +//! guided many design choices in this library. This section will be extended once locale negotiation is implemented. +//! +//! # Multi Source Merging +//! +//! In simple systems the user will most often use ICU4X to format +//! some information in a selected locale, and use this library to augument +//! the formatting with regional preferences set by the user in the host environment. +//! +//! In more complex systems, the user may also want to introduce a second source of regional preferences +//! and mix the values set in the host environment with those set in the program itself. +//! +//! For example, a web browser may offer some regional preferences set in the browser +//! itself, or even set separate for some contexts of the browser. +//! +//! In those cases, the depoyment requires merging of the preferences. +//! ICU4X exposes an `extend` method on both `Preferences` and `Unicode` extensions struct. +//! +//! This allows the system to retrieve [`HostInfo`] Preferences or `Unicode`, and applications' +//! equivalent, and merge of them. +//! +//! ## `Preferences` Example +//! +//! ```ignore +//! use icu_host_info::HostInfo; +//! use icu::datetime::{fieldsets, DateTimeFormatter}; +//! +//! let app_prefs = app.datetime_preferences(); +//! +//! // requires feature `datetime` +//! let mut combined_prefs = HostInfo::datetime_preferences() +//! .expect("Failed to retrieve host info"); +//! +//! combined_prefs.extend(app_prefs); +//! +//! let dtf = DateTimeFormatter::try_new(combined_prefs, fieldsets::YMD::long()) +//! .expect("Failed to create datetime formatter."); +//! ``` +//! +//! ## `Unicode` Extensions Example +//! +//! ```ignore +//! use icu_host_info::HostInfo; +//! use icu::{ +//! datetime::{fieldsets, DateTimeFormatter}, +//! locale::locale, +//! }; +//! +//! let mut locale = locale!("fr-CA"); +//! +//! let app_ue = app.unicode_extensions(); +//! +//! let mut combined_ue = HostInfo::unicode_extensions() +//! .expect("Failed to retrieve host info"); +//! +//! combined_ue.extend(app_ue); +//! +//! locale.extensions.unicode = combined_ue; +//! +//! let dtf = DateTimeFormatter::try_new(locale.into(), fieldsets::YMD::long()) +//! .expect("Failed to create datetime formatter."); +//! ``` +//! +//! # Design Decisions +//! +//! The library operates on a boundary of diverse set of host +//! environments and uniformal ICU4X design derived from Unicode LDML. +//! It requires a number of design tradeoffs that had to be made in +//! order to achieve the uniformity and scale over time as the host +//! platforms design evolves. +//! +//! ## Host Environment +//! +//! The library is designed to handle retrieval of data from the direct host +//! environment. This usually means an operating system, but it can mean a +//! virtual environment, sandbox or runtime. +//! In such a case it is the responsibility of the execution logic +//! setting up such environment to ensure propagation of customer preferences. +//! +//! ## Lossy Results +//! +//! The library makes best-effort to retrieve the values +//! that can be directly used in ICU4X. As the operating systems, +//! runtimes and ICU4X evolve, there's always a risk of a mismatch. +//! This library makes a design decision to be lossy-by-default. +//! +//! Any value that cannot be directly mapped onto a valid value is ignored +//! and indistinguishable in the ergonomic API from a missing value. +//! +//! Similarly, the API does not distinguish between missing binding logic and unknown value. +//! The assumption is that users of this library are aiming to respect user choices +//! encoded in host environment regional preferences, but are not in a position +//! to act differently on a failed attempt to retrieve them from a missing attempt. +//! Therefore errors in this library are very rare and only related to catastrophic +//! cases like memory corruption or OS API errors propagation. +//! +//! ## Normalized vs Raw values +//! +//! The main API of this library - [`HostInfo`] - provides methods that return normalized +//! values, often directly taken from `icu::locale_core::preferences`. +//! Per-host backends provide additional trait implementation that returns +//! raw values, allowing the user to handle or introspect those values manually. +//! When using `HostInfo`, the library performs best-effort to normalize and parse +//! those raw values into canonical Unicode ICU4X representation, often discarding +//! unknown values and values that fail to parse. +//! +//! Those raw backends are not exposed in the documentation as the documentation. +//! +//! ### Example +//! +//! ```ignore +//! use icu_host_info::backends::{ +//! RawHostInfoBackend, +//! macos::MacOSHostInfoBackend, +//! }; +//! +//! let raw_cal: Option = MacOSHostInfoBackend::raw_calendar() +//! .expect("Failed to retrieve raw calendar"); +//! ``` +//! +//! ## Minimize defaults +//! +//! The library attempts to use host APIs in a way that allows distinguishing between +//! preference values that represent defaults for a given locale, from ones manually set +//! by the user. +//! In some cases, the host API does not allow for distinguishing of that, which may result +//! in overly expressive locales such as `en-US-ca-gregory` (`gregory` being already a default calendar for en-US). +//! +//! This, like other aspects of the library, operates on best-effort basis and may be further improved in the future +//! releases as better bindings become available. +//! +//! ### Host API Design Guidance +//! +//! A note for host API designers - it is useful for foundational libraries such as this to expose APIs that enable us +//! to distinguish between regional preferences values derived by the host from defaults of a locale, from cases +//! when the value is explicitly set by the user. +//! This dinstinction allows ICU4X to better serve in locale negotiations scenario where other-than-first locale may be used +//! and the deployment should respect whether the user set a given preference explicitly or left it to the per-locale default. +//! +pub mod backends; +mod error; +mod host_info; +pub mod locale; +mod posix; + +pub use host_info::HostInfo; + +/// Enumeration of known hosts. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +#[non_exhaustive] +pub enum HostKind { + Android, + Ios, + Linux, + MacOS, + Windows, +} diff --git a/utils/host_info/src/locale/mod.rs b/utils/host_info/src/locale/mod.rs new file mode 100644 index 00000000000..d5110a85e61 --- /dev/null +++ b/utils/host_info/src/locale/mod.rs @@ -0,0 +1,14 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Host-specific Locale representations. +//! +//! Some popular host environments provide custom definition of a `Locale`. +//! This module contains APIs allowing for encoding of those variants and their conversion +//! to ICU4X Locale. +pub mod posix; +pub mod windows; + +pub use posix::PosixLocale; +pub use windows::WindowsLocale; diff --git a/utils/host_info/src/locale/posix.rs b/utils/host_info/src/locale/posix.rs new file mode 100644 index 00000000000..c94cde00fc2 --- /dev/null +++ b/utils/host_info/src/locale/posix.rs @@ -0,0 +1,529 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Parsing functionality for POSIX locale identifiers. +//! For more information, see [`PosixLocale`]. +//! +//! # Usage example +//! ``` +//! use icu_locale_core::{Locale, locale}; +//! use icu_host_info::locale::{posix::PosixParseError, PosixLocale}; +//! +//! # fn main() -> Result<(), PosixParseError> { +//! let posix_locale = PosixLocale::try_from_str("en_US.utf8@euro").unwrap(); +//! +//! assert_eq!(Locale::try_from(posix_locale), Ok(locale!("en-US-u-cu-eur"))); +//! # Ok(()) +//! # } +//! ``` + +use displaydoc::Display; +use icu_locale_core::extensions::unicode::{key, value}; +use icu_locale_core::extensions::Extensions; +use icu_locale_core::subtags::{language, script, variant, Language, Region, Variants}; +use icu_locale_core::{locale, LanguageIdentifier, Locale, ParseError}; + +#[derive(Display, Debug, PartialEq)] +/// An error while parsing a POSIX locale identifier +pub enum PosixParseError { + #[displaydoc("Empty locale")] + EmptyLocale, + #[displaydoc("Empty section beginning at offset {offset}")] + EmptySection { offset: usize }, + #[displaydoc("Invalid character at offset {offset}")] + InvalidCharacter { offset: usize }, + #[displaydoc("Invalid locale")] + InvalidLocale, + #[displaydoc("Delimiter repeated at offsets {first_offset} and {second_offset}")] + RepeatedDelimiter { + first_offset: usize, + second_offset: usize, + }, + #[displaydoc("Delimiters found out-of-order at offsets {first_offset} and {second_offset}")] + UnorderedDelimiter { + first_offset: usize, + second_offset: usize, + }, +} + +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] +enum Delimiter { + Territory, + Codeset, + Modifier, +} + +impl Delimiter { + /// Find any optional sections, returning an error if the delimiters are invalid + pub fn try_find_sections(src: &str) -> Result, PosixParseError> { + // Find the offset and delimiter of each optional section + let optional_sections = src + .chars() + .enumerate() + .flat_map(|(index, character)| match character { + '_' => Some((index, Self::Territory)), + '.' => Some((index, Self::Codeset)), + '@' => Some((index, Self::Modifier)), + _ => None, + }) + .collect::>(); + + // Find any errors in the arrangement of delimiters + for (index, (first_offset, first_delimiter)) in optional_sections.iter().enumerate() { + // Find any repeated delimiters + if let Some((second_offset, _second_delimiter)) = optional_sections + .iter() + // Check all delimiters past this index + .skip(index + 1) + .find(|(_second_offset, second_delimiter)| first_delimiter == second_delimiter) + { + return Err(PosixParseError::RepeatedDelimiter { + first_offset: *first_offset, + second_offset: *second_offset, + }); + } + + // Find any delimiters that have been invalidated by a delimiter that should appear after it + // For example "en.utf8_US" is invalid because codeset appears before territory + if let Some((second_offset, second_delimiter)) = optional_sections.get(index + 1) { + if first_delimiter > second_delimiter { + return Err(PosixParseError::UnorderedDelimiter { + first_offset: *first_offset, + second_offset: *second_offset, + }); + } + } + } + + Ok(optional_sections) + } +} + +#[derive(Debug)] +/// A parsed and validated POSIX locale identifier. +/// +/// Locales are expected to be in the format `language[_territory][.codeset][@modifier]`; +/// only the language section is mandatory, all other sections are optional. +/// For example: +/// - All sections: `en_US.utf8@euro` +/// - Only required sections: `en` +/// +/// See section 8.2 of the POSIX spec for more details: +/// +pub struct PosixLocale<'src> { + language: &'src str, + territory: Option<&'src str>, + codeset: Option<&'src str>, + // TODO: is it possible to have multiple modifiers? + modifier: Option<&'src str>, +} + +impl<'src> PosixLocale<'src> { + /// Attempt to parse a POSIX locale. + pub fn try_from_str(src: &'src str) -> Result { + // These cases are implementation-defined and can be ignored: + // - Empty locales + if src.is_empty() { + return Err(PosixParseError::EmptyLocale); + } + // - Any locale containing '/' + if let Some(offset) = src.find('/') { + return Err(PosixParseError::InvalidCharacter { offset }); + } + // - Locales consisting of "." or ".." + if src == "." || src == ".." { + return Err(PosixParseError::InvalidLocale); + } + + // Find any optional sections, and return any delimiter-related errors + let optional_sections = Delimiter::try_find_sections(src)?; + + // The language field continues until the start of the first optional section, if one exists + let language = match optional_sections.first() { + Some((offset, _delimiter)) => &src[..*offset], + None => src, + }; + + // Make sure the language itself is non-empty + if language.is_empty() { + return Err(PosixParseError::EmptySection { offset: 0 }); + } + + let mut locale = Self { + language, + territory: None, + codeset: None, + modifier: None, + }; + + for (index, (start_offset, delimiter)) in optional_sections.iter().enumerate() { + // Find the offset of the next section, or end of the string if none exist + let end_offset = optional_sections + .get(index + 1) + .map(|(next_offset, _next_delimiter)| *next_offset) + .unwrap_or(src.len()); + + // Make sure this section is non-empty (more characters than just the delimiter) + if start_offset + 1 >= end_offset { + return Err(PosixParseError::EmptySection { + offset: *start_offset, + }); + } + + // Write the section to the appropriate field + let section_value = Some(&src[start_offset + 1..end_offset]); + match delimiter { + Delimiter::Territory => locale.territory = section_value, + Delimiter::Codeset => locale.codeset = section_value, + Delimiter::Modifier => locale.modifier = section_value, + } + } + + Ok(locale) + } +} + +impl<'s> TryFrom> for Locale { + type Error = ParseError; + + fn try_from(input: PosixLocale<'s>) -> Result { + // The default "C"/"POSIX" locale should map to "en-US-posix", + // which is the default behaviour in ICU4C: + // https://github.com/unicode-org/icu/blob/795d7ac82c4b29cf721d0ad62c0b178347d453bf/icu4c/source/common/putil.cpp#L1738 + if input.language == "C" || input.language == "POSIX" { + return Ok(locale!("en-US-posix")); + } + + let mut extensions = Extensions::new(); + let mut script = None; + let mut variant = None; + + // Parse the language/region + let mut language = Language::try_from_str(input.language)?; + let region = input.territory.map(Region::try_from_str).transpose()?; + + if let Some(modifier) = input.modifier { + match modifier.to_ascii_lowercase().as_str() { + "euro" => { + extensions.unicode.keywords.set(key!("cu"), value!("eur")); + } + // Known script modifiers + "cyrillic" => script = Some(script!("Cyrl")), + "devanagari" => script = Some(script!("Deva")), + "latin" => script = Some(script!("Latn")), + // Saaho seems to be the only "legacy variant" that appears as a modifier: + // https://www.unicode.org/reports/tr35/#table-legacy-variant-mappings + "saaho" => language = language!("ssy"), + "valencia" => variant = Some(variant!("valencia")), + // Some modifiers are known but can't be expressed as a BCP-47 identifier + // e.g. "@abegede", "@iqtelif" + _ => (), + } + } + + Ok(Locale { + id: LanguageIdentifier { + language, + region, + script, + variants: variant.map_or_else(Variants::new, Variants::from_variant), + }, + extensions, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn expect_success(src: &str, expected: &str) { + let posix_locale = PosixLocale::try_from_str(src).expect(src); + let converted_locale: Locale = posix_locale.try_into().expect(src); + + let expected_locale = Locale::try_from_str(expected).expect(src); + assert_eq!(converted_locale, expected_locale, "POSIX locale: `{src}`"); + } + + #[test] + fn default_locale() { + expect_success("C", "en-US-posix"); + expect_success("POSIX", "en-US-posix"); + } + + #[test] + fn region() { + expect_success("en_US", "en-US"); + expect_success("ne_NP", "ne-NP"); + expect_success("zh_TW", "zh-TW"); + } + + #[test] + fn codeset_ignored() { + expect_success("lv_LV.iso885913", "lv-LV"); + expect_success("hy_AM.armscii8", "hy-AM"); + } + + #[test] + fn modifier() { + // Currency + expect_success("it_IT@euro", "it-IT-u-cu-eur"); + + // Script + expect_success("uz_UZ@cyrillic", "uz-Cyrl-UZ"); + expect_success("sd_IN@devanagari", "sd-Deva-IN"); + expect_success("sr_RS@latin", "sr-Latn-RS"); + + // Language + expect_success("aa_ER@saaho", "ssy-ER"); + + // Variant + expect_success("ca_ES@valencia", "ca-ES-valencia"); + } + + mod error { + mod parse { + use crate::locale::{posix::PosixParseError, PosixLocale}; + + fn expect_error(src: &str, posix_error: PosixParseError) { + let result = PosixLocale::try_from_str(src); + + match result { + Ok(invalid_locale) => { + panic!("Expected the error `{posix_error:?}`, got the locale `{invalid_locale:?}` from input of `{src}`") + } + Err(error) => { + assert_eq!(error, posix_error, "Comparing expected output of `{src}`") + } + } + } + + #[test] + fn empty_locale() { + expect_error("", PosixParseError::EmptyLocale); + } + + #[test] + fn empty_section() { + // Single, empty optional section + expect_error("en_", PosixParseError::EmptySection { offset: 2 }); + expect_error("en.", PosixParseError::EmptySection { offset: 2 }); + expect_error("en@", PosixParseError::EmptySection { offset: 2 }); + + // Multiple optional sections, one empty + expect_error("en_.utf8@euro", PosixParseError::EmptySection { offset: 2 }); + expect_error("en_US.@euro", PosixParseError::EmptySection { offset: 5 }); + expect_error("en_US.utf8@", PosixParseError::EmptySection { offset: 10 }); + + // Single delimiter (excluding "." as that should return `PosixParseError::InvalidLocale` instead) + expect_error("_", PosixParseError::EmptySection { offset: 0 }); + expect_error("@", PosixParseError::EmptySection { offset: 0 }); + + // All delimiters + expect_error("_.@", PosixParseError::EmptySection { offset: 0 }); + } + + #[test] + fn invalid_character() { + const SAMPLE_LOCALES: [&str; 2] = [ + "en", // No optional fields + "en_US.utf8@euro", // All optional fields + ]; + + for locale in SAMPLE_LOCALES { + // Insert an invalid character ('/') at every position along the sample locale + for offset in 0..=locale.len() { + let (left, right) = locale.split_at(offset); + let invalid_locale = format!("{left}/{right}"); + expect_error( + &invalid_locale, + PosixParseError::InvalidCharacter { offset }, + ); + } + } + + // Test a single '/' character + expect_error("/", PosixParseError::InvalidCharacter { offset: 0 }); + } + + #[test] + fn invalid_locale() { + expect_error(".", PosixParseError::InvalidLocale); + expect_error("..", PosixParseError::InvalidLocale); + } + + #[test] + fn repeated_delimiter() { + // Repeated delimiter at the end of locale + expect_error( + "en_US.utf8@euro_US", + PosixParseError::RepeatedDelimiter { + first_offset: 2, + second_offset: 15, + }, + ); + expect_error( + "en_US.utf8@euro.utf8", + PosixParseError::RepeatedDelimiter { + first_offset: 5, + second_offset: 15, + }, + ); + expect_error( + "en_US.utf8@euro@euro", + PosixParseError::RepeatedDelimiter { + first_offset: 10, + second_offset: 15, + }, + ); + + // Multiple repeated delimiters + expect_error( + "en.utf8.utf8.utf8", + PosixParseError::RepeatedDelimiter { + first_offset: 2, + second_offset: 7, + }, + ); + + // Consecutive repeated delimiters + expect_error( + "en__US.utf8@euro", + PosixParseError::RepeatedDelimiter { + first_offset: 2, + second_offset: 3, + }, + ); + expect_error( + "en_US..utf8@euro", + PosixParseError::RepeatedDelimiter { + first_offset: 5, + second_offset: 6, + }, + ); + expect_error( + "en_US.utf8@@euro", + PosixParseError::RepeatedDelimiter { + first_offset: 10, + second_offset: 11, + }, + ); + } + + #[test] + fn unordered_delimiter() { + expect_error( + "en_US@euro.utf8", + PosixParseError::UnorderedDelimiter { + first_offset: 5, + second_offset: 10, + }, + ); + expect_error( + "en.utf8_US@euro", + PosixParseError::UnorderedDelimiter { + first_offset: 2, + second_offset: 7, + }, + ); + expect_error( + "en.utf8@euro_US", + PosixParseError::UnorderedDelimiter { + first_offset: 7, + second_offset: 12, + }, + ); + expect_error( + "en@euro_US.utf8", + PosixParseError::UnorderedDelimiter { + first_offset: 2, + second_offset: 7, + }, + ); + expect_error( + "en@euro.utf8_US", + PosixParseError::UnorderedDelimiter { + first_offset: 2, + second_offset: 7, + }, + ); + } + + #[test] + fn offset() { + // Empty section + let src = "en_.utf8@euro"; + match PosixLocale::try_from_str(src) { + Err(PosixParseError::EmptySection { offset }) => { + assert_eq!(&src[offset..offset + 1], "_"); + } + _ => unreachable!(), + } + + // Invalid character + let src = "en_U/S"; + match PosixLocale::try_from_str(src) { + Err(PosixParseError::InvalidCharacter { offset }) => { + assert_eq!(&src[offset..offset + 1], "/"); + } + _ => unreachable!(), + } + + // Repeated delimiter + let src = "en_US.utf8@euro_US"; + match PosixLocale::try_from_str(src) { + Err(PosixParseError::RepeatedDelimiter { + first_offset, + second_offset, + }) => { + assert_eq!(&src[first_offset..first_offset + 1], "_"); + assert_eq!(&src[second_offset..second_offset + 1], "_"); + } + _ => unreachable!(), + } + + // Unordered delimiter + let src = "en_US@euro.utf8"; + match PosixLocale::try_from_str(src) { + Err(PosixParseError::UnorderedDelimiter { + first_offset, + second_offset, + }) => { + assert_eq!(&src[first_offset..first_offset + 1], "@"); + assert_eq!(&src[second_offset..second_offset + 1], "."); + } + _ => unreachable!(), + } + } + } + + mod conversion { + use crate::locale::PosixLocale; + use icu_locale_core::Locale; + + fn expect_error(src: &str, icu_error: icu_locale_core::ParseError) { + let result: Result = + PosixLocale::try_from_str(src).expect(src).try_into(); + match result { + Ok(invalid_locale) => { + panic!("Expected the error `{icu_error:?}`, got the locale `{invalid_locale:?}` from input of `{src}`") + } + Err(error) => { + assert_eq!(error, icu_error, "Comparing expected output of `{src}`") + } + } + } + + #[test] + fn invalid_language() { + expect_error("invalid", icu_locale_core::ParseError::InvalidLanguage); + } + + #[test] + fn invalid_region() { + expect_error("en_invalid", icu_locale_core::ParseError::InvalidSubtag); + } + } + } +} diff --git a/utils/host_info/src/locale/windows.rs b/utils/host_info/src/locale/windows.rs new file mode 100644 index 00000000000..df7681ca6c3 --- /dev/null +++ b/utils/host_info/src/locale/windows.rs @@ -0,0 +1,161 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Parsing functionality for Windows LCIDs. +//! For more information, see [`WindowsLocale`]. +//! +//! # Usage example +//! ``` +//! use icu_locale_core::{Locale, locale}; +//! use icu_host_info::locale::{WindowsLocale, windows::WindowsLocaleParseError}; +//! +//! # fn main() -> Result<(), WindowsLocaleParseError> { +//! let windows_locale = WindowsLocale::try_from_str("zh-CN_radstr")?; +//! +//! assert_eq!(Locale::try_from(windows_locale), Ok(locale!("zh-CN-u-co-unihan"))); +//! # Ok(()) +//! # } +//! ``` + +use displaydoc::Display; +use icu_locale_core::extensions::unicode::{key, value, Keywords, Unicode, Value}; +use icu_locale_core::extensions::Extensions; +use icu_locale_core::{langid, LanguageIdentifier, Locale, ParseError}; + +#[derive(Display, Debug, PartialEq)] +/// An error while parsing a Windows locale identifier +pub enum WindowsLocaleParseError {} + +/// A parsed and validated Windows locale identifier. +pub struct WindowsLocale<'src> { + src: &'src str, +} + +impl<'src> WindowsLocale<'src> { + pub fn try_from_str(src: &'src str) -> Result { + Ok(Self { src }) + } +} + +impl<'src> TryFrom> for Locale { + type Error = ParseError; + + fn try_from(input: WindowsLocale<'src>) -> Result { + let (lcid, collation_value) = strip_windows_collation_suffix_lossy(input.src); + let keywords = match collation_value { + // Add the -u-co-VALUE extension to the locale + Some(collation_value) => Keywords::new_single(key!("co"), collation_value), + // No collation value found, use default keywords + None => Keywords::new(), + }; + + // Use a matching alias if found + let language = match find_windows_language_alias_lossy(lcid) { + Some(locale) => locale, + None => LanguageIdentifier::try_from_str(lcid)?, + }; + + Ok(Locale { + id: language, + extensions: Extensions::from_unicode(Unicode { + keywords, + ..Unicode::new() + }), + }) + } +} + +fn strip_windows_collation_suffix_lossy(lcid: &str) -> (&str, Option) { + // All known LCIDs containing an underscore are used for a collation suffix + if let Some((prefix, suffix)) = lcid.split_once('_') { + let collation_value = match suffix { + "phoneb" => value!("phonebk"), + "pronun" => value!("zhuyin"), + "radstr" => value!("unihan"), + "stroke" => value!("stroke"), + "tradnl" => value!("trad"), + // Strip the suffix on LCIDs with an underscore but no (known) matching CLDR data + _ => return (prefix, None), + }; + + // Return the LCID with the stripped prefix, and the matching CLDR collation key + (prefix, Some(collation_value)) + } else { + // No underscore found, return the LCID as-is + (lcid, None) + } +} + +/// Find a BCP-47 identifier from a list of known Windows aliases. +fn find_windows_language_alias_lossy(lcid: &str) -> Option { + match lcid { + "zh-yue-HK" => Some(langid!("yue-HK")), + // LCID with no (known) matching CLDR data: "math alphanumeric sorting" + // This would be `x-IV_mathan`, but the collation suffix may already be stripped by + // `strip_windows_collation_suffix_lossy`. For some reason, `LocaleEnumProcEx` also uses + // `x-IV-mathan`, so that is included here too. + // https://learn.microsoft.com/en-us/windows/win32/api/winnls/nc-winnls-locale_enumprocex + "x-IV" | "x-IV_mathan" | "x-IV-mathan" => Some(langid!("und")), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn expect_success(src: &str, expected: &str) { + let windows_locale = WindowsLocale::try_from_str(src).expect(src); + let locale = Locale::try_from(windows_locale).expect(src); + + assert_eq!( + locale, + Locale::try_from_str(expected).unwrap(), + "Case: {src}" + ); + } + + #[test] + fn collation() { + /// All MS-LCID collation entries with a known matching CLDR collation value + const CASES: [(&str, &str); 12] = [ + ("de-DE_phoneb", "de-DE-u-co-phonebk"), + ("es-ES_tradnl", "es-ES-u-co-trad"), + ("ja-JP_radstr", "ja-JP-u-co-unihan"), + ("zh-CN_phoneb", "zh-CN-u-co-phonebk"), + ("zh-CN_stroke", "zh-CN-u-co-stroke"), + ("zh-HK_radstr", "zh-HK-u-co-unihan"), + ("zh-MO_radstr", "zh-MO-u-co-unihan"), + ("zh-MO_stroke", "zh-MO-u-co-stroke"), + ("zh-SG_phoneb", "zh-SG-u-co-phonebk"), + ("zh-SG_stroke", "zh-SG-u-co-stroke"), + ("zh-TW_pronun", "zh-TW-u-co-zhuyin"), + ("zh-TW_radstr", "zh-TW-u-co-unihan"), + ]; + + for (src, expected) in CASES { + expect_success(src, expected); + } + } + + #[test] + fn collation_strip_known_invalid() { + // All MS-LCID collation entries with NO known matching CLDR collation value + expect_success("hu-HU_tchncl", "hu-HU"); + expect_success("ka-GE_modern", "ka-GE"); + } + + #[test] + fn collation_strip_unknown() { + expect_success("en-US_unknown", "en-US"); + expect_success("en-US_unknown_multiple_underscores", "en-US"); + expect_success("en-US_unknown-with-hyphens", "en-US"); + } + + #[test] + fn alias() { + expect_success("zh-yue-HK", "yue-HK"); + expect_success("x-IV-mathan", "und"); + } +} diff --git a/utils/host_info/src/posix.rs b/utils/host_info/src/posix.rs new file mode 100644 index 00000000000..e5373a00c2f --- /dev/null +++ b/utils/host_info/src/posix.rs @@ -0,0 +1,235 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#![allow(dead_code)] + +use libc::{setlocale, LC_ALL}; +use std::{collections::HashMap, ffi::CStr, ptr, str::FromStr}; + +use crate::error::HostInfoError; + +#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)] +pub enum LocaleCategory { + Character, // LC_CTYPE + Number, // LC_NUMERIC + Time, // LC_TIME + Collate, // LC_COLLATE + Monetary, // LC_MONETARY + Messages, // LC_MESSAGES + // GNU extensions (may not exist on non-gnu targets) + Paper, // LC_PAPER + Name, // LC_NAME + Address, // LC_ADDRESS + Telephone, // LC_TELEPHONE + Measurement, // LC_MEASUREMENT + Identification, // LC_IDENTIFICATION + All, // LC_ALL +} + +impl LocaleCategory { + #[inline] + fn to_env_var_name(self) -> &'static str { + match self { + LocaleCategory::Character => "LC_CTYPE", + LocaleCategory::Number => "LC_NUMERIC", + LocaleCategory::Time => "LC_TIME", + LocaleCategory::Collate => "LC_COLLATE", + LocaleCategory::Monetary => "LC_MONETARY", + LocaleCategory::Messages => "LC_MESSAGES", + LocaleCategory::Paper => "LC_PAPER", + LocaleCategory::Name => "LC_NAME", + LocaleCategory::Address => "LC_ADDRESS", + LocaleCategory::Telephone => "LC_TELEPHONE", + LocaleCategory::Measurement => "LC_MEASUREMENT", + LocaleCategory::Identification => "LC_IDENTIFICATION", + LocaleCategory::All => "LC_ALL", + } + } +} + +impl FromStr for LocaleCategory { + type Err = HostInfoError; + + fn from_str(s: &str) -> Result { + match s { + "LC_CTYPE" => Ok(Self::Character), + "LC_NUMERIC" => Ok(Self::Number), + "LC_TIME" => Ok(Self::Time), + "LC_COLLATE" => Ok(Self::Collate), + "LC_MONETARY" => Ok(Self::Monetary), + "LC_MESSAGES" => Ok(Self::Messages), + "LC_PAPER" => Ok(Self::Paper), + "LC_NAME" => Ok(Self::Name), + "LC_ADDRESS" => Ok(Self::Address), + "LC_TELEPHONE" => Ok(Self::Telephone), + "LC_MEASUREMENT" => Ok(Self::Measurement), + "LC_IDENTIFICATION" => Ok(Self::Identification), + "LC_ALL" => Ok(Self::All), + _ => Err(HostInfoError::UnknownCategory), + } + } +} + +// --- helpers --- + +#[inline] +fn is_c_like(raw: &str) -> bool { + let s = raw.trim(); + if s.is_empty() { + return true; + } + let up = s.to_ascii_uppercase(); + // Strip charset and modifier suffixes like ".UTF-8" or "@euro" + let base = up.split('.').next().unwrap_or(&up); + let base = base.split('@').next().unwrap_or(base); + base == "C" || base == "POSIX" +} + +#[inline] +fn non_c_like_env(name: &str) -> Option { + std::env::var_os(name).and_then(|v| { + let s = v.to_string_lossy(); + if s.is_empty() || is_c_like(&s) { + None + } else { + Some(s.into_owned()) + } + }) +} + +/// POSIX precedence: LC_ALL > LC_ > LANG. +/// Returns Some(non-C/POSIX) or None if unset/C-like. +fn resolve_env_for_category(cat: LocaleCategory) -> Option { + if let Some(v) = non_c_like_env("LC_ALL") { + return Some(v); + } + if cat != LocaleCategory::All { + if let Some(v) = non_c_like_env(cat.to_env_var_name()) { + return Some(v); + } + } + non_c_like_env("LANG") +} + +/// Attempt to parse `setlocale(LC_ALL, NULL)` into a map. +/// Returns None if NULL or C/POSIX-like (uninformative), to trigger env fallback. +/// Note: We only check LC_ALL because if libc is uninitialized, all categories return "C". +/// If initialized, LC_ALL contains the composite snapshot of all category values. +fn parse_setlocale_snapshot() -> Option> { + // SAFETY: read-only query of current thread's locale snapshot + let ptr = unsafe { setlocale(LC_ALL, ptr::null()) }; + if ptr.is_null() { + return None; + } + let s = unsafe { CStr::from_ptr(ptr) }.to_str().ok()?; + if s.is_empty() || is_c_like(s) { + return None; + } + + let mut map = HashMap::new(); + if !s.contains('=') { + // Single composite locale -> LC_ALL + if !is_c_like(s) { + map.insert(LocaleCategory::All, s.to_string()); + } + return if map.is_empty() { None } else { Some(map) }; + } + + for pair in s.split(';') { + let mut it = pair.splitn(2, '='); + let k = it.next().unwrap_or_default().trim(); + let v = it.next().unwrap_or_default().trim(); + if v.is_empty() || is_c_like(v) { + continue; + } + if let Ok(cat) = LocaleCategory::from_str(k) { + map.insert(cat, v.to_string()); + } + } + + if map.is_empty() { + None + } else { + Some(map) + } +} + +// --- public --- + +/// Retrieves locales for LC_ALL and any explicitly-set categories in this thread. +/// If libc is uninitialized (NULL/C/POSIX), falls back to env precedence. +/// If nothing resolves, returns `{ LC_ALL: "en-US-posix" }`. +pub(crate) fn raw_locale_categories() -> Result, HostInfoError> { + if let Some(map) = parse_setlocale_snapshot() { + return Ok(map); + } + + // Env fallback: collect only categories that resolve to non-C/POSIX. + const CATS: &[LocaleCategory] = &[ + LocaleCategory::Character, + LocaleCategory::Number, + LocaleCategory::Time, + LocaleCategory::Collate, + LocaleCategory::Monetary, + LocaleCategory::Messages, + LocaleCategory::Paper, + LocaleCategory::Name, + LocaleCategory::Address, + LocaleCategory::Telephone, + LocaleCategory::Measurement, + LocaleCategory::Identification, + LocaleCategory::All, + ]; + + let mut out = HashMap::new(); + + for &cat in CATS { + if let Some(v) = resolve_env_for_category(cat) { + out.insert(cat, v); + } + } + + if out.is_empty() { + out.insert(LocaleCategory::All, "en-US-posix".to_string()); + } + + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + use icu_locale_core::Locale; + + // Testing fetching of locale, as `get_locales` fetches the locales for category + // `LC_ALL`. For this category this should return non empty + #[test] + fn test_get_raw_locale_categories() { + let locale_res = raw_locale_categories().unwrap(); + assert!( + !locale_res.is_empty(), + "Empty hashmap for locales retrieved" + ); + for locale in locale_res.into_values() { + assert!(locale.is_ascii(), "Invalid form of locale retrieved") + } + } + + #[test] + fn test_converting_locales() { + let locale_res: std::collections::HashMap = + raw_locale_categories().unwrap(); + for locale in locale_res.into_values() { + let parts: Vec<&str> = locale.split('.').collect(); + + // Skipping "C" and those ending with "UTF-8", as they cannot be converted + // into the locale + if !parts.contains(&"C") && (parts.len() > 1 && parts[parts.len() - 1] != "UTF-8") { + let mut locale_converted: Locale = locale.parse().unwrap(); + locale_converted.extensions.unicode.clear(); + assert_eq!(locale_converted, locale.parse().unwrap()); + } + } + } +} diff --git a/utils/env_preferences/tests/datasets/mod.rs b/utils/host_info/tests/datasets/mod.rs similarity index 82% rename from utils/env_preferences/tests/datasets/mod.rs rename to utils/host_info/tests/datasets/mod.rs index 242575b2eea..9980a3b8fc1 100644 --- a/utils/env_preferences/tests/datasets/mod.rs +++ b/utils/host_info/tests/datasets/mod.rs @@ -16,22 +16,24 @@ const WINDOWS_DATASET: &str = include_str!("windows.txt"); #[test] fn posix() { - use env_preferences::parse::posix::PosixLocale; + use icu_host_info::locale::PosixLocale; + use icu_locale_core::Locale; for locale in POSIX_DATASET.lines() { let posix_locale = PosixLocale::try_from_str(locale).expect(locale); - posix_locale.try_convert_lossy().expect(locale); + Locale::try_from(posix_locale).expect(locale); } } #[test] fn windows() { - use env_preferences::parse::windows::WindowsLocale; + use icu_host_info::locale::WindowsLocale; + use icu_locale_core::Locale; for locale in WINDOWS_DATASET.lines() { let windows_locale = WindowsLocale::try_from_str(locale).expect(locale); - windows_locale.try_convert_lossy().expect(locale); + Locale::try_from(windows_locale).expect(locale); } } diff --git a/utils/env_preferences/tests/datasets/posix.txt b/utils/host_info/tests/datasets/posix.txt similarity index 100% rename from utils/env_preferences/tests/datasets/posix.txt rename to utils/host_info/tests/datasets/posix.txt diff --git a/utils/env_preferences/tests/datasets/windows.txt b/utils/host_info/tests/datasets/windows.txt similarity index 100% rename from utils/env_preferences/tests/datasets/windows.txt rename to utils/host_info/tests/datasets/windows.txt