Skip to content

Commit f952b03

Browse files
authored
Internal datagen API: Return the distance from skeleton matching (#6485)
1 parent 0bb3a5b commit f952b03

File tree

4 files changed

+101
-62
lines changed

4 files changed

+101
-62
lines changed

components/datetime/src/provider/pattern/hour_cycle.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ impl CoarseHourCycle {
104104
// requested fields.
105105
true,
106106
) {
107-
skeleton::BestSkeleton::AllFieldsMatch(patterns)
108-
| skeleton::BestSkeleton::MissingOrExtraFields(patterns) => {
107+
skeleton::BestSkeleton::AllFieldsMatch(patterns, _)
108+
| skeleton::BestSkeleton::MissingOrExtraFields(patterns, _) => {
109109
Some(reference::Pattern::from(&patterns.expect_pattern(
110110
"Only week-of patterns have plural variants",
111111
)))

components/datetime/src/provider/skeleton/helpers.rs

Lines changed: 71 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -60,38 +60,62 @@ const NO_DISTANCE: u32 = 0;
6060
// MM ≅ M (09 ≅ 9)
6161
const WIDTH_MISMATCH_DISTANCE: u32 = 1;
6262

63+
// If a glue pattern is required, give a small penalty.
64+
const GLUE_DISTANCE: u32 = 10;
65+
6366
// C. Numeric and text fields are given a larger distance from each other.
6467
// - MMM ≈ MM (Sep ≈ 09)
6568
// MMM
66-
const TEXT_VS_NUMERIC_DISTANCE: u32 = 10;
69+
const TEXT_VS_NUMERIC_DISTANCE: u32 = 100;
6770

6871
// D. Symbols representing substantial differences (week of year vs week of month) are given much
6972
// larger a distances from each other.
7073
// - d ≋ D; (12 ≋ 345) Day of month vs Day of year
71-
const SUBSTANTIAL_DIFFERENCES_DISTANCE: u32 = 100;
74+
const SUBSTANTIAL_DIFFERENCES_DISTANCE: u32 = 1000;
7275

7376
// A skeleton had more symbols than what was requested.
74-
const SKELETON_EXTRA_SYMBOL: u32 = 1000;
77+
const SKELETON_EXTRA_SYMBOL: u32 = 10000;
7578

7679
// A requested symbol is missing in the skeleton. Note that this final value can be more than
7780
// MAX_SKELETON_FIELDS, as it's counting the missing requested fields, which can be longer than
7881
// the stored skeletons. There cannot be any cases higher than this one.
79-
const REQUESTED_SYMBOL_MISSING: u32 = 10000;
82+
const REQUESTED_SYMBOL_MISSING: u32 = 100000;
8083

8184
/// The best skeleton found, alongside information on how well it matches.
8285
///
8386
/// According to the [UTS 35 skeleton matching algorithm](https://unicode.org/reports/tr35/tr35-dates.html#Matching_Skeletons)
8487
/// there will be a guaranteed match for a skeleton. However, with this initial implementation,
8588
/// there is no attempt to add on missing fields. This enum encodes the variants for the current
8689
/// search for a best skeleton.
90+
///
91+
/// The patterns are paired with a measure of their quality.
8792
#[derive(Debug, PartialEq, Clone)]
8893
#[allow(missing_docs)]
8994
pub enum BestSkeleton<T> {
90-
AllFieldsMatch(T),
91-
MissingOrExtraFields(T),
95+
AllFieldsMatch(T, SkeletonQuality),
96+
MissingOrExtraFields(T, SkeletonQuality),
9297
NoMatch,
9398
}
9499

100+
/// A measure of the quality of a skeleton.
101+
///
102+
/// Internally, this is a u32, a "distance" value. This value is highly
103+
/// unstable and should not be compared across versions. It should be used
104+
/// only for comparing against other distances in the same version of ICU4X.
105+
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
106+
pub struct SkeletonQuality(u32);
107+
108+
impl SkeletonQuality {
109+
/// Returns the worst possible quality measure.
110+
pub fn worst() -> SkeletonQuality {
111+
SkeletonQuality(u32::MAX)
112+
}
113+
/// Returns the best possible quality measure.
114+
pub fn best() -> SkeletonQuality {
115+
SkeletonQuality(0)
116+
}
117+
}
118+
95119
/// This function swaps out the time zone name field for the appropriate one. Skeleton matching
96120
/// only needs to find a single "v" field, and then the time zone name can expand from there.
97121
fn naively_apply_time_zone_name(
@@ -140,51 +164,57 @@ pub fn create_best_pattern_for_fields<'data>(
140164
get_best_available_format_pattern(skeletons, fields, prefer_matched_pattern);
141165

142166
// Try to match a skeleton to all of the fields.
143-
if let BestSkeleton::AllFieldsMatch(mut pattern_plurals) = first_pattern_match {
167+
if let BestSkeleton::AllFieldsMatch(mut pattern_plurals, d) = first_pattern_match {
144168
pattern_plurals.for_each_mut(|pattern| {
145169
naively_apply_preferences(pattern, components.hour_cycle);
146170
naively_apply_time_zone_name(pattern, components.time_zone_name);
147171
apply_subseconds(pattern, components.subsecond);
148172
});
149-
return BestSkeleton::AllFieldsMatch(pattern_plurals);
173+
return BestSkeleton::AllFieldsMatch(pattern_plurals, d);
150174
}
151175

152176
let FieldsByType { date, time } = group_fields_by_type(fields);
153177

154178
if date.is_empty() || time.is_empty() {
155179
return match first_pattern_match {
156-
BestSkeleton::AllFieldsMatch(_) => {
180+
BestSkeleton::AllFieldsMatch(_, _) => {
157181
unreachable!("Logic error in implementation. AllFieldsMatch handled above.")
158182
}
159-
BestSkeleton::MissingOrExtraFields(mut pattern_plurals) => {
183+
BestSkeleton::MissingOrExtraFields(mut pattern_plurals, d) => {
160184
if date.is_empty() {
161185
pattern_plurals.for_each_mut(|pattern| {
162186
naively_apply_preferences(pattern, components.hour_cycle);
163187
naively_apply_time_zone_name(pattern, components.time_zone_name);
164188
apply_subseconds(pattern, components.subsecond);
165189
});
166190
}
167-
BestSkeleton::MissingOrExtraFields(pattern_plurals)
191+
BestSkeleton::MissingOrExtraFields(pattern_plurals, d)
168192
}
169193
BestSkeleton::NoMatch => BestSkeleton::NoMatch,
170194
};
171195
}
172196

173197
// Match the date and time, and then simplify the combinatorial logic of the results into
174198
// an optional values of the results, and a boolean value.
175-
let (date_patterns, date_missing_or_extra): (Option<PatternPlurals<'data>>, bool) =
176-
match get_best_available_format_pattern(skeletons, &date, prefer_matched_pattern) {
177-
BestSkeleton::MissingOrExtraFields(fields) => (Some(fields), true),
178-
BestSkeleton::AllFieldsMatch(fields) => (Some(fields), false),
179-
BestSkeleton::NoMatch => (None, true),
180-
};
199+
let (date_patterns, date_missing_or_extra, date_distance): (
200+
Option<PatternPlurals<'data>>,
201+
bool,
202+
SkeletonQuality,
203+
) = match get_best_available_format_pattern(skeletons, &date, prefer_matched_pattern) {
204+
BestSkeleton::MissingOrExtraFields(fields, d) => (Some(fields), true, d),
205+
BestSkeleton::AllFieldsMatch(fields, d) => (Some(fields), false, d),
206+
BestSkeleton::NoMatch => (None, true, SkeletonQuality(REQUESTED_SYMBOL_MISSING)),
207+
};
181208

182-
let (time_patterns, time_missing_or_extra): (Option<PatternPlurals<'data>>, bool) =
183-
match get_best_available_format_pattern(skeletons, &time, prefer_matched_pattern) {
184-
BestSkeleton::MissingOrExtraFields(fields) => (Some(fields), true),
185-
BestSkeleton::AllFieldsMatch(fields) => (Some(fields), false),
186-
BestSkeleton::NoMatch => (None, true),
187-
};
209+
let (time_patterns, time_missing_or_extra, time_distance): (
210+
Option<PatternPlurals<'data>>,
211+
bool,
212+
SkeletonQuality,
213+
) = match get_best_available_format_pattern(skeletons, &time, prefer_matched_pattern) {
214+
BestSkeleton::MissingOrExtraFields(fields, d) => (Some(fields), true, d),
215+
BestSkeleton::AllFieldsMatch(fields, d) => (Some(fields), false, d),
216+
BestSkeleton::NoMatch => (None, true, SkeletonQuality(REQUESTED_SYMBOL_MISSING)),
217+
};
188218
let time_pattern: Option<runtime::Pattern<'data>> = time_patterns.map(|pattern_plurals| {
189219
let mut pattern =
190220
pattern_plurals.expect_pattern("Only date patterns can contain plural variants");
@@ -254,12 +284,18 @@ pub fn create_best_pattern_for_fields<'data>(
254284
(None, None) => None,
255285
};
256286

287+
let distance = SkeletonQuality(
288+
date_distance
289+
.0
290+
.saturating_add(time_distance.0)
291+
.saturating_add(GLUE_DISTANCE),
292+
);
257293
match patterns {
258294
Some(patterns) => {
259295
if date_missing_or_extra || time_missing_or_extra {
260-
BestSkeleton::MissingOrExtraFields(patterns)
296+
BestSkeleton::MissingOrExtraFields(patterns, distance)
261297
} else {
262-
BestSkeleton::AllFieldsMatch(patterns)
298+
BestSkeleton::AllFieldsMatch(patterns, distance)
263299
}
264300
}
265301
None => BestSkeleton::NoMatch,
@@ -481,6 +517,7 @@ pub fn get_best_available_format_pattern<'data>(
481517
// (e.g. text vs numeric). We return the field instead of the matched pattern.
482518
return BestSkeleton::AllFieldsMatch(
483519
runtime::Pattern::from(vec![PatternItem::Field(*field)]).into(),
520+
SkeletonQuality(closest_distance),
484521
);
485522
}
486523
}
@@ -496,7 +533,10 @@ pub fn get_best_available_format_pattern<'data>(
496533
}
497534

498535
if closest_distance == NO_DISTANCE {
499-
return BestSkeleton::AllFieldsMatch(closest_format_pattern);
536+
return BestSkeleton::AllFieldsMatch(
537+
closest_format_pattern,
538+
SkeletonQuality(closest_distance),
539+
);
500540
}
501541

502542
// Modify the resulting pattern to have fields of the same length.
@@ -511,8 +551,11 @@ pub fn get_best_available_format_pattern<'data>(
511551
}
512552

513553
if closest_distance >= SKELETON_EXTRA_SYMBOL {
514-
return BestSkeleton::MissingOrExtraFields(closest_format_pattern);
554+
return BestSkeleton::MissingOrExtraFields(
555+
closest_format_pattern,
556+
SkeletonQuality(closest_distance),
557+
);
515558
}
516559

517-
BestSkeleton::AllFieldsMatch(closest_format_pattern)
560+
BestSkeleton::AllFieldsMatch(closest_format_pattern, SkeletonQuality(closest_distance))
518561
}

provider/source/src/datetime/neo_skeleton.rs

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,40 +11,31 @@ use icu::datetime::options::Length;
1111
use icu::datetime::provider::calendar::{DateSkeletonPatterns, TimeLengths};
1212
use icu::datetime::provider::fields::components;
1313
use icu::datetime::provider::pattern::{reference, runtime, CoarseHourCycle};
14-
use icu::datetime::provider::skeleton::PatternPlurals;
14+
use icu::datetime::provider::skeleton::{PatternPlurals, SkeletonQuality};
1515
use icu::datetime::provider::*;
1616
use icu::plurals::PluralElements;
1717
use icu_locale_core::preferences::extensions::unicode::keywords::HourCycle;
1818
use icu_provider::prelude::*;
1919

2020
use super::DatagenCalendar;
2121

22-
enum ExactOrSynthetic<T> {
23-
Exact(T),
24-
Synthetic(T),
22+
struct PatternsWithDistance<T> {
23+
inner: T,
24+
distance: SkeletonQuality,
2525
}
2626

27-
impl<T> ExactOrSynthetic<T> {
28-
pub fn map<V>(self, mut f: impl FnMut(T) -> V) -> ExactOrSynthetic<V> {
29-
use ExactOrSynthetic::*;
30-
match self {
31-
Exact(t) => Exact(f(t)),
32-
Synthetic(t) => Synthetic(f(t)),
27+
impl<T> PatternsWithDistance<T> {
28+
pub fn map<V>(self, mut f: impl FnMut(T) -> V) -> PatternsWithDistance<V> {
29+
PatternsWithDistance {
30+
inner: f(self.inner),
31+
distance: self.distance,
3332
}
3433
}
3534
pub fn inner(&self) -> &T {
36-
use ExactOrSynthetic::*;
37-
match self {
38-
Exact(t) => t,
39-
Synthetic(t) => t,
40-
}
35+
&self.inner
4136
}
4237
pub fn into_inner(self) -> T {
43-
use ExactOrSynthetic::*;
44-
match self {
45-
Exact(t) => t,
46-
Synthetic(t) => t,
47-
}
38+
self.inner
4839
}
4940
}
5041

@@ -53,7 +44,7 @@ fn select_pattern<'data>(
5344
skeletons: &DateSkeletonPatterns<'data>,
5445
preferred_hour_cycle: CoarseHourCycle,
5546
length_patterns: &GenericLengthPatterns<'data>,
56-
) -> ExactOrSynthetic<PatternPlurals<'data>> {
47+
) -> PatternsWithDistance<PatternPlurals<'data>> {
5748
use icu::datetime::provider::pattern::{runtime, PatternItem};
5849
use icu::datetime::provider::skeleton::{create_best_pattern_for_fields, BestSkeleton};
5950
use icu_locale_core::preferences::extensions::unicode::keywords::HourCycle;
@@ -64,8 +55,10 @@ fn select_pattern<'data>(
6455
};
6556
let fields = bag.to_vec_fields(default_hour_cycle);
6657
match create_best_pattern_for_fields(skeletons, length_patterns, &fields, &bag, false) {
67-
BestSkeleton::AllFieldsMatch(p) => ExactOrSynthetic::Exact(p),
68-
BestSkeleton::MissingOrExtraFields(p) => ExactOrSynthetic::Synthetic(p),
58+
BestSkeleton::AllFieldsMatch(p, distance) => PatternsWithDistance { inner: p, distance },
59+
BestSkeleton::MissingOrExtraFields(p, distance) => {
60+
PatternsWithDistance { inner: p, distance }
61+
}
6962
BestSkeleton::NoMatch => {
7063
// Build a last-resort pattern that contains all of the requested fields.
7164
// This is NOT in the CLDR standard! Better would be:
@@ -79,7 +72,10 @@ fn select_pattern<'data>(
7972
.skip(1)
8073
.collect::<Vec<_>>();
8174
let pattern = runtime::Pattern::from(pattern_items);
82-
ExactOrSynthetic::Synthetic(PatternPlurals::SinglePattern(pattern))
75+
PatternsWithDistance {
76+
inner: PatternPlurals::SinglePattern(pattern),
77+
distance: SkeletonQuality::worst(),
78+
}
8379
}
8480
}
8581
}
@@ -133,8 +129,8 @@ impl SourceDataProvider {
133129
DateSkeletonPatterns::from(&data.datetime_formats.available_formats);
134130

135131
fn expand_pp_to_pe(
136-
value: ExactOrSynthetic<PatternPlurals>,
137-
) -> ExactOrSynthetic<PluralElements<runtime::Pattern>> {
132+
value: PatternsWithDistance<PatternPlurals>,
133+
) -> PatternsWithDistance<PluralElements<runtime::Pattern>> {
138134
value.map(|pp| match pp {
139135
PatternPlurals::MultipleVariants(variants) => PluralElements::new(variants.other)
140136
.with_zero_value(variants.zero.clone())

provider/source/src/datetime/skeletons.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ mod test {
122122
let (_, skeletons) = get_data_payload();
123123

124124
match get_best_available_format_pattern(&skeletons, &requested_fields, false) {
125-
BestSkeleton::AllFieldsMatch(available_format_pattern)
126-
| BestSkeleton::MissingOrExtraFields(available_format_pattern) => {
125+
BestSkeleton::AllFieldsMatch(available_format_pattern, _)
126+
| BestSkeleton::MissingOrExtraFields(available_format_pattern, _) => {
127127
assert_eq!(
128128
available_format_pattern
129129
.expect_pattern("pattern should not have plural variants")
@@ -147,7 +147,7 @@ mod test {
147147
let (_, skeletons) = get_data_payload();
148148

149149
match get_best_available_format_pattern(&skeletons, &requested_fields, false) {
150-
BestSkeleton::MissingOrExtraFields(available_format_pattern) => {
150+
BestSkeleton::MissingOrExtraFields(available_format_pattern, _) => {
151151
assert_eq!(
152152
available_format_pattern
153153
.expect_pattern("pattern should not have plural variants")
@@ -182,7 +182,7 @@ mod test {
182182
&Default::default(),
183183
false,
184184
) {
185-
BestSkeleton::AllFieldsMatch(available_format_pattern) => {
185+
BestSkeleton::AllFieldsMatch(available_format_pattern, _) => {
186186
// TODO - Append items are needed here.
187187
assert_eq!(
188188
available_format_pattern
@@ -376,7 +376,7 @@ mod test {
376376
let (_, skeletons) = get_data_payload();
377377

378378
match get_best_available_format_pattern(&skeletons, &requested_fields, false) {
379-
BestSkeleton::AllFieldsMatch(available_format_pattern) => {
379+
BestSkeleton::AllFieldsMatch(available_format_pattern, _) => {
380380
assert_eq!(
381381
available_format_pattern
382382
.expect_pattern("pattern should not have plural variants")
@@ -399,7 +399,7 @@ mod test {
399399
let (_, skeletons) = get_data_payload();
400400

401401
match get_best_available_format_pattern(&skeletons, &requested_fields, false) {
402-
BestSkeleton::AllFieldsMatch(available_format_pattern) => {
402+
BestSkeleton::AllFieldsMatch(available_format_pattern, _) => {
403403
assert_eq!(
404404
available_format_pattern
405405
.expect_pattern("pattern should not have plural variants")

0 commit comments

Comments
 (0)