Skip to content

Commit fbf31a2

Browse files
committed
Bug 1973876 - Use the geonames SQL collation for weather keywords too
This creates a new `keywords_i18n` table that's the same as `keywords` except the main column uses the collation from the geonames table. Weather keywords are now added to this table instead of the old one. I think we should use the new table in the future especially if/as we expand to non-English locales, but it's useful for English too since it ignores some punctuation. Since the collation is now used for other things besides geonames, I renamed it `i18n_collation`. Let me know if you have ideas on better names for the table and collation. I used "i18n" because I wanted to capture the idea they're useful for locale-specific user-facing strings. "i18n" seemed more appropriate than "l10n" because these things are structures/facilities that enable l10n and aren't the l10n artifacts themselves. I thought about using "unicode" but that's pretty ambiguous I think. This also makes some improvements to keywords metrics.
1 parent 4dc2d82 commit fbf31a2

File tree

6 files changed

+604
-138
lines changed

6 files changed

+604
-138
lines changed

components/suggest/src/db.rs

Lines changed: 197 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use crate::{
2828
},
2929
schema::{clear_database, SuggestConnectionInitializer},
3030
suggestion::{cook_raw_suggestion_url, FtsMatchInfo, Suggestion},
31-
util::{full_keyword, split_keyword},
31+
util::{full_keyword, i18n_transform, split_keyword},
3232
weather::WeatherCache,
3333
Result, SuggestionQuery,
3434
};
@@ -1105,7 +1105,11 @@ impl<'a> SuggestDao<'a> {
11051105
// `suggestion.keywords()` can yield duplicates for dynamic
11061106
// suggestions, so ignore failures on insert in the uniqueness
11071107
// constraint on `(suggestion_id, keyword)`.
1108-
let mut keyword_insert = KeywordInsertStatement::new_with_or_ignore(self.conn)?;
1108+
let mut keyword_insert = KeywordInsertStatement::with_details(
1109+
self.conn,
1110+
"keywords",
1111+
Some(InsertConflictResolution::Ignore),
1112+
)?;
11091113
let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?;
11101114
let mut dynamic_insert = DynamicInsertStatement::new(self.conn)?;
11111115
for suggestion in suggestions {
@@ -1675,29 +1679,27 @@ pub(crate) struct KeywordInsertStatement<'conn>(rusqlite::Statement<'conn>);
16751679

16761680
impl<'conn> KeywordInsertStatement<'conn> {
16771681
pub(crate) fn new(conn: &'conn Connection) -> Result<Self> {
1678-
Ok(Self(conn.prepare(
1679-
"INSERT INTO keywords(
1680-
suggestion_id,
1681-
keyword,
1682-
full_keyword_id,
1683-
rank
1684-
)
1685-
VALUES(?, ?, ?, ?)
1686-
",
1687-
)?))
1682+
Self::with_details(conn, "keywords", None)
16881683
}
16891684

1690-
pub(crate) fn new_with_or_ignore(conn: &'conn Connection) -> Result<Self> {
1691-
Ok(Self(conn.prepare(
1692-
"INSERT OR IGNORE INTO keywords(
1693-
suggestion_id,
1694-
keyword,
1695-
full_keyword_id,
1696-
rank
1697-
)
1698-
VALUES(?, ?, ?, ?)
1699-
",
1700-
)?))
1685+
pub(crate) fn with_details(
1686+
conn: &'conn Connection,
1687+
table: &str,
1688+
conflict_resolution: Option<InsertConflictResolution>,
1689+
) -> Result<Self> {
1690+
Ok(Self(conn.prepare(&format!(
1691+
r#"
1692+
INSERT {} INTO {}(
1693+
suggestion_id,
1694+
keyword,
1695+
full_keyword_id,
1696+
rank
1697+
)
1698+
VALUES(?, ?, ?, ?)
1699+
"#,
1700+
conflict_resolution.as_ref().map(|r| r.as_str()).unwrap_or_default(),
1701+
table,
1702+
))?))
17011703
}
17021704

17031705
pub(crate) fn execute(
@@ -1714,6 +1716,18 @@ impl<'conn> KeywordInsertStatement<'conn> {
17141716
}
17151717
}
17161718

1719+
pub(crate) enum InsertConflictResolution {
1720+
Ignore,
1721+
}
1722+
1723+
impl InsertConflictResolution {
1724+
fn as_str(&self) -> &str {
1725+
match self {
1726+
InsertConflictResolution::Ignore => "OR IGNORE",
1727+
}
1728+
}
1729+
}
1730+
17171731
struct PrefixKeywordInsertStatement<'conn>(rusqlite::Statement<'conn>);
17181732

17191733
impl<'conn> PrefixKeywordInsertStatement<'conn> {
@@ -1752,32 +1766,55 @@ impl<'conn> PrefixKeywordInsertStatement<'conn> {
17521766
}
17531767
}
17541768

1755-
#[derive(Debug, Default)]
1769+
/// Metrics for a set of keywords. Use `KeywordsMetricsUpdater` to write metrics
1770+
/// to the store and `SuggestDao::get_keywords_metrics` to read them.
1771+
#[derive(Debug, Default, Eq, PartialEq)]
17561772
pub(crate) struct KeywordsMetrics {
1773+
/// The max byte count (not chars) of all keywords in the set.
17571774
pub(crate) max_len: usize,
1775+
/// The max word count of all keywords in the set.
17581776
pub(crate) max_word_count: usize,
17591777
}
17601778

1761-
/// This can be used to update metrics as keywords are inserted into the DB.
1762-
/// Create a `KeywordsMetricsUpdater`, call `update` on it as each keyword is
1763-
/// inserted, and then call `finish` after all keywords have been inserted.
1779+
/// This can be used to keep a running tally of metrics as you process keywords
1780+
/// and then to write the metrics to the store. Make a `KeywordsMetricsUpdater`,
1781+
/// call `update` on it as you process each keyword, and then call `finish` to
1782+
/// write the metrics to the store.
17641783
pub(crate) struct KeywordsMetricsUpdater {
1765-
pub(crate) max_len: usize,
1766-
pub(crate) max_word_count: usize,
1784+
metrics: KeywordsMetrics,
17671785
}
17681786

17691787
impl KeywordsMetricsUpdater {
17701788
pub(crate) fn new() -> Self {
17711789
Self {
1772-
max_len: 0,
1773-
max_word_count: 0,
1790+
metrics: KeywordsMetrics::default(),
17741791
}
17751792
}
17761793

1794+
/// Updates the metrics with those of the given keyword.
1795+
///
1796+
/// The keyword's metrics are calculated as the max of its metrics as-is and
1797+
/// its metrics when it's transformed using `i18n_transform`. For example,
1798+
/// `"Qu\u{00e9}bec"` ("Québec" with single two-byte 'é' char) has 7 bytes,
1799+
/// so its `len` is 7, but the `len` of `i18n_transform("Qu\u{00e9}bec")` is
1800+
/// 6, since the 'é' is transformed to an ASCII 'e'. The keyword's `max_len`
1801+
/// will therefore be the max of 7 and 6, which is 7.
17771802
pub(crate) fn update(&mut self, keyword: &str) {
1778-
self.max_len = std::cmp::max(self.max_len, keyword.len());
1779-
self.max_word_count =
1780-
std::cmp::max(self.max_word_count, keyword.split_whitespace().count());
1803+
let transformed_kw = i18n_transform(keyword);
1804+
self.metrics.max_len = std::cmp::max(
1805+
self.metrics.max_len,
1806+
std::cmp::max(transformed_kw.len(), keyword.len()),
1807+
);
1808+
1809+
// Getting the word count is costly, so we get only the transformed word
1810+
// count under the assumption that it will always be at least as large
1811+
// as the the original word count. That's currently true because the
1812+
// only relevant transform that `i18n_transform` performs is to replace
1813+
// hyphens with spaces.
1814+
self.metrics.max_word_count = std::cmp::max(
1815+
self.metrics.max_word_count,
1816+
transformed_kw.split_whitespace().count(),
1817+
);
17811818
}
17821819

17831820
/// Inserts keywords metrics into the database. This assumes you have a
@@ -1805,8 +1842,8 @@ impl KeywordsMetricsUpdater {
18051842
.execute((
18061843
record_id.as_str(),
18071844
record_type,
1808-
self.max_len,
1809-
self.max_word_count,
1845+
self.metrics.max_len,
1846+
self.metrics.max_word_count,
18101847
))
18111848
.with_context("keywords metrics insert")?;
18121849

@@ -1845,3 +1882,127 @@ impl<'conn> AmpFtsInsertStatement<'conn> {
18451882
fn provider_config_meta_key(provider: SuggestionProvider) -> String {
18461883
format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8)
18471884
}
1885+
1886+
#[cfg(test)]
1887+
mod tests {
1888+
use super::*;
1889+
use crate::{store::tests::TestStore, testing::*};
1890+
1891+
#[test]
1892+
fn keywords_metrics_updater() -> anyhow::Result<()> {
1893+
// (test keyword, expected `KeywordsMetrics`)
1894+
//
1895+
// Tests are cumulative!
1896+
let tests = [
1897+
(
1898+
"abc",
1899+
KeywordsMetrics {
1900+
max_len: 3,
1901+
max_word_count: 1,
1902+
},
1903+
),
1904+
(
1905+
"a b",
1906+
KeywordsMetrics {
1907+
max_len: 3,
1908+
max_word_count: 2,
1909+
},
1910+
),
1911+
(
1912+
"a b c",
1913+
KeywordsMetrics {
1914+
max_len: 5,
1915+
max_word_count: 3,
1916+
},
1917+
),
1918+
// "Québec" with single 'é' char:
1919+
// With `i18n_transform`: len = 6
1920+
// Without `i18n_transform`: len = 7
1921+
// => Length for this string should be max(6, 7) = 7, which is a new
1922+
// overall max
1923+
(
1924+
"Qu\u{00e9}bec",
1925+
KeywordsMetrics {
1926+
max_len: 7,
1927+
max_word_count: 3,
1928+
},
1929+
),
1930+
// "Québec" with ASCII 'e' followed by combining acute accent:
1931+
// With `i18n_transform`, len = 6
1932+
// Without `i18n_transform`, len = 8
1933+
// => Length for this string should be max(6, 8) = 8, which is a new
1934+
// overall max
1935+
(
1936+
"Que\u{0301}bec",
1937+
KeywordsMetrics {
1938+
max_len: 8,
1939+
max_word_count: 3,
1940+
},
1941+
),
1942+
// Each '-' should be replaced with a space, so the word count for
1943+
// this string should be 4, which is a new overall max
1944+
(
1945+
"Carmel-by-the-Sea",
1946+
KeywordsMetrics {
1947+
max_len: 17,
1948+
max_word_count: 4,
1949+
},
1950+
),
1951+
];
1952+
1953+
// Make an updater and call it with each test in turn.
1954+
let mut updater = KeywordsMetricsUpdater::new();
1955+
for (test_kw, expected_metrics) in &tests {
1956+
updater.update(test_kw);
1957+
assert_eq!(&updater.metrics, expected_metrics);
1958+
}
1959+
1960+
// Make a store and finish the updater so it updates the metrics table.
1961+
let store = TestStore::new(MockRemoteSettingsClient::default());
1962+
store.write(|dao| {
1963+
// Make a dummy cache cell. It doesn't matter what it contains, we
1964+
// just want to make sure it's empty after finishing the updater.
1965+
let mut dummy_cache = OnceCell::new();
1966+
dummy_cache.set("test").expect("dummy cache set");
1967+
assert_ne!(dummy_cache.get(), None);
1968+
1969+
let record_type = SuggestRecordType::Wikipedia;
1970+
updater.finish(
1971+
dao.conn,
1972+
&SuggestRecordId::new("test-record-1".to_string()),
1973+
record_type,
1974+
&mut dummy_cache,
1975+
)?;
1976+
1977+
assert_eq!(dummy_cache.get(), None);
1978+
1979+
// Read back the metrics and make sure they match the ones in the
1980+
// final test.
1981+
let read_metrics_1 = dao.get_keywords_metrics(record_type)?;
1982+
assert_eq!(read_metrics_1, tests.last().unwrap().1);
1983+
1984+
// Update the metrics one more time and finish them again.
1985+
updater.update("a very long keyword with many words");
1986+
let new_expected = KeywordsMetrics {
1987+
max_len: 35,
1988+
max_word_count: 7,
1989+
};
1990+
assert_eq!(updater.metrics, new_expected);
1991+
1992+
updater.finish(
1993+
dao.conn,
1994+
&SuggestRecordId::new("test-record-2".to_string()),
1995+
record_type,
1996+
&mut dummy_cache,
1997+
)?;
1998+
1999+
// Read back the new metrics and make sure they match.
2000+
let read_metrics_2 = dao.get_keywords_metrics(record_type)?;
2001+
assert_eq!(read_metrics_2, new_expected);
2002+
2003+
Ok(())
2004+
})?;
2005+
2006+
Ok(())
2007+
}
2008+
}

0 commit comments

Comments
 (0)