-
Notifications
You must be signed in to change notification settings - Fork 0
compare trie with packtab #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: tajp-icu4x-data
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ | |
| //! | ||
| //! This module provides a benchmark for the default style. | ||
|
|
||
| use crate::{ColorBrush, FONT_STACK, get_contexts, get_samples}; | ||
| use crate::{ColorBrush, FONT_STACK, get_contexts, get_samples, lookup}; | ||
| use parley::{ | ||
| Alignment, AlignmentOptions, FontStack, FontStyle, FontWeight, Layout, RangedBuilder, | ||
| StyleProperty, | ||
|
|
@@ -129,3 +129,15 @@ pub fn styled() -> Vec<Benchmark> { | |
| }) | ||
| .collect() | ||
| } | ||
|
|
||
| /// Benchmark the composite property lookup backends. | ||
| pub fn composite_lookup_latency() -> Vec<Benchmark> { | ||
| let samples = lookup::codepoint_samples(); | ||
| let composite = lookup::composite_data(); | ||
|
|
||
| vec![benchmark_fn("Composite lookup", move |b| { | ||
| b.iter(|| { | ||
| black_box(lookup::checksum_packtab(samples)); | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change this to |
||
| }) | ||
| })] | ||
| } | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These binaries were used to compare sizes |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| // Copyright 2025 the Parley Authors | ||
| // SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
|
||
| //! Binary to exercise the PackTab composite lookup for size comparisons. | ||
|
|
||
| use parley_bench::lookup; | ||
|
|
||
| fn main() { | ||
| let samples = lookup::codepoint_samples(); | ||
| let checksum = lookup::checksum_packtab(samples); | ||
| println!("PackTab checksum: {checksum}"); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| // Copyright 2025 the Parley Authors | ||
| // SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
|
||
| //! Binary to exercise the CodePointTrie composite lookup for size comparisons. | ||
|
|
||
| use parley_bench::lookup; | ||
|
|
||
| fn main() { | ||
| let samples = lookup::codepoint_samples(); | ||
| let composite = lookup::composite_data(); | ||
| let checksum = lookup::checksum_trie(samples, composite); | ||
| println!("Composite trie checksum: {checksum}"); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| // Copyright 2025 the Parley Authors | ||
| // SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
|
||
| //! Helpers for comparing composite property lookups. | ||
|
|
||
| use std::sync::OnceLock; | ||
|
|
||
| use icu_provider::buf::AsDeserializingBufferProvider; | ||
| use icu_provider::{DataMarker, DataPayload, DataRequest, DataResponse, DynamicDataProvider}; | ||
| use unicode_data::{CompositePropsV1, CompositePropsV1Data}; | ||
|
|
||
| const SAMPLE_LEN: usize = 4096; | ||
|
|
||
| fn build_sample_codepoints() -> Vec<u32> { | ||
| let mut values = Vec::with_capacity(SAMPLE_LEN); | ||
| let mut state = 0x1234_5678_u32; | ||
| for _ in 0..SAMPLE_LEN { | ||
| state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); | ||
| values.push(state % 0x11_0000); | ||
| } | ||
| values | ||
| } | ||
|
|
||
| fn load_composite_payload() -> DataPayload<CompositePropsV1> { | ||
| let provider = icu_provider_blob::BlobDataProvider::try_new_from_static_blob( | ||
| unicode_data::generated::COMPOSITE_BLOB, | ||
| ) | ||
| .expect("Composite blob should deserialize"); | ||
|
|
||
| let response: DataResponse<CompositePropsV1> = provider | ||
| .as_deserializing() | ||
| .load_data(CompositePropsV1::INFO, DataRequest::default()) | ||
| .expect("Composite data should load"); | ||
|
|
||
| response.payload | ||
| } | ||
|
|
||
| /// Returns the baked composite trie. | ||
| pub fn composite_data() -> &'static CompositePropsV1Data<'static> { | ||
| static PAYLOAD: OnceLock<DataPayload<CompositePropsV1>> = OnceLock::new(); | ||
| PAYLOAD.get_or_init(load_composite_payload).get() | ||
| } | ||
|
|
||
| /// Returns a deterministic set of scalar values that cover all Unicode planes. | ||
| pub fn codepoint_samples() -> &'static [u32] { | ||
| static SAMPLES: OnceLock<Vec<u32>> = OnceLock::new(); | ||
| SAMPLES.get_or_init(build_sample_codepoints) | ||
| } | ||
|
|
||
| /// Accumulates a checksum from the packed properties stored in the CodePointTrie. | ||
| pub fn checksum_trie(samples: &[u32], composite: &'static CompositePropsV1Data<'static>) -> u32 { | ||
| samples | ||
| .iter() | ||
| .fold(0_u32, |acc, &cp| acc ^ u32::from(composite.properties(cp))) | ||
| } | ||
|
|
||
| /// Accumulates a checksum from the properties stored in the PackTab tables. | ||
| pub fn checksum_packtab(samples: &[u32]) -> u32 { | ||
| samples.iter().fold(0_u32, |acc, &cp| { | ||
| acc ^ u32::from(unicode_data::packtab_properties(cp)) | ||
| }) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| #!/usr/bin/env python3 | ||
| """ MUST BE RUN IN PACKTAB REPO ROOT """ | ||
|
|
||
| import sys | ||
| import json | ||
| sys.path.insert(0, 'Lib') | ||
|
|
||
| from packTab import pack_table, Code | ||
|
|
||
| with open('/Users/taj/repos/parley/composite_props.json', 'r') as f: | ||
| data = json.load(f) | ||
|
|
||
| print(f"Loaded {len(data)} Unicode codepoint properties") | ||
|
|
||
| # Find the most common value to use as default | ||
| from collections import Counter | ||
| value_counts = Counter(data) | ||
| default = value_counts.most_common(1)[0][0] | ||
| default_count = value_counts[default] | ||
|
|
||
| print(f"Most common value: {default} (appears {default_count} times, {100*default_count/len(data):.2f}%)") | ||
| print(f"Unique values: {len(value_counts)}") | ||
| print() | ||
|
|
||
| print("Running packTab compression...") | ||
| print("This may take a moment for ~1.1M values...") | ||
|
|
||
| # Pack the table with compression level 1 (balanced) | ||
| solution = pack_table(data, default=default, compression=3) | ||
|
|
||
| print() | ||
| print("Compression results:") | ||
| print(f" Uncompressed: {len(data) * 4} bytes (u32 array)") | ||
| print(f" Compressed: {solution.cost} bytes") | ||
| print(f" Compression ratio: {(len(data) * 4) / solution.cost:.2f}x") | ||
| print(f" Lookups needed: {solution.nLookups}") | ||
| print(f" Extra operations: {solution.nExtraOps}") | ||
| print() | ||
|
|
||
| print("Generating Rust code...") | ||
| code = Code("composite_props") | ||
| solution.genCode(code, "get", language="rust", private=False) | ||
|
|
||
| output_file = './composite_props_packtab.rs' | ||
| print(f"Writing to {output_file}...") | ||
|
|
||
| with open(output_file, 'w') as f: | ||
| f.write("// Auto-generated by packTab\n") | ||
| f.write("// Unicode composite properties lookup table\n") | ||
| f.write("//\n") | ||
| f.write(f"// Original size: {len(data)} u32 values = {len(data) * 4} bytes\n") | ||
| f.write(f"// Compressed size: {solution.cost} bytes\n") | ||
| f.write(f"// Compression ratio: {(len(data) * 4) / solution.cost:.2f}x\n") | ||
| f.write(f"// Lookups: {solution.nLookups}, Extra ops: {solution.nExtraOps}\n") | ||
| f.write("//\n") | ||
| f.write(f"// Default value: {default}\n") | ||
| f.write("//\n") | ||
| f.write("// Usage: composite_props_get(codepoint as usize) -> u32\n") | ||
| f.write("\n") | ||
|
|
||
| # Redirect print to file | ||
| code.print_code(file=f, language="rust", private=False) | ||
|
|
||
| print(f"✓ Successfully generated {output_file}") | ||
| print() | ||
| print("Summary:") | ||
| print(f" Input: {len(data):,} codepoints") | ||
| print(f" Output: {solution.cost:,} bytes") | ||
| print(f" Savings: {len(data) * 4 - solution.cost:,} bytes") | ||
| print(f" Space efficiency: {100 * solution.cost / (len(data) * 4):.2f}% of original") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,9 @@ | |
|
|
||
| #![no_std] | ||
|
|
||
| #[cfg(test)] | ||
| extern crate std; | ||
|
|
||
| use icu_collections::codepointtrie::CodePointTrie; | ||
| use icu_properties::props::{GeneralCategory, GraphemeClusterBreak, Script}; | ||
| use zerofrom::ZeroFrom; | ||
|
|
@@ -16,6 +19,8 @@ use zerofrom::ZeroFrom; | |
| #[cfg(feature = "baked")] | ||
| pub mod generated; | ||
|
|
||
| mod packtab; | ||
|
|
||
| /// A data provider of `CompositePropsV1`. | ||
| #[derive(Clone, Debug, Eq, PartialEq, yoke::Yokeable, ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(databake::Bake))] | ||
|
|
@@ -68,6 +73,13 @@ impl CompositePropsV1Data<'_> { | |
| } | ||
| } | ||
|
|
||
| /// Returns the composite properties stored in the PackTab tables. | ||
| #[inline(always)] | ||
| pub fn packtab_properties(ch: u32) -> Properties { | ||
| debug_assert!(ch <= 0x10_FFFF, "Invalid scalar value: {ch:#X}"); | ||
| Properties(packtab::composite_props_get(ch as usize)) | ||
| } | ||
|
|
||
| impl unicode_bidi::BidiDataSource for CompositePropsV1Data<'_> { | ||
| fn bidi_class(&self, cp: char) -> unicode_bidi::BidiClass { | ||
| self.properties(cp as u32).bidi_class() | ||
|
|
@@ -258,6 +270,35 @@ impl From<Properties> for u32 { | |
| } | ||
| } | ||
|
|
||
| #[cfg(all(test, feature = "baked"))] | ||
| mod tests { | ||
| use super::*; | ||
| use icu_provider::buf::AsDeserializingBufferProvider; | ||
| use icu_provider::{DataMarker, DataRequest, DynamicDataProvider}; | ||
|
|
||
| #[test] | ||
| fn packtab_matches_trie() { | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test to ensure that both packtab and trie return the same values |
||
| let provider = icu_provider_blob::BlobDataProvider::try_new_from_static_blob( | ||
| generated::COMPOSITE_BLOB, | ||
| ) | ||
| .expect("Composite blob should deserialize"); | ||
| let response: icu_provider::DataResponse<CompositePropsV1> = provider | ||
| .as_deserializing() | ||
| .load_data(CompositePropsV1::INFO, DataRequest::default()) | ||
| .expect("Composite data should load"); | ||
| let data = response.payload.get().clone(); | ||
|
|
||
| for cp in 0_u32..=0x10_FFFF { | ||
| let trie_value: u32 = data.properties(cp).into(); | ||
| let packtab_value: u32 = packtab_properties(cp).into(); | ||
| assert_eq!( | ||
| trie_value, packtab_value, | ||
| "Mismatch at code point U+{cp:04X}" | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[cfg(feature = "datagen")] | ||
| fn unicode_to_unicode_bidi(bidi: icu_properties::props::BidiClass) -> unicode_bidi::BidiClass { | ||
| use icu_properties::props::BidiClass; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| // Copyright 2025 the Parley Authors | ||
| // SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
|
||
| //! Utility binary that dumps the composite property trie into a JSON array. | ||
|
|
||
| #[cfg(feature = "baked")] | ||
| use std::env; | ||
| #[cfg(feature = "baked")] | ||
| use std::fs::File; | ||
| #[cfg(feature = "baked")] | ||
| use std::io::{BufWriter, Write}; | ||
| #[cfg(feature = "baked")] | ||
| use std::path::PathBuf; | ||
|
|
||
| #[cfg(feature = "baked")] | ||
| use icu_provider::buf::AsDeserializingBufferProvider; | ||
| #[cfg(feature = "baked")] | ||
| use icu_provider::{DataMarker, DataRequest, DataResponse, DynamicDataProvider}; | ||
| #[cfg(feature = "baked")] | ||
| use unicode_data::{CompositePropsV1, CompositePropsV1Data}; | ||
|
|
||
| #[cfg(feature = "baked")] | ||
| fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
| const DEFAULT_OUTPUT: &str = "composite_props.json"; | ||
| const MAX_UNICODE_SCALAR: u32 = 0x10_FFFF; | ||
|
|
||
| let output_path = env::args() | ||
| .nth(1) | ||
| .map(PathBuf::from) | ||
| .unwrap_or_else(|| PathBuf::from(DEFAULT_OUTPUT)); | ||
|
|
||
| let provider = icu_provider_blob::BlobDataProvider::try_new_from_static_blob( | ||
| unicode_data::generated::COMPOSITE_BLOB, | ||
| )?; | ||
|
|
||
| let response: DataResponse<CompositePropsV1> = provider | ||
| .as_deserializing() | ||
| .load_data(CompositePropsV1::INFO, DataRequest::default())?; | ||
| let composite: &CompositePropsV1Data<'_> = response.payload.get(); | ||
|
|
||
| let file = File::create(&output_path)?; | ||
| let mut writer = BufWriter::new(file); | ||
|
|
||
| writer.write_all(b"[")?; | ||
| for cp in 0_u32..=MAX_UNICODE_SCALAR { | ||
| let value: u32 = composite.properties(cp).into(); | ||
| if cp != 0 { | ||
| writer.write_all(b",")?; | ||
| } | ||
| write!(writer, "{value}")?; | ||
| } | ||
| writer.write_all(b"]\n")?; | ||
|
|
||
| println!( | ||
| "Wrote {} composite property values to {}", | ||
| (MAX_UNICODE_SCALAR + 1), | ||
| output_path.display() | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[cfg(not(feature = "baked"))] | ||
| fn main() { | ||
| eprintln!( | ||
| "The unicode_data binary requires the `baked` feature. \ | ||
| Rebuild with `--features baked` (enabled by default)." | ||
| ); | ||
| std::process::exit(1); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the raw data fed to PackTab