Skip to content

Commit ced5509

Browse files
habermancopybara-github
authored andcommitted
Use the standard library's implementation of Utf8Chunks.
Also removed the `chars()` and `utf8_chunks()` methods, as it's equally easy for the user to call `proto_str.as_bytes().utf8_chunks()`. PiperOrigin-RevId: 850439621
1 parent c5a3469 commit ced5509

File tree

4 files changed

+14
-566
lines changed

4 files changed

+14
-566
lines changed

rust/BUILD

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ ALL_RUST_SRCS = PROTOBUF_SHARED + [
7474
"gtest_matchers_impl.rs",
7575
"protobuf.rs",
7676
"upb.rs",
77-
"utf8.rs",
7877
# go/keep-sorted end
7978
]
8079

@@ -97,7 +96,6 @@ rust_library(
9796
],
9897
visibility = [":protobuf_internal"],
9998
deps = [
100-
":utf8",
10199
"//rust/upb",
102100
],
103101
)
@@ -144,7 +142,6 @@ rust_library(
144142
],
145143
visibility = [":protobuf_internal"],
146144
deps = [
147-
":utf8",
148145
"//rust/cpp_kernel:cpp_api",
149146
],
150147
)
@@ -215,11 +212,6 @@ rust_library(
215212
],
216213
)
217214

218-
rust_library(
219-
name = "utf8",
220-
srcs = ["utf8.rs"],
221-
)
222-
223215
proto_lang_toolchain(
224216
name = "proto_rust_upb_toolchain",
225217
command_line = "--rust_out=$(OUT)",

rust/shared.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,6 @@ mod string;
7070
#[path = "upb/lib.rs"]
7171
mod upb;
7272

73-
#[cfg(not(bzl))]
74-
mod utf8;
75-
76-
// Forces the utf8 crate to be accessible from crate::.
77-
#[cfg(bzl)]
78-
#[allow(clippy::single_component_path_imports)]
79-
use utf8;
80-
8173
// If the Upb and C++ kernels are both linked into the same binary, this symbol
8274
// will be defined twice and cause a link error.
8375
#[unsafe(no_mangle)]

rust/string.rs

Lines changed: 14 additions & 244 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@
1111

1212
use crate::__internal::runtime::InnerProtoString;
1313
use crate::__internal::{Private, SealedInternal};
14-
use crate::{
15-
utf8::Utf8Chunks, AsView, IntoProxied, IntoView, Mut, MutProxied, Optional, Proxied, View,
16-
};
14+
use crate::{AsView, IntoProxied, IntoView, Mut, MutProxied, Optional, Proxied, View};
1715
use std::borrow::Cow;
1816
use std::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
1917
use std::convert::{AsMut, AsRef};
@@ -394,43 +392,6 @@ impl ProtoStr {
394392
self.0.len()
395393
}
396394

397-
/// Iterates over the `char`s in this protobuf `string`.
398-
///
399-
/// Invalid UTF-8 sequences are replaced with
400-
/// [`U+FFFD REPLACEMENT CHARACTER`].
401-
///
402-
/// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
403-
pub fn chars(&self) -> impl Iterator<Item = char> + '_ + fmt::Debug {
404-
Utf8Chunks::new(self.as_bytes()).flat_map(|chunk| {
405-
let mut yield_replacement_char = !chunk.invalid().is_empty();
406-
chunk.valid().chars().chain(iter::from_fn(move || {
407-
// Yield a single replacement character for every
408-
// non-empty invalid sequence.
409-
yield_replacement_char.then(|| {
410-
yield_replacement_char = false;
411-
char::REPLACEMENT_CHARACTER
412-
})
413-
}))
414-
})
415-
}
416-
417-
/// Returns an iterator over chunks of UTF-8 data in the string.
418-
///
419-
/// An `Ok(&str)` is yielded for every valid UTF-8 chunk, and an
420-
/// `Err(&[u8])` for each non-UTF-8 chunk. An `Err` will be emitted
421-
/// multiple times in a row for contiguous invalid chunks. Each invalid
422-
/// chunk in an `Err` has a maximum length of 3 bytes.
423-
pub fn utf8_chunks(&self) -> impl Iterator<Item = Result<&str, &[u8]>> + '_ {
424-
Utf8Chunks::new(self.as_bytes()).flat_map(|chunk| {
425-
let valid = chunk.valid();
426-
let invalid = chunk.invalid();
427-
(!valid.is_empty())
428-
.then_some(Ok(valid))
429-
.into_iter()
430-
.chain((!invalid.is_empty()).then_some(Err(invalid)))
431-
})
432-
}
433-
434395
/// Converts known-UTF-8 bytes to a `ProtoStr` without a check.
435396
///
436397
/// # Safety
@@ -486,20 +447,24 @@ impl<'msg> TryFrom<&'msg [u8]> for &'msg ProtoStr {
486447

487448
impl fmt::Debug for ProtoStr {
488449
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
489-
fmt::Debug::fmt(&Utf8Chunks::new(self.as_bytes()).debug(), f)
450+
write!(f, "\"");
451+
for chunk in self.as_bytes().utf8_chunks() {
452+
for ch in chunk.valid().chars() {
453+
write!(f, "{}", ch.escape_debug());
454+
}
455+
for byte in chunk.invalid() {
456+
// Format byte as \xff.
457+
write!(f, "\\x{:02X}", byte);
458+
}
459+
}
460+
write!(f, "\"");
461+
Ok(())
490462
}
491463
}
492464

493465
impl fmt::Display for ProtoStr {
494466
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
495-
use std::fmt::Write as _;
496-
for chunk in Utf8Chunks::new(self.as_bytes()) {
497-
fmt::Display::fmt(chunk.valid(), f)?;
498-
if !chunk.invalid().is_empty() {
499-
// One invalid chunk is emitted per detected invalid sequence.
500-
f.write_char(char::REPLACEMENT_CHARACTER)?;
501-
}
502-
}
467+
fmt::Display::fmt(&String::from_utf8_lossy(self.as_bytes()), f)?;
503468
Ok(())
504469
}
505470
}
@@ -608,199 +573,4 @@ mod tests {
608573
// inside of `ProtoStr`.
609574
unsafe { ProtoStr::from_utf8_unchecked(bytes) }
610575
}
611-
612-
// UTF-8 test cases copied from:
613-
// https://github.com/rust-lang/rust/blob/e8ee0b7/library/core/tests/str_lossy.rs
614-
615-
#[gtest]
616-
fn proto_str_debug() {
617-
assert_eq!(&format!("{:?}", test_proto_str(b"Hello There")), "\"Hello There\"");
618-
assert_eq!(
619-
&format!(
620-
"{:?}",
621-
test_proto_str(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa"),
622-
),
623-
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
624-
);
625-
}
626-
627-
#[gtest]
628-
fn proto_str_display() {
629-
assert_eq!(&test_proto_str(b"Hello There").to_string(), "Hello There");
630-
assert_eq!(
631-
&test_proto_str(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").to_string(),
632-
"Hello�� There� Goodbye\u{10d4ea}",
633-
);
634-
}
635-
636-
#[gtest]
637-
fn proto_str_to_rust_str() {
638-
assert_eq!(test_proto_str(b"hello").to_str(), Ok("hello"));
639-
assert_eq!(test_proto_str("ศไทย中华Việt Nam".as_bytes()).to_str(), Ok("ศไทย中华Việt Nam"));
640-
for expect_fail in [
641-
&b"Hello\xC2 There\xFF Goodbye"[..],
642-
b"Hello\xC0\x80 There\xE6\x83 Goodbye",
643-
b"\xF5foo\xF5\x80bar",
644-
b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
645-
b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
646-
b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
647-
b"\xED\xA0\x80foo\xED\xBF\xBFbar",
648-
] {
649-
assert!(
650-
matches!(test_proto_str(expect_fail).to_str(), Err(Utf8Error { inner: _ })),
651-
"{expect_fail:?}"
652-
);
653-
}
654-
}
655-
656-
#[gtest]
657-
fn proto_str_to_cow() {
658-
assert_eq!(test_proto_str(b"hello").to_cow_lossy(), Cow::Borrowed("hello"));
659-
assert_eq!(
660-
test_proto_str("ศไทย中华Việt Nam".as_bytes()).to_cow_lossy(),
661-
Cow::Borrowed("ศไทย中华Việt Nam")
662-
);
663-
for (bytes, lossy_str) in [
664-
(&b"Hello\xC2 There\xFF Goodbye"[..], "Hello� There� Goodbye"),
665-
(b"Hello\xC0\x80 There\xE6\x83 Goodbye", "Hello�� There� Goodbye"),
666-
(b"\xF5foo\xF5\x80bar", "�foo��bar"),
667-
(b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "�foo�bar�baz"),
668-
(b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "�foo�bar��baz"),
669-
(b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "����foo\u{10000}bar"),
670-
(b"\xED\xA0\x80foo\xED\xBF\xBFbar", "���foo���bar"),
671-
] {
672-
let cow = test_proto_str(bytes).to_cow_lossy();
673-
assert!(matches!(cow, Cow::Owned(_)));
674-
assert_eq!(&*cow, lossy_str, "{bytes:?}");
675-
}
676-
}
677-
678-
#[gtest]
679-
fn proto_str_utf8_chunks() {
680-
macro_rules! assert_chunks {
681-
($bytes:expr, $($chunks:expr),* $(,)?) => {
682-
let bytes = $bytes;
683-
let chunks: &[std::result::Result<&str, &[u8]>] = &[$($chunks),*];
684-
let s = test_proto_str(bytes);
685-
let mut got_chunks = s.utf8_chunks();
686-
let mut expected_chars = chunks.iter().copied();
687-
assert!(got_chunks.eq(expected_chars), "{bytes:?} -> {chunks:?}");
688-
};
689-
}
690-
assert_chunks!(b"hello", Ok("hello"));
691-
assert_chunks!("ศไทย中华Việt Nam".as_bytes(), Ok("ศไทย中华Việt Nam"));
692-
assert_chunks!(
693-
b"Hello\xC2 There\xFF Goodbye",
694-
Ok("Hello"),
695-
Err(b"\xC2"),
696-
Ok(" There"),
697-
Err(b"\xFF"),
698-
Ok(" Goodbye"),
699-
);
700-
assert_chunks!(
701-
b"Hello\xC0\x80 There\xE6\x83 Goodbye",
702-
Ok("Hello"),
703-
Err(b"\xC0"),
704-
Err(b"\x80"),
705-
Ok(" There"),
706-
Err(b"\xE6\x83"),
707-
Ok(" Goodbye"),
708-
);
709-
assert_chunks!(
710-
b"\xF5foo\xF5\x80bar",
711-
Err(b"\xF5"),
712-
Ok("foo"),
713-
Err(b"\xF5"),
714-
Err(b"\x80"),
715-
Ok("bar"),
716-
);
717-
assert_chunks!(
718-
b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
719-
Err(b"\xF1"),
720-
Ok("foo"),
721-
Err(b"\xF1\x80"),
722-
Ok("bar"),
723-
Err(b"\xF1\x80\x80"),
724-
Ok("baz"),
725-
);
726-
assert_chunks!(
727-
b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
728-
Err(b"\xF4"),
729-
Ok("foo"),
730-
Err(b"\xF4\x80"),
731-
Ok("bar"),
732-
Err(b"\xF4"),
733-
Err(b"\xBF"),
734-
Ok("baz"),
735-
);
736-
assert_chunks!(
737-
b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
738-
Err(b"\xF0"),
739-
Err(b"\x80"),
740-
Err(b"\x80"),
741-
Err(b"\x80"),
742-
Ok("foo\u{10000}bar"),
743-
);
744-
assert_chunks!(
745-
b"\xED\xA0\x80foo\xED\xBF\xBFbar",
746-
Err(b"\xED"),
747-
Err(b"\xA0"),
748-
Err(b"\x80"),
749-
Ok("foo"),
750-
Err(b"\xED"),
751-
Err(b"\xBF"),
752-
Err(b"\xBF"),
753-
Ok("bar"),
754-
);
755-
}
756-
757-
#[gtest]
758-
fn proto_str_chars() {
759-
macro_rules! assert_chars {
760-
($bytes:expr, $chars:expr) => {
761-
let bytes = $bytes;
762-
let chars = $chars;
763-
let s = test_proto_str(bytes);
764-
let mut got_chars = s.chars();
765-
let mut expected_chars = chars.into_iter();
766-
assert!(got_chars.eq(expected_chars), "{bytes:?} -> {chars:?}");
767-
};
768-
}
769-
assert_chars!(b"hello", ['h', 'e', 'l', 'l', 'o']);
770-
assert_chars!(
771-
"ศไทย中华Việt Nam".as_bytes(),
772-
['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']
773-
);
774-
assert_chars!(
775-
b"Hello\xC2 There\xFF Goodbye",
776-
[
777-
'H', 'e', 'l', 'l', 'o', '�', ' ', 'T', 'h', 'e', 'r', 'e', '�', ' ', 'G', 'o',
778-
'o', 'd', 'b', 'y', 'e'
779-
]
780-
);
781-
assert_chars!(
782-
b"Hello\xC0\x80 There\xE6\x83 Goodbye",
783-
[
784-
'H', 'e', 'l', 'l', 'o', '�', '�', ' ', 'T', 'h', 'e', 'r', 'e', '�', ' ', 'G',
785-
'o', 'o', 'd', 'b', 'y', 'e'
786-
]
787-
);
788-
assert_chars!(b"\xF5foo\xF5\x80bar", ['�', 'f', 'o', 'o', '�', '�', 'b', 'a', 'r']);
789-
assert_chars!(
790-
b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
791-
['�', 'f', 'o', 'o', '�', 'b', 'a', 'r', '�', 'b', 'a', 'z']
792-
);
793-
assert_chars!(
794-
b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
795-
['�', 'f', 'o', 'o', '�', 'b', 'a', 'r', '�', '�', 'b', 'a', 'z']
796-
);
797-
assert_chars!(
798-
b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
799-
['�', '�', '�', '�', 'f', 'o', 'o', '\u{10000}', 'b', 'a', 'r']
800-
);
801-
assert_chars!(
802-
b"\xED\xA0\x80foo\xED\xBF\xBFbar",
803-
['�', '�', '�', 'f', 'o', 'o', '�', '�', '�', 'b', 'a', 'r']
804-
);
805-
}
806576
}

0 commit comments

Comments
 (0)