Skip to content

Commit f877b32

Browse files
committed
Don't split UTF-8 across threads in str::Bytes
1 parent 30121d8 commit f877b32

File tree

2 files changed

+41
-11
lines changed

2 files changed

+41
-11
lines changed

src/str.rs

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
1616
use iter::*;
1717
use iter::plumbing::*;
18-
use slice;
1918
use split_producer::*;
2019

2120

@@ -79,6 +78,11 @@ pub trait ParallelString {
7978

8079
/// Returns a parallel iterator over the bytes of a string.
8180
///
81+
/// Note that multi-byte sequences (for code points greater than `U+007F`)
82+
/// are produced as separate items, but will not be split across threads.
83+
/// If you would prefer an indexed iterator without that guarantee, consider
84+
/// `string.as_bytes().par_iter().cloned()` instead.
85+
///
8286
/// # Examples
8387
///
8488
/// ```
@@ -87,13 +91,12 @@ pub trait ParallelString {
8791
/// assert_eq!(Some(b'o'), max);
8892
/// ```
8993
fn par_bytes(&self) -> Bytes {
90-
let bytes = self.as_parallel_string().as_bytes();
91-
Bytes { inner: bytes.par_iter().cloned() }
94+
Bytes { chars: self.as_parallel_string() }
9295
}
9396

9497
/// Returns a parallel iterator over a string encoded as UTF-16.
9598
///
96-
/// Note that surrogate pairs (for codepoints greater than `U+FFFF`) are
99+
/// Note that surrogate pairs (for code points greater than `U+FFFF`) are
97100
/// produced as separate items, but will not be split across threads.
98101
///
99102
/// # Examples
@@ -445,14 +448,43 @@ impl<'ch> UnindexedProducer for CharIndicesProducer<'ch> {
445448
/// Parallel iterator over the bytes of a string
446449
#[derive(Debug, Clone)]
447450
pub struct Bytes<'ch> {
448-
inner: Cloned<slice::Iter<'ch, u8>>,
451+
chars: &'ch str,
449452
}
450453

451-
delegate_indexed_iterator! {
452-
Bytes<'ch> => u8,
453-
impl<'ch>
454+
struct BytesProducer<'ch> {
455+
chars: &'ch str,
454456
}
455457

458+
impl<'ch> ParallelIterator for Bytes<'ch> {
459+
type Item = u8;
460+
461+
fn drive_unindexed<C>(self, consumer: C) -> C::Result
462+
where C: UnindexedConsumer<Self::Item>
463+
{
464+
bridge_unindexed(BytesProducer { chars: self.chars }, consumer)
465+
}
466+
}
467+
468+
impl<'ch> UnindexedProducer for BytesProducer<'ch> {
469+
type Item = u8;
470+
471+
fn split(mut self) -> (Self, Option<Self>) {
472+
let index = find_char_midpoint(self.chars);
473+
if index > 0 {
474+
let (left, right) = self.chars.split_at(index);
475+
self.chars = left;
476+
(self, Some(BytesProducer { chars: right }))
477+
} else {
478+
(self, None)
479+
}
480+
}
481+
482+
fn fold_with<F>(self, folder: F) -> F
483+
where F: Folder<Self::Item>
484+
{
485+
folder.consume_iter(self.chars.bytes())
486+
}
487+
}
456488

457489
// /////////////////////////////////////////////////////////////////////////
458490

tests/str.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ pub fn execute_strings() {
2121
let par_chars: String = vchars.par_iter().collect();
2222
assert_eq!(s, par_chars);
2323

24-
let mut par_bytes: Vec<u8> = s.par_bytes().collect();
25-
assert_eq!(s.as_bytes(), &*par_bytes);
26-
s.par_bytes().collect_into_vec(&mut par_bytes); // indexed!
24+
let par_bytes: Vec<u8> = s.par_bytes().collect();
2725
assert_eq!(s.as_bytes(), &*par_bytes);
2826

2927
let par_utf16: Vec<u16> = s.par_encode_utf16().collect();

0 commit comments

Comments
 (0)