Skip to content

Commit 44a36be

Browse files
committed
Improve parsing of sign characters.
1 parent ba1574c commit 44a36be

File tree

2 files changed

+70
-16
lines changed

2 files changed

+70
-16
lines changed

lexical-parse-integer/src/algorithm.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -209,20 +209,18 @@ macro_rules! parse_sign {
209209
$invalid_positive:ident,
210210
$missing:ident
211211
) => {
212-
// NOTE: `read_if` optimizes poorly since we then match after
213-
match $byte.integer_iter().first() {
214-
Some(&b'+') if !$no_positive => {
212+
match $byte.integer_iter().parse_sign() {
213+
(false, true) if !$no_positive => {
215214
// SAFETY: We have at least 1 item left since we peaked a value
216215
unsafe { $byte.step_unchecked() };
217216
Ok(false)
218217
},
219-
Some(&b'+') if $no_positive => Err(Error::$invalid_positive($byte.cursor())),
220-
Some(&b'-') if $is_signed => {
218+
(false, true) if $no_positive => Err(Error::$invalid_positive($byte.cursor())),
219+
(true, true) if $is_signed => {
221220
// SAFETY: We have at least 1 item left since we peaked a value
222221
unsafe { $byte.step_unchecked() };
223222
Ok(true)
224223
},
225-
Some(_) if $required => Err(Error::$missing($byte.cursor())),
226224
_ if $required => Err(Error::$missing($byte.cursor())),
227225
_ => Ok(false),
228226
}
@@ -742,7 +740,7 @@ macro_rules! algorithm {
742740
// culminates in **way** slower performance overall for simple
743741
// integers, and no improvement for large integers.
744742
let mut value = T::ZERO;
745-
#[allow(unused_mut)]
743+
#[allow(unused_variables, unused_mut)]
746744
let mut has_suffix = false;
747745
if cannot_overflow && is_negative {
748746
parse_digits_unchecked!(
@@ -794,7 +792,8 @@ macro_rules! algorithm {
794792
);
795793
}
796794

797-
if cfg!(all(feature = "format", feature = "power-of-two")) && format.required_base_suffix() && !has_suffix {
795+
#[cfg(all(feature = "format", feature = "power-of-two"))]
796+
if format.required_base_suffix() && !has_suffix {
798797
return Err(Error::MissingBaseSuffix(iter.cursor()));
799798
}
800799

lexical-util/src/iterator.rs

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ pub unsafe trait Iter<'a> {
124124
self.get_buffer().get(self.cursor())
125125
}
126126

127-
/// Check if the next element is a given value.
127+
/// Check if the next element is a given value, in a case-
128+
/// sensitive manner.
128129
#[inline(always)]
129130
fn first_is_cased(&self, value: u8) -> bool {
130131
Some(&value) == self.first()
@@ -167,6 +168,10 @@ pub unsafe trait Iter<'a> {
167168
/// [`increment_count`] afterwards or else the internal count will
168169
/// be incorrect.
169170
///
171+
/// This does not skip digit separators and so if used incorrectly,
172+
/// the buffer may be in an invalid state, such as setting the next
173+
/// return value to a digit separator it should have skipped.
174+
///
170175
/// [`increment_count`]: DigitsIter::increment_count
171176
///
172177
/// # Panics
@@ -183,13 +188,16 @@ pub unsafe trait Iter<'a> {
183188

184189
/// Advance the internal slice by 1 element.
185190
///
186-
///
187191
/// This does not increment the count of items: returns: this only
188192
/// increments the index, not the total digits returned. You must
189193
/// use this carefully: if stepping over a digit, you must then call
190194
/// [`increment_count`] afterwards or else the internal count will
191195
/// be incorrect.
192196
///
197+
/// This does not skip digit separators and so if used incorrectly,
198+
/// the buffer may be in an invalid state, such as setting the next
199+
/// return value to a digit separator it should have skipped.
200+
///
193201
/// [`increment_count`]: DigitsIter::increment_count
194202
///
195203
/// # Panics
@@ -233,6 +241,7 @@ pub unsafe trait Iter<'a> {
233241
/// Try to read a the next four bytes as a u32.
234242
///
235243
/// This does not advance the internal state of the iterator.
244+
/// This will only return a value for contiguous iterators.
236245
#[inline(always)]
237246
fn peek_u32(&self) -> Option<u32> {
238247
if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u32>() {
@@ -309,9 +318,32 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
309318
/// for iterators that find the first non-zero value, etc. We optimize
310319
/// this for the case where we have contiguous iterators, since
311320
/// non-contiguous iterators already have a major performance penalty.
321+
///
322+
/// That is, say we have the following buffer and are skipping `_`
323+
/// characters, peek will advance the internal index to `2` if it
324+
/// can skip characters there.
325+
///
326+
/// +---+---+---+---+---+ +---+---+---+---+
327+
/// | _ | 2 | _ | _ | 3 | -> | 2 | _ | _ | 3 |
328+
/// +---+---+---+---+---+ +---+---+---+---+
329+
///
330+
/// For implementation reasons, where digit separators may not be
331+
/// allowed afterwards that character, it must stop right there.
312332
fn peek(&mut self) -> Option<Self::Item>;
313333

314334
/// Peek the next value of the iterator, and step only if it exists.
335+
///
336+
/// This will always advance to one byte past the peek value, since
337+
/// we may need to know internally if the next character is a digit
338+
/// separator.
339+
///
340+
/// That is, say we have the following buffer and are skipping `_`
341+
/// characters, peek will advance the internal index to `_` if it
342+
/// can skip characters there.
343+
///
344+
/// +---+---+---+---+---+ +---+---+---+
345+
/// | _ | 2 | _ | _ | 3 | -> | _ | _ | 3 |
346+
/// +---+---+---+---+---+ +---+---+---+
315347
#[inline(always)]
316348
fn try_read(&mut self) -> Option<Self::Item> {
317349
if let Some(value) = self.peek() {
@@ -323,13 +355,15 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
323355
}
324356
}
325357

326-
/// Check if the next element is a given value.
358+
/// Check if the next element is a given value, in a case-
359+
/// sensitive manner.
327360
#[inline(always)]
328361
fn peek_is_cased(&mut self, value: u8) -> bool {
329362
Some(&value) == self.peek()
330363
}
331364

332-
/// Check if the next element is a given value without case sensitivity.
365+
/// Check if the next element is a given value without case
366+
/// sensitivity.
333367
#[inline(always)]
334368
fn peek_is_uncased(&mut self, value: u8) -> bool {
335369
if let Some(&c) = self.peek() {
@@ -351,7 +385,7 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
351385
}
352386

353387
/// Peek the next value and consume it if the read value matches the
354-
/// expected one.
388+
/// expected one using a custom predicate.
355389
#[inline(always)]
356390
fn read_if<Pred: FnOnce(u8) -> bool>(&mut self, pred: Pred) -> Option<u8> {
357391
// NOTE: This was implemented to remove usage of unsafe throughout to code
@@ -370,7 +404,8 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
370404
}
371405
}
372406

373-
/// Read a value if the value matches the provided one.
407+
/// Read a value if the value matches the provided one, in a case-
408+
/// sensitive manner.
374409
#[inline(always)]
375410
fn read_if_value_cased(&mut self, value: u8) -> Option<u8> {
376411
if self.peek() == Some(&value) {
@@ -389,7 +424,8 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
389424
self.read_if(|x| x.eq_ignore_ascii_case(&value))
390425
}
391426

392-
/// Read a value if the value matches the provided one.
427+
/// Read a value if the value matches the provided one, with optional
428+
/// case sensitivity.
393429
#[inline(always)]
394430
fn read_if_value(&mut self, value: u8, is_cased: bool) -> Option<u8> {
395431
if is_cased {
@@ -399,7 +435,7 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
399435
}
400436
}
401437

402-
/// Skip zeros from the start of the iterator
438+
/// Skip zeros from the start of the iterator.
403439
#[inline(always)]
404440
fn skip_zeros(&mut self) -> usize {
405441
let start = self.current_count();
@@ -411,4 +447,23 @@ pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
411447

412448
/// Determine if the character is a digit.
413449
fn is_digit(&self, value: u8) -> bool;
450+
451+
// -------
452+
453+
/// Parse the sign from the iterator.
454+
///
455+
/// If this allows leading digit separators, it will handle
456+
/// those internally and advance the state as needed. This
457+
/// returned if the value is negative and if a sign was found.
458+
///
459+
/// The default implementation does not support digit separators.
460+
#[inline(always)]
461+
fn parse_sign(&mut self) -> (bool, bool) {
462+
// NOTE: `read_if` optimizes poorly since we then match after
463+
match self.first() {
464+
Some(&b'+') => (false, true),
465+
Some(&b'-') => (true, true),
466+
_ => (false, false)
467+
}
468+
}
414469
}

0 commit comments

Comments
 (0)