From 8ed33ea603509f5afcb4c06b771197f7a52d8fd0 Mon Sep 17 00:00:00 2001 From: Mahdi Ali-Raihan Date: Tue, 5 May 2026 14:55:52 -0400 Subject: [PATCH 1/2] Redesigned Components iterator to use front and back indexing instead of mutating and subslicing path field; as a result, Components iterator memory size goes from 64 bytes to 40 bytes and as_path does not use cloning at all --- library/std/src/path.rs | 681 ++++++++++++++++++++++++---------------- 1 file changed, 415 insertions(+), 266 deletions(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index 222bf77996c7f..81cbc79dd6f9f 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -391,20 +391,6 @@ fn validate_extension(extension: &OsStr) { // The core iterators //////////////////////////////////////////////////////////////////////////////// -/// Component parsing works by a double-ended state machine; the cursors at the -/// front and back of the path each keep track of what parts of the path have -/// been consumed so far. -/// -/// Going front to back, a path is made up of a prefix, a starting -/// directory component, and a body (of normal components) -#[derive(Copy, Clone, PartialEq, PartialOrd, Debug)] -enum State { - Prefix = 0, // c: - StartDir = 1, // / or . or nothing - Body = 2, // foo/bar/baz - Done = 3, -} - /// A structure wrapping a Windows path prefix as well as its unparsed string /// representation. /// @@ -597,6 +583,18 @@ impl AsRef for Component<'_> { } } +/// This is what the first component of our path is +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum FirstComponent { + /// For all paths starting with `/` + AbsolutePath, + /// For paths without root path like `.`, `..`, `a/` + RelativePath, + /// For Window specific paths like (`C:`, `\\?\UNC\server\share`, + /// `\\.\COM42`, etc.) + Prefix, +} + /// An iterator over the [`Component`]s of a [`Path`]. /// /// This `struct` is created by the [`components`] method on [`Path`]. @@ -615,25 +613,26 @@ impl AsRef for Component<'_> { /// ``` /// /// [`components`]: Path::components -#[derive(Clone)] +#[derive(Copy, Clone)] #[must_use = "iterators are lazy and do nothing unless consumed"] #[stable(feature = "rust1", since = "1.0.0")] pub struct Components<'a> { // The path left to parse components from path: &'a [u8], - - // The prefix as it was originally parsed, if any - prefix: Option>, - - // true if path *physically* has a root separator; for most Windows + // The iterator is double-ended, and these two indices keep track of how to + // subslice the path to present the unconsumed components accordingly + // If `front` starts off as non-zero on creating a `Components<'_>` iterator, a + // prefix is present. `back` may not equal to `path.len()` if trailing separators + // are present. + front: usize, + back: usize, + // True if path *physically* has a root separator; for most Windows // prefixes, it may have a "logical" root separator for the purposes of // normalization, e.g., \\server\share == \\server\share\. has_physical_root: bool, - - // The iterator is double-ended, and these two states keep track of what has - // been produced from either end - front: State, - back: State, + // The first component parsed, be it a relative path (""), an absolute path ("/"), + // or a Prefix, which is Windows Specific + first_comp: Option, } /// An iterator over the [`Component`]s of a [`Path`], as [`OsStr`] slices. @@ -665,49 +664,219 @@ impl fmt::Debug for Components<'_> { } impl<'a> Components<'a> { - // how long is the prefix, if any? + /// Is the *original* path rooted? + fn has_root(&self) -> bool { + if self.has_physical_root { + return true; + } + + // SAFETY: This u8 slice is the entire original path unmodified. The caller to + // `Path::components` should have given us a valid `Path`. + if HAS_PREFIXES + && let Some(p) = parse_prefix(unsafe { OsStr::from_encoded_bytes_unchecked(self.path) }) + { + if p.has_implicit_root() { + return true; + } + } + false + } + + /// This is a helper function for consuming the physical first component in + /// either `Components::next`/`Components::next_back`. + /// + /// There are four cases we can have here: + /// - We have an unconsumed absolute component (`/`). We should just output `/` + /// in this case. + /// - We have an unconsumed prefix component (Windows specific, e.g. `C:`). + /// We should just return that prefix component + /// - We have a relative directory, we should just parse the component as + /// normal for the front direction only (due to 0 indexing front index) + /// - We don't have a start component (frequent case), which means we just + /// return `None`. #[inline] - fn prefix_len(&self) -> usize { - if !HAS_PREFIXES { - return 0; + fn consume_first_component(&mut self, dir_front: bool) -> Option> { + if let Some(first_comp) = self.first_comp { + // Our first + self.first_comp = None; + if !matches!(first_comp, FirstComponent::RelativePath) { + if dir_front { + self.advance_through_trailing_sep_front(); + } + if first_comp == FirstComponent::AbsolutePath { + return Some(Component::RootDir); + } else { + // Our front has the length of our Prefix component encoded at the start, + // so this slice is guaranteed to contain the Prefix component if it's + // unconsumed. + let subslice = + unsafe { OsStr::from_encoded_bytes_unchecked(&self.path[0..self.front]) }; + // This prefix is guaranteed to be made since we confirmed + // our first component is a Prefix + let prefix = parse_prefix(subslice).unwrap(); + return Some(Component::Prefix(PrefixComponent { + raw: subslice, + parsed: prefix, + })); + } + } else { + if dir_front { + return self.parse_next_component(); + } + } } - self.prefix.as_ref().map(Prefix::len).unwrap_or(0) + None } + /// Normalizes away trailing separators and current directory ('.') components + /// in the forward direction. #[inline] - fn prefix_verbatim(&self) -> bool { - if !HAS_PREFIXES { - return false; + fn advance_through_trailing_sep_front(&mut self) { + // `Some(false)` is used to denote that + // we haven't seen a '.' component *yet*, + // `Some(true)` means we have seen a '.' component, + // and `None` means that the component is not '.' + let mut curr_dir = Some(false); + // We rebound to the original index for path components + // like '..' or 'abc.' + let mut rebound_ind: Option = None; + loop { + if self.front == self.back { + if let Some(front_ind) = rebound_ind { + self.front = front_ind; + } + break; + } + + if is_sep_byte(self.path[self.front]) { + if let Some(curr_dir_present) = curr_dir + && curr_dir_present + { + curr_dir = Some(false); + rebound_ind = None; + } + } else { + if self.path[self.front] == b'.' { + if let Some(curr_dir_present) = curr_dir { + if !curr_dir_present { + curr_dir = Some(true); + rebound_ind = Some(self.front); + } else { + curr_dir = None; + } + } else { + if let Some(front_ind) = rebound_ind { + self.front = front_ind; + } + break; + } + } else { + if let Some(front_ind) = rebound_ind { + self.front = front_ind; + } + break; + } + } + + self.front += 1; } - self.prefix.as_ref().map(Prefix::is_verbatim).unwrap_or(false) } - /// how much of the prefix is left from the point of view of iteration? + /// Normalizes away trailing separators and current directory ('.') components + /// in the backward direction. #[inline] - fn prefix_remaining(&self) -> usize { - if !HAS_PREFIXES { - return 0; + fn advance_through_trailing_sep_back(&mut self) { + // `Some(false)` is used to denote that + // we haven't seen a '.' component *yet*, + // `Some(true)` means we have seen a '.' component, + // and `None` means that the component is not '.' + let mut curr_dir = Some(false); + // We rebound to the original index for path components + // like '..' or 'abc.' + let mut rebound_ind: Option = None; + loop { + if self.back == self.front { + if let Some(back_ind) = rebound_ind { + self.back = back_ind; + } + break; + } + + if is_sep_byte(self.path[self.back - 1]) { + if let Some(curr_dir_present) = curr_dir + && curr_dir_present + { + curr_dir = Some(false); + rebound_ind = None; + } + } else { + if self.path[self.back - 1] == b'.' { + if let Some(curr_dir_present) = curr_dir { + if !curr_dir_present { + curr_dir = Some(true); + rebound_ind = Some(self.back); + } else { + curr_dir = None; + } + } else { + if let Some(back_ind) = rebound_ind { + self.back = back_ind; + } + break; + } + } else { + if let Some(back_ind) = rebound_ind { + self.back = back_ind; + } + break; + } + } + self.back -= 1; } - if self.front == State::Prefix { self.prefix_len() } else { 0 } } - // Given the iteration so far, how much of the pre-State::Body path is left? + /// Increments our front pointer until we find the + /// next separator byte or have reached the component + /// that back index is pointing at. #[inline] - fn len_before_body(&self) -> usize { - let root = if self.front <= State::StartDir && self.has_physical_root { 1 } else { 0 }; - let cur_dir = if self.front <= State::StartDir && self.include_cur_dir() { 1 } else { 0 }; - self.prefix_remaining() + root + cur_dir + fn find_next_separator_front(&mut self) { + while self.front < self.back { + if is_sep_byte(self.path[self.front]) { + self.front += 1; + break; + } + self.front += 1; + } } - // is the iteration complete? + /// Decrements our back pointer until we find the + /// next separator byte or have reached the component + /// that front index is pointing to. #[inline] - fn finished(&self) -> bool { - self.front == State::Done || self.back == State::Done || self.front > self.back + fn find_next_separator_back(&mut self) { + while self.back > self.front { + if is_sep_byte(self.path[self.back - 1]) { + self.back -= 1; + break; + } + self.back -= 1; + } } + /// Parse a u8 slice into an OsStr, which is encoded into a `Component` #[inline] - fn is_sep_byte(&self, b: u8) -> bool { - if self.prefix_verbatim() { is_verbatim_sep(b) } else { is_sep_byte(b) } + fn parse_single_component(&self, slice: &'a [u8]) -> Option> { + match slice { + [] => return None, + [b'.'] => Some(Component::CurDir), + [b'.', b'.'] => Some(Component::ParentDir), + _ => { + // SAFETY: Our sliced path is guaranteed to capture the entire component + // due to delimiting on ascii separators from front and back. + let path_osstr = unsafe { OsStr::from_encoded_bytes_unchecked(slice) }; + Some(Component::Normal(path_osstr)) + } + } } /// Extracts a slice corresponding to the portion of the path remaining for iteration. @@ -726,103 +895,76 @@ impl<'a> Components<'a> { #[must_use] #[stable(feature = "rust1", since = "1.0.0")] pub fn as_path(&self) -> &'a Path { - let mut comps = self.clone(); - if comps.front == State::Body { - comps.trim_left(); - } - if comps.back == State::Body { - comps.trim_right(); - } - unsafe { Path::from_u8_slice(comps.path) } - } - - /// Is the *original* path rooted? - fn has_root(&self) -> bool { - if self.has_physical_root { - return true; - } - if HAS_PREFIXES && let Some(p) = self.prefix { - if p.has_implicit_root() { - return true; + if let Some(first_comp) = self.first_comp { + match first_comp { + FirstComponent::AbsolutePath => { + if self.back == 0 { + return Path::new("/"); + } + } + FirstComponent::Prefix => { + return unsafe { + Path::from_u8_slice(&self.path[..self.back]).trim_trailing_sep() + }; + } + FirstComponent::RelativePath => {} } } - false - } - - /// Should the normalized path include a leading . ? - fn include_cur_dir(&self) -> bool { - if self.has_root() { - return false; - } - let slice = &self.path[self.prefix_remaining()..]; - match slice { - [b'.'] => true, - [b'.', b, ..] => self.is_sep_byte(*b), - _ => false, - } - } - - // parse a given byte sequence following the OsStr encoding into the - // corresponding path component - unsafe fn parse_single_component<'b>(&self, comp: &'b [u8]) -> Option> { - match comp { - b"." if HAS_PREFIXES && self.prefix_verbatim() => Some(Component::CurDir), - b"." => None, // . components are normalized away, except at - // the beginning of a path, which is treated - // separately via `include_cur_dir` - b".." => Some(Component::ParentDir), - b"" => None, - _ => Some(Component::Normal(unsafe { OsStr::from_encoded_bytes_unchecked(comp) })), - } - } - // parse a component from the left, saying how many bytes to consume to - // remove the component - fn parse_next_component(&self) -> (usize, Option>) { - debug_assert!(self.front == State::Body); - let (extra, comp) = match self.path.iter().position(|b| self.is_sep_byte(*b)) { - None => (0, self.path), - Some(i) => (1, &self.path[..i]), + // SAFETY: front and back index are delimited by ascii separator bytes, + // where front is a byte after an ascii separator and back is at an ascii + // separator, so this will always produce a valid path. + unsafe { Path::from_u8_slice(&self.path[self.front..self.back]).trim_trailing_sep() } + } + + /// Parses the next component in `Components<'_>` from the left + fn parse_next_component(&mut self) -> Option> { + // Our current `self.front` index at this point is the start + // of the component name + let before_front = self.front; + // We trace our `self.front` idx down the path until + // we hit a separator. + self.find_next_separator_front(); + let curr_front = self.front; + // Normalizes trailing seps and curr dirs in preparation for + // next front component + self.advance_through_trailing_sep_front(); + + // SAFETY: Our curr_front index always stops a byte after the ascii + // separator byte or at self.back (should there be no ascii separator + // in traversal), so we can always construct a valid u8 path slice + let sliced_path = if curr_front > 0 && is_sep_byte(self.path[curr_front - 1]) { + &self.path[before_front..curr_front - 1] + } else { + &self.path[before_front..curr_front] }; - // SAFETY: `comp` is a valid substring, since it is split on a separator. - (comp.len() + extra, unsafe { self.parse_single_component(comp) }) - } - - // parse a component from the right, saying how many bytes to consume to - // remove the component - fn parse_next_component_back(&self) -> (usize, Option>) { - debug_assert!(self.back == State::Body); - let start = self.len_before_body(); - let (extra, comp) = match self.path[start..].iter().rposition(|b| self.is_sep_byte(*b)) { - None => (0, &self.path[start..]), - Some(i) => (1, &self.path[start + i + 1..]), + self.parse_single_component(sliced_path) + } + + /// Parses the next back component in `Components<'_>` from the + /// right + fn parse_next_back_component(&mut self) -> Option> { + // Our current `self.back` index at this point encompasses + // the parent path + let before_back = self.back; + // We trace our `self.back` idx up the path until we reach a + // separator byte. This prepares the path we return on the next + // call to this function. + self.find_next_separator_back(); + let curr_back = self.back; + // Normalizes trailing seps and curr dirs in preparation for + // next back component + self.advance_through_trailing_sep_back(); + + // Our curr_back is at the byte before an ascii separator byte or self.front, + // (should there be no ascii separator in traversal), so we can always + // construct a valid u8 path slice + let sliced_path = if is_sep_byte(self.path[curr_back]) { + &self.path[curr_back + 1..before_back] + } else { + &self.path[curr_back..before_back] }; - // SAFETY: `comp` is a valid substring, since it is split on a separator. - (comp.len() + extra, unsafe { self.parse_single_component(comp) }) - } - - // trim away repeated separators (i.e., empty components) on the left - fn trim_left(&mut self) { - while !self.path.is_empty() { - let (size, comp) = self.parse_next_component(); - if comp.is_some() { - return; - } else { - self.path = &self.path[size..]; - } - } - } - - // trim away repeated separators (i.e., empty components) on the right - fn trim_right(&mut self) { - while self.path.len() > self.len_before_body() { - let (size, comp) = self.parse_next_component_back(); - if comp.is_some() { - return; - } else { - self.path = &self.path[..self.path.len() - size]; - } - } + self.parse_single_component(sliced_path) } } @@ -921,101 +1063,33 @@ impl<'a> Iterator for Components<'a> { type Item = Component<'a>; fn next(&mut self) -> Option> { - while !self.finished() { - match self.front { - // most likely case first - State::Body if !self.path.is_empty() => { - let (size, comp) = self.parse_next_component(); - self.path = &self.path[size..]; - if comp.is_some() { - return comp; - } - } - State::Body => { - self.front = State::Done; - } - State::StartDir => { - self.front = State::Body; - if self.has_physical_root { - debug_assert!(!self.path.is_empty()); - self.path = &self.path[1..]; - return Some(Component::RootDir); - } else if HAS_PREFIXES && let Some(p) = self.prefix { - if p.has_implicit_root() && !p.is_verbatim() { - return Some(Component::RootDir); - } - } else if self.include_cur_dir() { - debug_assert!(!self.path.is_empty()); - self.path = &self.path[1..]; - return Some(Component::CurDir); - } - } - _ if const { !HAS_PREFIXES } => unreachable!(), - State::Prefix if self.prefix_len() == 0 => { - self.front = State::StartDir; - } - State::Prefix => { - self.front = State::StartDir; - debug_assert!(self.prefix_len() <= self.path.len()); - let raw = &self.path[..self.prefix_len()]; - self.path = &self.path[self.prefix_len()..]; - return Some(Component::Prefix(PrefixComponent { - raw: unsafe { OsStr::from_encoded_bytes_unchecked(raw) }, - parsed: self.prefix.unwrap(), - })); - } - State::Done => unreachable!(), - } + // We reach this case when we no longer have anymore paths + // to consume (return `None`), or if our front idx was initially + // equal to back idx (e.g. if we had `C:`, `.`, `/`) + if self.front >= self.back { + return self.consume_first_component(true); } - None + + // Consume our first component if we haven't already. + if let Some(comp) = self.consume_first_component(true) { + return Some(comp); + } + + self.parse_next_component() } } #[stable(feature = "rust1", since = "1.0.0")] impl<'a> DoubleEndedIterator for Components<'a> { fn next_back(&mut self) -> Option> { - while !self.finished() { - match self.back { - State::Body if self.path.len() > self.len_before_body() => { - let (size, comp) = self.parse_next_component_back(); - self.path = &self.path[..self.path.len() - size]; - if comp.is_some() { - return comp; - } - } - State::Body => { - self.back = State::StartDir; - } - State::StartDir => { - self.back = if HAS_PREFIXES { State::Prefix } else { State::Done }; - if self.has_physical_root { - self.path = &self.path[..self.path.len() - 1]; - return Some(Component::RootDir); - } else if HAS_PREFIXES && let Some(p) = self.prefix { - if p.has_implicit_root() && !p.is_verbatim() { - return Some(Component::RootDir); - } - } else if self.include_cur_dir() { - self.path = &self.path[..self.path.len() - 1]; - return Some(Component::CurDir); - } - } - _ if !HAS_PREFIXES => unreachable!(), - State::Prefix if self.prefix_len() > 0 => { - self.back = State::Done; - return Some(Component::Prefix(PrefixComponent { - raw: unsafe { OsStr::from_encoded_bytes_unchecked(self.path) }, - parsed: self.prefix.unwrap(), - })); - } - State::Prefix => { - self.back = State::Done; - return None; - } - State::Done => unreachable!(), - } + // We reach here when we no longer have anymore paths + // to consume, we're dealing with relative paths and + // need to output "", or we need to output Prefix component + if self.back <= self.front { + return self.consume_first_component(false); } - None + + self.parse_next_back_component() } } @@ -1026,16 +1100,12 @@ impl FusedIterator for Components<'_> {} impl<'a> PartialEq for Components<'a> { #[inline] fn eq(&self, other: &Components<'a>) -> bool { - let Components { path: _, front: _, back: _, has_physical_root: _, prefix: _ } = self; - // Fast path for exact matches, e.g. for hashmap lookups. // Don't explicitly compare the prefix or has_physical_root fields since they'll // either be covered by the `path` buffer or are only relevant for `prefix_verbatim()`. if self.path.len() == other.path.len() && self.front == other.front - && self.back == State::Body - && other.back == State::Body - && self.prefix_verbatim() == other.prefix_verbatim() + && self.back == other.back { // possible future improvement: this could bail out earlier if there were a // reverse memcmp/bcmp comparing back to front @@ -1068,7 +1138,7 @@ impl Ord for Components<'_> { } } -fn compare_components(mut left: Components<'_>, mut right: Components<'_>) -> cmp::Ordering { +fn compare_components(left: Components<'_>, right: Components<'_>) -> cmp::Ordering { // Fast path for long shared prefixes // // - compare raw bytes to find first mismatch @@ -1078,23 +1148,27 @@ fn compare_components(mut left: Components<'_>, mut right: Components<'_>) -> cm // // The fast path isn't taken for paths with a PrefixComponent to avoid backtracking into // the middle of one - if left.prefix.is_none() && right.prefix.is_none() && left.front == right.front { - // possible future improvement: a [u8]::first_mismatch simd implementation - let first_difference = match left.path.iter().zip(right.path).position(|(&a, &b)| a != b) { - None if left.path.len() == right.path.len() => return cmp::Ordering::Equal, - None => left.path.len().min(right.path.len()), - Some(diff) => diff, - }; + // possible future improvement: a [u8]::first_mismatch simd implementation - if let Some(previous_sep) = - left.path[..first_difference].iter().rposition(|&b| left.is_sep_byte(b)) - { - let mismatched_component_start = previous_sep + 1; - left.path = &left.path[mismatched_component_start..]; - left.front = State::Body; - right.path = &right.path[mismatched_component_start..]; - right.front = State::Body; - } + let (left_path_len, left_path) = if left.first_comp.is_some() { + (left.back, &left.path[..left.back]) + } else { + (left.back - left.front, &left.path[left.front..left.back]) + }; + let (right_path_len, right_path) = if right.first_comp.is_some() { + (right.back, &right.path[..right.back]) + } else { + (right.back - right.front, &right.path[right.front..right.back]) + }; + match left_path.iter().zip(right_path).position(|(&a, &b)| a != b) { + // Left path and right path are exactly the same + None if left_path_len == right_path_len => return cmp::Ordering::Equal, + // FIXME: This should check the character that they conflict on so you + // can return Ordering::Greater or Ordering::Less if the conflicting + // characters is not a slash ("/") or current directory character (".") + // Some(pos) => { + // } + _ => {} } Iterator::cmp(left, right) @@ -1344,11 +1418,11 @@ impl PathBuf { let mut need_sep = buf.last().map(|c| !is_sep_byte(*c)).unwrap_or(false); // in the special case of `C:` on Windows, do *not* add a separator - let comps = self.components(); + let parsed_prefix = parse_prefix(&self.inner); - if comps.prefix_len() > 0 - && comps.prefix_len() == comps.path.len() - && comps.prefix.unwrap().is_drive() + if let Some(prefix) = parsed_prefix + && prefix.len() == self.inner.len() + && prefix.is_drive() { need_sep = false } @@ -1367,7 +1441,11 @@ impl PathBuf { self.inner.clear(); // verbatim paths need . and .. removed - } else if comps.prefix_verbatim() && !path.inner.is_empty() { + } else if let Some(prefix) = parsed_prefix + && prefix.is_verbatim() + && !path.inner.is_empty() + { + let comps = self.components(); let mut buf: Vec<_> = comps.collect(); for c in path.components() { match c { @@ -1408,7 +1486,9 @@ impl PathBuf { // `path` has a root but no prefix, e.g., `\windows` (Windows only) } else if path.has_root() { - let prefix_len = self.components().prefix_remaining(); + // On creating a components iterator, if front index + // is non-zero we have a prefix. + let prefix_len = self.components().front; self.inner.truncate(prefix_len); // `path` is a pure relative path @@ -2576,7 +2656,7 @@ impl Path { } pub(crate) fn prefix(&self) -> Option> { - self.components().prefix + parse_prefix(&self.inner) } /// Returns `true` if the `Path` has a root. @@ -3235,16 +3315,85 @@ impl Path { /// [`CurDir`]: Component::CurDir #[stable(feature = "rust1", since = "1.0.0")] pub fn components(&self) -> Components<'_> { - let prefix = parse_prefix(self.as_os_str()); - Components { - path: self.as_u8_slice(), - prefix, - has_physical_root: has_physical_root(self.as_u8_slice(), prefix), - // use a platform-specific initial state to avoid one turn of - // the state-machine when the platform doesn't have a Prefix. - front: const { if HAS_PREFIXES { State::Prefix } else { State::StartDir } }, - back: State::Body, + /// Normalizes the trailing portion of given path + /// and returns the number of bytes that it occupied + #[inline] + fn trailing_path_length(path_bytes: &[u8]) -> usize { + let path_len = path_bytes.len(); + // this won't panic because "" does not have + // a trailing separator + let mut idx = path_len; + + // `Some(false)` is used to denote that + // we haven't seen a '.' component *yet*, + // `Some(true)` means we have seen a '.' component, + // and `None` means that the component is not '.' + let mut curr_dir = false; + // We rebound to the original index for path components + // like '..' or 'abc.' + let mut rebound_idx: Option = None; + while idx > 0 { + if is_sep_byte(path_bytes[idx - 1]) { + if curr_dir { + rebound_idx = None; + curr_dir = false; + } + } else { + if path_bytes[idx - 1] == b'.' { + if !curr_dir { + rebound_idx = Some(idx); + curr_dir = true; + } else { + if let Some(r_idx) = rebound_idx { + curr_dir = false; + idx = r_idx; + } + break; + } + } else { + if let Some(r_idx) = rebound_idx { + curr_dir = false; + idx = r_idx; + } + break; + } + } + idx -= 1; + } + + // If our path is `./a/b/c`, this `.` is not normalized + // away because it's treated as its own component + if curr_dir { + idx += 1; + } + path_len - idx } + + let os_str_path = self.as_os_str(); + let path_bytes = os_str_path.as_encoded_bytes(); + let trailing_seps = trailing_path_length(path_bytes); + + // Windows specific component + let prefix = parse_prefix(os_str_path); + let prefix_exist = prefix.map(|_| true).unwrap_or(false); + + let mut has_root = false; + let first_comp = if prefix_exist { + Some(FirstComponent::Prefix) + } else if has_physical_root(path_bytes, prefix) { + has_root = true; + Some(FirstComponent::AbsolutePath) + } else { + Some(FirstComponent::RelativePath) + }; + + // If we have a prefix, we encode that index into front + let front = prefix.map(|prefix| prefix.len()).unwrap_or(0); + // Set our back pointer to the last separator byte (without trailing) + // or last byte + let back = path_bytes.len() - trailing_seps; + + Components { path: path_bytes, has_physical_root: has_root, front, back, first_comp } } /// Produces an iterator over the path's components viewed as [`OsStr`] From 3921fffa09432aedd91cfd9d184eb1cd4574a5ae Mon Sep 17 00:00:00 2001 From: Mahdi Ali-Raihan Date: Wed, 13 May 2026 15:01:33 -0400 Subject: [PATCH 2/2] Refactored some code, check subslice in fast path of Components Equality, added safety comments, and check for root dir after Prefix component (e.g., '\\?\checkout\src\tools' should produce Prefix, RootDir, Normal, Normal, None, ...) in Components::parse_single_component --- library/std/src/path.rs | 125 ++++++++++++++++++++++++---------------- 1 file changed, 76 insertions(+), 49 deletions(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index 81cbc79dd6f9f..52277af930159 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -696,36 +696,40 @@ impl<'a> Components<'a> { /// return `None`. #[inline] fn consume_first_component(&mut self, dir_front: bool) -> Option> { - if let Some(first_comp) = self.first_comp { - // Our first - self.first_comp = None; - if !matches!(first_comp, FirstComponent::RelativePath) { + match self.first_comp { + Some(FirstComponent::AbsolutePath) => { + self.first_comp = None; if dir_front { self.advance_through_trailing_sep_front(); } - if first_comp == FirstComponent::AbsolutePath { - return Some(Component::RootDir); - } else { - // Our front has the length of our Prefix component encoded at the start, - // so this slice is guaranteed to contain the Prefix component if it's - // unconsumed. - let subslice = - unsafe { OsStr::from_encoded_bytes_unchecked(&self.path[0..self.front]) }; - // This prefix is guaranteed to be made since we confirmed - // our first component is a Prefix - let prefix = parse_prefix(subslice).unwrap(); - return Some(Component::Prefix(PrefixComponent { - raw: subslice, - parsed: prefix, - })); + return Some(Component::RootDir); + } + Some(FirstComponent::Prefix) => { + self.first_comp = None; + if dir_front { + self.advance_through_trailing_sep_front(); } - } else { + + // SAFETY: Our front has the length of our Prefix component encoded at the start, + // so this slice is guaranteed to contain the Prefix component if it's + // unconsumed. + let subslice = + unsafe { OsStr::from_encoded_bytes_unchecked(&self.path[0..self.front]) }; + // This prefix is guaranteed to be made since we confirmed + // our first component is a Prefix + let prefix = parse_prefix(subslice).unwrap(); + + Some(Component::Prefix(PrefixComponent { raw: subslice, parsed: prefix })) + } + Some(FirstComponent::RelativePath) => { + self.first_comp = None; if dir_front { return self.parse_next_component(); } + None } + None => None, } - None } /// Normalizes away trailing separators and current directory ('.') components @@ -863,22 +867,6 @@ impl<'a> Components<'a> { } } - /// Parse a u8 slice into an OsStr, which is encoded into a `Component` - #[inline] - fn parse_single_component(&self, slice: &'a [u8]) -> Option> { - match slice { - [] => return None, - [b'.'] => Some(Component::CurDir), - [b'.', b'.'] => Some(Component::ParentDir), - _ => { - // SAFETY: Our sliced path is guaranteed to capture the entire component - // due to delimiting on ascii separators from front and back. - let path_osstr = unsafe { OsStr::from_encoded_bytes_unchecked(slice) }; - Some(Component::Normal(path_osstr)) - } - } - } - /// Extracts a slice corresponding to the portion of the path remaining for iteration. /// /// # Examples @@ -903,6 +891,16 @@ impl<'a> Components<'a> { } } FirstComponent::Prefix => { + // We don't want to trim away separators from a Prefix + // component + if self.front == self.back { + // SAFETY: If the first component is not consumed, then + // front index encodes the whole length of the Prefix + // component + return unsafe { Path::from_u8_slice(&self.path[..self.front]) }; + } + // SAFETY: Our back index is guaranteed to delimit at an ascii + // separator byte, so this should present a valid path return unsafe { Path::from_u8_slice(&self.path[..self.back]).trim_trailing_sep() }; @@ -917,6 +915,26 @@ impl<'a> Components<'a> { unsafe { Path::from_u8_slice(&self.path[self.front..self.back]).trim_trailing_sep() } } + /// Parse a u8 slice into an OsStr, which is encoded into a `Component` + #[inline] + fn parse_single_component(&self, slice: &'a [u8]) -> Option> { + match slice { + [] => return None, + [b'.'] => Some(Component::CurDir), + [b'.', b'.'] => Some(Component::ParentDir), + _ => { + let root_slice = [MAIN_SEPARATOR as u8]; + if slice == root_slice { + return Some(Component::RootDir); + } + // SAFETY: Our sliced path is guaranteed to capture the entire component + // due to delimiting on ascii separators from front and back. + let path_osstr = unsafe { OsStr::from_encoded_bytes_unchecked(slice) }; + Some(Component::Normal(path_osstr)) + } + } + } + /// Parses the next component in `Components<'_>` from the left fn parse_next_component(&mut self) -> Option> { // Our current `self.front` index at this point is the start @@ -1065,16 +1083,12 @@ impl<'a> Iterator for Components<'a> { fn next(&mut self) -> Option> { // We reach this case when we no longer have anymore paths // to consume (return `None`), or if our front idx was initially - // equal to back idx (e.g. if we had `C:`, `.`, `/`) - if self.front >= self.back { + // equal to back idx (e.g. if we had `C:`, `.`, `/`), or if we + // had a front component initially + if self.front >= self.back || self.first_comp.is_some() { return self.consume_first_component(true); } - // Consume our first component if we haven't already. - if let Some(comp) = self.consume_first_component(true) { - return Some(comp); - } - self.parse_next_component() } } @@ -1083,8 +1097,8 @@ impl<'a> Iterator for Components<'a> { impl<'a> DoubleEndedIterator for Components<'a> { fn next_back(&mut self) -> Option> { // We reach here when we no longer have anymore paths - // to consume, we're dealing with relative paths and - // need to output "", or we need to output Prefix component + // to consume, or we need to output Prefix component + // (anything else falls through this conditional) if self.back <= self.front { return self.consume_first_component(false); } @@ -1107,9 +1121,22 @@ impl<'a> PartialEq for Components<'a> { && self.front == other.front && self.back == other.back { - // possible future improvement: this could bail out earlier if there were a - // reverse memcmp/bcmp comparing back to front - if self.path == other.path { + // If either `self` or `other` have a prefix (indicated by `first_comp`) + // we need to start at index 0 (because prefix length is encoded in + // `front`) + let path = if matches!(self.first_comp, Some(FirstComponent::Prefix)) { + &self.path[..self.back] + } else { + &self.path[self.front..self.back] + }; + + let other_path = if matches!(other.first_comp, Some(FirstComponent::Prefix)) { + &other.path[..other.back] + } else { + &other.path[other.front..other.back] + }; + + if path == other_path { return true; } }