diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e6f00e857f7..ee16c442c0f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5516,6 +5516,7 @@ Released 2018-09-13 [`cast_slice_different_sizes`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_different_sizes [`cast_slice_from_raw_parts`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_from_raw_parts [`cfg_not_test`]: https://rust-lang.github.io/rust-clippy/master/index.html#cfg_not_test +[`char_indices_as_byte_indices`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_indices_as_byte_indices [`char_lit_as_u8`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_lit_as_u8 [`chars_last_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_last_cmp [`chars_next_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_next_cmp diff --git a/clippy_lints/src/declared_lints.rs b/clippy_lints/src/declared_lints.rs index c2274f0a619c..7ee898ec75a9 100644 --- a/clippy_lints/src/declared_lints.rs +++ b/clippy_lints/src/declared_lints.rs @@ -287,6 +287,7 @@ pub static LINTS: &[&crate::LintInfo] = &[ crate::literal_representation::UNREADABLE_LITERAL_INFO, crate::literal_representation::UNUSUAL_BYTE_GROUPINGS_INFO, crate::literal_string_with_formatting_args::LITERAL_STRING_WITH_FORMATTING_ARGS_INFO, + crate::loops::CHAR_INDICES_AS_BYTE_INDICES_INFO, crate::loops::EMPTY_LOOP_INFO, crate::loops::EXPLICIT_COUNTER_LOOP_INFO, crate::loops::EXPLICIT_INTO_ITER_LOOP_INFO, diff --git a/clippy_lints/src/loops/char_indices_as_byte_indices.rs b/clippy_lints/src/loops/char_indices_as_byte_indices.rs new file mode 100644 index 000000000000..8916454ada16 --- /dev/null +++ b/clippy_lints/src/loops/char_indices_as_byte_indices.rs @@ -0,0 +1,141 @@ +use std::ops::ControlFlow; + +use clippy_utils::diagnostics::span_lint_hir_and_then; +use clippy_utils::ty::is_type_lang_item; +use clippy_utils::visitors::for_each_expr; +use clippy_utils::{eq_expr_value, higher, path_to_local_id}; +use rustc_errors::{Applicability, MultiSpan}; +use rustc_hir::{Expr, ExprKind, LangItem, Node, Pat, PatKind}; +use rustc_lint::LateContext; +use rustc_middle::ty::Ty; +use rustc_span::{Span, sym}; + +use super::CHAR_INDICES_AS_BYTE_INDICES; + +// The list of `str` methods we want to lint that have a `usize` argument representing a byte index. +// Note: `String` also has methods that work with byte indices, +// but they all take `&mut self` and aren't worth considering since the user couldn't have called +// them while the chars iterator is live anyway. +const BYTE_INDEX_METHODS: &[&str] = &[ + "is_char_boundary", + "floor_char_boundary", + "ceil_char_boundary", + "get", + "index", + "index_mut", + "get_mut", + "get_unchecked", + "get_unchecked_mut", + "slice_unchecked", + "slice_mut_unchecked", + "split_at", + "split_at_mut", + "split_at_checked", + "split_at_mut_checked", +]; + +const CONTINUE: ControlFlow = ControlFlow::Continue(()); + +pub(super) fn check<'tcx>(cx: &LateContext<'tcx>, pat: &Pat<'_>, iterable: &Expr<'_>, body: &'tcx Expr<'tcx>) { + if let ExprKind::MethodCall(_, enumerate_recv, _, enumerate_span) = iterable.kind + && let Some(method_id) = cx.typeck_results().type_dependent_def_id(iterable.hir_id) + && cx.tcx.is_diagnostic_item(sym::enumerate_method, method_id) + && let ExprKind::MethodCall(_, chars_recv, _, chars_span) = enumerate_recv.kind + && let Some(method_id) = cx.typeck_results().type_dependent_def_id(enumerate_recv.hir_id) + && cx.tcx.is_diagnostic_item(sym::str_chars, method_id) + { + if let PatKind::Tuple([pat, _], _) = pat.kind + && let PatKind::Binding(_, binding_id, ..) = pat.kind + { + // Destructured iterator element `(idx, _)`, look for uses of the binding + for_each_expr(cx, body, |expr| { + if path_to_local_id(expr, binding_id) { + check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv); + } + CONTINUE + }); + } else if let PatKind::Binding(_, binding_id, ..) = pat.kind { + // Bound as a tuple, look for `tup.0` + for_each_expr(cx, body, |expr| { + if let ExprKind::Field(e, field) = expr.kind + && path_to_local_id(e, binding_id) + && field.name == sym::integer(0) + { + check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv); + } + CONTINUE + }); + } + } +} + +fn check_index_usage<'tcx>( + cx: &LateContext<'tcx>, + expr: &'tcx Expr<'tcx>, + pat: &Pat<'_>, + enumerate_span: Span, + chars_span: Span, + chars_recv: &Expr<'_>, +) { + let Some(parent_expr) = index_consumed_at(cx, expr) else { + return; + }; + + let is_string_like = |ty: Ty<'_>| ty.is_str() || is_type_lang_item(cx, ty, LangItem::String); + let message = match parent_expr.kind { + ExprKind::MethodCall(segment, recv, ..) + // We currently only lint `str` methods (which `String` can deref to), so a `.is_str()` check is sufficient here + // (contrary to the `ExprKind::Index` case which needs to handle both with `is_string_like` because `String` implements + // `Index` directly and no deref to `str` would happen in that case). + if cx.typeck_results().expr_ty_adjusted(recv).peel_refs().is_str() + && BYTE_INDEX_METHODS.contains(&segment.ident.name.as_str()) + && eq_expr_value(cx, chars_recv, recv) => + { + "passing a character position to a method that expects a byte index" + }, + ExprKind::Index(target, ..) + if is_string_like(cx.typeck_results().expr_ty_adjusted(target).peel_refs()) + && eq_expr_value(cx, chars_recv, target) => + { + "indexing into a string with a character position where a byte index is expected" + }, + _ => return, + }; + + span_lint_hir_and_then( + cx, + CHAR_INDICES_AS_BYTE_INDICES, + expr.hir_id, + expr.span, + message, + |diag| { + diag.note("a character can take up more than one byte, so they are not interchangeable") + .span_note( + MultiSpan::from_spans(vec![pat.span, enumerate_span]), + "position comes from the enumerate iterator", + ) + .span_suggestion_verbose( + chars_span.to(enumerate_span), + "consider using `.char_indices()` instead", + "char_indices()", + Applicability::MaybeIncorrect, + ); + }, + ); +} + +/// Returns the expression which ultimately consumes the index. +/// This is usually the parent expression, i.e. `.split_at(idx)` for `idx`, +/// but for `.get(..idx)` we want to consider the method call the consuming expression, +/// which requires skipping past the range expression. +fn index_consumed_at<'tcx>(cx: &LateContext<'tcx>, expr: &'tcx Expr<'tcx>) -> Option<&'tcx Expr<'tcx>> { + for (_, node) in cx.tcx.hir_parent_iter(expr.hir_id) { + match node { + Node::Expr(expr) if higher::Range::hir(expr).is_some() => {}, + Node::ExprField(_) => {}, + Node::Expr(expr) => return Some(expr), + _ => break, + } + } + None +} diff --git a/clippy_lints/src/loops/mod.rs b/clippy_lints/src/loops/mod.rs index 4b0bf5a4b3c9..2b66827e82ee 100644 --- a/clippy_lints/src/loops/mod.rs +++ b/clippy_lints/src/loops/mod.rs @@ -1,3 +1,4 @@ +mod char_indices_as_byte_indices; mod empty_loop; mod explicit_counter_loop; mod explicit_into_iter_loop; @@ -740,6 +741,49 @@ declare_clippy_lint! { "manually filling a slice with a value" } +declare_clippy_lint! { + /// ### What it does + /// Checks for usage of a character position yielded by `.chars().enumerate()` in a context where a **byte index** is expected, + /// such as an argument to a specific `str` method or indexing into a `str` or `String`. + /// + /// ### Why is this bad? + /// A character (more specifically, a Unicode scalar value) that is yielded by `str::chars` can take up multiple bytes, + /// so a character position does not necessarily have the same byte index at which the character is stored. + /// Thus, using the character position where a byte index is expected can unexpectedly return wrong values + /// or panic when the string consists of multibyte characters. + /// + /// For example, the character `a` in `äa` is stored at byte index 2 but has the character position 1. + /// Using the character position 1 to index into the string will lead to a panic as it is in the middle of the first character. + /// + /// Instead of `.chars().enumerate()`, the correct iterator to use is `.char_indices()`, which yields byte indices. + /// + /// This pattern is technically fine if the strings are known to only use the ASCII subset, + /// though in those cases it would be better to use `bytes()` directly to make the intent clearer, + /// but there is also no downside to just using `.char_indices()` directly and supporting non-ASCII strings. + /// + /// You may also want to read the [chapter on strings in the Rust Book](https://doc.rust-lang.org/book/ch08-02-strings.html) + /// which goes into this in more detail. + /// + /// ### Example + /// ```no_run + /// # let s = "..."; + /// for (idx, c) in s.chars().enumerate() { + /// let _ = s[idx..]; // ⚠️ Panics for strings consisting of multibyte characters + /// } + /// ``` + /// Use instead: + /// ```no_run + /// # let s = "..."; + /// for (idx, c) in s.char_indices() { + /// let _ = s[idx..]; + /// } + /// ``` + #[clippy::version = "1.83.0"] + pub CHAR_INDICES_AS_BYTE_INDICES, + correctness, + "using the character position yielded by `.chars().enumerate()` in a context where a byte index is expected" +} + pub struct Loops { msrv: Msrv, enforce_iter_loop_reborrow: bool, @@ -777,6 +821,7 @@ impl_lint_pass!(Loops => [ UNUSED_ENUMERATE_INDEX, INFINITE_LOOP, MANUAL_SLICE_FILL, + CHAR_INDICES_AS_BYTE_INDICES, ]); impl<'tcx> LateLintPass<'tcx> for Loops { @@ -860,6 +905,7 @@ impl Loops { manual_flatten::check(cx, pat, arg, body, span, self.msrv); manual_find::check(cx, pat, arg, body, span, expr); unused_enumerate_index::check(cx, pat, arg, body); + char_indices_as_byte_indices::check(cx, pat, arg, body); } fn check_for_loop_arg(&self, cx: &LateContext<'_>, _: &Pat<'_>, arg: &Expr<'_>) { diff --git a/tests/ui/char_indices_as_byte_indices.fixed b/tests/ui/char_indices_as_byte_indices.fixed new file mode 100644 index 000000000000..04c8f6782c51 --- /dev/null +++ b/tests/ui/char_indices_as_byte_indices.fixed @@ -0,0 +1,65 @@ +#![feature(round_char_boundary)] +#![warn(clippy::char_indices_as_byte_indices)] + +trait StrExt { + fn use_index(&self, _: usize); +} +impl StrExt for str { + fn use_index(&self, _: usize) {} +} + +fn bad(prim: &str, string: String) { + for (idx, _) in prim.char_indices() { + let _ = prim[..idx]; + //~^ char_indices_as_byte_indices + prim.split_at(idx); + //~^ char_indices_as_byte_indices + + // This won't panic, but it can still return a wrong substring + let _ = prim[..prim.floor_char_boundary(idx)]; + //~^ char_indices_as_byte_indices + + // can't use #[expect] here because the .fixed file will still have the attribute and create an + // unfulfilled expectation, but make sure lint level attributes work on the use expression: + #[allow(clippy::char_indices_as_byte_indices)] + let _ = prim[..idx]; + } + + for c in prim.char_indices() { + let _ = prim[..c.0]; + //~^ char_indices_as_byte_indices + prim.split_at(c.0); + //~^ char_indices_as_byte_indices + } + + for (idx, _) in string.char_indices() { + let _ = string[..idx]; + //~^ char_indices_as_byte_indices + string.split_at(idx); + //~^ char_indices_as_byte_indices + } +} + +fn good(prim: &str, prim2: &str) { + for (idx, _) in prim.chars().enumerate() { + // Indexing into a different string + let _ = prim2[..idx]; + + // Unknown use + std::hint::black_box(idx); + + // Method call to user defined extension trait + prim.use_index(idx); + + // str method taking a usize that doesn't represent a byte index + prim.splitn(idx, prim2); + } + + let mut string = "äa".to_owned(); + for (idx, _) in string.clone().chars().enumerate() { + // Even though the receiver is the same expression, it should not be treated as the same value. + string.clone().remove(idx); + } +} + +fn main() {} diff --git a/tests/ui/char_indices_as_byte_indices.rs b/tests/ui/char_indices_as_byte_indices.rs new file mode 100644 index 000000000000..773a4fc65f12 --- /dev/null +++ b/tests/ui/char_indices_as_byte_indices.rs @@ -0,0 +1,65 @@ +#![feature(round_char_boundary)] +#![warn(clippy::char_indices_as_byte_indices)] + +trait StrExt { + fn use_index(&self, _: usize); +} +impl StrExt for str { + fn use_index(&self, _: usize) {} +} + +fn bad(prim: &str, string: String) { + for (idx, _) in prim.chars().enumerate() { + let _ = prim[..idx]; + //~^ char_indices_as_byte_indices + prim.split_at(idx); + //~^ char_indices_as_byte_indices + + // This won't panic, but it can still return a wrong substring + let _ = prim[..prim.floor_char_boundary(idx)]; + //~^ char_indices_as_byte_indices + + // can't use #[expect] here because the .fixed file will still have the attribute and create an + // unfulfilled expectation, but make sure lint level attributes work on the use expression: + #[allow(clippy::char_indices_as_byte_indices)] + let _ = prim[..idx]; + } + + for c in prim.chars().enumerate() { + let _ = prim[..c.0]; + //~^ char_indices_as_byte_indices + prim.split_at(c.0); + //~^ char_indices_as_byte_indices + } + + for (idx, _) in string.chars().enumerate() { + let _ = string[..idx]; + //~^ char_indices_as_byte_indices + string.split_at(idx); + //~^ char_indices_as_byte_indices + } +} + +fn good(prim: &str, prim2: &str) { + for (idx, _) in prim.chars().enumerate() { + // Indexing into a different string + let _ = prim2[..idx]; + + // Unknown use + std::hint::black_box(idx); + + // Method call to user defined extension trait + prim.use_index(idx); + + // str method taking a usize that doesn't represent a byte index + prim.splitn(idx, prim2); + } + + let mut string = "äa".to_owned(); + for (idx, _) in string.clone().chars().enumerate() { + // Even though the receiver is the same expression, it should not be treated as the same value. + string.clone().remove(idx); + } +} + +fn main() {} diff --git a/tests/ui/char_indices_as_byte_indices.stderr b/tests/ui/char_indices_as_byte_indices.stderr new file mode 100644 index 000000000000..e2b4c1db78cf --- /dev/null +++ b/tests/ui/char_indices_as_byte_indices.stderr @@ -0,0 +1,130 @@ +error: indexing into a string with a character position where a byte index is expected + --> tests/ui/char_indices_as_byte_indices.rs:13:24 + | +LL | let _ = prim[..idx]; + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:12:10 + | +LL | for (idx, _) in prim.chars().enumerate() { + | ^^^ ^^^^^^^^^^^ + = note: `-D clippy::char-indices-as-byte-indices` implied by `-D warnings` + = help: to override `-D warnings` add `#[allow(clippy::char_indices_as_byte_indices)]` +help: consider using `.char_indices()` instead + | +LL - for (idx, _) in prim.chars().enumerate() { +LL + for (idx, _) in prim.char_indices() { + | + +error: passing a character position to a method that expects a byte index + --> tests/ui/char_indices_as_byte_indices.rs:15:23 + | +LL | prim.split_at(idx); + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:12:10 + | +LL | for (idx, _) in prim.chars().enumerate() { + | ^^^ ^^^^^^^^^^^ +help: consider using `.char_indices()` instead + | +LL - for (idx, _) in prim.chars().enumerate() { +LL + for (idx, _) in prim.char_indices() { + | + +error: passing a character position to a method that expects a byte index + --> tests/ui/char_indices_as_byte_indices.rs:19:49 + | +LL | let _ = prim[..prim.floor_char_boundary(idx)]; + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:12:10 + | +LL | for (idx, _) in prim.chars().enumerate() { + | ^^^ ^^^^^^^^^^^ +help: consider using `.char_indices()` instead + | +LL - for (idx, _) in prim.chars().enumerate() { +LL + for (idx, _) in prim.char_indices() { + | + +error: indexing into a string with a character position where a byte index is expected + --> tests/ui/char_indices_as_byte_indices.rs:29:24 + | +LL | let _ = prim[..c.0]; + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:28:9 + | +LL | for c in prim.chars().enumerate() { + | ^ ^^^^^^^^^^^ +help: consider using `.char_indices()` instead + | +LL - for c in prim.chars().enumerate() { +LL + for c in prim.char_indices() { + | + +error: passing a character position to a method that expects a byte index + --> tests/ui/char_indices_as_byte_indices.rs:31:23 + | +LL | prim.split_at(c.0); + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:28:9 + | +LL | for c in prim.chars().enumerate() { + | ^ ^^^^^^^^^^^ +help: consider using `.char_indices()` instead + | +LL - for c in prim.chars().enumerate() { +LL + for c in prim.char_indices() { + | + +error: indexing into a string with a character position where a byte index is expected + --> tests/ui/char_indices_as_byte_indices.rs:36:26 + | +LL | let _ = string[..idx]; + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:35:10 + | +LL | for (idx, _) in string.chars().enumerate() { + | ^^^ ^^^^^^^^^^^ +help: consider using `.char_indices()` instead + | +LL - for (idx, _) in string.chars().enumerate() { +LL + for (idx, _) in string.char_indices() { + | + +error: passing a character position to a method that expects a byte index + --> tests/ui/char_indices_as_byte_indices.rs:38:25 + | +LL | string.split_at(idx); + | ^^^ + | + = note: a character can take up more than one byte, so they are not interchangeable +note: position comes from the enumerate iterator + --> tests/ui/char_indices_as_byte_indices.rs:35:10 + | +LL | for (idx, _) in string.chars().enumerate() { + | ^^^ ^^^^^^^^^^^ +help: consider using `.char_indices()` instead + | +LL - for (idx, _) in string.chars().enumerate() { +LL + for (idx, _) in string.char_indices() { + | + +error: aborting due to 7 previous errors +