diff --git a/NEWS.md b/NEWS.md index fa2aafffb2802..1d0700631ae48 100644 --- a/NEWS.md +++ b/NEWS.md @@ -57,6 +57,8 @@ New library functions * `Base.donotdelete` is now public. It prevents deadcode elimination of its arguments ([#55774]). * `Sys.sysimage_target()` returns the CPU target string used to build the current system image ([#58970]). * `Iterators.findeach` is a lazy version of `findall` ([#54124]) +* `Base.unsafe_substring` is an unexported, public constructor to build a `SubString` without checking for + valid string indices. New library features -------------------- diff --git a/base/public.jl b/base/public.jl index e944e8f3915d6..e83ce7bab7af5 100644 --- a/base/public.jl +++ b/base/public.jl @@ -107,6 +107,8 @@ public # Strings escape_raw_string, + unsafe_substring, + unannotate, # IO # types diff --git a/base/regex.jl b/base/regex.jl index 2bd47c271ce75..f3c976bab705b 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -447,12 +447,10 @@ end function _annotatedmatch(m::RegexMatch{S}, str::AnnotatedString{S}) where {S<:AbstractString} RegexMatch{AnnotatedString{S}}( - (@inbounds SubString{AnnotatedString{S}}( - str, m.match.offset, m.match.ncodeunits, Val(:noshift))), + (@inbounds unsafe_substring(str, m.match.offset + 1, m.match.ncodeunits)), Union{Nothing,SubString{AnnotatedString{S}}}[ if !isnothing(cap) - (@inbounds SubString{AnnotatedString{S}}( - str, cap.offset, cap.ncodeunits, Val(:noshift))) + (@inbounds unsafe_substring(str, cap.offset + 1, cap.ncodeunits)) end for cap in m.captures], m.offset, m.offsets, m.regex) end diff --git a/base/strings/annotated.jl b/base/strings/annotated.jl index 89cba6db42c8d..f98c2dee72ae7 100644 --- a/base/strings/annotated.jl +++ b/base/strings/annotated.jl @@ -158,6 +158,32 @@ eltype(::Type{<:AnnotatedString{S}}) where {S} = AnnotatedChar{eltype(S)} firstindex(s::AnnotatedString) = firstindex(s.string) lastindex(s::AnnotatedString) = lastindex(s.string) +""" + unannotate(s::AnnotatedString{S})::S + unannotate(s::SubString{AnnotatedString{S}})::SubString{S} + +Get the underlying string of `s`, without copying. + +# Examples +```jldoctest; setup=:(using Base: AnnotatedString) +julia> s = AnnotatedString("abcde", [(1:3, :A, 4)]) +"abcde" + +julia> u = unannotate(s) +"abcde" + +julia> typeof(u) +String +``` +""" +unannotate(s::AnnotatedString) = s.string + +function unannotate(s::SubString{<:AnnotatedString}) + start_index = first(parentindices(s)[1]) + @inbounds unsafe_substring(parent(s).string, start_index, ncodeunits(s)) +end + + function getindex(s::AnnotatedString, i::Integer) @boundscheck checkbounds(s, i) @inbounds if isvalid(s, i) @@ -204,16 +230,14 @@ cmp(a::AnnotatedString, b::AnnotatedString) = cmp(a.string, b.string) # To prevent substring equality from hitting the generic fallback function ==(a::SubString{<:AnnotatedString}, b::SubString{<:AnnotatedString}) - SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == - SubString(b.string.string, b.offset, b.ncodeunits, Val(:noshift)) && - annotations(a) == annotations(b) + unannotate(a) == unannotate(b) && annotations(a) == annotations(b) end ==(a::SubString{<:AnnotatedString}, b::AnnotatedString) = - annotations(a) == annotations(b) && SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == b.string + annotations(a) == annotations(b) && unannotate(a) == b.string ==(a::SubString{<:AnnotatedString}, b::AbstractString) = - isempty(annotations(a)) && SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == b + isempty(annotations(a)) && unannotate(a) == b ==(a::AbstractString, b::SubString{<:AnnotatedString}) = b == a @@ -262,7 +286,7 @@ function annotatedstring(xs...) push!(annotations, setindex(annot, rstart:rstop, :region)) end end - print(s, SubString(x.string.string, x.offset, x.ncodeunits, Val(:noshift))) + print(s, unannotate(x)) elseif x isa AnnotatedChar for annot in x.annotations push!(annotations, (region=1+size:1+size, annot...)) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index ec9944449bc0e..06caae5277413 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -36,18 +36,67 @@ struct SubString{T<:AbstractString} <: AbstractString end return new(s, i-1, nextind(s,j)-i) end - function SubString{T}(s::T, i::Int, j::Int, ::Val{:noshift}) where T<:AbstractString - @boundscheck if !(i == j == 0) - si, sj = i + 1, prevind(s, j + i + 1) - @inbounds isvalid(s, si) || string_index_err(s, si) - @inbounds isvalid(s, sj) || string_index_err(s, sj) - end - new(s, i, j) + # We don't expose this, because the exposed constructor needs to avoid constructing + # a SubString{SubString{T}} when passed a substring. + global function _unsafe_substring(s::T, offset::Int, ncodeunits::Int) where {T <: AbstractString} + new{T}(s, offset, ncodeunits) + end +end + +function check_codeunit_bounds(s::AbstractString, first_index::Int, n_codeunits::Int) + last_index = first_index + n_codeunits - 1 + bad_index = if first_index < 1 + first_index + elseif last_index > ncodeunits(s) + last_index + else + return nothing end + throw(BoundsError(s, bad_index)) +end + +""" + unsafe_substring(s::AbstractString, first_index::Int, n_codeunits::Int)::SubString{typeof(s)} + unsafe_substring(s::SubString{S}, first_index::Int, n_codeunits::Int)::SubString{S} + +Create a substring of `s` spanning the codeunits `first_index:(first_index + n_codeunits - 1)`. + +If `first_index` < 1, or `first_index + n_codeunits - 1 > ncodeunits(s)`, throw a `BoundsError`. + +This function does check bounds, but does not validate that the arguments corresponds to valid +start and end indices in `s`, and so the resulting substring may contain truncated characters. +The presence of truncated characters is safe and well-defined for `String` and `SubString{String}`, +but may not be permitted for custom subtypes of `AbstractString`. + +# Examples +```jldoctest +julia> s = "Hello, Bjørn!"; + +julia> ss = unsafe_substring(s, 3, 10) +"lo, Bjørn" + +julia> typeof(ss) +SubString{String} + +julia> ss2 = unsafe_substring(ss, 2, 6) +"o, Bj\\xc3" + +julia> typeof(ss2) +SubString{String} +``` +""" +function unsafe_substring(s::AbstractString, first_index::Int, n_codeunits::Int) + @boundscheck @inline checkbounds(codeunits(s), first_index:(first_index + n_codeunits - 1)) + return _unsafe_substring(s, first_index - 1, n_codeunits) +end + +function unsafe_substring(s::SubString, first_index::Int, n_codeunits::Int) + @boundscheck @inline check_codeunit_bounds(s, first_index, n_codeunits) + string = s.string + return _unsafe_substring(string, first_index + s.offset - 1, n_codeunits) end @propagate_inbounds SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j) -@propagate_inbounds SubString(s::T, i::Int, j::Int, v::Val{:noshift}) where {T<:AbstractString} = SubString{T}(s, i, j, v) @propagate_inbounds SubString(s::AbstractString, i::Integer, j::Integer=lastindex(s)) = SubString(s, Int(i)::Int, Int(j)::Int) @propagate_inbounds SubString(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, first(r), last(r)) @@ -56,8 +105,9 @@ end SubString(s.string, s.offset+i, s.offset+j) end -SubString(s::AbstractString) = SubString(s, 1, lastindex(s)::Int) -SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, lastindex(s)::Int) +SubString(s::AbstractString) = @inbounds unsafe_substring(s, 1, Int(ncodeunits(s))::Int) +SubString{T}(s::T) where {T<:AbstractString} = SubString(s) +SubString(s::SubString) = s @propagate_inbounds view(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, r) @propagate_inbounds maybeview(s::AbstractString, r::AbstractUnitRange{<:Integer}) = view(s, r) diff --git a/base/strings/util.jl b/base/strings/util.jl index 0573893481d3d..da149a8f49b03 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -377,7 +377,7 @@ end end off = s isa String ? 0 : s.offset par = s isa String ? s : s.string - @inbounds @inline SubString{String}(par, off, len, Val{:noshift}()) + @inbounds unsafe_substring(s, 1, len) end """ lstrip([pred=isspace,] str::AbstractString)::SubString diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md index 15a7c0531de4a..a3930312ce3d5 100644 --- a/doc/src/base/strings.md +++ b/doc/src/base/strings.md @@ -15,6 +15,7 @@ Base.repeat(::AbstractChar, ::Integer) Base.repr(::Any) Core.String(::AbstractString) Base.SubString +Base.unsafe_substring Base.LazyString Base.@lazy_str Base.transcode @@ -110,4 +111,5 @@ Base.AnnotatedChar Base.annotatedstring Base.annotations Base.annotate! +Base.unannotate ``` diff --git a/test/strings/annotated.jl b/test/strings/annotated.jl index 7f53740b9eec1..022369f8e4772 100644 --- a/test/strings/annotated.jl +++ b/test/strings/annotated.jl @@ -77,6 +77,17 @@ @test Bool === Base.infer_return_type(isvalid, Tuple{Base.AnnotatedString, Vararg}) @test Int === Base.infer_return_type(ncodeunits, Tuple{Base.AnnotatedString}) + + @testset "unannotate" begin + s = "some string" + str = Base.AnnotatedString(s, [(2:5, :A, 3)]) + @test Base.unannotate(str) === s + + str2 = SubString(str, 2:9) + u = Base.unannotate(str2) + @test u isa SubString{String} + @test u == SubString(s, 2:9) + end end @testset "AnnotatedChar" begin diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 214a14ed2443f..458b72d41f6b3 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -222,10 +222,17 @@ end @test (@views (x[3], x[1:2], x[[1,4]])) == ('c', "ab", "ad") end - @testset ":noshift constructor" begin - @test SubString("", 0, 0, Val(:noshift)) == "" - @test SubString("abcd", 0, 1, Val(:noshift)) == "a" - @test SubString("abcd", 0, 4, Val(:noshift)) == "abcd" + @testset "unsafe_substring" begin + s = "abcdefgøø" + @test unsafe_substring(s, 1, 11) == s + @test unsafe_substring(s, 1, 3) == "abc" + @test unsafe_substring(s, 3, 3) == "cde" + @test unsafe_substring(s, 5, 4) == String(codeunits(s)[5:8]) + @test unsafe_substring(s, 1, 2) isa SubString{String} + @test unsafe_substring(unsafe_substring(s, 2, 8), 1, 3) isa SubString{String} + + @test_throws BoundsError unsafe_substring(s, 0, 2) + @test_throws BoundsError unsafe_substring(s, 2, 11) end end