Skip to content

Commit dba8a08

Browse files
author
Jeffrey Lin
authored
add eachsplit for iterative splitting (#39245)
This moves the existing splitting implementation into an iterator named `eachsplit` and changes the definition of `split(...)` to `collect(eachsplit(...))`, plus a few edge cases.
1 parent ed3691f commit dba8a08

File tree

12 files changed

+99
-55
lines changed

12 files changed

+99
-55
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ New language features
1111
* `@inline` and `@noinline` annotations can now be applied to a function callsite or block
1212
to enforce the involved function calls to be (or not to be) inlined. ([#41312])
1313
* The default behavior of observing `@inbounds` declarations is now an option via `auto` in `--check-bounds=yes|no|auto` ([#41551])
14+
* New function `eachsplit(str)` for iteratively performing `split(str)`.
1415

1516
Language changes
1617
----------------

base/binaryplatforms.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ function Base.parse(::Type{Platform}, triplet::AbstractString; validate_strict::
706706
libstdcxx_version = get_field(m, libstdcxx_version_mapping)
707707
cxxstring_abi = get_field(m, cxxstring_abi_mapping)
708708
function split_tags(tagstr)
709-
tag_fields = filter(!isempty, split(tagstr, "-"))
709+
tag_fields = split(tagstr, "-"; keepempty=false)
710710
if isempty(tag_fields)
711711
return Pair{String,String}[]
712712
end

base/cmd.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ function addenv(cmd::Cmd, env::Dict; inherit::Bool = true)
269269
merge!(new_env, ENV)
270270
end
271271
else
272-
for (k, v) in split.(cmd.env, "=")
272+
for (k, v) in eachsplit.(cmd.env, "=")
273273
new_env[string(k)::String] = string(v)::String
274274
end
275275
end
@@ -284,7 +284,7 @@ function addenv(cmd::Cmd, pairs::Pair{<:AbstractString}...; inherit::Bool = true
284284
end
285285

286286
function addenv(cmd::Cmd, env::Vector{<:AbstractString}; inherit::Bool = true)
287-
return addenv(cmd, Dict(k => v for (k, v) in split.(env, "=")); inherit)
287+
return addenv(cmd, Dict(k => v for (k, v) in eachsplit.(env, "=")); inherit)
288288
end
289289

290290
(&)(left::AbstractCmd, right::AbstractCmd) = AndCmds(left, right)

base/exports.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,7 @@ export
578578
codeunits,
579579
digits,
580580
digits!,
581+
eachsplit,
581582
escape_string,
582583
hex2bytes,
583584
hex2bytes!,

base/initdefs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ function init_depot_path()
100100
if haskey(ENV, "JULIA_DEPOT_PATH")
101101
str = ENV["JULIA_DEPOT_PATH"]
102102
isempty(str) && return
103-
for path in split(str, Sys.iswindows() ? ';' : ':')
103+
for path in eachsplit(str, Sys.iswindows() ? ';' : ':')
104104
if isempty(path)
105105
append_default_depot_path!(DEPOT_PATH)
106106
else
@@ -198,7 +198,7 @@ end
198198
function parse_load_path(str::String)
199199
envs = String[]
200200
isempty(str) && return envs
201-
for env in split(str, Sys.iswindows() ? ';' : ':')
201+
for env in eachsplit(str, Sys.iswindows() ? ';' : ':')
202202
if isempty(env)
203203
for env′ in DEFAULT_LOAD_PATH
204204
env′ in envs || push!(envs, env′)

base/logging.jl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -674,10 +674,11 @@ function handle_message(logger::SimpleLogger, level::LogLevel, message, _module,
674674
end
675675
iob = IOContext(buf, stream)
676676
levelstr = level == Warn ? "Warning" : string(level)
677-
msglines = split(chomp(string(message)::String), '\n')
678-
println(iob, "", levelstr, ": ", msglines[1])
679-
for i in 2:length(msglines)
680-
println(iob, "", msglines[i])
677+
msglines = eachsplit(chomp(string(message)::String), '\n')
678+
msg1, rest = Iterators.peel(msglines)
679+
println(iob, "", levelstr, ": ", msg1)
680+
for msg in rest
681+
println(iob, "", msg)
681682
end
682683
for (key, val) in kwargs
683684
key === :maxlog && continue

base/mpfr.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,7 @@ function string_mpfr(x::BigFloat, fmt::String)
962962
end
963963

964964
function _prettify_bigfloat(s::String)::String
965-
mantissa, exponent = split(s, 'e')
965+
mantissa, exponent = eachsplit(s, 'e')
966966
if !occursin('.', mantissa)
967967
mantissa = string(mantissa, '.')
968968
end
@@ -973,7 +973,7 @@ function _prettify_bigfloat(s::String)::String
973973
expo = parse(Int, exponent)
974974
if -5 < expo < 6
975975
expo == 0 && return mantissa
976-
int, frac = split(mantissa, '.')
976+
int, frac = eachsplit(mantissa, '.')
977977
if expo > 0
978978
expo < length(frac) ?
979979
string(int, frac[1:expo], '.', frac[expo+1:end]) :

base/path.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,8 +368,8 @@ function normpath(path::String)
368368
isabs = isabspath(path)
369369
isdir = isdirpath(path)
370370
drive, path = splitdrive(path)
371-
parts = split(path, path_separator_re)
372-
filter!(x->!isempty(x) && x!=".", parts)
371+
parts = split(path, path_separator_re; keepempty=false)
372+
filter!(!=("."), parts)
373373
while true
374374
clean = true
375375
for j = 1:length(parts)-1

base/strings/util.jl

Lines changed: 80 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,83 @@ function rpad(
383383
r == 0 ? string(s, p^q) : string(s, p^q, first(p, r))
384384
end
385385

386+
"""
387+
eachsplit(str::AbstractString, dlm; limit::Integer=0)
388+
eachsplit(str::AbstractString; limit::Integer=0)
389+
390+
Split `str` on occurrences of the delimiter(s) `dlm` and return an iterator over the
391+
substrings. `dlm` can be any of the formats allowed by [`findnext`](@ref)'s first argument
392+
(i.e. as a string, regular expression or a function), or as a single character or collection
393+
of characters.
394+
395+
If `dlm` is omitted, it defaults to [`isspace`](@ref).
396+
397+
The iterator will return a maximum of `limit` results if the keyword argument is supplied.
398+
The default of `limit=0` implies no maximum.
399+
400+
See also [`split`](@ref).
401+
402+
# Examples
403+
```jldoctest
404+
julia> a = "Ma.rch"
405+
"Ma.rch"
406+
407+
julia> collect(eachsplit(a, "."))
408+
2-element Vector{SubString}:
409+
"Ma"
410+
"rch"
411+
```
412+
"""
413+
function eachsplit end
414+
415+
# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
416+
# and prevents a major invalidation risk (1550 MethodInstances)
417+
struct SplitIterator{S<:AbstractString,F}
418+
str::S
419+
splitter::F
420+
limit::Int
421+
keepempty::Bool
422+
end
423+
424+
eltype(::Type{<:SplitIterator}) = SubString
425+
426+
IteratorSize(::Type{<:SplitIterator}) = SizeUnknown()
427+
428+
# i: the starting index of the substring to be extracted
429+
# k: the starting index of the next substring to be extracted
430+
# n: the number of splits returned so far; always less than iter.limit - 1 (1 for the rest)
431+
function iterate(iter::SplitIterator, (i, k, n)=(firstindex(iter.str), firstindex(iter.str), 0))
432+
i - 1 > ncodeunits(iter.str)::Int && return nothing
433+
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
434+
while r !== nothing && n != iter.limit - 1 && first(r) <= ncodeunits(iter.str)
435+
j, k = first(r), nextind(iter.str, last(r))::Int
436+
k_ = k <= j ? nextind(iter.str, j) : k
437+
if i < k
438+
substr = @inbounds SubString(iter.str, i, prevind(iter.str, j)::Int)
439+
(iter.keepempty || i < j) && return (substr, (k, k_, n + 1))
440+
i = k
441+
end
442+
k = k_
443+
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
444+
end
445+
iter.keepempty || i <= ncodeunits(iter.str) || return nothing
446+
@inbounds SubString(iter.str, i), (ncodeunits(iter.str) + 2, k, n + 1)
447+
end
448+
449+
eachsplit(str::T, splitter; limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString} =
450+
SplitIterator(str, splitter, limit, keepempty)
451+
452+
eachsplit(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}};
453+
limit::Integer=0, keepempty=true) where {T<:AbstractString} =
454+
eachsplit(str, in(splitter); limit, keepempty)
455+
456+
eachsplit(str::T, splitter::AbstractChar; limit::Integer=0, keepempty=true) where {T<:AbstractString} =
457+
eachsplit(str, isequal(splitter); limit, keepempty)
458+
459+
# a bit oddball, but standard behavior in Perl, Ruby & Python:
460+
eachsplit(str::AbstractString; limit::Integer=0, keepempty=false) =
461+
eachsplit(str, isspace; limit, keepempty)
462+
386463
"""
387464
split(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true)
388465
split(str::AbstractString; limit::Integer=0, keepempty::Bool=false)
@@ -412,52 +489,16 @@ julia> split(a, ".")
412489
"rch"
413490
```
414491
"""
415-
function split end
416-
417492
function split(str::T, splitter;
418493
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
419-
_split(str, splitter, limit, keepempty, T <: SubString ? T[] : SubString{T}[])
420-
end
421-
function split(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}};
422-
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
423-
_split(str, in(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
424-
end
425-
function split(str::T, splitter::AbstractChar;
426-
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
427-
_split(str, isequal(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
428-
end
429-
430-
function _split(str::AbstractString, splitter::F, limit::Integer, keepempty::Bool, strs::Vector) where F
431-
# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
432-
# and prevents a major invalidation risk (1550 MethodInstances)
433-
i = 1 # firstindex(str)
434-
n = lastindex(str)::Int
435-
r = findfirst(splitter,str)::Union{Nothing,Int,UnitRange{Int}}
436-
if r !== nothing
437-
j, k = first(r), nextind(str,last(r))::Int
438-
while 0 < j <= n && length(strs) != limit-1
439-
if i < k
440-
if keepempty || i < j
441-
push!(strs, @inbounds SubString(str,i,prevind(str,j)::Int))
442-
end
443-
i = k
444-
end
445-
(k <= j) && (k = nextind(str,j)::Int)
446-
r = findnext(splitter,str,k)::Union{Nothing,Int,UnitRange{Int}}
447-
r === nothing && break
448-
j, k = first(r), nextind(str,last(r))::Int
449-
end
450-
end
451-
if keepempty || i <= ncodeunits(str)::Int
452-
push!(strs, @inbounds SubString(str,i))
453-
end
454-
return strs
494+
itr = eachsplit(str, splitter; limit, keepempty)
495+
collect(T <: SubString ? T : SubString{T}, itr)
455496
end
456497

457498
# a bit oddball, but standard behavior in Perl, Ruby & Python:
458499
split(str::AbstractString;
459500
limit::Integer=0, keepempty::Bool=false) =
460-
split(str, isspace; limit=limit, keepempty=keepempty)
501+
split(str, isspace; limit, keepempty)
461502

462503
"""
463504
rsplit(s::AbstractString; limit::Integer=0, keepempty::Bool=false)

base/sysinfo.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ function which(program_name::String)
499499
# If we have been given just a program name (not a relative or absolute
500500
# path) then we should search `PATH` for it here:
501501
pathsep = iswindows() ? ';' : ':'
502-
path_dirs = abspath.(split(get(ENV, "PATH", ""), pathsep))
502+
path_dirs = map(abspath, eachsplit(get(ENV, "PATH", ""), pathsep))
503503

504504
# On windows we always check the current directory as well
505505
if iswindows()

0 commit comments

Comments
 (0)