Skip to content

Commit 70586ce

Browse files
Merge pull request #32849 from JuliaLang/sk/count-regex
count(regex, string) computes length(findall(regex, string)) better
2 parents c7ff479 + 26517bf commit 70586ce

File tree

3 files changed

+44
-5
lines changed

3 files changed

+44
-5
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ New library functions
3232
* `findfirst`, `findlast`, `findnext` and `findprev` now accept a character as first argument
3333
to search for that character in a string passed as the second argument ([#31664]).
3434
* New `findall(pattern, string)` method where `pattern` is a string or regex ([#31834]).
35+
* `count(pattern, string)` gives the number of things `findall` would match ([#32849]).
3536
* `istaskfailed` is now documented and exported, like its siblings `istaskdone` and `istaskstarted` ([#32300]).
3637
* `RefArray` and `RefValue` objects now accept index `CartesianIndex()` in `getindex` and `setindex!` ([#32653])
3738

base/regex.jl

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -328,26 +328,58 @@ findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s))
328328

329329

330330
"""
331-
findall(pattern::Union{AbstractString,Regex}, string::AbstractString; overlap::Bool=false)
331+
findall(
332+
pattern::Union{AbstractString,Regex},
333+
string::AbstractString;
334+
overlap::Bool = false,
335+
)
332336
333337
Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`.
334338
Each element of the returned vector is a range of indices where the
335339
matching sequence is found, like the return value of [`findnext`](@ref).
336340
337341
If `overlap=true`, the matching sequences are allowed to overlap indices in the
338-
original string, otherwise they must be from distinct character ranges.
342+
original string, otherwise they must be from disjoint character ranges.
339343
"""
340344
function findall(t::Union{AbstractString,Regex}, s::AbstractString; overlap::Bool=false)
341345
found = UnitRange{Int}[]
342346
i, e = firstindex(s), lastindex(s)
343347
while true
344348
r = findnext(t, s, i)
345-
isnothing(r) && return found
349+
isnothing(r) && break
346350
push!(found, r)
347351
j = overlap || isempty(r) ? first(r) : last(r)
348-
j > e && return found
352+
j > e && break
353+
@inbounds i = nextind(s, j)
354+
end
355+
return found
356+
end
357+
358+
"""
359+
count(
360+
pattern::Union{AbstractString,Regex},
361+
string::AbstractString;
362+
overlap::Bool = false,
363+
)
364+
365+
Return the number of matches for `pattern` in `string`. This is equivalent to
366+
calling `length(findall(pattern, string))` but more efficient.
367+
368+
If `overlap=true`, the matching sequences are allowed to overlap indices in the
369+
original string, otherwise they must be from disjoint character ranges.
370+
"""
371+
function count(t::Union{AbstractString,Regex}, s::AbstractString; overlap::Bool=false)
372+
n = 0
373+
i, e = firstindex(s), lastindex(s)
374+
while true
375+
r = findnext(t, s, i)
376+
isnothing(r) && break
377+
n += 1
378+
j = overlap || isempty(r) ? first(r) : last(r)
379+
j > e && break
349380
@inbounds i = nextind(s, j)
350381
end
382+
return n
351383
end
352384

353385
"""

test/regex.jl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,18 @@
4646
@test_throws ArgumentError match(r"test", GenericString("this is a test"))
4747
@test_throws ArgumentError findfirst(r"test", GenericString("this is a test"))
4848

49-
# findall:
49+
# findall
5050
@test findall(r"\w+", "foo bar") == [1:3, 5:7]
5151
@test findall(r"\w+", "foo bar", overlap=true) == [1:3, 2:3, 3:3, 5:7, 6:7, 7:7]
5252
@test findall(r"\w*", "foo bar") == [1:3, 4:3, 5:7, 8:7]
5353
@test findall(r"\b", "foo bar") == [1:0, 4:3, 5:4, 8:7]
5454

55+
# count
56+
@test count(r"\w+", "foo bar") == 2
57+
@test count(r"\w+", "foo bar", overlap=true) == 6
58+
@test count(r"\w*", "foo bar") == 4
59+
@test count(r"\b", "foo bar") == 4
60+
5561
# Named subpatterns
5662
let m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
5763
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")

0 commit comments

Comments
 (0)