Skip to content

Commit 9fe32cc

Browse files
authored
don't copy regex match strings (#15)
* don't copy regex match strings * more docs for SVRegexMatch * AbstractMatch requires Julia 1.6 * bugfixes and tests
1 parent 5ddbe5a commit 9fe32cc

File tree

6 files changed

+104
-10
lines changed

6 files changed

+104
-10
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
fail-fast: false
1515
matrix:
1616
version:
17-
- "1.3"
17+
- "1.6"
1818
- "1"
1919
- "nightly"
2020
os:

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ authors = ["Steven G. Johnson <[email protected]>"]
44
version = "1.1.2"
55

66
[compat]
7-
julia = "1.3"
7+
julia = "1.6"
88

99
[extras]
1010
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ julia> s = StringView(b) # does not make a copy
2525
"foobar"
2626

2727
julia> collect(eachmatch(r"[aeiou]+", s))
28-
2-element Array{RegexMatch,1}:
29-
RegexMatch("oo")
30-
RegexMatch("a")
28+
2-element Vector{SVRegexMatch{StringView{Vector{UInt8}}}}:
29+
SVRegexMatch("oo")
30+
SVRegexMatch("a")
3131

3232
julia> StringView(@view b[1:3]) # also works for subarrays, with no copy
3333
"foo"

src/StringViews.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ a `StringView` is intended to be usable in any context where you might
1010
have otherwise used `String`.
1111
"""
1212
module StringViews
13-
export StringView
13+
export StringView, SVRegexMatch
1414

1515
"""
1616
StringView{T<:AbstractVector{UInt8}} <: AbstractString

src/regex.jl

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ function Base.endswith(s::DenseStringViewAndSub, r::Regex)
1818
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED)
1919
end
2020

21-
function Base.match(re::Regex, str::DenseStringViewAndSub, idx::Integer, add_opts::UInt32=UInt32(0))
21+
function Base.match(re::Regex, str::T, idx::Integer, add_opts::UInt32=UInt32(0)) where {T<:DenseStringViewAndSub}
2222
Base.compile(re)
2323
opts = re.match_options | add_opts
2424
matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts)
@@ -29,11 +29,11 @@ function Base.match(re::Regex, str::DenseStringViewAndSub, idx::Integer, add_opt
2929
n = div(PCRE.ovec_length(data), 2) - 1
3030
p = PCRE.ovec_ptr(data)
3131
mat = SubString(str, unsafe_load(p, 1)+1, prevind(str, unsafe_load(p, 2)+1))
32-
cap = Union{Nothing,SubString{String}}[unsafe_load(p,2i+1) == PCRE.UNSET ? nothing :
32+
cap = Union{Nothing,SubString{T}}[unsafe_load(p,2i+1) == PCRE.UNSET ? nothing :
3333
SubString(str, unsafe_load(p,2i+1)+1,
3434
prevind(str, unsafe_load(p,2i+2)+1)) for i=1:n]
3535
off = Int[ unsafe_load(p,2i+1)+1 for i=1:n ]
36-
result = RegexMatch(mat, cap, unsafe_load(p,1)+1, off, re)
36+
result = SVRegexMatch(mat, cap, unsafe_load(p,1)+1, off, re)
3737
PCRE.free_match_data(data)
3838
return result
3939
end
@@ -70,7 +70,8 @@ struct RegexMatchIterator{T<:DenseStringViewAndSub}
7070
overlap::Bool
7171
end
7272
Base.compile(itr::RegexMatchIterator) = (compile(itr.regex); itr)
73-
Base.eltype(::Type{<:RegexMatchIterator}) = RegexMatch
73+
Base.eltype(::Type{RegexMatchIterator{T}}) where {T<:DenseStringView} = SVRegexMatch{T}
74+
Base.eltype(::Type{RegexMatchIterator{SubString{T}}}) where {T<:DenseStringView} = SVRegexMatch{T}
7475
Base.IteratorSize(::Type{<:RegexMatchIterator}) = Base.SizeUnknown()
7576

7677
function Base.iterate(itr::RegexMatchIterator, (offset,prevempty)=(1,false))
@@ -115,3 +116,80 @@ function PCRE.exec(re, subject::DenseStringViewAndSub, offset, options, match_da
115116
rc < -2 && error("PCRE.exec error: $(PCRE.err_message(rc))")
116117
return rc >= 0
117118
end
119+
120+
#####################################################################
121+
# need to duplicate this code from Base because of julia#48617:
122+
"""
123+
SVRegexMatch <: AbstractMatch
124+
125+
This type is identical to `RegexMatch` (in Julia `Base`) except that the
126+
`match` is a `SubString` of a `StringView` instead of a `String`.
127+
128+
A type representing a single match to a `Regex` found in a string.
129+
Typically created from the [`match`](@ref) function.
130+
131+
* The `match` field stores the substring of the entire matched string.
132+
* The `captures` field stores the substrings for each capture group, indexed by number.
133+
To index by capture group name, the entire match object should be indexed instead,
134+
as shown in the examples.
135+
* The location of the start of the match is stored in the `offset` field.
136+
* The `offsets` field stores the locations of the start of each capture group,
137+
with 0 denoting a group that was not captured.
138+
139+
This type can be used as an iterator over the capture groups of the `Regex`,
140+
yielding the substrings captured in each group.
141+
Because of this, the captures of a match can be destructured.
142+
If a group was not captured, `nothing` will be yielded instead of a substring.
143+
"""
144+
struct SVRegexMatch{T<:DenseStringView} <: AbstractMatch
145+
match::SubString{T}
146+
captures::Vector{Union{Nothing,SubString{T}}}
147+
offset::Int
148+
offsets::Vector{Int}
149+
regex::Regex
150+
end
151+
SVRegexMatch(match::SubString{T}, captures, offset, offsets, regex) where {T<:DenseStringViewAndSub} =
152+
SVRegexMatch{T}(match, captures, offset, offsets, regex)
153+
154+
function Base.keys(m::SVRegexMatch)
155+
idx_to_capture_name = PCRE.capture_names(m.regex.regex)
156+
return map(eachindex(m.captures)) do i
157+
# If the capture group is named, return it's name, else return it's index
158+
get(idx_to_capture_name, i, i)
159+
end
160+
end
161+
162+
function Base.show(io::IO, m::SVRegexMatch)
163+
print(io, "SVRegexMatch(")
164+
show(io, m.match)
165+
capture_keys = keys(m)
166+
if !isempty(capture_keys)
167+
print(io, ", ")
168+
for (i, capture_name) in enumerate(capture_keys)
169+
print(io, capture_name, "=")
170+
show(io, m.captures[i])
171+
if i < length(m)
172+
print(io, ", ")
173+
end
174+
end
175+
end
176+
print(io, ")")
177+
end
178+
179+
# Capture group extraction
180+
Base.getindex(m::SVRegexMatch, idx::Integer) = m.captures[idx]
181+
function Base.getindex(m::SVRegexMatch, name::Union{AbstractString,Symbol})
182+
idx = PCRE.substring_number_from_name(m.regex.regex, name)
183+
idx <= 0 && error("no capture group named $name found in regex")
184+
m[idx]
185+
end
186+
187+
Base.haskey(m::SVRegexMatch, idx::Integer) = idx in eachindex(m.captures)
188+
function Base.haskey(m::SVRegexMatch, name::Union{AbstractString,Symbol})
189+
idx = PCRE.substring_number_from_name(m.regex.regex, name)
190+
return idx > 0
191+
end
192+
193+
Base.iterate(m::SVRegexMatch, args...) = iterate(m.captures, args...)
194+
Base.length(m::SVRegexMatch) = length(m.captures)
195+
Base.eltype(m::SVRegexMatch) = eltype(m.captures)

test/runtests.jl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,24 @@ end
8282
end
8383
@test findnext(r"[aeiou]+", s, 1) == 2:3
8484
@test findnext(r"[aeiou]+", ss, 1) == 1:2
85+
86+
sv = StringView(codeunits("foo 1234 bar"))
87+
@test match(r"[0-9]+", sv).match.string === sv
88+
@test eltype(eachmatch(r"[0-9]+", sv)) == SVRegexMatch{typeof(sv)}
89+
end
90+
91+
@testset "named subpatterns" begin
92+
m = match(r"(?<a>.)(.)(?<b>.)", StringView(codeunits("xyz")))
93+
@test haskey(m, :a)
94+
@test haskey(m, 2)
95+
@test haskey(m, "b")
96+
@test !haskey(m, "foo")
97+
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")
98+
@test sprint(show, m) == "SVRegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"
99+
@test keys(m) == ["a", 2, "b"]
85100
end
86101

102+
87103
@testset "parsing" begin
88104
for val in (true, 1234, 1234.5, 1234.5f0, 4.5+3.25im)
89105
sval = string(val)

0 commit comments

Comments
 (0)