Skip to content

Commit 0557365

Browse files
committed
Reshape pattern free string and add general case
1 parent b32e8c8 commit 0557365

File tree

2 files changed

+164
-165
lines changed

2 files changed

+164
-165
lines changed

src/misc/pattern_free_string.jl

Lines changed: 132 additions & 159 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,18 @@
22
PatternFreeString{T<:String} <: AbstractPointProcess{T}
33
44
Struct with fields
5-
- `alphabet::Vector{Char}`,
6-
- `pattern::Regex`,
5+
- `alphabet::Vector{String}`,
6+
- `pattern::String`,
7+
78
used to generate strings made of characters from `alphabet` avoiding the prescribed `pattern`.
89
"""
910
struct PatternFreeString{T<:String} <: AbstractPointProcess{T}
10-
alphabet::Vector{Char}
11-
pattern::Regex
11+
pattern::T
12+
alphabet::Vector{T}
1213
end
1314

1415
function Base.show(io::IO, pp::PatternFreeString{T}) where {T}
15-
print(io, "PatternFreeString{$T}\n- alphabet = $(pp.alphabet)\n- pattern = $(pp.pattern.pattern)")
16+
print(io, "PatternFreeString{$T}\n- pattern = $(pp.pattern)\n- alphabet = $(pp.alphabet)")
1617
end
1718

1819
"""
@@ -22,22 +23,22 @@ Construct a [`PRS.PatternFreeString`](@ref).
2223
2324
```jldoctest; output = true
2425
using PartialRejectionSampling
25-
PRS.PatternFreeString(['A', 'C', 'G', 'T'], "ATGTA")
26+
PRS.PatternFreeString("ATGTA", ["A", "C", "G", "T"])
2627
2728
# output
2829
2930
PatternFreeString{String}
30-
- alphabet = ['A', 'C', 'G', 'T']
3131
- pattern = ATGTA
32+
- alphabet = ["A", "C", "G", "T"]
3233
```
3334
"""
34-
function PatternFreeString(alphabet::Vector{Char}, pattern::String)
35-
@assert !isempty(alphabet)
35+
function PatternFreeString(pattern::String, alphabet::Vector{String})
3636
@assert !isempty(pattern)
37-
if !issubset(Vector{Char}(pattern), alphabet)
38-
throw(DomainError(pattern, "pattern $(pattern) is not fully made of characters from alphabet $(alphabet)"))
37+
@assert !isempty(alphabet)
38+
if !issubset(string.(unique(pattern)), alphabet)
39+
throw(DomainError(pattern, "pattern is not fully made of characters from alphabet $(alphabet)"))
3940
end
40-
return PatternFreeString{String}(alphabet, Regex(pattern))
41+
return PatternFreeString{String}(pattern, alphabet)
4142
end
4243

4344
"""
@@ -47,7 +48,7 @@ end
4748
size::Int
4849
)::T where {T<:String}
4950
50-
Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of the pattern `pp.pattern`.
51+
Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of `pp.pattern`.
5152
5253
Default sampler is [`PRS.generate_sample_prs`](@ref).
5354
"""
@@ -70,192 +71,164 @@ end
7071
size::Int
7172
)::T where {T<:String}
7273
73-
Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of the pattern `pp.pattern`, using a tailored version of Partial Rejection Sampling (PRS).
74-
75-
**See also**
76-
- Technical report of [GiAmWe18](@cite)
74+
Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of `pp.pattern`, using a tailored version of Partial Rejection Sampling (PRS).
7775
7876
```@example
7977
using PartialRejectionSampling
80-
pp = PRS.PatternFreeString(['A', 'C', 'G', 'T'], "ATGTA")
78+
pp = PRS.PatternFreeString("ATGTA", ["A", "C", "G", "T"])
8179
PRS.generate_sample_prs(pp, 20)
8280
```
81+
82+
**See also**
83+
- [GiAmWe18](@cite), Sections 2.1 and 4
8384
"""
8485
function generate_sample_prs(
8586
rng::Random.AbstractRNG,
8687
pp::PatternFreeString{T},
8788
size::Int
8889
)::T where {T<:String}
8990
@assert size > 0
90-
@assert !isempty(pp.alphabet)
91-
@assert !isempty(pp.pattern.pattern)
92-
return _pattern_free_string_prs(rng, pp.alphabet, pp.pattern, size)
91+
return _pattern_free_string_prs(rng, pp.pattern, pp.alphabet, size)
9392
end
9493

9594
"""
9695
_generate_pattern_free_string_prs(
9796
rng::Random.AbstractRNG,
98-
alphabet::Vector{Char},
99-
pattern::Regex,
97+
pattern::String,
98+
alphabet::Vector{String},
10099
size::Int
101100
)::String
102101
103-
Generate a string uniformly at random among all strings made of characters from `alphabet` with no occurence of the pattern `pattern`, using a tailored version of Partial Rejection Sampling (PRS) derived by [GiAmWe18](@cite)
102+
Generate a string uniformly at random among all strings made of characters from `alphabet` with no occurence of the pattern `pattern`, using a tailored version of Partial Rejection Sampling (PRS)
103+
104+
**See also**
105+
- [GiAmWe18](@cite), Sections 2.1 and 4
104106
"""
105107
function _pattern_free_string_prs(
106108
rng::Random.AbstractRNG,
107-
alphabet::Vector{Char},
108-
regex::Regex,
109+
pattern::String,
110+
alphabet::Vector{String},
109111
size::Int
110112
)::String
111-
pref_suff = find_prefix_suffix(regex.pattern)
112-
if isempty(pref_suff)
113-
return _pattern_free_string_prs_extremal(rng, alphabet, regex, size)
113+
if has_common_prefix_suffix(pattern)
114+
return _pattern_free_string_prs_general(rng, pattern, alphabet, size)
114115
else
115-
return _pattern_free_string_prs_general(rng, alphabet, regex, size, pref_suff)
116+
return _pattern_free_string_prs_extremal(rng, pattern, alphabet, size)
116117
end
117118
end
118119

119120
## Extremal PRS
120121

121-
function _pattern_free_string_prs_extremal(rng, alphabet, regex, size)
122+
function _pattern_free_string_prs_extremal(rng, pattern, alphabet, size)
122123
str_vec = rand(rng, alphabet, size)
123124
while true
124125
str = join(str_vec)
125-
bad_ranges = eachmatch_ranges(regex, str)
126-
isempty(bad_ranges) && return str
127-
for b in bad_ranges
128-
str_vec[b] .= rand(rng, alphabet, length(b))
126+
bad = findall(pattern, str; overlap=false)
127+
isempty(bad) && return str
128+
for range_ in bad
129+
str_vec[range_] .= rand(rng, alphabet, length(range_))
129130
end
130131
end
131132
end
132133

133134
## General PRS
134135

135-
function _pattern_free_string_prs_general(rng, alphabet, regex, size, pref_suff)
136-
throw(DomainError(regex.pattern, "Generalized PRS is not yet implemented for pattern free strings"))
137-
# pp = fill("", size)
138-
# resample_indices = Set(1:size)
139-
140-
# while !isempty(resample_indices)
141-
# generate_sample!(rng, pp, resample_indices, alphabet)
142-
# resample_indices = find_characters_to_resample(pp, regex, pref_suff)
143-
# end
144-
# return join(pp)
136+
function _pattern_free_string_prs_general(rng, pattern, alphabet, size)
137+
str_vec = Vector{String}(undef, size)
138+
rand!(rng, str_vec, alphabet)
139+
tmp_vec = fill("", size)
140+
while true
141+
str = join(str_vec)
142+
bad = findall_overlap(pattern, str)
143+
isempty(bad) && return str
144+
for range_ in bad
145+
@inbounds tmp_vec[range_] .= str_vec[range_]
146+
end
147+
res = empty(bad)
148+
while !isempty(bad)
149+
B = popfirst!(bad)
150+
= _check_extension!(tmp_vec, str_vec, pattern, B)
151+
if isequal(B, B̄)
152+
push!(res, B)
153+
else
154+
push!(bad, B̄)
155+
end
156+
end
157+
for range_ in res
158+
@inbounds str_vec[range_] .= rand(rng, alphabet, length(range_))
159+
end
160+
end
145161
end
146162

147-
# """
148-
# generate_sample!(
149-
# rng::Random.AbstractRNG,
150-
# string_vec::Vector{T},
151-
# indices,
152-
# alphabet::Vector{T}
153-
# ) where {T<:AbstractString}
154-
155-
# Generate a character uniformly at random from `alphabet` at positions prescribed by `indices` in `string_vec`.
156-
# """
157-
# function generate_sample!(
158-
# rng::Random.AbstractRNG,
159-
# string_vec::Vector{T},
160-
# indices,
161-
# alphabet::Vector{T}
162-
# ) where {T<:AbstractString}
163-
# for i in indices
164-
# string_vec[i] = rand(rng, alphabet)
165-
# end
166-
# end
167-
168-
# """
169-
# find_bad_ranges(
170-
# pattern::T,
171-
# string::T
172-
# )::Vector{UnitRange} where {T<:AbstractString}
173-
174-
# Identify where `pattern` occur in `string` and return the corresponding ranges of indices.
175-
# """
176-
# function find_bad_ranges(
177-
# pattern::T,
178-
# string::T
179-
# )::Vector{UnitRange} where {T<:AbstractString}
180-
181-
# bad_ranges = UnitRange{Int64}[]
182-
183-
# matches = eachmatch(Regex(pattern), string, overlap=true)
184-
# isempty(matches) && return bad_ranges
185-
186-
# p = length(pattern)
187-
# m, _ = iterate(matches)
188-
189-
# x1, y1 = m.offset, m.offset + p - 1
190-
# for m in Iterators.drop(matches, 1)
191-
# x2, y2 = m.offset, m.offset + p - 1
192-
# if x2 <= y1 + 1
193-
# y1 = y2
194-
# else
195-
# push!(bad_ranges, x1:y1)
196-
# x1, y1 = x2, y2
197-
# end
198-
# end
199-
# push!(bad_ranges, x1:y1)
200-
201-
# return bad_ranges
202-
# end
203-
204-
# """
205-
# find_characters_to_resample(
206-
# string_vec::Vector{T},
207-
# pattern::T,
208-
# pref_suff::Vector{U}
209-
# )::Vector{U} where {T<:String, U<:Int}
210-
211-
# Identify the set of events to be resampled as constructed by Algorithm 5 in [GuJeLi19](@cite) as part of the Partial Rejection Sampling (PRS) method.
212-
# Return the indices of the variables involved in the corresponding events.
213-
214-
# **See also**
215-
216-
# - [`PRS.find_bad_ranges`](@ref)
217-
# """
218-
# function find_characters_to_resample(
219-
# string_vec::Vector{T},
220-
# pattern::T,
221-
# pref_suff::Vector{U}
222-
# )::Vector{U} where {T<:String, U<:Int}
223-
224-
# # Extremal case
225-
# isempty(pref_suff) && return vcat(findall(pattern, join(string_vec), overlap=false)...)
226-
227-
# # General case
228-
# bad_ranges = find_bad_ranges(pattern, join(string_vec))
229-
# isempty(bad_ranges) && return vcat(bad_ranges...)
230-
231-
# p, n = length(pattern), length(string_vec)
232-
# tmp = fill("", n)
163+
"""
164+
_check_extension!(
165+
tmp_vec::Vector{String},
166+
str_vec::Vector{String},
167+
pattern::String,
168+
window::UnitRange{U}
169+
)::UnitRange{U} where {U<:Int}
170+
171+
Assuming `join(tmp_vec[window]) == join(str_vec[window]) == pattern`, check whether a reassigment of `""` elements from left or right of `tmp_vec[window]` can make `pattern` occur.
172+
If this is the case, the identified `""` elements of `tmp_vec` are set with the corresponding elements from `str_vec` and the *extended* window where pattern can arise is returned.
173+
Otherwise the original `window` is returned.
174+
"""
175+
function _check_extension!(
176+
tmp_vec::Vector{String},
177+
str_vec::Vector{String},
178+
pattern::String,
179+
window::UnitRange{U}
180+
)::UnitRange{U} where {U<:Int}
181+
p = length(pattern)
182+
i, j = first(window), last(window)
183+
# left looking
184+
i₋ = max(firstindex(tmp_vec), i-p+1)
185+
while i₋ < i
186+
window_ = i₋:min(i₋+p-1, j)
187+
range_ = _pattern_can_occur_if_reassignment_at(pattern, tmp_vec, window_)
188+
if !isempty(range_)
189+
@inbounds tmp_vec[range_] .= str_vec[range_]
190+
break
191+
end
192+
i₋ += 1
193+
end
194+
# right looking
195+
j₊ = min(lastindex(tmp_vec), j+p-1)
196+
while j₊ > j
197+
window_ = j₊:max(j₊-p+1, i)
198+
range_ = _pattern_can_occur_if_reassignment_at(pattern, tmp_vec, window_)
199+
if !isempty(range_)
200+
@inbounds tmp_vec[range_] .= str_vec[range_]
201+
break
202+
end
203+
j₊ -= 1
204+
end
205+
return i₋:j₊
206+
end
233207

234-
# for bad_range in bad_ranges
235-
# tmp[bad_range] = string_vec[bad_range]
236-
# start, stop = bad_range.start, bad_range.stop
237-
# for ps in pref_suff
238-
# flag_left = flag_right = false
239-
# if !flag_left
240-
# I = (start - p + ps):(start - 1)
241-
# if I.start >= 1
242-
# flag_left = startswith(pattern, join(tmp[I]))
243-
# if flag_left
244-
# tmp[I] = string_vec[I]
245-
# end
246-
# end
247-
# end
248-
# if !flag_right
249-
# J = (stop + 1):(stop + p - ps)
250-
# if J.stop <= n
251-
# flag_right = endswith(pattern, join(tmp[J]))
252-
# if flag_right
253-
# tmp[J] = string_vec[J]
254-
# end
255-
# end
256-
# end
257-
# flag_left && flag_right && break
258-
# end
259-
# end
260-
# return findall(!isempty, tmp)
261-
# end
208+
"""
209+
_pattern_can_occur_if_reassignment_at(
210+
pattern::String,
211+
vec::Vector{String},
212+
window::UnitRange{U}
213+
)::UnitRange{U} where {U<:Int}
214+
215+
Find the range of indices of `window` where `""` elements of `vec[window]` can be modified to make the resulting `join(vec[window]) == pattern`.
216+
An empty range is returned otherwise.
217+
"""
218+
function _pattern_can_occur_if_reassignment_at(
219+
pattern::String,
220+
vec::Vector{String},
221+
window::UnitRange{U}
222+
)::UnitRange{U} where {U<:Int}
223+
empty_range = one(U):zero(U)
224+
(isempty(window) || length(window) != length(pattern)) && return empty_range
225+
i = first(window)
226+
f1 = findnext(isempty, vec, i)
227+
isnothing(f1) && return empty_range
228+
j = last(window)
229+
f2 = findprev(isempty, vec, j)
230+
if startswith(pattern, join(vec[i:f1])) && endswith(pattern, join(vec[f2:j]))
231+
return f1:f2
232+
end
233+
return empty_range
234+
end

0 commit comments

Comments
 (0)