Skip to content

Commit 3013e74

Browse files
committed
Start reshaping PatternFreeString, found bug in the general case
Need to work on the general case for pattern free strings!
1 parent 7e6df24 commit 3013e74

File tree

2 files changed

+179
-148
lines changed

2 files changed

+179
-148
lines changed

src/misc/pattern_free_string.jl

Lines changed: 167 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
"""
22
PatternFreeString{T<:String} <: AbstractPointProcess{T}
33
4-
Container with fields `alphabet` and `pattern` used to generate a string made of characters from `alphabet` avoiding the prescribed `pattern`.
4+
Struct with fields
5+
- `alphabet::Vector{Char}`,
6+
- `pattern::Regex`,
7+
used to generate strings made of characters from `alphabet` avoiding the prescribed `pattern`.
58
"""
69
struct PatternFreeString{T<:String} <: AbstractPointProcess{T}
7-
alphabet::Vector{T}
8-
pattern::T
10+
alphabet::Vector{Char}
11+
pattern::Regex
912
end
1013

1114
function Base.show(io::IO, pp::PatternFreeString{T}) where {T}
12-
print(io, "PatternFreeString{$T}\n- alphabet = $(pp.alphabet)\n- pattern = $(pp.pattern)")
15+
print(io, "PatternFreeString{$T}\n- alphabet = $(pp.alphabet)\n- pattern = $(pp.pattern.pattern)")
1316
end
1417

1518
"""
@@ -19,22 +22,22 @@ Construct a [`PRS.PatternFreeString`](@ref).
1922
2023
```jldoctest; output = true
2124
using PartialRejectionSampling
22-
PRS.PatternFreeString(["A", "C", "G", "T"], "ATGTA")
25+
PRS.PatternFreeString(['A', 'C', 'G', 'T'], "ATGTA")
2326
2427
# output
2528
2629
PatternFreeString{String}
27-
- alphabet = ["A", "C", "G", "T"]
30+
- alphabet = ['A', 'C', 'G', 'T']
2831
- pattern = ATGTA
2932
```
3033
"""
31-
function PatternFreeString(alphabet::Vector{String}, pattern::String)
34+
function PatternFreeString(alphabet::Vector{Char}, pattern::String)
3235
@assert !isempty(alphabet)
3336
@assert !isempty(pattern)
34-
for p in split(pattern, "")
35-
p alphabet && throw(DomainError(p, "pattern is not made of characters from alphabet"))
37+
if !issubset(Vector{Char}(pattern), alphabet)
38+
throw(DomainError(pattern, "pattern $(pattern) is not fully made of characters from alphabet $(alphabet)"))
3639
end
37-
return PatternFreeString{String}(alphabet, pattern)
40+
return PatternFreeString{String}(alphabet, Regex(pattern))
3841
end
3942

4043
"""
@@ -67,11 +70,14 @@ end
6770
size::Int
6871
)::T where {T<:String}
6972
70-
Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of the pattern `pp.pattern`, using a tailored version of Partial Rejection Sampling (PRS) derived by [GiAmWe18](@cite).
73+
Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of the pattern `pp.pattern`, using a tailored version of Partial Rejection Sampling (PRS).
74+
75+
**See also**
76+
- Technical report of [GiAmWe18](@cite)
7177
7278
```@example
7379
using PartialRejectionSampling
74-
pp = PRS.PatternFreeString(["A", "C", "G", "T"], "ATGTA")
80+
pp = PRS.PatternFreeString(['A', 'C', 'G', 'T'], "ATGTA")
7581
PRS.generate_sample_prs(pp, 20)
7682
```
7783
"""
@@ -81,162 +87,175 @@ function generate_sample_prs(
8187
size::Int
8288
)::T where {T<:String}
8389
@assert size > 0
84-
return _generate_sample_pattern_free_string_prs(rng, pp.alphabet, pp.pattern, size)
90+
@assert !isempty(pp.alphabet)
91+
@assert !isempty(pp.pattern.pattern)
92+
return _pattern_free_string_prs(rng, pp.alphabet, pp.pattern, size)
8593
end
8694

87-
# function generate_sample_prs(
88-
# pp::PatternFreeString,
89-
# size::Int
90-
# )
91-
# return generate_sample(Random.default_rng(), pp, size)
92-
# end
93-
9495
"""
9596
_generate_pattern_free_string_prs(
9697
rng::Random.AbstractRNG,
97-
alphabet::Vector{T},
98-
pattern::T,
98+
alphabet::Vector{Char},
99+
pattern::Regex,
99100
size::Int
100-
)::T where {T<:AbstractString}
101+
)::String
101102
102103
Generate a string uniformly at random among all strings made of characters from `alphabet` with no occurence of the pattern `pattern`, using a tailored version of Partial Rejection Sampling (PRS) derived by [GiAmWe18](@cite)
103104
"""
104-
function _generate_sample_pattern_free_string_prs(
105+
function _pattern_free_string_prs(
105106
rng::Random.AbstractRNG,
106-
alphabet::Vector{T},
107-
pattern::T,
107+
alphabet::Vector{Char},
108+
regex::Regex,
108109
size::Int
109-
)::T where {T<:AbstractString}
110-
111-
@assert size > 0
112-
@assert !isempty(alphabet)
113-
@assert !isempty(pattern)
114-
115-
pref_suff = find_prefix_suffix(pattern)
116-
117-
pp = fill("", size)
118-
resample_indices = Set(1:size)
119-
120-
while !isempty(resample_indices)
121-
generate_sample!(rng, pp, resample_indices, alphabet)
122-
resample_indices = find_characters_to_resample(pp, pattern, pref_suff)
110+
)::String
111+
pref_suff = find_prefix_suffix(regex.pattern)
112+
if isempty(pref_suff)
113+
return _pattern_free_string_prs_extremal(rng, alphabet, regex, size)
114+
else
115+
return _pattern_free_string_prs_general(rng, alphabet, regex, size, pref_suff)
123116
end
124-
return join(pp)
125117
end
126118

127-
find_prefix_suffix(s::String) = [i for i in 1:div(length(s), 2) if s[1:i] == s[end-i+1:end]]
128-
129-
"""
130-
generate_sample!(
131-
[rng::Random.AbstractRNG,]
132-
string_vec::Vector{T},
133-
indices,
134-
alphabet::Vector{T}
135-
) where {T<:AbstractString}
119+
## Extremal PRS
136120

137-
Generate a character uniformly at random from `alphabet` at positions prescribed by `indices` in `string_vec`.
138-
"""
139-
function generate_sample!(
140-
rng::Random.AbstractRNG,
141-
string_vec::Vector{T},
142-
indices,
143-
alphabet::Vector{T}
144-
) where {T<:AbstractString}
145-
for i in indices
146-
string_vec[i] = rand(rng, alphabet)
121+
function _pattern_free_string_prs_extremal(rng, alphabet, regex, size)
122+
str_vec = rand(rng, alphabet, size)
123+
while true
124+
str = join(str_vec)
125+
bad_ranges = eachmatch_ranges(regex, str)
126+
isempty(bad_ranges) && return str
127+
for b in bad_ranges
128+
str_vec[b] .= rand(rng, alphabet, length(b))
129+
end
147130
end
148131
end
149132

150-
"""
151-
find_bad_ranges(
152-
pattern::T,
153-
string::T
154-
)::Vector{UnitRange} where {T<:AbstractString}
133+
## General PRS
155134

156-
Identify where `pattern` occur in `string` and return the corresponding ranges of indices.
157-
"""
158-
function find_bad_ranges(
159-
pattern::T,
160-
string::T
161-
)::Vector{UnitRange} where {T<:AbstractString}
162-
163-
bad_ranges = UnitRange{Int64}[]
164-
165-
matches = eachmatch(Regex(pattern), string, overlap=true)
166-
isempty(matches) && return bad_ranges
167-
168-
p = length(pattern)
169-
m, _ = iterate(matches)
170-
x1, y1 = m.offset, m.offset + p - 1
171-
172-
for m in Iterators.drop(matches, 1)
173-
x2, y2 = m.offset, m.offset + p - 1
174-
if x2 <= y1 + 1
175-
y1 = y2
176-
else
177-
push!(bad_ranges, x1:y1)
178-
x1, y1 = x2, y2
179-
end
180-
end
181-
push!(bad_ranges, x1:y1)
182-
return bad_ranges
183-
end
135+
function _pattern_free_string_prs_general(rng, alphabet, regex, size, pref_suff)
136+
throw(DomainError(regex.pattern, "Generalized PRS is not yet implemented for pattern free strings"))
137+
# pp = fill("", size)
138+
# resample_indices = Set(1:size)
184139

185-
"""
186-
find_characters_to_resample(
187-
string_vec::Vector{T},
188-
pattern::T,
189-
pref_suff::Vector{U}
190-
)::Vector{U} where {T<:String, U<:Int}
140+
# while !isempty(resample_indices)
141+
# generate_sample!(rng, pp, resample_indices, alphabet)
142+
# resample_indices = find_characters_to_resample(pp, regex, pref_suff)
143+
# end
144+
# return join(pp)
145+
end
191146

192-
Identify the set of events to be resampled as constructed by Algorithm 5 in [GuJeLi19](@cite) as part of the Partial Rejection Sampling (PRS) method.
193-
Return the indices of the variables involved in the corresponding events.
147+
# """
148+
# generate_sample!(
149+
# rng::Random.AbstractRNG,
150+
# string_vec::Vector{T},
151+
# indices,
152+
# alphabet::Vector{T}
153+
# ) where {T<:AbstractString}
154+
155+
# Generate a character uniformly at random from `alphabet` at positions prescribed by `indices` in `string_vec`.
156+
# """
157+
# function generate_sample!(
158+
# rng::Random.AbstractRNG,
159+
# string_vec::Vector{T},
160+
# indices,
161+
# alphabet::Vector{T}
162+
# ) where {T<:AbstractString}
163+
# for i in indices
164+
# string_vec[i] = rand(rng, alphabet)
165+
# end
166+
# end
194167

195-
**See also**
168+
# """
169+
# find_bad_ranges(
170+
# pattern::T,
171+
# string::T
172+
# )::Vector{UnitRange} where {T<:AbstractString}
173+
174+
# Identify where `pattern` occur in `string` and return the corresponding ranges of indices.
175+
# """
176+
# function find_bad_ranges(
177+
# pattern::T,
178+
# string::T
179+
# )::Vector{UnitRange} where {T<:AbstractString}
180+
181+
# bad_ranges = UnitRange{Int64}[]
182+
183+
# matches = eachmatch(Regex(pattern), string, overlap=true)
184+
# isempty(matches) && return bad_ranges
185+
186+
# p = length(pattern)
187+
# m, _ = iterate(matches)
188+
189+
# x1, y1 = m.offset, m.offset + p - 1
190+
# for m in Iterators.drop(matches, 1)
191+
# x2, y2 = m.offset, m.offset + p - 1
192+
# if x2 <= y1 + 1
193+
# y1 = y2
194+
# else
195+
# push!(bad_ranges, x1:y1)
196+
# x1, y1 = x2, y2
197+
# end
198+
# end
199+
# push!(bad_ranges, x1:y1)
200+
201+
# return bad_ranges
202+
# end
196203

197-
- [`PRS.find_bad_ranges`](@ref)
198-
"""
199-
function find_characters_to_resample(
200-
string_vec::Vector{T},
201-
pattern::T,
202-
pref_suff::Vector{U}
203-
)::Vector{U} where {T<:String, U<:Int}
204-
205-
# Extremal case
206-
isempty(pref_suff) && return vcat(findall(pattern, join(string_vec), overlap=false)...)
207-
208-
# General case
209-
bad_ranges = find_bad_ranges(pattern, join(string_vec))
210-
isempty(bad_ranges) && return vcat(bad_ranges...)
211-
212-
p, s = length(pattern), length(string_vec)
213-
tmp = fill("", s)
214-
215-
for bad_range in bad_ranges
216-
tmp[bad_range] = string_vec[bad_range]
217-
start, stop = bad_range.start, bad_range.stop
218-
for ps in pref_suff
219-
flag_left = flag_right = false
220-
if !flag_left
221-
I = (start - p + ps):(start - 1)
222-
if I.start >= 1
223-
flag_left = startswith(pattern, join(tmp[I]))
224-
if flag_left
225-
tmp[I] = string_vec[I]
226-
end
227-
end
228-
end
229-
if !flag_right
230-
J = (stop + 1):(stop + p - ps)
231-
if J.stop <= s
232-
flag_right = endswith(pattern, join(tmp[J]))
233-
if flag_right
234-
tmp[J] = string_vec[J]
235-
end
236-
end
237-
end
238-
flag_left && flag_right && break
239-
end
240-
end
241-
return findall(!isempty, tmp)
242-
end
204+
# """
205+
# find_characters_to_resample(
206+
# string_vec::Vector{T},
207+
# pattern::T,
208+
# pref_suff::Vector{U}
209+
# )::Vector{U} where {T<:String, U<:Int}
210+
211+
# Identify the set of events to be resampled as constructed by Algorithm 5 in [GuJeLi19](@cite) as part of the Partial Rejection Sampling (PRS) method.
212+
# Return the indices of the variables involved in the corresponding events.
213+
214+
# **See also**
215+
216+
# - [`PRS.find_bad_ranges`](@ref)
217+
# """
218+
# function find_characters_to_resample(
219+
# string_vec::Vector{T},
220+
# pattern::T,
221+
# pref_suff::Vector{U}
222+
# )::Vector{U} where {T<:String, U<:Int}
223+
224+
# # Extremal case
225+
# isempty(pref_suff) && return vcat(findall(pattern, join(string_vec), overlap=false)...)
226+
227+
# # General case
228+
# bad_ranges = find_bad_ranges(pattern, join(string_vec))
229+
# isempty(bad_ranges) && return vcat(bad_ranges...)
230+
231+
# p, n = length(pattern), length(string_vec)
232+
# tmp = fill("", n)
233+
234+
# for bad_range in bad_ranges
235+
# tmp[bad_range] = string_vec[bad_range]
236+
# start, stop = bad_range.start, bad_range.stop
237+
# for ps in pref_suff
238+
# flag_left = flag_right = false
239+
# if !flag_left
240+
# I = (start - p + ps):(start - 1)
241+
# if I.start >= 1
242+
# flag_left = startswith(pattern, join(tmp[I]))
243+
# if flag_left
244+
# tmp[I] = string_vec[I]
245+
# end
246+
# end
247+
# end
248+
# if !flag_right
249+
# J = (stop + 1):(stop + p - ps)
250+
# if J.stop <= n
251+
# flag_right = endswith(pattern, join(tmp[J]))
252+
# if flag_right
253+
# tmp[J] = string_vec[J]
254+
# end
255+
# end
256+
# end
257+
# flag_left && flag_right && break
258+
# end
259+
# end
260+
# return findall(!isempty, tmp)
261+
# end

src/utils.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,15 @@ function random_neighbor_assignment(
216216
) where {T}
217217
return random_neighbor_assignment(Random.default_rng(), graph, roots)
218218
end
219+
220+
## String methods
221+
222+
function eachmatch_ranges(regex::Regex, string::String; overlap=false)
223+
matches = eachmatch(regex, string; overlap=overlap)
224+
p = length(regex.pattern)
225+
return (m.offset:(m.offset + p - 1) for m in matches)
226+
end
227+
228+
function find_prefix_suffix(s::String)
229+
return [i for i in 1:div(length(s), 2) if s[1:i] == s[end-i+1:end]]
230+
end

0 commit comments

Comments
 (0)