22 PatternFreeString{T<:String} <: AbstractPointProcess{T}
33
44Struct with fields
5- - `alphabet::Vector{Char}`,
6- - `pattern::Regex`,
5+ - `alphabet::Vector{String}`,
6+ - `pattern::String`,
7+
78used to generate strings made of characters from `alphabet` avoiding the prescribed `pattern`.
89"""
910struct PatternFreeString{T<: String } <: AbstractPointProcess{T}
10- alphabet :: Vector{Char}
11- pattern :: Regex
11+ pattern :: T
12+ alphabet :: Vector{T}
1213end
1314
1415function Base. show (io:: IO , pp:: PatternFreeString{T} ) where {T}
15- print (io, " PatternFreeString{$T }\n - alphabet = $(pp. alphabet ) \n - pattern = $(pp. pattern . pattern ) " )
16+ print (io, " PatternFreeString{$T }\n - pattern = $(pp. pattern ) \n - alphabet = $(pp. alphabet ) " )
1617end
1718
1819"""
@@ -22,22 +23,22 @@ Construct a [`PRS.PatternFreeString`](@ref).
2223
2324```jldoctest; output = true
2425using PartialRejectionSampling
25- PRS.PatternFreeString(['A', 'C', 'G', 'T'], "ATGTA" )
26+ PRS.PatternFreeString("ATGTA", ["A", "C", "G", "T"] )
2627
2728# output
2829
2930PatternFreeString{String}
30- - alphabet = ['A', 'C', 'G', 'T']
3131- pattern = ATGTA
32+ - alphabet = ["A", "C", "G", "T"]
3233```
3334"""
34- function PatternFreeString (alphabet:: Vector{Char} , pattern:: String )
35- @assert ! isempty (alphabet)
35+ function PatternFreeString (pattern:: String , alphabet:: Vector{String} )
3636 @assert ! isempty (pattern)
37- if ! issubset (Vector {Char} (pattern), alphabet)
38- throw (DomainError (pattern, " pattern $(pattern) is not fully made of characters from alphabet $(alphabet) " ))
37+ @assert ! isempty (alphabet)
38+ if ! issubset (string .(unique (pattern)), alphabet)
39+ throw (DomainError (pattern, " pattern is not fully made of characters from alphabet $(alphabet) " ))
3940 end
40- return PatternFreeString {String} (alphabet, Regex (pattern) )
41+ return PatternFreeString {String} (pattern, alphabet )
4142end
4243
4344"""
4748 size::Int
4849 )::T where {T<:String}
4950
50- Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of the pattern `pp.pattern`.
51+ Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of `pp.pattern`.
5152
5253Default sampler is [`PRS.generate_sample_prs`](@ref).
5354"""
@@ -70,192 +71,164 @@ end
7071 size::Int
7172 )::T where {T<:String}
7273
73- Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of the pattern `pp.pattern`, using a tailored version of Partial Rejection Sampling (PRS).
74-
75- **See also**
76- - Technical report of [GiAmWe18](@cite)
74+ Generate a string uniformly at random among all strings made of characters from `pp.alphabet` with no occurence of `pp.pattern`, using a tailored version of Partial Rejection Sampling (PRS).
7775
7876```@example
7977using PartialRejectionSampling
80- pp = PRS.PatternFreeString(['A', 'C', 'G', 'T'], "ATGTA" )
78+ pp = PRS.PatternFreeString("ATGTA", ["A", "C", "G", "T"] )
8179PRS.generate_sample_prs(pp, 20)
8280```
81+
82+ **See also**
83+ - [GiAmWe18](@cite), Sections 2.1 and 4
8384"""
8485function generate_sample_prs (
8586 rng:: Random.AbstractRNG ,
8687 pp:: PatternFreeString{T} ,
8788 size:: Int
8889):: T where {T<: String }
8990 @assert size > 0
90- @assert ! isempty (pp. alphabet)
91- @assert ! isempty (pp. pattern. pattern)
92- return _pattern_free_string_prs (rng, pp. alphabet, pp. pattern, size)
91+ return _pattern_free_string_prs (rng, pp. pattern, pp. alphabet, size)
9392end
9493
9594"""
9695 _generate_pattern_free_string_prs(
9796 rng::Random.AbstractRNG,
98- alphabet::Vector{Char} ,
99- pattern::Regex ,
97+ pattern::String ,
98+ alphabet::Vector{String} ,
10099 size::Int
101100 )::String
102101
103- Generate a string uniformly at random among all strings made of characters from `alphabet` with no occurence of the pattern `pattern`, using a tailored version of Partial Rejection Sampling (PRS) derived by [GiAmWe18](@cite)
102+ Generate a string uniformly at random among all strings made of characters from `alphabet` with no occurence of the pattern `pattern`, using a tailored version of Partial Rejection Sampling (PRS)
103+
104+ **See also**
105+ - [GiAmWe18](@cite), Sections 2.1 and 4
104106"""
105107function _pattern_free_string_prs (
106108 rng:: Random.AbstractRNG ,
107- alphabet :: Vector{Char} ,
108- regex :: Regex ,
109+ pattern :: String ,
110+ alphabet :: Vector{String} ,
109111 size:: Int
110112):: String
111- pref_suff = find_prefix_suffix (regex. pattern)
112- if isempty (pref_suff)
113- return _pattern_free_string_prs_extremal (rng, alphabet, regex, size)
113+ if has_common_prefix_suffix (pattern)
114+ return _pattern_free_string_prs_general (rng, pattern, alphabet, size)
114115 else
115- return _pattern_free_string_prs_general (rng, alphabet, regex , size, pref_suff )
116+ return _pattern_free_string_prs_extremal (rng, pattern, alphabet , size)
116117 end
117118end
118119
119120# # Extremal PRS
120121
121- function _pattern_free_string_prs_extremal (rng, alphabet, regex , size)
122+ function _pattern_free_string_prs_extremal (rng, pattern, alphabet , size)
122123 str_vec = rand (rng, alphabet, size)
123124 while true
124125 str = join (str_vec)
125- bad_ranges = eachmatch_ranges (regex , str)
126- isempty (bad_ranges ) && return str
127- for b in bad_ranges
128- str_vec[b ] .= rand (rng, alphabet, length (b ))
126+ bad = findall (pattern , str; overlap = false )
127+ isempty (bad ) && return str
128+ for range_ in bad
129+ str_vec[range_ ] .= rand (rng, alphabet, length (range_ ))
129130 end
130131 end
131132end
132133
133134# # General PRS
134135
135- function _pattern_free_string_prs_general (rng, alphabet, regex, size, pref_suff)
136- throw (DomainError (regex. pattern, " Generalized PRS is not yet implemented for pattern free strings" ))
137- # pp = fill("", size)
138- # resample_indices = Set(1:size)
139-
140- # while !isempty(resample_indices)
141- # generate_sample!(rng, pp, resample_indices, alphabet)
142- # resample_indices = find_characters_to_resample(pp, regex, pref_suff)
143- # end
144- # return join(pp)
136+ function _pattern_free_string_prs_general (rng, pattern, alphabet, size)
137+ str_vec = Vector {String} (undef, size)
138+ rand! (rng, str_vec, alphabet)
139+ tmp_vec = fill (" " , size)
140+ while true
141+ str = join (str_vec)
142+ bad = findall_overlap (pattern, str)
143+ isempty (bad) && return str
144+ for range_ in bad
145+ @inbounds tmp_vec[range_] .= str_vec[range_]
146+ end
147+ res = empty (bad)
148+ while ! isempty (bad)
149+ B = popfirst! (bad)
150+ B̄ = _check_extension! (tmp_vec, str_vec, pattern, B)
151+ if isequal (B, B̄)
152+ push! (res, B)
153+ else
154+ push! (bad, B̄)
155+ end
156+ end
157+ for range_ in res
158+ @inbounds str_vec[range_] .= rand (rng, alphabet, length (range_))
159+ end
160+ end
145161end
146162
147- # """
148- # generate_sample!(
149- # rng::Random.AbstractRNG,
150- # string_vec::Vector{T},
151- # indices,
152- # alphabet::Vector{T}
153- # ) where {T<:AbstractString}
154-
155- # Generate a character uniformly at random from `alphabet` at positions prescribed by `indices` in `string_vec`.
156- # """
157- # function generate_sample!(
158- # rng::Random.AbstractRNG,
159- # string_vec::Vector{T},
160- # indices,
161- # alphabet::Vector{T}
162- # ) where {T<:AbstractString}
163- # for i in indices
164- # string_vec[i] = rand(rng, alphabet)
165- # end
166- # end
167-
168- # """
169- # find_bad_ranges(
170- # pattern::T,
171- # string::T
172- # )::Vector{UnitRange} where {T<:AbstractString}
173-
174- # Identify where `pattern` occur in `string` and return the corresponding ranges of indices.
175- # """
176- # function find_bad_ranges(
177- # pattern::T,
178- # string::T
179- # )::Vector{UnitRange} where {T<:AbstractString}
180-
181- # bad_ranges = UnitRange{Int64}[]
182-
183- # matches = eachmatch(Regex(pattern), string, overlap=true)
184- # isempty(matches) && return bad_ranges
185-
186- # p = length(pattern)
187- # m, _ = iterate(matches)
188-
189- # x1, y1 = m.offset, m.offset + p - 1
190- # for m in Iterators.drop(matches, 1)
191- # x2, y2 = m.offset, m.offset + p - 1
192- # if x2 <= y1 + 1
193- # y1 = y2
194- # else
195- # push!(bad_ranges, x1:y1)
196- # x1, y1 = x2, y2
197- # end
198- # end
199- # push!(bad_ranges, x1:y1)
200-
201- # return bad_ranges
202- # end
203-
204- # """
205- # find_characters_to_resample(
206- # string_vec::Vector{T},
207- # pattern::T,
208- # pref_suff::Vector{U}
209- # )::Vector{U} where {T<:String, U<:Int}
210-
211- # Identify the set of events to be resampled as constructed by Algorithm 5 in [GuJeLi19](@cite) as part of the Partial Rejection Sampling (PRS) method.
212- # Return the indices of the variables involved in the corresponding events.
213-
214- # **See also**
215-
216- # - [`PRS.find_bad_ranges`](@ref)
217- # """
218- # function find_characters_to_resample(
219- # string_vec::Vector{T},
220- # pattern::T,
221- # pref_suff::Vector{U}
222- # )::Vector{U} where {T<:String, U<:Int}
223-
224- # # Extremal case
225- # isempty(pref_suff) && return vcat(findall(pattern, join(string_vec), overlap=false)...)
226-
227- # # General case
228- # bad_ranges = find_bad_ranges(pattern, join(string_vec))
229- # isempty(bad_ranges) && return vcat(bad_ranges...)
230-
231- # p, n = length(pattern), length(string_vec)
232- # tmp = fill("", n)
163+ """
164+ _check_extension!(
165+ tmp_vec::Vector{String},
166+ str_vec::Vector{String},
167+ pattern::String,
168+ window::UnitRange{U}
169+ )::UnitRange{U} where {U<:Int}
170+
171+ Assuming `join(tmp_vec[window]) == join(str_vec[window]) == pattern`, check whether a reassigment of `""` elements from left or right of `tmp_vec[window]` can make `pattern` occur.
172+ If this is the case, the identified `""` elements of `tmp_vec` are set with the corresponding elements from `str_vec` and the *extended* window where pattern can arise is returned.
173+ Otherwise the original `window` is returned.
174+ """
175+ function _check_extension! (
176+ tmp_vec:: Vector{String} ,
177+ str_vec:: Vector{String} ,
178+ pattern:: String ,
179+ window:: UnitRange{U}
180+ ):: UnitRange{U} where {U<: Int }
181+ p = length (pattern)
182+ i, j = first (window), last (window)
183+ # left looking
184+ i₋ = max (firstindex (tmp_vec), i- p+ 1 )
185+ while i₋ < i
186+ window_ = i₋: min (i₋+ p- 1 , j)
187+ range_ = _pattern_can_occur_if_reassignment_at (pattern, tmp_vec, window_)
188+ if ! isempty (range_)
189+ @inbounds tmp_vec[range_] .= str_vec[range_]
190+ break
191+ end
192+ i₋ += 1
193+ end
194+ # right looking
195+ j₊ = min (lastindex (tmp_vec), j+ p- 1 )
196+ while j₊ > j
197+ window_ = j₊: max (j₊- p+ 1 , i)
198+ range_ = _pattern_can_occur_if_reassignment_at (pattern, tmp_vec, window_)
199+ if ! isempty (range_)
200+ @inbounds tmp_vec[range_] .= str_vec[range_]
201+ break
202+ end
203+ j₊ -= 1
204+ end
205+ return i₋: j₊
206+ end
233207
234- # for bad_range in bad_ranges
235- # tmp[bad_range] = string_vec[bad_range]
236- # start, stop = bad_range.start, bad_range.stop
237- # for ps in pref_suff
238- # flag_left = flag_right = false
239- # if !flag_left
240- # I = (start - p + ps):(start - 1)
241- # if I.start >= 1
242- # flag_left = startswith(pattern, join(tmp[I]))
243- # if flag_left
244- # tmp[I] = string_vec[I]
245- # end
246- # end
247- # end
248- # if !flag_right
249- # J = (stop + 1):(stop + p - ps)
250- # if J.stop <= n
251- # flag_right = endswith(pattern, join(tmp[J]))
252- # if flag_right
253- # tmp[J] = string_vec[J]
254- # end
255- # end
256- # end
257- # flag_left && flag_right && break
258- # end
259- # end
260- # return findall(!isempty, tmp)
261- # end
208+ """
209+ _pattern_can_occur_if_reassignment_at(
210+ pattern::String,
211+ vec::Vector{String},
212+ window::UnitRange{U}
213+ )::UnitRange{U} where {U<:Int}
214+
215+ Find the range of indices of `window` where `""` elements of `vec[window]` can be modified to make the resulting `join(vec[window]) == pattern`.
216+ An empty range is returned otherwise.
217+ """
218+ function _pattern_can_occur_if_reassignment_at (
219+ pattern:: String ,
220+ vec:: Vector{String} ,
221+ window:: UnitRange{U}
222+ ):: UnitRange{U} where {U<: Int }
223+ empty_range = one (U): zero (U)
224+ (isempty (window) || length (window) != length (pattern)) && return empty_range
225+ i = first (window)
226+ f1 = findnext (isempty, vec, i)
227+ isnothing (f1) && return empty_range
228+ j = last (window)
229+ f2 = findprev (isempty, vec, j)
230+ if startswith (pattern, join (vec[i: f1])) && endswith (pattern, join (vec[f2: j]))
231+ return f1: f2
232+ end
233+ return empty_range
234+ end
0 commit comments