From dd256591a05bc9b3ad7d7e6045aa861062625463 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 14 Apr 2021 16:20:27 -0400 Subject: [PATCH 1/3] convert BinaryTree to slightly more canonical Julia form With no node mutation, it is unclear why this is mutable (which costs some extra loads), though the recursion forces this to the heap regardless. But singleton struct Empty needs to be non-mutable now for correctness --- src/shootout/binary_trees.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/shootout/binary_trees.jl b/src/shootout/binary_trees.jl index 281341c4..59fdadf4 100644 --- a/src/shootout/binary_trees.jl +++ b/src/shootout/binary_trees.jl @@ -4,17 +4,17 @@ # # Ported from an OCaml version -abstract type BTree end -mutable struct Empty <: BTree -end +struct Empty end -mutable struct Node <: BTree - info - left::BTree - right::BTree +struct Node{T} + info::T + left::Union{Node{T}, Empty} + right::Union{Node{T}, Empty} end +const BTree{T} = Union{Node{T}, Empty} + function make(val, d) if d == 0 Node(val, Empty(), Empty()) From 94dba74e7e4af7c123f67188efdf5ed6805fea74 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 14 Apr 2021 16:21:43 -0400 Subject: [PATCH 2/3] use appropriate optimizations in regex_dna The multi-replace method is being added in https://github.com/JuliaLang/julia/pull/40484 --- src/shootout/regex_dna.jl | 69 ++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/src/shootout/regex_dna.jl b/src/shootout/regex_dna.jl index 8e519fde..887d6746 100644 --- a/src/shootout/regex_dna.jl +++ b/src/shootout/regex_dna.jl @@ -5,30 +5,30 @@ # Fix from David Campbell const variants = [ - "agggtaaa|tttaccct", - "[cgt]gggtaaa|tttaccc[acg]", - "a[act]ggtaaa|tttacc[agt]t", - "ag[act]gtaaa|tttac[agt]ct", - "agg[act]taaa|ttta[agt]cct", - "aggg[acg]aaa|ttt[cgt]ccct", - "agggt[cgt]aa|tt[acg]accct", - "agggta[cgt]a|t[acg]taccct", - "agggtaa[cgt]|[acg]ttaccct" + r"agggtaaa|tttaccct", + r"[cgt]gggtaaa|tttaccc[acg]", + r"a[act]ggtaaa|tttacc[agt]t", + r"ag[act]gtaaa|tttac[agt]ct", + r"agg[act]taaa|ttta[agt]cct", + r"aggg[acg]aaa|ttt[cgt]ccct", + r"agggt[cgt]aa|tt[acg]accct", + r"agggta[cgt]a|t[acg]taccct", + r"agggtaa[cgt]|[acg]ttaccct" ] -const subs = [ - (r"B", "(c|g|t)"), - (r"D", "(a|g|t)"), - (r"H", "(a|c|t)"), - (r"K", "(g|t)"), - (r"M", "(a|c)"), - (r"N", "(a|c|g|t)"), - (r"R", "(a|g)"), - (r"S", "(c|g)"), - (r"V", "(a|c|g)"), - (r"W", "(a|t)"), - (r"Y", "(c|t)") -] +const subs = ( + ("B" => "(c|g|t)"), + ("D" => "(a|g|t)"), + ("H" => "(a|c|t)"), + ("K" => "(g|t)"), + ("M" => "(a|c)"), + ("N" => "(a|c|g|t)"), + ("R" => "(a|g)"), + ("S" => "(c|g)"), + ("V" => "(a|c|g)"), + ("W" => "(a|t)"), + ("Y" => "(c|t)") +) function perf_regex_dna() infile = joinpath(SHOOTOUT_DATA_PATH, "regexdna-input.txt") @@ -38,20 +38,29 @@ function perf_regex_dna() seq = replace(seq, r">.*\n|\n" => "") l2 = length(seq) + kk = 0 for v in variants k = 0 - for m in eachmatch(Regex(v), seq) + for m in eachmatch(v, seq) k += 1 end -# @printf("%s %d\n", v, k) + kk += k end - for (u, v) in subs - seq = replace(seq, u => v) + try + # VERSION > 1.7-dev + seq = replace(seq, subs...) + catch ex + ex isa MethodError || rethrow() + # semi-optimized regex + r = Regex(join(first.(subs), "|")) + repl = Dict(subs) + seq = replace(seq, r => (r -> repl[r])) + ## multiple passes + #for sub in subs + # seq = replace(seq, sub) + #end end -# println() -# println(l1) -# println(l2) -# println(length(seq)) + seq, kk end From 9dee1b2cd3f9b3c5e85893414e6855a00c5b7f78 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 14 Apr 2021 16:25:55 -0400 Subject: [PATCH 3/3] improve quality of perf_parse_json implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allocating strings with json character-by-character with ""+c+c+… is bad style, and this was not handling \u correctly either. We could make strcat a separate micro-benchmark, but it is not a particularly interesting test case currently. --- src/problem/JSONParse.jl | 59 ++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/src/problem/JSONParse.jl b/src/problem/JSONParse.jl index 574ef139..2b803c0e 100644 --- a/src/problem/JSONParse.jl +++ b/src/problem/JSONParse.jl @@ -75,36 +75,53 @@ function perf_parse_json(strng::AbstractString) if strng[pos] != '"' error("AbstractString starting with quotation expected at position $pos") else - pos = pos + 1 + pos += 1 end - str = "" + str = IOBuffer() while pos <= len nc = strng[pos] if nc == '"' + pos += 1 + return String(take!(str)) + elseif nc == '\\' pos = pos + 1 - return string(str) - elseif nc == '\\' - if pos+1 > len - error_pos("End of file reached right after escape character") - end - pos = pos + 1 + pos > len && break # goto error handling anc = strng[pos] - if anc == '"' || anc == '\\' || anc == '/' - str = string(str, strng[pos]) - pos = pos + 1 - elseif anc == 'b' || anc == 'f'|| anc == 'n' || anc == 'r' || anc == 't' - str = string(str, '\\', string[pos]) - pos = pos + 1 + if anc == '"' + write(str, "\"") + pos += 1 + elseif anc == '\\' + write(str, "\\") + pos += 1 + elseif anc == '/' + write(str, "/") + pos += 1 + elseif anc == 'b' + write(str, "\b") + pos += 1 + elseif anc == 'f' + write(str, "\f") + pos += 1 + elseif anc == 'n' + write(str, "\n") + pos += 1 + elseif anc == 'r' + write(str, "\r") + pos += 1 + elseif anc == 't' + write(str, "\t") + pos += 1 elseif anc == 'u' - if pos+4 > len - error_pos("End of file reached in escaped unicode character") - end - str = string(str, strng[pos-1:pos+4]) + pos + 4 > len && break # goto error handling + write(str, Char(parse(Int, strng[pos:pos+4], base=16))) pos = pos + 5 + else # should rarely happen + write(str, anc) + pos = pos + 1 end - else # should never happen - str = string(str,strng[pos]) - pos = pos + 1 + else # common case + write(str, nc) + pos = nextind(strng, pos) end end error("End of file while expecting end of string")