Skip to content

Commit 729e0d8

Browse files
Add RegEx supports using RE2 to sjsonnet (#244)
With this PR, I'm adding a handful of methods to expose regular expressions in jsonnet, through std.native() I'm modeling them after [this open PR from jsonnet](google/jsonnet#1039), which was ported to jrsonnet. For now, they are in std.native() as they are not part of the default std package and use RE2 instead of the native regexp package (for performance and compatibility reasons with a future go-jsonnet implementation). - regexFullMatch(pattern, str) -- Full match regex - regexPartialMatch(pattern, str) -- Partial match regex - regexReplace(str, pattern, to) -- Replace single occurance using regex - regexGlobalReplace(str, pattern, to) -- Replace globally using regex and the utility function: - regexQuoteMeta(str) -- Escape regex metachararacters Those functions return a object: ``` std.native("regexFullMatch")("h(?P<mid>.*)o", "hello") { "captures": [ "ell" ], "string": "hello" } ``` This PR does not add support for the "namedCaptures" return field due to some complications with scalajs and scalanative. Those language both use the JDK Pattern class (js being powered by ECMA regex and Native being powered by RE2(!)), but JDK<20 Pattern class does not have a straightforward way to list the names of groups without some additional hacks. This will be dealt with in a follow up PR. This PR also adds the ability to cache patterns, and refactors all users of regexes to use it.
1 parent 1497955 commit 729e0d8

File tree

11 files changed

+187
-36
lines changed

11 files changed

+187
-36
lines changed

build.sbt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ lazy val main = (project in file("sjsonnet"))
2121
"org.scala-lang.modules" %% "scala-collection-compat" % "2.11.0",
2222
"org.tukaani" % "xz" % "1.8",
2323
"org.yaml" % "snakeyaml" % "1.33",
24+
"com.google.re2j" % "re2j" % "1.7",
2425
),
2526
libraryDependencies ++= Seq(
2627
"com.lihaoyi" %% "utest" % "0.8.2",

build.sc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ object sjsonnet extends Module {
108108
ivy"org.json:json:20240303",
109109
ivy"org.tukaani:xz::1.10",
110110
ivy"org.lz4:lz4-java::1.8.0",
111-
ivy"org.yaml:snakeyaml::1.33"
111+
ivy"org.yaml:snakeyaml::1.33",
112+
ivy"com.google.re2j:re2j:1.7",
112113
)
113114
def scalacOptions = Seq("-opt:l:inline", "-opt-inline-from:sjsonnet.**")
114115

sjsonnet/src-js/sjsonnet/Platform.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
package sjsonnet
2+
23
import java.io.File
4+
import java.util
5+
import java.util.regex.Pattern
6+
7+
38
object Platform {
49
def gzipBytes(s: Array[Byte]): String = {
510
throw new Exception("GZip not implemented in Scala.js")
@@ -34,4 +39,12 @@ object Platform {
3439
def hashFile(file: File): String = {
3540
throw new Exception("hashFile not implemented in Scala.js")
3641
}
42+
43+
private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern]
44+
45+
// scala.js does not rely on re2. Per https://www.scala-js.org/doc/regular-expressions.html.
46+
// Expect to see some differences in behavior.
47+
def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat))
48+
49+
def regexQuote(s: String): String = Pattern.quote(s)
3750
}

sjsonnet/src-jvm/sjsonnet/Platform.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
package sjsonnet
22

33
import java.io.{BufferedInputStream, ByteArrayOutputStream, File, FileInputStream}
4+
import java.util
45
import java.util.Base64
56
import java.util.zip.GZIPOutputStream
7+
import com.google.re2j.Pattern
68
import net.jpountz.xxhash.{StreamingXXHash64, XXHashFactory}
79
import org.json.{JSONArray, JSONObject}
810
import org.tukaani.xz.LZMA2Options
911
import org.tukaani.xz.XZOutputStream
1012
import org.yaml.snakeyaml.{LoaderOptions, Yaml}
1113
import org.yaml.snakeyaml.constructor.SafeConstructor
14+
1215
import scala.jdk.CollectionConverters._
1316

1417
object Platform {
@@ -107,4 +110,9 @@ object Platform {
107110

108111
hash.getValue.toString
109112
}
113+
114+
private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern]
115+
def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat))
116+
117+
def regexQuote(s: String): String = Pattern.quote(s)
110118
}

sjsonnet/src-native/sjsonnet/Platform.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package sjsonnet
22

33
import java.io.{ByteArrayOutputStream, File}
4+
import java.util
45
import java.util.Base64
56
import java.util.zip.GZIPOutputStream
7+
import java.util.regex.Pattern
68

79
object Platform {
810
def gzipBytes(b: Array[Byte]): String = {
@@ -50,4 +52,11 @@ object Platform {
5052
// File hashes in Scala Native are just the file content
5153
scala.io.Source.fromFile(file).mkString
5254
}
55+
56+
private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern]
57+
// scala native is powered by RE2, per https://scala-native.org/en/latest/lib/javalib.html#regular-expressions-java-util-regexp
58+
// It should perform similarly to the JVM implementation.
59+
def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat))
60+
61+
def regexQuote(s: String): String = Pattern.quote(s)
5362
}

sjsonnet/src/sjsonnet/Std.scala

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,10 @@ import java.io.StringWriter
44
import java.nio.charset.StandardCharsets.UTF_8
55
import java.util.Base64
66
import java.util
7-
import java.util.regex.Pattern
87
import sjsonnet.Expr.Member.Visibility
98

109
import scala.collection.Searching._
1110
import scala.collection.mutable
12-
import scala.util.matching.Regex
1311

1412
/**
1513
* The Jsonnet standard library, `std`, with each builtin function implemented
@@ -19,9 +17,9 @@ import scala.util.matching.Regex
1917
class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.empty) {
2018
private val dummyPos: Position = new Position(null, 0)
2119
private val emptyLazyArray = new Array[Lazy](0)
22-
private val leadingWhiteSpacePattern = Pattern.compile("^[ \t\n\f\r\u0085\u00A0']+")
23-
private val trailingWhiteSpacePattern = Pattern.compile("[ \t\n\f\r\u0085\u00A0']+$")
24-
private val oldNativeFunctions = Map(
20+
private val leadingWhiteSpacePattern = Platform.getPatternFromCache("^[ \t\n\f\r\u0085\u00A0']+")
21+
private val trailingWhiteSpacePattern = Platform.getPatternFromCache("[ \t\n\f\r\u0085\u00A0']+$")
22+
private val builtinNativeFunctions = Map(
2523
builtin("gzip", "v"){ (_, _, v: Val) =>
2624
v match{
2725
case Val.Str(_, value) => Platform.gzipString(value)
@@ -46,9 +44,9 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.
4644
case x => Error.fail("Cannot xz encode " + x.prettyName)
4745
}
4846
},
49-
)
50-
require(oldNativeFunctions.forall(k => !additionalNativeFunctions.contains(k._1)), "Conflicting native functions")
51-
private val nativeFunctions = oldNativeFunctions ++ additionalNativeFunctions
47+
) ++ StdRegex.functions
48+
require(builtinNativeFunctions.forall(k => !additionalNativeFunctions.contains(k._1)), "Conflicting native functions")
49+
private val nativeFunctions = builtinNativeFunctions ++ additionalNativeFunctions
5250

5351
private object AssertEqual extends Val.Builtin2("assertEqual", "a", "b") {
5452
def evalRhs(v1: Val, v2: Val, ev: EvalScope, pos: Position): Val = {
@@ -474,26 +472,25 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.
474472
Val.Str(pos, str.asString.replaceAll(from.asString, to.asString))
475473
override def specialize(args: Array[Expr]) = args match {
476474
case Array(str, from: Val.Str, to) =>
477-
try { (new SpecFrom(Pattern.compile(from.value)), Array(str, to)) } catch { case _: Exception => null }
475+
try { (new SpecFrom(from.value), Array(str, to)) } catch { case _: Exception => null }
478476
case _ => null
479477
}
480-
private class SpecFrom(from: Pattern) extends Val.Builtin2("strReplaceAll", "str", "to") {
478+
private class SpecFrom(from: String) extends Val.Builtin2("strReplaceAll", "str", "to") {
479+
private[this] val pattern = Platform.getPatternFromCache(from)
481480
def evalRhs(str: Val, to: Val, ev: EvalScope, pos: Position): Val =
482-
Val.Str(pos, from.matcher(str.asString).replaceAll(to.asString))
481+
Val.Str(pos, pattern.matcher(str.asString).replaceAll(to.asString))
483482
}
484483
}
485484

486485
private object StripUtils {
487-
private def getLeadingPattern(chars: String): Pattern =
488-
Pattern.compile("^[" + Regex.quote(chars) + "]+")
486+
private def getLeadingPattern(chars: String): String = "^[" + Platform.regexQuote(chars) + "]+"
489487

490-
private def getTrailingPattern(chars: String): Pattern =
491-
Pattern.compile("[" + Regex.quote(chars) + "]+$")
488+
private def getTrailingPattern(chars: String): String = "[" + Platform.regexQuote(chars) + "]+$"
492489

493490
def unspecializedStrip(str: String, chars: String, left: Boolean, right: Boolean): String = {
494491
var s = str
495-
if (right) s = getTrailingPattern(chars).matcher(s).replaceAll("")
496-
if (left) s = getLeadingPattern(chars).matcher(s).replaceAll("")
492+
if (right) s = Platform.getPatternFromCache(getTrailingPattern(chars)).matcher(s).replaceAll("")
493+
if (left) s = Platform.getPatternFromCache(getLeadingPattern(chars)).matcher(s).replaceAll("")
497494
s
498495
}
499496

@@ -503,8 +500,8 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.
503500
right: Boolean,
504501
functionName: String
505502
) extends Val.Builtin1(functionName, "str") {
506-
private[this] val leftPattern = getLeadingPattern(chars)
507-
private[this] val rightPattern = getTrailingPattern(chars)
503+
private[this] val leftPattern = Platform.getPatternFromCache(getLeadingPattern(chars))
504+
private[this] val rightPattern = Platform.getPatternFromCache(getTrailingPattern(chars))
508505

509506
def evalRhs(str: Val, ev: EvalScope, pos: Position): Val = {
510507
var s = str.asString
@@ -1522,7 +1519,7 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.
15221519
Error.fail("Native function " + name + " not found", pos)(ev)
15231520
}
15241521
},
1525-
) ++ oldNativeFunctions
1522+
) ++ builtinNativeFunctions
15261523

15271524
private def toSetArrOrString(args: Array[Val], idx: Int, pos: Position, ev: EvalScope) = {
15281525
args(idx) match {

sjsonnet/src/sjsonnet/StdRegex.scala

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package sjsonnet
2+
3+
import sjsonnet.Expr.Member.Visibility
4+
import sjsonnet.Val.Obj
5+
6+
object StdRegex {
7+
def functions: Map[String, Val.Builtin] = Map(
8+
"regexPartialMatch" -> new Val.Builtin2("regexPartialMatch", "pattern", "str") {
9+
override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = {
10+
val compiledPattern = Platform.getPatternFromCache(pattern.asString)
11+
val matcher = compiledPattern.matcher(str.asString)
12+
var returnStr: Val = null
13+
val captures = Array.newBuilder[Val]
14+
val groupCount = matcher.groupCount()
15+
while (matcher.find()) {
16+
if (returnStr == null) {
17+
val m = matcher.group(0)
18+
if (m != null) {
19+
returnStr = Val.Str(pos.noOffset, matcher.group(0))
20+
} else {
21+
returnStr = Val.Null(pos.noOffset)
22+
}
23+
}
24+
for (i <- 1 to groupCount) {
25+
val m = matcher.group(i)
26+
if (m == null) {
27+
captures += Val.Null(pos.noOffset)
28+
} else {
29+
captures += Val.Str(pos.noOffset, m)
30+
}
31+
}
32+
}
33+
val result = captures.result()
34+
Val.Obj.mk(pos.noOffset,
35+
"string" -> new Obj.ConstMember(true, Visibility.Normal,
36+
if (returnStr == null) Val.Null(pos.noOffset) else returnStr),
37+
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result))
38+
)
39+
}
40+
},
41+
"regexFullMatch" -> new Val.Builtin2("regexFullMatch", "pattern", "str") {
42+
override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = {
43+
val compiledPattern = Platform.getPatternFromCache(pattern.asString)
44+
val matcher = compiledPattern.matcher(str.asString)
45+
if (!matcher.matches()) {
46+
Val.Obj.mk(pos.noOffset,
47+
"string" -> new Obj.ConstMember(true, Visibility.Normal, Val.Null(pos.noOffset)),
48+
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, Array.empty[Lazy]))
49+
)
50+
} else {
51+
val captures = Array.newBuilder[Val]
52+
val groupCount = matcher.groupCount()
53+
for (i <- 0 to groupCount) {
54+
val m = matcher.group(i)
55+
if (m == null) {
56+
captures += Val.Null(pos.noOffset)
57+
} else {
58+
captures += Val.Str(pos.noOffset, m)
59+
}
60+
}
61+
val result = captures.result()
62+
Val.Obj.mk(pos.noOffset,
63+
"string" -> new Obj.ConstMember(true, Visibility.Normal, result.head),
64+
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result.drop(1)))
65+
)
66+
}
67+
}
68+
},
69+
"regexGlobalReplace" -> new Val.Builtin3("regexGlobalReplace", "str", "pattern", "to") {
70+
override def evalRhs(str: Val, pattern: Val, to: Val, ev: EvalScope, pos: Position): Val = {
71+
val compiledPattern = Platform.getPatternFromCache(pattern.asString)
72+
val matcher = compiledPattern.matcher(str.asString)
73+
Val.Str(pos.noOffset, matcher.replaceAll(to.asString))
74+
}
75+
},
76+
"regexReplace" -> new Val.Builtin3("regexReplace", "str", "pattern", "to") {
77+
override def evalRhs(str: Val, pattern: Val, to: Val, ev: EvalScope, pos: Position): Val = {
78+
val compiledPattern = Platform.getPatternFromCache(pattern.asString)
79+
val matcher = compiledPattern.matcher(str.asString)
80+
Val.Str(pos.noOffset, matcher.replaceFirst(to.asString))
81+
}
82+
},
83+
"regexQuoteMeta" -> new Val.Builtin1("regexQuoteMeta", "str") {
84+
override def evalRhs(str: Val, ev: EvalScope, pos: Position): Val = {
85+
Val.Str(pos.noOffset, Platform.regexQuote(str.asString))
86+
}
87+
}
88+
)
89+
}

sjsonnet/src/sjsonnet/TomlRenderer.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package sjsonnet
33
import upickle.core.{ArrVisitor, CharBuilder, ObjVisitor, SimpleVisitor, Visitor}
44

55
import java.io.StringWriter
6-
import java.util.regex.Pattern
76

87

98
class TomlRenderer(out: StringWriter = new java.io.StringWriter(), cumulatedIndent: String, indent: String) extends SimpleVisitor[StringWriter, StringWriter]{
@@ -117,7 +116,7 @@ class TomlRenderer(out: StringWriter = new java.io.StringWriter(), cumulatedInde
117116
}
118117

119118
object TomlRenderer {
120-
private val bareAllowed = Pattern.compile("[A-Za-z0-9_-]+")
119+
private val bareAllowed = Platform.getPatternFromCache("[A-Za-z0-9_-]+")
121120
def escapeKey(key: String): String = if (bareAllowed.matcher(key).matches()) key else {
122121
val out = new StringWriter()
123122
BaseRenderer.escape(out, key, unicode = true)

sjsonnet/src/sjsonnet/YamlRenderer.scala

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
package sjsonnet
22

33
import java.io.StringWriter
4-
import java.util.regex.Pattern
54
import upickle.core.{ArrVisitor, ObjVisitor, SimpleVisitor, Visitor}
65

7-
import scala.util.Try
8-
9-
106

117
class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayInObject: Boolean = false,
128
quoteKeys: Boolean = true, indent: Int = 2) extends BaseCharRenderer(_out, indent){
@@ -52,7 +48,7 @@ class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayI
5248
elemBuilder.append('"')
5349
elemBuilder.append('"')
5450
} else if (s.charAt(len - 1) == '\n') {
55-
val splits = YamlRenderer.newlinePattern.split(s)
51+
val splits = YamlRenderer.newlinePattern.split(s.toString)
5652
elemBuilder.append('|')
5753
depth += 1
5854
splits.foreach { split =>
@@ -174,15 +170,15 @@ class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayI
174170
}
175171
}
176172
object YamlRenderer{
177-
val newlinePattern: Pattern = Pattern.compile("\n")
178-
private val safeYamlKeyPattern = Pattern.compile("^[a-zA-Z0-9/._-]+$")
173+
private[sjsonnet] val newlinePattern = Platform.getPatternFromCache("\n")
174+
private val safeYamlKeyPattern = Platform.getPatternFromCache("^[a-zA-Z0-9/._-]+$")
179175
private val yamlReserved = Set("true", "false", "null", "yes", "no", "on", "off", "y", "n", ".nan",
180176
"+.inf", "-.inf", ".inf", "null", "-", "---", "''")
181-
private val yamlTimestampPattern = Pattern.compile("^(?:[0-9]*-){2}[0-9]*$")
182-
private val yamlBinaryPattern = Pattern.compile("^[-+]?0b[0-1_]+$")
183-
private val yamlHexPattern = Pattern.compile("[-+]?0x[0-9a-fA-F_]+")
184-
private val yamlFloatPattern = Pattern.compile( "^-?([0-9_]*)*(\\.[0-9_]*)?(e[-+][0-9_]+)?$" )
185-
private val yamlIntPattern = Pattern.compile("^[-+]?[0-9_]+$")
177+
private val yamlTimestampPattern = Platform.getPatternFromCache("^(?:[0-9]*-){2}[0-9]*$")
178+
private val yamlBinaryPattern = Platform.getPatternFromCache("^[-+]?0b[0-1_]+$")
179+
private val yamlHexPattern = Platform.getPatternFromCache("[-+]?0x[0-9a-fA-F_]+")
180+
private val yamlFloatPattern = Platform.getPatternFromCache( "^-?([0-9_]*)*(\\.[0-9_]*)?(e[-+][0-9_]+)?$" )
181+
private val yamlIntPattern = Platform.getPatternFromCache("^[-+]?[0-9_]+$")
186182

187183
private def isSafeBareKey(k: String) = {
188184
val l = k.toLowerCase

sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class OldYamlRenderer(out: StringWriter = new java.io.StringWriter(), indentArra
2020
val len = s.length()
2121
if (len == 0) out.append("\"\"")
2222
else if (s.charAt(len - 1) == '\n') {
23-
val splits = YamlRenderer.newlinePattern.split(s)
23+
val splits = YamlRenderer.newlinePattern.split(s.toString)
2424
out.append('|')
2525
depth += 1
2626
splits.foreach { split =>

0 commit comments

Comments
 (0)