From 9bce632ca959cec161ea7108f1d5b1854f8e3362 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Wed, 2 Apr 2025 23:51:57 +0200 Subject: [PATCH 1/6] regex.ex - add support for to_embed() to_embed(regex,strict) returns an embeddable representation of regex. For instance ~r/foo/i can be represented as ~r/(?i-msx:foo)/. If the option :strict is true (the default) then it will throw an ArgumentError if the regex was compiled with an option/modifier which cannot be represented as an embeddable pattern. If :strict is false then any unembeddable options will be silently ignored. This may be perfectly reasonable, for intance the wrapped pattern may be compiled with the same modifiers as the pattern, or reusing the pattern without the unembeddable modifiers may not change its semantics. --- lib/elixir/lib/regex.ex | 97 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/lib/elixir/lib/regex.ex b/lib/elixir/lib/regex.ex index 6e478d3d05c..7c3727bb678 100644 --- a/lib/elixir/lib/regex.ex +++ b/lib/elixir/lib/regex.ex @@ -412,6 +412,80 @@ defmodule Regex do opts end + @doc """ + Returns the pattern as an embeddable string. + + If the pattern was compiled with an option which cannot be represented + as an embeddable modifier in the current version of PCRE and strict is true + (the default) then an ArgumentError exception will be raised. + + When strict is false the pattern will be returned as though any offending + options had not be used and the function will not raise any exceptions. + + Embeddable modifiers/options are currently: + * 'i' - :caseless + * 'm' - :multiline + * 's' - :dotall, :newline, :anycrlf}) + * 'x' - :extended + + And unembeddable modifiers are + * 'f' - :firstline + * 'U' - :ungreedy + * 'u' - :unicode, :ucp + + Any other regex compilation option not listed here is considered unembeddable. + + ## Examples + iex> Regex.to_embed(~r/foo/) + "(?-imsx:foo)" + + iex> Regex.to_embed(~r/^foo/m) + "(?m-isx:^foo)" + + iex> Regex.to_embed(~r/foo # comment/ix) + "(?ix-ms:foo # comment\\n)" + + iex> Regex.to_embed(~r/foo/iu) + ** (ArgumentError) regex compiled with options [:ucp, :unicode] which cannot be represented as an embedded pattern in this version of PCRE + + iex> Regex.to_embed(~r/foo/imsxu, strict: false) + "(?imsx:foo\\n)" + + """ + @spec to_embed(t, [term]) :: String.t() + def to_embed(%Regex{source: source, opts: regex_opts}, embed_opts \\ []) do + strict = Keyword.get(embed_opts, :strict, true) + + modifiers = + case embeddable_modifiers(regex_opts) do + {:ok, modifiers} -> + modifiers + + {:error, modifiers, untranslatable} -> + if strict do + raise ArgumentError, + "regex compiled with options #{inspect(untranslatable)} which cannot be " <> + "represented as an embedded pattern in this version of PCRE" + else + modifiers + end + end + + disabled = + Enum.reject([?i, ?m, ?s, ?x], &(&1 in modifiers)) + |> List.to_string() + + disabled = if disabled != "", do: "-#{disabled}", else: "" + + modifiers = + Enum.sort(modifiers) + |> List.to_string() + + nl = if Enum.member?(regex_opts, :extended), do: "\n", else: "" + + "(?#{modifiers}#{disabled}:#{source}#{nl})" + end + @doc """ Returns a list of names in the regex. @@ -845,6 +919,29 @@ defmodule Regex do # Helpers + # translate options to modifiers as required for emedding + defp embeddable_modifiers(list), do: embeddable_modifiers(list, [], []) + + defp embeddable_modifiers([:dotall, {:newline, :anycrlf} | t], acc, err), + do: embeddable_modifiers(t, [?s | acc], err) + + defp embeddable_modifiers([:caseless | t], acc, err), + do: embeddable_modifiers(t, [?i | acc], err) + + defp embeddable_modifiers([:extended | t], acc, err), + do: embeddable_modifiers(t, [?x | acc], err) + + defp embeddable_modifiers([:multiline | t], acc, err), + do: embeddable_modifiers(t, [?m | acc], err) + + defp embeddable_modifiers([option | t], acc, err), + do: embeddable_modifiers(t, acc, [option | err]) + + defp embeddable_modifiers([], acc, []), do: {:ok, acc} + defp embeddable_modifiers([], acc, err), do: {:error, acc, err} + + # translate modifers to options + defp translate_options(<>, acc), do: translate_options(t, [:dotall, {:newline, :anycrlf} | acc]) From 65d1215ddc0d27d1d98464a72a879ae88344260a Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Sat, 5 Apr 2025 11:09:50 +0200 Subject: [PATCH 2/6] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor fixups and simplifications. Co-authored-by: José Valim --- lib/elixir/lib/regex.ex | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/elixir/lib/regex.ex b/lib/elixir/lib/regex.ex index 7c3727bb678..008f5bba27b 100644 --- a/lib/elixir/lib/regex.ex +++ b/lib/elixir/lib/regex.ex @@ -423,15 +423,17 @@ defmodule Regex do options had not be used and the function will not raise any exceptions. Embeddable modifiers/options are currently: - * 'i' - :caseless - * 'm' - :multiline - * 's' - :dotall, :newline, :anycrlf}) - * 'x' - :extended - And unembeddable modifiers are - * 'f' - :firstline - * 'U' - :ungreedy - * 'u' - :unicode, :ucp + * 'i' - `:caseless` + * 'm' - `:multiline` + * 's' - `:dotall, {:newline, :anycrlf}` + * 'x' - `:extended` + + And unembeddable modifiers are: + + * 'f' - `:firstline` + * 'U' - `:ungreedy` + * 'u' - `:unicode, :ucp` Any other regex compilation option not listed here is considered unembeddable. @@ -452,6 +454,7 @@ defmodule Regex do "(?imsx:foo\\n)" """ + @doc since: "1.19.0" @spec to_embed(t, [term]) :: String.t() def to_embed(%Regex{source: source, opts: regex_opts}, embed_opts \\ []) do strict = Keyword.get(embed_opts, :strict, true) @@ -471,9 +474,7 @@ defmodule Regex do end end - disabled = - Enum.reject([?i, ?m, ?s, ?x], &(&1 in modifiers)) - |> List.to_string() + disabled = List.to_string([?i, ?m, ?s, ?x] -- modifiers) disabled = if disabled != "", do: "-#{disabled}", else: "" From 70ee357cb68089918aae8c1cecc6b39c0b5781cc Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Sat, 5 Apr 2025 11:20:32 +0200 Subject: [PATCH 3/6] embeddable regex - more doc tweaks. (squash me) * Sentences should not start with 'And'. * Rework sentence about unlisted regex compile options. * Consistent formatting for the the 'strict' option. --- lib/elixir/lib/regex.ex | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/elixir/lib/regex.ex b/lib/elixir/lib/regex.ex index 008f5bba27b..dee3fff9867 100644 --- a/lib/elixir/lib/regex.ex +++ b/lib/elixir/lib/regex.ex @@ -419,8 +419,9 @@ defmodule Regex do as an embeddable modifier in the current version of PCRE and strict is true (the default) then an ArgumentError exception will be raised. - When strict is false the pattern will be returned as though any offending - options had not be used and the function will not raise any exceptions. + When the `:strict` option is false the pattern will be returned as though + any offending options had not be used and the function will not raise any + exceptions. Embeddable modifiers/options are currently: @@ -429,13 +430,14 @@ defmodule Regex do * 's' - `:dotall, {:newline, :anycrlf}` * 'x' - `:extended` - And unembeddable modifiers are: + Unembeddable modifiers are: * 'f' - `:firstline` * 'U' - `:ungreedy` * 'u' - `:unicode, :ucp` - Any other regex compilation option not listed here is considered unembeddable. + Any other regex compilation option not listed here is considered unembeddable + and will raise an exception unless the `:strict` option is false. ## Examples iex> Regex.to_embed(~r/foo/) From 622bff3dcb504ef6927d8a6b34c67b5650da1260 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Sat, 5 Apr 2025 11:24:06 +0200 Subject: [PATCH 4/6] simplify code - dont use to_string() where it is not needed also add comment about why we sort the modifiers --- lib/elixir/lib/regex.ex | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/elixir/lib/regex.ex b/lib/elixir/lib/regex.ex index dee3fff9867..446729721a8 100644 --- a/lib/elixir/lib/regex.ex +++ b/lib/elixir/lib/regex.ex @@ -476,13 +476,12 @@ defmodule Regex do end end - disabled = List.to_string([?i, ?m, ?s, ?x] -- modifiers) + disabled = [?i, ?m, ?s, ?x] -- modifiers - disabled = if disabled != "", do: "-#{disabled}", else: "" + disabled = if disabled != [], do: "-#{disabled}", else: "" - modifiers = - Enum.sort(modifiers) - |> List.to_string() + # Future proof option ordering consistency by sorting + modifiers = Enum.sort(modifiers) nl = if Enum.member?(regex_opts, :extended), do: "\n", else: "" From 6351efde95a4f6e8098782bb8919c7a4da1508da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Sat, 5 Apr 2025 13:15:35 +0200 Subject: [PATCH 5/6] Update lib/elixir/lib/regex.ex --- lib/elixir/lib/regex.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/elixir/lib/regex.ex b/lib/elixir/lib/regex.ex index 446729721a8..c3884b1118f 100644 --- a/lib/elixir/lib/regex.ex +++ b/lib/elixir/lib/regex.ex @@ -457,7 +457,7 @@ defmodule Regex do """ @doc since: "1.19.0" - @spec to_embed(t, [term]) :: String.t() + @spec to_embed(t, [strict: boolean()]) :: String.t() def to_embed(%Regex{source: source, opts: regex_opts}, embed_opts \\ []) do strict = Keyword.get(embed_opts, :strict, true) From 207516c96811a2a9fca6ee975da5402b6731f91f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Sat, 5 Apr 2025 13:27:58 +0200 Subject: [PATCH 6/6] Update lib/elixir/lib/regex.ex --- lib/elixir/lib/regex.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/elixir/lib/regex.ex b/lib/elixir/lib/regex.ex index c3884b1118f..429247a311f 100644 --- a/lib/elixir/lib/regex.ex +++ b/lib/elixir/lib/regex.ex @@ -457,7 +457,7 @@ defmodule Regex do """ @doc since: "1.19.0" - @spec to_embed(t, [strict: boolean()]) :: String.t() + @spec to_embed(t, strict: boolean()) :: String.t() def to_embed(%Regex{source: source, opts: regex_opts}, embed_opts \\ []) do strict = Keyword.get(embed_opts, :strict, true)