Skip to content

Commit 7ab69cd

Browse files
author
Yves Orton
committed
Initial attempt at embeddable regex objects
This patch, which works on 1.18.x but does not work on 1.19 is an attempt to implement String.Chars protocol and also a Regex.to_string() and Regex.modifiers() and Regex.to_string!() and Regex.modifiers!() functions. The idea is to make it possible to safely embed precompiled regexes into other regexes in a similar way as that supported by perl. The general idea is that ~r/foo/x turns into "(?x-ims:foo\n)", and etc. Thus it should match the same as it would have in its original form when it is embedded into a pattern which has a different set of modifiers. For review by Jose.
1 parent 33b6e6d commit 7ab69cd

File tree

4 files changed

+212
-14
lines changed

4 files changed

+212
-14
lines changed

lib/elixir/lib/inspect.ex

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ end
489489

490490
defimpl Inspect, for: Regex do
491491
def inspect(regex = %{opts: regex_opts}, opts) when is_list(regex_opts) do
492-
case translate_options(regex_opts, []) do
492+
case Regex.modifiers(regex) do
493493
:error ->
494494
concat([
495495
"Regex.compile!(",
@@ -499,7 +499,7 @@ defimpl Inspect, for: Regex do
499499
")"
500500
])
501501

502-
translated_opts ->
502+
{:ok, translated_opts} ->
503503
{escaped, _} =
504504
regex.source
505505
|> normalize(<<>>)
@@ -510,18 +510,6 @@ defimpl Inspect, for: Regex do
510510
end
511511
end
512512

513-
defp translate_options([:dotall, {:newline, :anycrlf} | t], acc),
514-
do: translate_options(t, [?s | acc])
515-
516-
defp translate_options([:unicode, :ucp | t], acc), do: translate_options(t, [?u | acc])
517-
defp translate_options([:caseless | t], acc), do: translate_options(t, [?i | acc])
518-
defp translate_options([:extended | t], acc), do: translate_options(t, [?x | acc])
519-
defp translate_options([:firstline | t], acc), do: translate_options(t, [?f | acc])
520-
defp translate_options([:ungreedy | t], acc), do: translate_options(t, [?U | acc])
521-
defp translate_options([:multiline | t], acc), do: translate_options(t, [?m | acc])
522-
defp translate_options([], acc), do: acc
523-
defp translate_options(_t, _acc), do: :error
524-
525513
defp normalize(<<?\\, ?\\, rest::binary>>, acc), do: normalize(rest, <<acc::binary, ?\\, ?\\>>)
526514
defp normalize(<<?\\, ?/, rest::binary>>, acc), do: normalize(rest, <<acc::binary, ?/>>)
527515
defp normalize(<<?\\, ?#, ?{, rest::binary>>, acc), do: normalize(rest, <<acc::binary, ?#, ?{>>)

lib/elixir/lib/regex.ex

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# SPDX-FileCopyrightText: 2012 Plataformatec
44

55
defmodule Regex do
6+
import Kernel, except: [to_string: 1]
7+
68
@moduledoc ~S"""
79
Provides regular expressions for Elixir.
810
@@ -172,6 +174,29 @@ defmodule Regex do
172174
defexception message: "regex could not be compiled"
173175
end
174176

177+
defmodule ModifierError do
178+
@moduledoc """
179+
An exception raised when a regular expression modifier or option that
180+
cannot be represented as an embeddable expression is passed into
181+
to_string!() or modifiers!()
182+
183+
See `Regex.modifiers/2` for more details on embeddable modifiers.
184+
"""
185+
186+
defexception [:regex, :modifiers]
187+
188+
def message(%{regex: regex, modifiers: modifiers}) do
189+
if modifiers == nil do
190+
"regex #{inspect(regex)} compiled with unknown options"
191+
else
192+
ess = if String.length(modifiers) > 1, do: "s", else: ""
193+
194+
"regex #{inspect(regex)} compiled with modifier#{ess}" <>
195+
" #{inspect(modifiers)} which cannot be embedded"
196+
end
197+
end
198+
end
199+
175200
@doc """
176201
Compiles the regular expression.
177202
@@ -393,6 +418,71 @@ defmodule Regex do
393418
source
394419
end
395420

421+
@embeddable_modifiers [?i, ?m, ?s, ?x]
422+
423+
@doc """
424+
Returns an {:ok, ...} tuple containing the regex source as a
425+
binary in an embeddable form wrapped in a `(?:...)` construct.
426+
If the pattern was compiled with an option which cannot be embedded
427+
in such a construct an `{:error, ...}` tuple containing the
428+
offending modifer letters will be returned.
429+
430+
See `Regex.modifiers/2` for more details on embeddable modifiers.
431+
432+
## Examples
433+
434+
iex> Regex.to_string(~r/foo/ix)
435+
{:ok, "(?ix-ms:foo\\n)"}
436+
iex> Regex.to_string(~r/foo/u)
437+
{:error, "u"}
438+
439+
"""
440+
@spec to_string(t) :: :error | {:ok, String.t()} | {:error, String.t()}
441+
def to_string(%Regex{source: source, opts: opts}) do
442+
nl = if Enum.member?(opts, :extended), do: "\n", else: ""
443+
444+
case _modifiers(opts, true) do
445+
:error ->
446+
:error
447+
448+
{:ok, modifiers} ->
449+
disabled = Enum.reject(@embeddable_modifiers, &(&1 in modifiers))
450+
disabled = if disabled == [], do: "", else: "-" <> to_sorted_string(disabled)
451+
452+
modifiers = to_sorted_string(modifiers) <> disabled
453+
454+
{:ok, "(?#{modifiers}:#{source}#{nl})"}
455+
456+
{:error, modifiers} ->
457+
{:error, to_sorted_string(modifiers)}
458+
end
459+
end
460+
461+
@doc """
462+
Returns a string containing the regex source as a binary in an
463+
embeddable form wrapped in a `(?:...)` construct. If the pattern
464+
was compiled with an option that cannot be represented with this
465+
type of construct then a `Regex.ModifierError` error will be raised.
466+
467+
See `Regex.modifiers/2` for more details on embeddable modifiers.
468+
469+
## Examples
470+
471+
iex> Regex.to_string!(~r/foo/xism)
472+
"(?imsx:foo\\n)"
473+
iex> Regex.to_string!(~r/foo/uf)
474+
** (Regex.ModifierError) regex ~r/foo/fu compiled with modifiers "fu" which cannot be embedded
475+
476+
"""
477+
@spec to_string!(t) :: String.t()
478+
def to_string!(%Regex{} = regex) do
479+
case to_string(regex) do
480+
:error -> raise Regex.ModifierError, regex: regex
481+
{:ok, pattern} -> pattern
482+
{:error, modifiers} -> raise Regex.ModifierError, regex: regex, modifiers: modifiers
483+
end
484+
end
485+
396486
@doc """
397487
Returns the regex options.
398488
@@ -843,8 +933,107 @@ defmodule Regex do
843933
[binary_part(original, 0, length), ?\\, char | escape(rest, 0, rest)]
844934
end
845935

936+
@doc """
937+
Returns a binary containing the regex modifier letters that
938+
a pattern was compiled with. If the `embed_only` option is true
939+
then the function will throw an error if the regex was
940+
compiled with a modifier which cannot be embeddd in a pattern.
941+
942+
See `Regex.modifiers/1` for more details.
943+
944+
## Examples
945+
946+
iex> Regex.modifiers!(~r/foo/x)
947+
"x"
948+
iex> Regex.modifiers!(~r/foo/u, true)
949+
** (Regex.ModifierError) regex ~r/foo/u compiled with modifier "u" which cannot be embedded
950+
"""
951+
@spec modifiers!(t, boolean) :: String.t()
952+
def modifiers!(%Regex{} = regex, embed_only \\ false) do
953+
case modifiers(regex, embed_only) do
954+
:error -> raise Regex.ModifierError, regex: regex
955+
{:ok, modifiers} -> modifiers
956+
{:error, modifiers} -> raise Regex.ModifierError, regex: regex, modifiers: modifiers
957+
end
958+
end
959+
960+
@doc """
961+
Returns an `{:ok, ...}` tuple containing the regex options as a
962+
binary.
963+
964+
If the embed_only option is enabled then with only return `{:ok, ...}`
965+
if all the options provided are found in the following list
966+
967+
* (m) :multiline
968+
* (s) :dotall
969+
* (i) :caseless
970+
* (x) :extended
971+
972+
if any option is in the following list
973+
974+
* (u) :unicode
975+
* (f) :firstline
976+
* (U) :ungreedy
977+
978+
then an `{:error, ...}` tuple containing the offending modifers will
979+
be returned instead.
980+
981+
Returns `:error` if an option that is not listed here has been discovered
982+
in the `%Regex` struct.
983+
984+
## Examples
985+
986+
iex> Regex.modifiers(~r/foo/x)
987+
{:ok, "x"}
988+
iex> Regex.modifiers(~r/foo/u)
989+
{:ok, "u"}
990+
iex> Regex.modifiers(~r/foo/xsim)
991+
{:ok, "imsx"}
992+
iex> Regex.modifiers(~r/foo/u, true)
993+
{:error, "u"}
994+
"""
995+
@spec modifiers(t, boolean()) :: String.t()
996+
def modifiers(%Regex{opts: opts}, embed_only \\ false) do
997+
case _modifiers(opts, embed_only) do
998+
{:ok, list} -> {:ok, to_sorted_string(list)}
999+
{:error, list} -> {:error, to_sorted_string(list)}
1000+
:error -> :error
1001+
end
1002+
end
1003+
8461004
# Helpers
8471005

1006+
defp to_sorted_string(list), do: List.to_string(Enum.sort(list))
1007+
1008+
defp _modifiers(list, embed_only), do: _modifiers(list, [], [], embed_only)
1009+
1010+
defp _modifiers([:dotall, {:newline, :anycrlf} | t], acc, err, embed_only),
1011+
do: _modifiers(t, [?s | acc], err, embed_only)
1012+
1013+
defp _modifiers([:unicode, :ucp | t], acc, err, false),
1014+
do: _modifiers(t, [?u | acc], err, false)
1015+
1016+
defp _modifiers([:unicode, :ucp | t], acc, err, true),
1017+
do: _modifiers(t, acc, [?u | err], true)
1018+
1019+
defp _modifiers([:caseless | t], acc, err, embed_only),
1020+
do: _modifiers(t, [?i | acc], err, embed_only)
1021+
1022+
defp _modifiers([:extended | t], acc, err, embed_only),
1023+
do: _modifiers(t, [?x | acc], err, embed_only)
1024+
1025+
defp _modifiers([:firstline | t], acc, err, false), do: _modifiers(t, [?f | acc], err, false)
1026+
defp _modifiers([:firstline | t], acc, err, true), do: _modifiers(t, acc, [?f | err], true)
1027+
defp _modifiers([:ungreedy | t], acc, err, false), do: _modifiers(t, [?U | acc], err, false)
1028+
defp _modifiers([:ungreedy | t], acc, err, true), do: _modifiers(t, acc, [?U | err], true)
1029+
1030+
defp _modifiers([:multiline | t], acc, err, embed_only),
1031+
do: _modifiers(t, [?m | acc], err, embed_only)
1032+
1033+
defp _modifiers([], acc, [], _all), do: {:ok, acc}
1034+
defp _modifiers([], _acc, err, _all), do: {:error, err}
1035+
defp _modifiers(_t, _acc, _err, _all), do: :error
1036+
8481037
defp translate_options(<<?s, t::binary>>, acc),
8491038
do: translate_options(t, [:dotall, {:newline, :anycrlf} | acc])
8501039

lib/elixir/lib/string/chars.ex

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,9 @@ defimpl String.Chars, for: Float do
6464
Float.to_string(term)
6565
end
6666
end
67+
68+
defimpl String.Chars, for: Regex do
69+
def to_string(term) do
70+
Regex.to_string!(term)
71+
end
72+
end

lib/elixir/test/elixir/string/chars_test.exs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,21 @@ defmodule String.Chars.URITest do
100100
end
101101
end
102102

103+
defmodule String.Chars.RegexTest do
104+
use ExUnit.Case, async: true
105+
106+
test "regex" do
107+
re1 = ~r/foo|bar/
108+
assert "#{re1}" == "(?-imsx:foo|bar)"
109+
re2 = ~r/(#{re1})/
110+
assert re2 == ~r/((?-imsx:foo|bar))/
111+
re3 = ~r/thing # comment/x
112+
assert "#{re3}" == "(?x-ims:thing # comment\n)"
113+
re4 = ~r/#{re3}#{re1}ee/i
114+
assert "#{re4}" == "(?i-msx:(?x-ims:thing # comment\n)(?-imsx:foo|bar)ee)"
115+
end
116+
end
117+
103118
defmodule String.Chars.ErrorsTest do
104119
use ExUnit.Case, async: true
105120

0 commit comments

Comments
 (0)