Add support for regular expressions in Tokenizers.Normalizer.replace/2 (#56)

mruoss · web-flow · commit 0c8f4b72cd83 · 2024-04-22T09:18:24.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Added
+
+- Support for regular expressions to replace normalizer. See
+  `Tokenizers.Normalizer.replace_regex/2`.
+- Support for regular expressions to split pre-tokenizer. See
+  `Tokenizers.PreTokenizer.split_regex/3`.
+
 ## [v0.4.0] - 2023-08-09
 
 ### Added
diff --git a/lib/tokenizers/normalizer.ex b/lib/tokenizers/normalizer.ex
@@ -117,12 +117,23 @@ defmodule Tokenizers.Normalizer do
   defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase
 
   @doc """
-  Replaces a custom string or regexp and changes it with given content.
+  Replaces a custom `search` string with the given `content`.
   """
   @spec replace(String.t(), String.t()) :: t()
-  defdelegate replace(pattern, content),
-    to: Tokenizers.Native,
-    as: :normalizers_replace
+  def replace(search, content) do
+    Tokenizers.Native.normalizers_replace({:string, search}, content)
+  end
+
+  @doc """
+  Replaces occurrences of a custom regexp `pattern` with the given `content`.
+
+  The `pattern` should be a string representing a regular expression
+  according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).
+  """
+  @spec replace_regex(String.t(), String.t()) :: t()
+  def replace_regex(pattern, content) do
+    Tokenizers.Native.normalizers_replace({:regex, pattern}, content)
+  end
 
   @doc """
   Creates a Nmt normalizer.
diff --git a/native/ex_tokenizers/src/normalizers.rs b/native/ex_tokenizers/src/normalizers.rs
@@ -1,7 +1,9 @@
 use crate::{new_info, util::Info, ExTokenizersError};
 use rustler::NifTaggedEnum;
 use serde::{Deserialize, Serialize};
-use tokenizers::{NormalizedString, Normalizer, NormalizerWrapper};
+use tokenizers::{
+    normalizers::replace::ReplacePattern, NormalizedString, Normalizer, NormalizerWrapper,
+};
 
 pub struct ExTokenizersNormalizerRef(pub NormalizerWrapper);
 
@@ -241,13 +243,24 @@ pub fn normalizers_lowercase() -> ExTokenizersNormalizer {
     ExTokenizersNormalizer::new(tokenizers::normalizers::utils::Lowercase)
 }
 
+#[derive(NifTaggedEnum)]
+pub enum LocalReplacePattern {
+    String(String),
+    Regex(String),
+}
+
 #[rustler::nif]
 pub fn normalizers_replace(
-    pattern: String,
+    pattern: LocalReplacePattern,
     content: String,
 ) -> Result<ExTokenizersNormalizer, rustler::Error> {
+    let final_pattern = match pattern {
+        LocalReplacePattern::String(pattern) => ReplacePattern::String(pattern),
+        LocalReplacePattern::Regex(pattern) => ReplacePattern::Regex(pattern),
+    };
+
     Ok(ExTokenizersNormalizer::new(
-        tokenizers::normalizers::replace::Replace::new(pattern, content)
+        tokenizers::normalizers::replace::Replace::new(final_pattern, content)
             .map_err(|_| rustler::Error::BadArg)?,
     ))
 }
diff --git a/test/tokenizers/normalizer_test.exs b/test/tokenizers/normalizer_test.exs
@@ -90,4 +90,28 @@ defmodule Tokenizers.NormalizerTest do
                {:ok, "▁Hello"}
     end
   end
+
+  describe "Replace" do
+    test "can be initialized" do
+      assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace("find", "replace")
+    end
+
+    test "can normalize strings" do
+      assert Tokenizers.Normalizer.replace("Hello", "World")
+             |> Tokenizers.Normalizer.normalize("Hello") ==
+               {:ok, "World"}
+    end
+  end
+
+  describe "Replace Regex" do
+    test "can be initialized" do
+      assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace_regex("\\d*", "")
+    end
+
+    test "can normalize strings" do
+      assert Tokenizers.Normalizer.replace_regex("\\d*", "")
+             |> Tokenizers.Normalizer.normalize("1Hel2lo3") ==
+               {:ok, "Hello"}
+    end
+  end
 end