Skip to content

Commit 0c8f4b7

Browse files
authored
Add support for regular expressions in Tokenizers.Normalizer.replace/2 (#56)
1 parent f560e8f commit 0c8f4b7

File tree

4 files changed

+64
-7
lines changed

4 files changed

+64
-7
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## Unreleased
9+
10+
### Added
11+
12+
- Support for regular expressions to replace normalizer. See
13+
`Tokenizers.Normalizer.replace_regex/2`.
14+
- Support for regular expressions to split pre-tokenizer. See
15+
`Tokenizers.PreTokenizer.split_regex/3`.
16+
817
## [v0.4.0] - 2023-08-09
918

1019
### Added

lib/tokenizers/normalizer.ex

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,23 @@ defmodule Tokenizers.Normalizer do
117117
defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase
118118

119119
@doc """
120-
Replaces a custom string or regexp and changes it with given content.
120+
Replaces a custom `search` string with the given `content`.
121121
"""
122122
@spec replace(String.t(), String.t()) :: t()
123-
defdelegate replace(pattern, content),
124-
to: Tokenizers.Native,
125-
as: :normalizers_replace
123+
def replace(search, content) do
124+
Tokenizers.Native.normalizers_replace({:string, search}, content)
125+
end
126+
127+
@doc """
128+
Replaces occurrences of a custom regexp `pattern` with the given `content`.
129+
130+
The `pattern` should be a string representing a regular expression
131+
according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).
132+
"""
133+
@spec replace_regex(String.t(), String.t()) :: t()
134+
def replace_regex(pattern, content) do
135+
Tokenizers.Native.normalizers_replace({:regex, pattern}, content)
136+
end
126137

127138
@doc """
128139
Creates a Nmt normalizer.

native/ex_tokenizers/src/normalizers.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
use crate::{new_info, util::Info, ExTokenizersError};
22
use rustler::NifTaggedEnum;
33
use serde::{Deserialize, Serialize};
4-
use tokenizers::{NormalizedString, Normalizer, NormalizerWrapper};
4+
use tokenizers::{
5+
normalizers::replace::ReplacePattern, NormalizedString, Normalizer, NormalizerWrapper,
6+
};
57

68
pub struct ExTokenizersNormalizerRef(pub NormalizerWrapper);
79

@@ -241,13 +243,24 @@ pub fn normalizers_lowercase() -> ExTokenizersNormalizer {
241243
ExTokenizersNormalizer::new(tokenizers::normalizers::utils::Lowercase)
242244
}
243245

246+
#[derive(NifTaggedEnum)]
247+
pub enum LocalReplacePattern {
248+
String(String),
249+
Regex(String),
250+
}
251+
244252
#[rustler::nif]
245253
pub fn normalizers_replace(
246-
pattern: String,
254+
pattern: LocalReplacePattern,
247255
content: String,
248256
) -> Result<ExTokenizersNormalizer, rustler::Error> {
257+
let final_pattern = match pattern {
258+
LocalReplacePattern::String(pattern) => ReplacePattern::String(pattern),
259+
LocalReplacePattern::Regex(pattern) => ReplacePattern::Regex(pattern),
260+
};
261+
249262
Ok(ExTokenizersNormalizer::new(
250-
tokenizers::normalizers::replace::Replace::new(pattern, content)
263+
tokenizers::normalizers::replace::Replace::new(final_pattern, content)
251264
.map_err(|_| rustler::Error::BadArg)?,
252265
))
253266
}

test/tokenizers/normalizer_test.exs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,28 @@ defmodule Tokenizers.NormalizerTest do
9090
{:ok, "▁Hello"}
9191
end
9292
end
93+
94+
describe "Replace" do
95+
test "can be initialized" do
96+
assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace("find", "replace")
97+
end
98+
99+
test "can normalize strings" do
100+
assert Tokenizers.Normalizer.replace("Hello", "World")
101+
|> Tokenizers.Normalizer.normalize("Hello") ==
102+
{:ok, "World"}
103+
end
104+
end
105+
106+
describe "Replace Regex" do
107+
test "can be initialized" do
108+
assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace_regex("\\d*", "")
109+
end
110+
111+
test "can normalize strings" do
112+
assert Tokenizers.Normalizer.replace_regex("\\d*", "")
113+
|> Tokenizers.Normalizer.normalize("1Hel2lo3") ==
114+
{:ok, "Hello"}
115+
end
116+
end
93117
end

0 commit comments

Comments
 (0)