Skip to content

Commit a8a7464

Browse files
authored
Add support for regular expressions in Tokenizers.PreTokenizer.split/3 (#54)
1 parent 39f87df commit a8a7464

File tree

3 files changed

+55
-7
lines changed

3 files changed

+55
-7
lines changed

lib/tokenizers/pre_tokenizer.ex

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,20 +134,44 @@ defmodule Tokenizers.PreTokenizer do
134134
| :contiguous
135135

136136
@doc """
137-
Creates a Split pre-tokenizer.
137+
Creates a Split pre-tokenizer using a string as split pattern.
138138
139139
Versatile pre-tokenizer that splits on provided pattern and according
140-
to provided behavior. The pattern can be inverted if necessary.
140+
to provided behavior.
141141
142142
## Options
143143
144144
* `:invert` - whether to invert the split or not. Defaults to `false`
145145
146146
"""
147147
@spec split(String.t(), split_delimiter_behaviour(), keyword()) :: t()
148-
defdelegate split(pattern, behavior, opts \\ []),
149-
to: Tokenizers.Native,
150-
as: :pre_tokenizers_split
148+
def split(pattern, behavior, opts \\ []) when is_binary(pattern) do
149+
Tokenizers.Native.pre_tokenizers_split({:string, pattern}, behavior, opts)
150+
end
151+
152+
@doc ~S"""
153+
Creates a Split pre-tokenizer using a regular expression as split pattern.
154+
155+
Versatile pre-tokenizer that splits on provided regex pattern and according
156+
to provided behavior.
157+
158+
The `pattern` should be a string representing a regular expression
159+
according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).
160+
161+
## Options
162+
163+
* `:invert` - whether to invert the split or not. Defaults to `false`
164+
165+
## Example
166+
167+
iex> Tokenizers.PreTokenizer.split_regex(~S(\?\d{2}\?), :removed)
168+
#Tokenizers.PreTokenizer<[pre_tokenizer_type: "Split"]>
169+
170+
"""
171+
@spec split_regex(String.t(), split_delimiter_behaviour(), keyword()) :: t()
172+
def split_regex(pattern, behavior, opts \\ []) when is_binary(pattern) do
173+
Tokenizers.Native.pre_tokenizers_split({:regex, pattern}, behavior, opts)
174+
end
151175

152176
@doc """
153177
Creates a Punctuation pre-tokenizer.

native/ex_tokenizers/src/pre_tokenizers.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use crate::util::Info;
22
use crate::{new_info, ExTokenizersError};
33
use rustler::NifTaggedEnum;
44
use serde::{Deserialize, Serialize};
5+
use tokenizers::pre_tokenizers::split::SplitPattern;
56
use tokenizers::PreTokenizer;
67
use tokenizers::{processors::byte_level::ByteLevel, PreTokenizedString, PreTokenizerWrapper};
78

@@ -241,24 +242,35 @@ pub enum SplitOption {
241242
Invert(bool),
242243
}
243244

245+
#[derive(NifTaggedEnum)]
246+
pub enum LocalSplitPattern {
247+
String(String),
248+
Regex(String),
249+
}
250+
244251
#[rustler::nif]
245252
pub fn pre_tokenizers_split(
246-
pattern: String,
253+
pattern: LocalSplitPattern,
247254
behavior: SplitDelimiterBehavior,
248255
options: Vec<SplitOption>,
249256
) -> Result<ExTokenizersPreTokenizer, rustler::Error> {
250257
struct Opts {
251258
invert: bool,
252259
}
253260
let mut opts = Opts { invert: false };
261+
let final_pattern = match pattern {
262+
LocalSplitPattern::String(pattern) => SplitPattern::String(pattern),
263+
LocalSplitPattern::Regex(pattern) => SplitPattern::Regex(pattern),
264+
};
265+
254266
for option in options {
255267
match option {
256268
SplitOption::Invert(invert) => opts.invert = invert,
257269
}
258270
}
259271

260272
Ok(ExTokenizersPreTokenizer::new(
261-
tokenizers::pre_tokenizers::split::Split::new(pattern, behavior.into(), opts.invert)
273+
tokenizers::pre_tokenizers::split::Split::new(final_pattern, behavior.into(), opts.invert)
262274
.map_err(|_| rustler::Error::BadArg)?,
263275
))
264276
}

test/tokenizers/pre_tokenizer_test.exs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@ defmodule Tokenizers.PreTokenizerTest do
2424
end
2525
end
2626

27+
describe "Regex split pretokenizer" do
28+
test "accepts regular expressions" do
29+
assert %Tokenizers.PreTokenizer{} =
30+
Tokenizers.PreTokenizer.split_regex(".*", :removed)
31+
end
32+
33+
test "accepts options" do
34+
assert %Tokenizers.PreTokenizer{} =
35+
Tokenizers.PreTokenizer.split_regex(".*", :removed, invert: true)
36+
end
37+
end
38+
2739
describe "WhitespaceSplit pretokenizer" do
2840
test "accepts no parameters" do
2941
assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.whitespace_split()

0 commit comments

Comments
 (0)