Skip to content

Commit 97b3220

Browse files
authored
feat: under support for regex (#809)
1 parent 4ca30b9 commit 97b3220

File tree

8 files changed

+185
-5
lines changed

8 files changed

+185
-5
lines changed

changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ This version is not yet released. If you are reading this on the website, then t
4848
- [`under ⍜`](https://uiua.org/docs/under)[`&cd`](https://uiua.org/docs/&cd) will return to the original directory afterward
4949
- Add experimental [`reciprocal ⨪`](https://uiua.org/docs/reciprocal) function, which computes the multiplicative inverse AKA reciprocal of a number
5050
- Make subscripted [`stack ?`](https://uiua.org/docs/stack) merge adjacent non-subscripted [`stack ?`](https://uiua.org/docs/stack) chains
51+
- Implement [`under ⍜`](https://uiua.org/docs/under)[`regex`](https://uiua.org/docs/regex) for replacing using regex (called `gsub` in some other languages)
5152
### Interpreter
5253
- Speed up the implementation of [`or ∨`](https://uiua.org/docs/or)
5354
- The fomatter no longer truncates trailing decimal `0`s from number literals

parser/src/defs.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2610,12 +2610,15 @@ primitive!(
26102610
/// ex: regex "hi" "dog"
26112611
/// : △.
26122612
/// ex: regex "[a-z]+" "hello world"
2613-
/// Escaped regex characters must be double-escaped.
2613+
/// If the pattern contains escaped characters such as `\w`, either these must be double escaped or the whole pattern must be represented with a raw string.
26142614
/// ex: regex "\\d+" "123"
2615-
/// ex: P ← $"(\\d{_})"
2615+
/// ex: P ← $$ (\d{_})
26162616
/// : regex $"_-_-_"P3P3P4 "123-456-7890"
26172617
/// Regex patterns with optional captures can be used with [fill].
26182618
/// ex: ⬚""regex "a(b)?" "a ab"
2619+
/// [under] can be used to run arbitrary regex-based substitutions.
2620+
/// ex: Lorem
2621+
/// : ⍜regex≡(□⊂⋅⊙⇌°□₃) $ (\w)(\w+)
26192622
///
26202623
/// Uiua uses the [Rust regex crate](https://docs.rs/regex/latest/regex/) internally.
26212624
(2, Regex, Algorithm, "regex"),

site/src/primitive.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@ fn all_unders() -> impl IntoView {
353353
{ inverse_row([Repeat], Required, "Inner function must be invertible", "⍜⍥(×2). 5 1") }
354354
{ inverse_row([Repeat], No, "Inner function must be invertible", "°⍥°(↧1000×2) 5") }
355355
{ inverse_row([Switch], No, "", "⍜(⨬⊢⊣|×10) 1 [1 2 3 4]") }
356+
{ inverse_row([Regex], Optional, "Substitution", r#"⍜regex≡(□⊂⋅⊙⇌°□₃) ⊙"Hello, world!" $ (\w)(\w+)"#) }
356357
{ inverse_row([Now], No, "Times execution", "⍜now(&sl 0.005)") }
357358
{ inverse_row([Sys(FOpen)], Optional, view!("Calls "<Prim prim=Sys(Close)/>" on handle"), None) }
358359
{ inverse_row([Sys(FCreate)], Optional, view!("Calls "<Prim prim=Sys(Close)/>" on handle"), None) }

src/compile/invert/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ as_node!(A, B, C);
9191
as_node!(A, B, C, D);
9292
as_node!(A, B, C, D, E);
9393
as_node!(A, B, C, D, E, F);
94+
as_node!(A, B, C, D, E, F, G);
95+
as_node!(A, B, C, D, E, F, G, H);
9496
as_node!(A, B, C, D, E, F, G, H, I);
9597

9698
trait SpanFromNodes: Sized + fmt::Debug + Sync {

src/compile/invert/under.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,19 @@ static UNDER_PATTERNS: &[&dyn UnderPattern] = &[
163163
&Stash(2, Pick, UndoPick),
164164
&Stash(2, Select, UndoSelect),
165165
&Stash(2, AntiOrient, UndoAntiOrient),
166+
&(
167+
Regex,
168+
(
169+
Over,
170+
Flip,
171+
DoRegex,
172+
PushUnd(1),
173+
CopyUnd(1),
174+
Flip,
175+
PushUnd(1),
176+
),
177+
(PopUnd(3), UndoRegex),
178+
),
166179
// Map control
167180
&MaybeVal((Get, (CopyUnd(2), Get), (PopUnd(1), Flip, PopUnd(1), Insert))),
168181
&Stash(2, Remove, UndoRemove),

src/impl_prim.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ impl_primitive!(
186186
(2, UndoWhere),
187187
(2, AntiOrient),
188188
(3, UndoAntiOrient),
189+
(2(2), DoRegex),
190+
(4, UndoRegex),
189191
(3(2), UndoJoin),
190192
(1(1)[1], UndoPartition1),
191193
(3, UndoPartition2),
@@ -374,6 +376,8 @@ impl fmt::Display for ImplPrimitive {
374376
UndoWhere => write!(f, "{Under}{Where}"),
375377
AntiOrient => write!(f, "{Anti}{Orient}"),
376378
UndoAntiOrient => write!(f, "{Under}{Orient}"),
379+
DoRegex => write!(f, "{Regex}"),
380+
UndoRegex => write!(f, "{Under}{Regex}"),
377381
UndoInsert => write!(f, "{Under}{Insert}"),
378382
UndoRemove => write!(f, "{Under}{Remove}"),
379383
UndoPartition1 | UndoPartition2 => write!(f, "{Under}{Partition}"),

src/run_prim.rs

Lines changed: 137 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
//! For the meat of the actual array algorithms, see [`crate::algorithm`].
44
55
use ecow::EcoVec;
6+
use rayon::iter::Either;
67
use regex::Regex;
78
use smallvec::SmallVec;
89

@@ -288,7 +289,12 @@ pub fn run_prim_func(prim: &Primitive, env: &mut Uiua) -> UiuaResult {
288289
env.push(vals);
289290
}
290291
Primitive::Stack => stack(env, false)?,
291-
Primitive::Regex => regex(env)?,
292+
Primitive::Regex => {
293+
regex(env)?;
294+
// NOTE: if you want to expose the match locations, n.t. they are given in bytes rather than codepoints
295+
// that is, you should also convert between byte and codepoint positions in `regex` and `undo_regex`
296+
env.pop("regex locations")?;
297+
}
292298
Primitive::Hsv => env.monadic_env(Value::rgb_to_hsv)?,
293299
Primitive::Json => env.monadic_ref_env(Value::to_json_string)?,
294300
Primitive::Binary => env.monadic_ref_env(Value::to_binary)?,
@@ -834,6 +840,8 @@ impl ImplPrimitive {
834840
let from = env.pop(3)?;
835841
env.push(from.undo_anti_orient(indices, into, env)?);
836842
}
843+
ImplPrimitive::DoRegex => regex(env)?,
844+
ImplPrimitive::UndoRegex => undo_regex(env)?,
837845
ImplPrimitive::UndoRerank => {
838846
let rank = env.pop(1)?;
839847
let shape = Shape::from(
@@ -1565,11 +1573,12 @@ fn regex(env: &mut Uiua) -> UiuaResult {
15651573

15661574
let mut matches: Value =
15671575
Array::<Boxed>::new([0, regex.captures_len()].as_slice(), []).into();
1576+
let mut locations: Value = Array::<f64>::new([0].as_slice(), []).into();
15681577

15691578
for caps in regex.captures_iter(&target) {
1570-
let row: EcoVec<Boxed> = caps
1579+
let row: EcoVec<_> = caps
15711580
.iter()
1572-
.flat_map(|m| {
1581+
.filter_map(|m| {
15731582
m.map(|m| Boxed(Value::from(m.as_str()))).or_else(|| {
15741583
env.value_fill()
15751584
.map(|fv| fv.value.clone())
@@ -1578,13 +1587,138 @@ fn regex(env: &mut Uiua) -> UiuaResult {
15781587
})
15791588
.collect();
15801589
matches.append(row.into(), false, env)?;
1590+
locations.append(
1591+
(caps
1592+
.get(0)
1593+
.expect("existence of 0 group is guaranteed")
1594+
.start() as f64)
1595+
.into(),
1596+
false,
1597+
env,
1598+
)?;
15811599
}
15821600

15831601
env.push(matches);
1602+
env.push(locations);
15841603
Ok(())
15851604
})
15861605
}
15871606

1607+
fn undo_regex(env: &mut Uiua) -> UiuaResult {
1608+
use std::iter::{once, repeat, zip, Repeat};
1609+
let locations = env
1610+
.pop(1)?
1611+
.as_nats(env, "Capture locations should be natural numbers")?;
1612+
let captures = env.pop(2)?;
1613+
let haystack = env.pop(3)?.as_string(env, "Haystack should be a string")?;
1614+
let mut repls = env.pop(4)?;
1615+
debug_assert_eq!(locations.len(), captures.row_count());
1616+
let captures: Vec<_> = captures
1617+
.into_rows()
1618+
.map(|x| {
1619+
x.first(env)
1620+
.expect("Capture group rows should have at least one element")
1621+
.as_string_opt()
1622+
.expect("Capture group content should be a string")
1623+
.len()
1624+
})
1625+
.collect();
1626+
enum OneOrMany<T> {
1627+
One(T),
1628+
Many(Vec<T>),
1629+
}
1630+
impl<T: Clone> IntoIterator for OneOrMany<T> {
1631+
type Item = T;
1632+
type IntoIter = Either<Repeat<T>, std::vec::IntoIter<T>>;
1633+
1634+
fn into_iter(self) -> Self::IntoIter {
1635+
match self {
1636+
Self::One(x) => Either::Left(repeat(x)),
1637+
Self::Many(items) => Either::Right(items.into_iter()),
1638+
}
1639+
}
1640+
}
1641+
use OneOrMany::{Many, One};
1642+
impl<'a, T> IntoIterator for &'a OneOrMany<T> {
1643+
type Item = &'a T;
1644+
type IntoIter = Either<Repeat<&'a T>, std::slice::Iter<'a, T>>;
1645+
1646+
fn into_iter(self) -> Self::IntoIter {
1647+
match self {
1648+
One(x) => Either::Left(repeat(x)),
1649+
Many(items) => Either::Right(items.iter()),
1650+
}
1651+
}
1652+
}
1653+
let repls: OneOrMany<String> = loop {
1654+
let rank = repls.rank();
1655+
repls =
1656+
match (repls, rank) {
1657+
(Value::Char(arr), 0 | 1) => break One(arr.data.into_iter().collect()),
1658+
(Value::Char(arr), 2) => {
1659+
break Many(arr.row_slices().map(|x| x.iter().collect()).collect())
1660+
}
1661+
(Value::Char(arr), 3) => {
1662+
break Many(
1663+
arr.into_rows()
1664+
.map(|x| x.first(env).map(|x| x.data.into_iter().collect()))
1665+
.collect::<UiuaResult<Vec<_>>>()?,
1666+
)
1667+
}
1668+
(Value::Box(bx), 0) => bx
1669+
.into_unboxed()
1670+
.expect("Scalar box array should be unboxable"),
1671+
(Value::Box(arr), 1) => {
1672+
break Many(
1673+
arr.data
1674+
.into_iter()
1675+
.map(|x| x.0)
1676+
.map(|x| x.as_string(env, "Expected boxed replacements to be strings"))
1677+
.collect::<UiuaResult<_>>()?,
1678+
)
1679+
}
1680+
(Value::Box(arr), 2) => {
1681+
break Many(
1682+
arr.into_rows()
1683+
.map(|x| {
1684+
x.first(env).and_then(|x| {
1685+
x.into_unboxed()
1686+
.expect("rank 0 box array should be unboxable")
1687+
.as_string(env, "Expected boxed replacements to be strings")
1688+
})
1689+
})
1690+
.collect::<UiuaResult<_>>()?,
1691+
)
1692+
}
1693+
_ => return Err(env.error(
1694+
"Expected replacements to be a string, array of strings or 2d array of strings",
1695+
)),
1696+
}
1697+
};
1698+
if let Many(ref x) = repls {
1699+
if locations.len() != x.len() {
1700+
return Err(env.error("Expected to have the same amount of captures and replacements"));
1701+
}
1702+
}
1703+
let repls = (&repls)
1704+
.into_iter()
1705+
.take(locations.len())
1706+
.map(String::as_str)
1707+
.chain(once(""));
1708+
let ends = zip(&locations, captures).map(|(&loc, len)| loc + len);
1709+
let chunks = zip(
1710+
once(0).chain(ends),
1711+
locations.iter().copied().chain(once(haystack.len())),
1712+
)
1713+
.map(|(s, e)| &haystack[s..e]);
1714+
env.push(
1715+
zip(chunks, repls)
1716+
.flat_map(|(chunk, repl)| [chunk, repl])
1717+
.collect::<String>(),
1718+
);
1719+
Ok(())
1720+
}
1721+
15881722
thread_local! {
15891723
pub(crate) static RNG: RefCell<SmallRng> = RefCell::new(SmallRng::from_entropy());
15901724
}

tests/under.ua

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,3 +482,25 @@ F ← ⌅(+|⌅(-|⟜-|×10+))
482482
⍤⤙≍ [¯1 ∞ ¯2] [⍜𝄐⊟¯ 1 ∞ 2]
483483
⍤⤙≍ [4 ∞ 5] [⍜(⊟⊙◌)+ 1 ∞ 2 3]
484484
⍤⤙≍ [¯1 ∞ ¯2] [⍜(⊟⊙◌)¯ 1 ∞ 2]
485+
486+
# Regex
487+
⍤⤙≍ Lorem ⍜(regex"\\w+")∘ Lorem
488+
H ← "Hello, world!!"
489+
⍤⤙≍ "world, Hello!!" ⍜regex⇌ ⊙H$ \w+
490+
⍤⤙≍ "Holle, wdlro!!" ⍜regex≡(□⊂⋅⊙⇌°□₃) ⊙H$ (\w)(\w+)
491+
⍤⤙≍ 0 ⍤⤙≍ H ⍜regex⟜⧻ ⊙H$ \d+
492+
⍤⤙≍ "@, w!!" ⬚@@⍜regex≡⊣ ⊙H$ \b([a-z])?\w*\s?
493+
⍤⤙≍ "HHello, wworld!!" ⍜regex≡(▽2°□⊣) ⊙H$ \b(\w)
494+
⍤⤙≍ ".H.e.l.l.o.,. . . .w.o.r.l.d.!.!." ⍜regex⋅@. "" H
495+
# different replacement forms:
496+
⊙H $ (\w)(\w*)
497+
⍤⤙≍ "olleH, dlrow!!" ◡⍜regex≡⍜⊢⇌ # 2d, boxed strings
498+
# TODO: remove ∘ when the misoptimization is fixed
499+
⍤⤙≍ "olleH, dlrow!!" ◡⍜regex≡(∘⇌⊢) # 1d boxed strings
500+
⍤⤙≍ "bip, bip!!" ◡⍜regex⋅(□"bip") # boxed string
501+
⍤⤙≍ "*, *!!" ◡⍜regex⋅(□@*) # boxed char
502+
⍤⤙≍ "olleH, dlrow!!" ◡⍜regex≡(⇌°□⊢) # 2d strings
503+
⍤⤙≍ "olleH, dlrow!!" ◡⍜regex≡(⊟⊸⇌°□⊢) # 1d strings
504+
⍤⤙≍ "bip, bip!!" ◡⍜regex⋅"bip" # string
505+
⍤⤙≍ "*, *!!" ◡⍜regex⋅@* # char
506+
⋅◌

0 commit comments

Comments
 (0)