Skip to content

Commit 1274983

Browse files
refactor: By default, sample only simple characters in strings (#252)
1 parent f35e9dc commit 1274983

File tree

2 files changed

+21
-2
lines changed

2 files changed

+21
-2
lines changed

dataframely/columns/string.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from ._base import Check, Column
1515
from ._registry import register
1616

17+
DEFAULT_SAMPLING_REGEX = r"[0-9a-zA-Z]"
18+
1719

1820
@register
1921
class String(Column):
@@ -126,9 +128,9 @@ def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
126128
str_max = f"{self.max_length}" if self.max_length is not None else ""
127129
# NOTE: We generate single-byte unicode characters here as validation uses
128130
# `len_bytes()`. Potentially we need to be more accurate at some point...
129-
regex = f"[\x01-\x7a]{{{str_min},{str_max}}}"
131+
regex = f"{DEFAULT_SAMPLING_REGEX}{{{str_min},{str_max}}}"
130132
else:
131-
regex = r"[\x01-\x7a]*"
133+
regex = rf"{DEFAULT_SAMPLING_REGEX}*"
132134

133135
return generator.sample_string(
134136
n,

tests/columns/test_str.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
# Copyright (c) QuantCo 2025-2025
22
# SPDX-License-Identifier: BSD-3-Clause
3+
import re
34

45
import pytest
56

67
import dataframely as dy
78
from dataframely.columns import Column
9+
from dataframely.columns.string import DEFAULT_SAMPLING_REGEX
10+
from dataframely.random import Generator
811
from dataframely.testing import ALL_COLUMN_TYPES
912

1013

@@ -32,3 +35,17 @@ def test_string_representation_array() -> None:
3235
def test_string_representation_struct() -> None:
3336
column = dy.Struct({"a": dy.String()})
3437
assert str(column) == dy.Struct.__name__.lower()
38+
39+
40+
@pytest.mark.parametrize("min_length", [None, 5, 10])
41+
@pytest.mark.parametrize("max_length", [None, 20])
42+
def test_string_sampling_without_regex(
43+
min_length: int | None, max_length: int | None
44+
) -> None:
45+
# Check that if no regex is provided, the sampled strings only use
46+
# characters from the DEFAULT_SAMPLING_REGEX.
47+
column = dy.String(min_length=min_length, max_length=max_length)
48+
generator = Generator(seed=42)
49+
sample = column.sample(generator=generator, n=1000)
50+
51+
assert all(re.match(f"{DEFAULT_SAMPLING_REGEX}*", value) for value in sample)

0 commit comments

Comments
 (0)