Skip to content

Commit 2e700c6

Browse files
authored
Merge pull request numpy#27789 from ArvidJB/string_slice_gufunc
ENH: Implement np.strings.slice as a gufunc
2 parents 767ccd8 + cafa9c2 commit 2e700c6

File tree

10 files changed

+544
-3
lines changed

10 files changed

+544
-3
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
* New function `numpy.strings.slice`
2+
The new function `numpy.strings.slice` was added, which implements fast
3+
native slicing of string arrays. It supports the full slicing API including
4+
negative slice offsets and steps.

doc/source/reference/routines.strings.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ String operations
4646
rjust
4747
rpartition
4848
rstrip
49+
slice
4950
strip
5051
swapcase
5152
title

numpy/_core/code_generators/generate_umath.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1338,6 +1338,11 @@ def english_upper(s):
13381338
docstrings.get('numpy._core.umath._rpartition'),
13391339
None,
13401340
),
1341+
'_slice':
1342+
Ufunc(4, 1, None,
1343+
docstrings.get('numpy._core.umath._slice'),
1344+
None,
1345+
),
13411346
}
13421347

13431348
def indent(st, spaces):

numpy/_core/code_generators/ufunc_docstrings.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5451,3 +5451,40 @@ def add_newdoc(place, name, doc):
54515451
array(['', ' ', 'Bba'], dtype=StringDType()))
54525452
54535453
""")
5454+
5455+
add_newdoc('numpy._core.umath', '_slice',
5456+
"""
5457+
Slice the strings in `a` by slices specified by `start`, `stop`, `step`.
5458+
Like in the regular Python `slice` object, if only `start` is
5459+
specified then it is interpreted as the `stop`.
5460+
5461+
Parameters
5462+
----------
5463+
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
5464+
5465+
start : the start of the slice, an integer or an array of integers
5466+
which can be broadcasted to`a`'s shape
5467+
5468+
stop : the end point of the slice, an integer or an array of integers
5469+
which can be broadcasted to`a`'s shape
5470+
5471+
step : the step for the slice, an integer or an array of integers
5472+
which can be broadcasted to`a`'s shape
5473+
5474+
Returns
5475+
-------
5476+
out : ndarray
5477+
Output array of str or unicode, depending on input type
5478+
5479+
Examples
5480+
--------
5481+
>>> import numpy as np
5482+
5483+
The ufunc is used most easily via ``np.strings.slice``,
5484+
which calls it under the hood::
5485+
5486+
>>> a = np.array(['hello', 'world'])
5487+
>>> np.strings.slice(a, 2)
5488+
array(['he', 'wo'], dtype='<U5')
5489+
5490+
""")

numpy/_core/defchararray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1272,7 +1272,7 @@ class adds the following functionality:
12721272
fastest). If order is 'A', then the returned array may
12731273
be in any order (either C-, Fortran-contiguous, or even
12741274
discontiguous).
1275-
1275+
12761276
Examples
12771277
--------
12781278

numpy/_core/src/umath/string_ufuncs.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,67 @@ string_partition_index_loop(PyArrayMethod_Context *context,
633633
}
634634

635635

636+
template <ENCODING enc>
637+
static int
638+
string_slice_loop(PyArrayMethod_Context *context,
639+
char *const data[], npy_intp const dimensions[],
640+
npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
641+
{
642+
int insize = context->descriptors[0]->elsize;
643+
int outsize = context->descriptors[4]->elsize;
644+
645+
char *in_ptr = data[0];
646+
char *start_ptr = data[1];
647+
char *stop_ptr = data[2];
648+
char *step_ptr = data[3];
649+
char *out_ptr = data[4];
650+
651+
npy_intp N = dimensions[0];
652+
653+
while (N--) {
654+
Buffer<enc> inbuf(in_ptr, insize);
655+
Buffer<enc> outbuf(out_ptr, outsize);
656+
657+
// get the slice
658+
npy_intp start = *(npy_intp*)start_ptr;
659+
npy_intp stop = *(npy_intp*)stop_ptr;
660+
npy_intp step = *(npy_intp*)step_ptr;
661+
662+
// adjust slice to string length in codepoints
663+
// and handle negative indices
664+
size_t num_codepoints = inbuf.num_codepoints();
665+
npy_intp slice_length = PySlice_AdjustIndices(num_codepoints, &start, &stop, step);
666+
667+
// iterate over slice and copy each character of the string
668+
inbuf.advance_chars_or_bytes(start);
669+
for (npy_intp i = 0; i < slice_length; i++) {
670+
// copy one codepoint
671+
inbuf.buffer_memcpy(outbuf, 1);
672+
673+
// Move in inbuf by step.
674+
inbuf += step;
675+
676+
// Move in outbuf by the number of chars or bytes written
677+
outbuf.advance_chars_or_bytes(1);
678+
}
679+
680+
// fill remaining outbuf with zero bytes
681+
for (char *tmp = outbuf.buf; tmp < outbuf.after; tmp++) {
682+
*tmp = 0;
683+
}
684+
685+
// Go to the next array element
686+
in_ptr += strides[0];
687+
start_ptr += strides[1];
688+
stop_ptr += strides[2];
689+
step_ptr += strides[3];
690+
out_ptr += strides[4];
691+
}
692+
693+
return 0;
694+
}
695+
696+
636697
/* Resolve descriptors & promoter functions */
637698

638699
static NPY_CASTING
@@ -1064,6 +1125,53 @@ string_partition_resolve_descriptors(
10641125
}
10651126

10661127

1128+
static int
1129+
string_slice_promoter(PyObject *NPY_UNUSED(ufunc),
1130+
PyArray_DTypeMeta *const op_dtypes[], PyArray_DTypeMeta *const signature[],
1131+
PyArray_DTypeMeta *new_op_dtypes[])
1132+
{
1133+
Py_INCREF(op_dtypes[0]);
1134+
new_op_dtypes[0] = op_dtypes[0];
1135+
new_op_dtypes[1] = NPY_DT_NewRef(&PyArray_IntpDType);
1136+
new_op_dtypes[2] = NPY_DT_NewRef(&PyArray_IntpDType);
1137+
new_op_dtypes[3] = NPY_DT_NewRef(&PyArray_IntpDType);
1138+
Py_INCREF(op_dtypes[0]);
1139+
new_op_dtypes[4] = op_dtypes[0];
1140+
return 0;
1141+
}
1142+
1143+
static NPY_CASTING
1144+
string_slice_resolve_descriptors(
1145+
PyArrayMethodObject *self,
1146+
PyArray_DTypeMeta *const NPY_UNUSED(dtypes[5]),
1147+
PyArray_Descr *const given_descrs[5],
1148+
PyArray_Descr *loop_descrs[5],
1149+
npy_intp *NPY_UNUSED(view_offset))
1150+
{
1151+
if (given_descrs[4]) {
1152+
PyErr_Format(PyExc_TypeError,
1153+
"The '%s' ufunc does not "
1154+
"currently support the 'out' keyword",
1155+
self->name);
1156+
return _NPY_ERROR_OCCURRED_IN_CAST;
1157+
}
1158+
1159+
for (int i = 0; i < 4; i++) {
1160+
loop_descrs[i] = NPY_DT_CALL_ensure_canonical(given_descrs[i]);
1161+
if (loop_descrs[i] == NULL) {
1162+
return _NPY_ERROR_OCCURRED_IN_CAST;
1163+
}
1164+
}
1165+
1166+
loop_descrs[4] = PyArray_DescrNew(loop_descrs[0]);
1167+
if (loop_descrs[4] == NULL) {
1168+
return _NPY_ERROR_OCCURRED_IN_CAST;
1169+
}
1170+
loop_descrs[4]->elsize = loop_descrs[0]->elsize;
1171+
1172+
return NPY_NO_CASTING;
1173+
}
1174+
10671175
/*
10681176
* Machinery to add the string loops to the existing ufuncs.
10691177
*/
@@ -1744,6 +1852,28 @@ init_string_ufuncs(PyObject *umath)
17441852
}
17451853
}
17461854

1855+
dtypes[0] = NPY_OBJECT;
1856+
dtypes[1] = NPY_INTP;
1857+
dtypes[2] = NPY_INTP;
1858+
dtypes[3] = NPY_INTP;
1859+
dtypes[4] = NPY_OBJECT;
1860+
if (init_ufunc(
1861+
umath, "_slice", 4, 1, dtypes, ENCODING::ASCII,
1862+
string_slice_loop<ENCODING::ASCII>,
1863+
string_slice_resolve_descriptors, NULL) < 0) {
1864+
return -1;
1865+
}
1866+
if (init_ufunc(
1867+
umath, "_slice", 4, 1, dtypes, ENCODING::UTF32,
1868+
string_slice_loop<ENCODING::UTF32>,
1869+
string_slice_resolve_descriptors, NULL) < 0) {
1870+
return -1;
1871+
}
1872+
if (init_promoter(umath, "_slice", 4, 1,
1873+
string_slice_promoter) < 0) {
1874+
return -1;
1875+
}
1876+
17471877
return 0;
17481878
}
17491879

0 commit comments

Comments
 (0)