Skip to content

Commit 0279383

Browse files
committed
Improve: skip_empty arg for Python compatibility
1 parent 07c4d1c commit 0279383

File tree

2 files changed

+306
-91
lines changed

2 files changed

+306
-91
lines changed

python/stringzilla.c

Lines changed: 129 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ typedef struct {
206206
/// @brief Should we include the newline characters in the resulting slices?
207207
sz_bool_t keepends;
208208

209+
/// @brief Should we skip empty segments (trailing, leading, consecutive)?
210+
sz_bool_t skip_empty;
211+
209212
} Utf8SplitLinesIterator;
210213

211214
/**
@@ -223,6 +226,9 @@ typedef struct {
223226
sz_cptr_t end; //< End of text (immutable)
224227
sz_size_t match_length; //< Length of current segment to yield
225228

229+
/// @brief Should we skip empty segments (trailing, leading, consecutive)?
230+
sz_bool_t skip_empty;
231+
226232
} Utf8SplitWhitespaceIterator;
227233

228234
/**
@@ -3983,6 +3989,7 @@ static char const doc_utf8_splitlines_iter[] = //
39833989
"Args:\n"
39843990
" text (Str or str or bytes): The string object.\n"
39853991
" keepends (bool, optional): Include line endings in results (default is False).\n"
3992+
" skip_empty (bool, optional): Skip empty lines (default is False).\n"
39863993
"Returns:\n"
39873994
" iterator: An iterator yielding lines as Str objects.\n"
39883995
"\n"
@@ -3995,14 +4002,15 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
39954002
// Check minimum arguments
39964003
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
39974004
Py_ssize_t min_args = !is_member;
3998-
Py_ssize_t max_args = !is_member + 1;
4005+
Py_ssize_t max_args = !is_member + 2;
39994006
if (positional_args_count < min_args || positional_args_count > max_args) {
40004007
PyErr_Format(PyExc_TypeError, "utf8_splitlines_iter() requires %zd to %zd arguments", min_args, max_args);
40014008
return NULL;
40024009
}
40034010

40044011
PyObject *text_obj = is_member ? self : args[0];
40054012
PyObject *keepends_obj = positional_args_count > !is_member ? args[!is_member] : NULL;
4013+
PyObject *skip_empty_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
40064014

40074015
// Parse keyword arguments
40084016
if (args_names_tuple) {
@@ -4011,12 +4019,16 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
40114019
PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
40124020
PyObject *value = args[positional_args_count + i];
40134021
if (PyUnicode_CompareWithASCIIString(key, "keepends") == 0 && !keepends_obj) { keepends_obj = value; }
4022+
else if (PyUnicode_CompareWithASCIIString(key, "skip_empty") == 0 && !skip_empty_obj) {
4023+
skip_empty_obj = value;
4024+
}
40144025
else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
40154026
}
40164027
}
40174028

40184029
sz_string_view_t text;
40194030
int keepends = 0;
4031+
int skip_empty = 0;
40204032

40214033
// Validate and convert `text`
40224034
if (!sz_py_export_string_like(text_obj, &text.start, &text.length)) {
@@ -4033,6 +4045,15 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
40334045
}
40344046
}
40354047

4048+
// Validate and convert `skip_empty`
4049+
if (skip_empty_obj) {
4050+
skip_empty = PyObject_IsTrue(skip_empty_obj);
4051+
if (skip_empty == -1) {
4052+
wrap_current_exception("The skip_empty argument must be a boolean");
4053+
return NULL;
4054+
}
4055+
}
4056+
40364057
// Create the iterator
40374058
Utf8SplitLinesIterator *result_obj =
40384059
(Utf8SplitLinesIterator *)Utf8SplitLinesIteratorType.tp_alloc(&Utf8SplitLinesIteratorType, 0);
@@ -4042,6 +4063,7 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
40424063
result_obj->start = text.start;
40434064
result_obj->end = text.start + text.length;
40444065
result_obj->keepends = keepends;
4066+
result_obj->skip_empty = skip_empty;
40454067

40464068
// Find first segment length
40474069
sz_size_t newline_length = 0;
@@ -4062,6 +4084,7 @@ static char const doc_utf8_split_iter[] = //
40624084
"\n"
40634085
"Args:\n"
40644086
" text (Str or str or bytes): The string object.\n"
4087+
" skip_empty (bool, optional): Skip empty segments (default is False).\n"
40654088
"Returns:\n"
40664089
" iterator: An iterator yielding non-whitespace tokens as Str objects.\n"
40674090
"\n"
@@ -4076,27 +4099,44 @@ static PyObject *Str_like_utf8_split_iter(PyObject *self, PyObject *const *args,
40764099
// Check minimum arguments
40774100
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
40784101
Py_ssize_t min_args = !is_member;
4079-
Py_ssize_t max_args = !is_member;
4102+
Py_ssize_t max_args = !is_member + 1;
40804103
if (positional_args_count < min_args || positional_args_count > max_args) {
4081-
PyErr_Format(PyExc_TypeError, "utf8_split_iter() takes exactly %zd argument(s)", min_args);
4104+
PyErr_Format(PyExc_TypeError, "utf8_split_iter() requires %zd to %zd arguments", min_args, max_args);
40824105
return NULL;
40834106
}
40844107

4085-
// No keyword arguments expected
4086-
if (args_names_tuple && PyTuple_GET_SIZE(args_names_tuple) > 0) {
4087-
PyErr_SetString(PyExc_TypeError, "utf8_split_iter() takes no keyword arguments");
4088-
return NULL;
4108+
PyObject *text_obj = is_member ? self : args[0];
4109+
PyObject *skip_empty_obj = positional_args_count > !is_member ? args[!is_member] : NULL;
4110+
4111+
// Parse keyword arguments
4112+
if (args_names_tuple) {
4113+
Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
4114+
for (Py_ssize_t i = 0; i < args_names_count; ++i) {
4115+
PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
4116+
PyObject *value = args[positional_args_count + i];
4117+
if (PyUnicode_CompareWithASCIIString(key, "skip_empty") == 0 && !skip_empty_obj) { skip_empty_obj = value; }
4118+
else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
4119+
}
40894120
}
40904121

4091-
PyObject *text_obj = is_member ? self : args[0];
40924122
sz_string_view_t text;
4123+
int skip_empty = 0;
40934124

40944125
// Validate and convert `text`
40954126
if (!sz_py_export_string_like(text_obj, &text.start, &text.length)) {
40964127
wrap_current_exception("The text argument must be string-like");
40974128
return NULL;
40984129
}
40994130

4131+
// Validate and convert `skip_empty`
4132+
if (skip_empty_obj) {
4133+
skip_empty = PyObject_IsTrue(skip_empty_obj);
4134+
if (skip_empty == -1) {
4135+
wrap_current_exception("The skip_empty argument must be a boolean");
4136+
return NULL;
4137+
}
4138+
}
4139+
41004140
// Create the iterator
41014141
Utf8SplitWhitespaceIterator *result_obj =
41024142
(Utf8SplitWhitespaceIterator *)Utf8SplitWhitespaceIteratorType.tp_alloc(&Utf8SplitWhitespaceIteratorType, 0);
@@ -4105,6 +4145,7 @@ static PyObject *Str_like_utf8_split_iter(PyObject *self, PyObject *const *args,
41054145
result_obj->text_obj = text_obj;
41064146
result_obj->start = text.start;
41074147
result_obj->end = text.start + text.length;
4148+
result_obj->skip_empty = skip_empty;
41084149
// Find first segment length
41094150
sz_size_t ws_len = 0;
41104151
sz_cptr_t ws = sz_utf8_find_whitespace(result_obj->start, text.length, &ws_len);
@@ -4740,51 +4781,54 @@ static PyObject *Utf8SplitLinesIteratorType_next(Utf8SplitLinesIterator *self) {
47404781
Str *result_obj = (Str *)StrType.tp_alloc(&StrType, 0);
47414782
if (result_obj == NULL && PyErr_NoMemory()) return NULL;
47424783

4743-
// Build the result from current state
4784+
// Find next non-empty segment (or any segment if skip_empty is false)
47444785
sz_string_view_t result_memory;
4745-
result_memory.start = self->start;
4746-
result_memory.length = self->match_length;
4747-
4748-
// Include newline in result if keepends is set
4749-
if (self->keepends && self->start + self->match_length < self->end) {
4750-
sz_size_t newline_length = 0;
4751-
sz_cptr_t newline_ptr =
4752-
sz_utf8_find_newline(self->start + self->match_length,
4753-
(sz_size_t)(self->end - self->start - self->match_length), &newline_length);
4754-
if (newline_ptr == self->start + self->match_length) { result_memory.length += newline_length; }
4755-
}
4756-
4757-
// Advance to next segment
4758-
self->start += self->match_length;
4759-
4760-
// Skip delimiter at current position (if any)
4761-
if (self->start < self->end) {
4762-
sz_size_t newline_length = 0;
4763-
sz_cptr_t newline_ptr =
4764-
sz_utf8_find_newline(self->start, (sz_size_t)(self->end - self->start), &newline_length);
4765-
if (newline_ptr == self->start) { self->start += newline_length; }
4766-
}
4767-
// Handle the case where we're exactly at end after consuming content
4768-
else if (self->start == self->end) {
4769-
// We've consumed all content - signal termination after this empty segment
4770-
self->start = self->end + 1;
4771-
self->match_length = 0;
4772-
// But we still return the current result
4773-
result_obj->memory = result_memory;
4774-
result_obj->parent = self->text_obj;
4775-
Py_INCREF(self->text_obj);
4776-
return (PyObject *)result_obj;
4777-
}
4778-
4779-
// If we're now past end, we're done after this
4780-
if (self->start > self->end) { self->match_length = 0; }
4781-
else {
4782-
// Find next delimiter to determine segment length
4783-
sz_size_t newline_length = 0;
4784-
sz_cptr_t newline_ptr =
4785-
sz_utf8_find_newline(self->start, (sz_size_t)(self->end - self->start), &newline_length);
4786-
self->match_length =
4787-
newline_ptr ? (sz_size_t)(newline_ptr - self->start) : (sz_size_t)(self->end - self->start);
4786+
do {
4787+
// Build the result from current state
4788+
result_memory.start = self->start;
4789+
result_memory.length = self->match_length;
4790+
4791+
// Include newline in result if keepends is set
4792+
if (self->keepends && self->start + self->match_length < self->end) {
4793+
sz_size_t newline_length = 0;
4794+
sz_cptr_t newline_ptr =
4795+
sz_utf8_find_newline(self->start + self->match_length,
4796+
(sz_size_t)(self->end - self->start - self->match_length), &newline_length);
4797+
if (newline_ptr == self->start + self->match_length) { result_memory.length += newline_length; }
4798+
}
4799+
4800+
// Advance to next segment
4801+
self->start += self->match_length;
4802+
4803+
// Skip delimiter at current position (if any)
4804+
if (self->start < self->end) {
4805+
sz_size_t newline_length = 0;
4806+
sz_cptr_t newline_ptr =
4807+
sz_utf8_find_newline(self->start, (sz_size_t)(self->end - self->start), &newline_length);
4808+
if (newline_ptr == self->start) { self->start += newline_length; }
4809+
}
4810+
// Handle the case where we're exactly at end after consuming content
4811+
else if (self->start == self->end) {
4812+
self->start = self->end + 1;
4813+
self->match_length = 0;
4814+
}
4815+
4816+
// If we're now past end, we're done after this
4817+
if (self->start > self->end) { self->match_length = 0; }
4818+
else {
4819+
// Find next delimiter to determine segment length
4820+
sz_size_t newline_length = 0;
4821+
sz_cptr_t newline_ptr =
4822+
sz_utf8_find_newline(self->start, (sz_size_t)(self->end - self->start), &newline_length);
4823+
self->match_length =
4824+
newline_ptr ? (sz_size_t)(newline_ptr - self->start) : (sz_size_t)(self->end - self->start);
4825+
}
4826+
} while (self->skip_empty && result_memory.length == 0 && self->start <= self->end);
4827+
4828+
// If we exhausted all segments while skipping empties, free and return NULL
4829+
if (self->skip_empty && result_memory.length == 0) {
4830+
Py_DECREF(result_obj);
4831+
return NULL;
47884832
}
47894833

47904834
// Set its properties based on the slice
@@ -4854,33 +4898,42 @@ static PyObject *Utf8SplitWhitespaceIteratorType_next(Utf8SplitWhitespaceIterato
48544898
Str *result_obj = (Str *)StrType.tp_alloc(&StrType, 0);
48554899
if (result_obj == NULL && PyErr_NoMemory()) return NULL;
48564900

4857-
// Current segment to yield
4901+
// Find next non-empty segment (or any segment if skip_empty is false)
48584902
sz_string_view_t result_memory;
4859-
result_memory.start = self->start;
4860-
result_memory.length = self->match_length;
4861-
4862-
// Advance to next segment
4863-
self->start += self->match_length;
4864-
if (self->start > self->end) {
4865-
// Already yielding final segment, mark termination
4866-
self->match_length = 0;
4867-
}
4868-
else if (self->start == self->end) {
4869-
// At end - move past to terminate after yielding this segment
4870-
self->start = self->end + 1;
4871-
self->match_length = 0;
4872-
}
4873-
else {
4874-
// Skip delimiter at current position
4875-
sz_size_t ws_len = 0;
4876-
sz_cptr_t ws = sz_utf8_find_whitespace(self->start, (sz_size_t)(self->end - self->start), &ws_len);
4877-
if (ws == self->start) self->start += ws_len;
4878-
if (self->start > self->end) { self->match_length = 0; }
4903+
do {
4904+
// Current segment to yield
4905+
result_memory.start = self->start;
4906+
result_memory.length = self->match_length;
4907+
4908+
// Advance to next segment
4909+
self->start += self->match_length;
4910+
if (self->start > self->end) {
4911+
// Already yielding final segment, mark termination
4912+
self->match_length = 0;
4913+
}
4914+
else if (self->start == self->end) {
4915+
// At end - move past to terminate after yielding this segment
4916+
self->start = self->end + 1;
4917+
self->match_length = 0;
4918+
}
48794919
else {
4880-
// Find next delimiter
4881-
ws = sz_utf8_find_whitespace(self->start, (sz_size_t)(self->end - self->start), &ws_len);
4882-
self->match_length = ws ? (sz_size_t)(ws - self->start) : (sz_size_t)(self->end - self->start);
4920+
// Skip delimiter at current position
4921+
sz_size_t ws_len = 0;
4922+
sz_cptr_t ws = sz_utf8_find_whitespace(self->start, (sz_size_t)(self->end - self->start), &ws_len);
4923+
if (ws == self->start) self->start += ws_len;
4924+
if (self->start > self->end) { self->match_length = 0; }
4925+
else {
4926+
// Find next delimiter
4927+
ws = sz_utf8_find_whitespace(self->start, (sz_size_t)(self->end - self->start), &ws_len);
4928+
self->match_length = ws ? (sz_size_t)(ws - self->start) : (sz_size_t)(self->end - self->start);
4929+
}
48834930
}
4931+
} while (self->skip_empty && result_memory.length == 0 && self->start <= self->end);
4932+
4933+
// If we exhausted all segments while skipping empties, free and return NULL
4934+
if (self->skip_empty && result_memory.length == 0) {
4935+
Py_DECREF(result_obj);
4936+
return NULL;
48844937
}
48854938

48864939
// Set its properties based on the slice

0 commit comments

Comments
 (0)