@@ -206,6 +206,9 @@ typedef struct {
206206 /// @brief Should we include the newline characters in the resulting slices?
207207 sz_bool_t keepends ;
208208
209+ /// @brief Should we skip empty segments (trailing, leading, consecutive)?
210+ sz_bool_t skip_empty ;
211+
209212} Utf8SplitLinesIterator ;
210213
211214/**
@@ -223,6 +226,9 @@ typedef struct {
223226 sz_cptr_t end ; //< End of text (immutable)
224227 sz_size_t match_length ; //< Length of current segment to yield
225228
229+ /// @brief Should we skip empty segments (trailing, leading, consecutive)?
230+ sz_bool_t skip_empty ;
231+
226232} Utf8SplitWhitespaceIterator ;
227233
228234/**
@@ -3983,6 +3989,7 @@ static char const doc_utf8_splitlines_iter[] = //
39833989 "Args:\n"
39843990 " text (Str or str or bytes): The string object.\n"
39853991 " keepends (bool, optional): Include line endings in results (default is False).\n"
3992+ " skip_empty (bool, optional): Skip empty lines (default is False).\n"
39863993 "Returns:\n"
39873994 " iterator: An iterator yielding lines as Str objects.\n"
39883995 "\n"
@@ -3995,14 +4002,15 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
39954002 // Check minimum arguments
39964003 int is_member = self != NULL && PyObject_TypeCheck (self , & StrType );
39974004 Py_ssize_t min_args = !is_member ;
3998- Py_ssize_t max_args = !is_member + 1 ;
4005+ Py_ssize_t max_args = !is_member + 2 ;
39994006 if (positional_args_count < min_args || positional_args_count > max_args ) {
40004007 PyErr_Format (PyExc_TypeError , "utf8_splitlines_iter() requires %zd to %zd arguments" , min_args , max_args );
40014008 return NULL ;
40024009 }
40034010
40044011 PyObject * text_obj = is_member ? self : args [0 ];
40054012 PyObject * keepends_obj = positional_args_count > !is_member ? args [!is_member ] : NULL ;
4013+ PyObject * skip_empty_obj = positional_args_count > !is_member + 1 ? args [!is_member + 1 ] : NULL ;
40064014
40074015 // Parse keyword arguments
40084016 if (args_names_tuple ) {
@@ -4011,12 +4019,16 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
40114019 PyObject * key = PyTuple_GET_ITEM (args_names_tuple , i );
40124020 PyObject * value = args [positional_args_count + i ];
40134021 if (PyUnicode_CompareWithASCIIString (key , "keepends" ) == 0 && !keepends_obj ) { keepends_obj = value ; }
4022+ else if (PyUnicode_CompareWithASCIIString (key , "skip_empty" ) == 0 && !skip_empty_obj ) {
4023+ skip_empty_obj = value ;
4024+ }
40144025 else if (PyErr_Format (PyExc_TypeError , "Got an unexpected keyword argument '%U'" , key )) { return NULL ; }
40154026 }
40164027 }
40174028
40184029 sz_string_view_t text ;
40194030 int keepends = 0 ;
4031+ int skip_empty = 0 ;
40204032
40214033 // Validate and convert `text`
40224034 if (!sz_py_export_string_like (text_obj , & text .start , & text .length )) {
@@ -4033,6 +4045,15 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
40334045 }
40344046 }
40354047
4048+ // Validate and convert `skip_empty`
4049+ if (skip_empty_obj ) {
4050+ skip_empty = PyObject_IsTrue (skip_empty_obj );
4051+ if (skip_empty == -1 ) {
4052+ wrap_current_exception ("The skip_empty argument must be a boolean" );
4053+ return NULL ;
4054+ }
4055+ }
4056+
40364057 // Create the iterator
40374058 Utf8SplitLinesIterator * result_obj =
40384059 (Utf8SplitLinesIterator * )Utf8SplitLinesIteratorType .tp_alloc (& Utf8SplitLinesIteratorType , 0 );
@@ -4042,6 +4063,7 @@ static PyObject *Str_like_utf8_splitlines_iter(PyObject *self, PyObject *const *
40424063 result_obj -> start = text .start ;
40434064 result_obj -> end = text .start + text .length ;
40444065 result_obj -> keepends = keepends ;
4066+ result_obj -> skip_empty = skip_empty ;
40454067
40464068 // Find first segment length
40474069 sz_size_t newline_length = 0 ;
@@ -4062,6 +4084,7 @@ static char const doc_utf8_split_iter[] = //
40624084 "\n"
40634085 "Args:\n"
40644086 " text (Str or str or bytes): The string object.\n"
4087+ " skip_empty (bool, optional): Skip empty segments (default is False).\n"
40654088 "Returns:\n"
40664089 " iterator: An iterator yielding non-whitespace tokens as Str objects.\n"
40674090 "\n"
@@ -4076,27 +4099,44 @@ static PyObject *Str_like_utf8_split_iter(PyObject *self, PyObject *const *args,
40764099 // Check minimum arguments
40774100 int is_member = self != NULL && PyObject_TypeCheck (self , & StrType );
40784101 Py_ssize_t min_args = !is_member ;
4079- Py_ssize_t max_args = !is_member ;
4102+ Py_ssize_t max_args = !is_member + 1 ;
40804103 if (positional_args_count < min_args || positional_args_count > max_args ) {
4081- PyErr_Format (PyExc_TypeError , "utf8_split_iter() takes exactly %zd argument(s) " , min_args );
4104+ PyErr_Format (PyExc_TypeError , "utf8_split_iter() requires %zd to %zd arguments " , min_args , max_args );
40824105 return NULL ;
40834106 }
40844107
4085- // No keyword arguments expected
4086- if (args_names_tuple && PyTuple_GET_SIZE (args_names_tuple ) > 0 ) {
4087- PyErr_SetString (PyExc_TypeError , "utf8_split_iter() takes no keyword arguments" );
4088- return NULL ;
4108+ PyObject * text_obj = is_member ? self : args [0 ];
4109+ PyObject * skip_empty_obj = positional_args_count > !is_member ? args [!is_member ] : NULL ;
4110+
4111+ // Parse keyword arguments
4112+ if (args_names_tuple ) {
4113+ Py_ssize_t args_names_count = PyTuple_GET_SIZE (args_names_tuple );
4114+ for (Py_ssize_t i = 0 ; i < args_names_count ; ++ i ) {
4115+ PyObject * key = PyTuple_GET_ITEM (args_names_tuple , i );
4116+ PyObject * value = args [positional_args_count + i ];
4117+ if (PyUnicode_CompareWithASCIIString (key , "skip_empty" ) == 0 && !skip_empty_obj ) { skip_empty_obj = value ; }
4118+ else if (PyErr_Format (PyExc_TypeError , "Got an unexpected keyword argument '%U'" , key )) { return NULL ; }
4119+ }
40894120 }
40904121
4091- PyObject * text_obj = is_member ? self : args [0 ];
40924122 sz_string_view_t text ;
4123+ int skip_empty = 0 ;
40934124
40944125 // Validate and convert `text`
40954126 if (!sz_py_export_string_like (text_obj , & text .start , & text .length )) {
40964127 wrap_current_exception ("The text argument must be string-like" );
40974128 return NULL ;
40984129 }
40994130
4131+ // Validate and convert `skip_empty`
4132+ if (skip_empty_obj ) {
4133+ skip_empty = PyObject_IsTrue (skip_empty_obj );
4134+ if (skip_empty == -1 ) {
4135+ wrap_current_exception ("The skip_empty argument must be a boolean" );
4136+ return NULL ;
4137+ }
4138+ }
4139+
41004140 // Create the iterator
41014141 Utf8SplitWhitespaceIterator * result_obj =
41024142 (Utf8SplitWhitespaceIterator * )Utf8SplitWhitespaceIteratorType .tp_alloc (& Utf8SplitWhitespaceIteratorType , 0 );
@@ -4105,6 +4145,7 @@ static PyObject *Str_like_utf8_split_iter(PyObject *self, PyObject *const *args,
41054145 result_obj -> text_obj = text_obj ;
41064146 result_obj -> start = text .start ;
41074147 result_obj -> end = text .start + text .length ;
4148+ result_obj -> skip_empty = skip_empty ;
41084149 // Find first segment length
41094150 sz_size_t ws_len = 0 ;
41104151 sz_cptr_t ws = sz_utf8_find_whitespace (result_obj -> start , text .length , & ws_len );
@@ -4740,51 +4781,54 @@ static PyObject *Utf8SplitLinesIteratorType_next(Utf8SplitLinesIterator *self) {
47404781 Str * result_obj = (Str * )StrType .tp_alloc (& StrType , 0 );
47414782 if (result_obj == NULL && PyErr_NoMemory ()) return NULL ;
47424783
4743- // Build the result from current state
4784+ // Find next non-empty segment (or any segment if skip_empty is false)
47444785 sz_string_view_t result_memory ;
4745- result_memory .start = self -> start ;
4746- result_memory .length = self -> match_length ;
4747-
4748- // Include newline in result if keepends is set
4749- if (self -> keepends && self -> start + self -> match_length < self -> end ) {
4750- sz_size_t newline_length = 0 ;
4751- sz_cptr_t newline_ptr =
4752- sz_utf8_find_newline (self -> start + self -> match_length ,
4753- (sz_size_t )(self -> end - self -> start - self -> match_length ), & newline_length );
4754- if (newline_ptr == self -> start + self -> match_length ) { result_memory .length += newline_length ; }
4755- }
4756-
4757- // Advance to next segment
4758- self -> start += self -> match_length ;
4759-
4760- // Skip delimiter at current position (if any)
4761- if (self -> start < self -> end ) {
4762- sz_size_t newline_length = 0 ;
4763- sz_cptr_t newline_ptr =
4764- sz_utf8_find_newline (self -> start , (sz_size_t )(self -> end - self -> start ), & newline_length );
4765- if (newline_ptr == self -> start ) { self -> start += newline_length ; }
4766- }
4767- // Handle the case where we're exactly at end after consuming content
4768- else if (self -> start == self -> end ) {
4769- // We've consumed all content - signal termination after this empty segment
4770- self -> start = self -> end + 1 ;
4771- self -> match_length = 0 ;
4772- // But we still return the current result
4773- result_obj -> memory = result_memory ;
4774- result_obj -> parent = self -> text_obj ;
4775- Py_INCREF (self -> text_obj );
4776- return (PyObject * )result_obj ;
4777- }
4778-
4779- // If we're now past end, we're done after this
4780- if (self -> start > self -> end ) { self -> match_length = 0 ; }
4781- else {
4782- // Find next delimiter to determine segment length
4783- sz_size_t newline_length = 0 ;
4784- sz_cptr_t newline_ptr =
4785- sz_utf8_find_newline (self -> start , (sz_size_t )(self -> end - self -> start ), & newline_length );
4786- self -> match_length =
4787- newline_ptr ? (sz_size_t )(newline_ptr - self -> start ) : (sz_size_t )(self -> end - self -> start );
4786+ do {
4787+ // Build the result from current state
4788+ result_memory .start = self -> start ;
4789+ result_memory .length = self -> match_length ;
4790+
4791+ // Include newline in result if keepends is set
4792+ if (self -> keepends && self -> start + self -> match_length < self -> end ) {
4793+ sz_size_t newline_length = 0 ;
4794+ sz_cptr_t newline_ptr =
4795+ sz_utf8_find_newline (self -> start + self -> match_length ,
4796+ (sz_size_t )(self -> end - self -> start - self -> match_length ), & newline_length );
4797+ if (newline_ptr == self -> start + self -> match_length ) { result_memory .length += newline_length ; }
4798+ }
4799+
4800+ // Advance to next segment
4801+ self -> start += self -> match_length ;
4802+
4803+ // Skip delimiter at current position (if any)
4804+ if (self -> start < self -> end ) {
4805+ sz_size_t newline_length = 0 ;
4806+ sz_cptr_t newline_ptr =
4807+ sz_utf8_find_newline (self -> start , (sz_size_t )(self -> end - self -> start ), & newline_length );
4808+ if (newline_ptr == self -> start ) { self -> start += newline_length ; }
4809+ }
4810+ // Handle the case where we're exactly at end after consuming content
4811+ else if (self -> start == self -> end ) {
4812+ self -> start = self -> end + 1 ;
4813+ self -> match_length = 0 ;
4814+ }
4815+
4816+ // If we're now past end, we're done after this
4817+ if (self -> start > self -> end ) { self -> match_length = 0 ; }
4818+ else {
4819+ // Find next delimiter to determine segment length
4820+ sz_size_t newline_length = 0 ;
4821+ sz_cptr_t newline_ptr =
4822+ sz_utf8_find_newline (self -> start , (sz_size_t )(self -> end - self -> start ), & newline_length );
4823+ self -> match_length =
4824+ newline_ptr ? (sz_size_t )(newline_ptr - self -> start ) : (sz_size_t )(self -> end - self -> start );
4825+ }
4826+ } while (self -> skip_empty && result_memory .length == 0 && self -> start <= self -> end );
4827+
4828+ // If we exhausted all segments while skipping empties, free and return NULL
4829+ if (self -> skip_empty && result_memory .length == 0 ) {
4830+ Py_DECREF (result_obj );
4831+ return NULL ;
47884832 }
47894833
47904834 // Set its properties based on the slice
@@ -4854,33 +4898,42 @@ static PyObject *Utf8SplitWhitespaceIteratorType_next(Utf8SplitWhitespaceIterato
48544898 Str * result_obj = (Str * )StrType .tp_alloc (& StrType , 0 );
48554899 if (result_obj == NULL && PyErr_NoMemory ()) return NULL ;
48564900
4857- // Current segment to yield
4901+ // Find next non-empty segment (or any segment if skip_empty is false)
48584902 sz_string_view_t result_memory ;
4859- result_memory .start = self -> start ;
4860- result_memory .length = self -> match_length ;
4861-
4862- // Advance to next segment
4863- self -> start += self -> match_length ;
4864- if (self -> start > self -> end ) {
4865- // Already yielding final segment, mark termination
4866- self -> match_length = 0 ;
4867- }
4868- else if (self -> start == self -> end ) {
4869- // At end - move past to terminate after yielding this segment
4870- self -> start = self -> end + 1 ;
4871- self -> match_length = 0 ;
4872- }
4873- else {
4874- // Skip delimiter at current position
4875- sz_size_t ws_len = 0 ;
4876- sz_cptr_t ws = sz_utf8_find_whitespace (self -> start , (sz_size_t )(self -> end - self -> start ), & ws_len );
4877- if (ws == self -> start ) self -> start += ws_len ;
4878- if (self -> start > self -> end ) { self -> match_length = 0 ; }
4903+ do {
4904+ // Current segment to yield
4905+ result_memory .start = self -> start ;
4906+ result_memory .length = self -> match_length ;
4907+
4908+ // Advance to next segment
4909+ self -> start += self -> match_length ;
4910+ if (self -> start > self -> end ) {
4911+ // Already yielding final segment, mark termination
4912+ self -> match_length = 0 ;
4913+ }
4914+ else if (self -> start == self -> end ) {
4915+ // At end - move past to terminate after yielding this segment
4916+ self -> start = self -> end + 1 ;
4917+ self -> match_length = 0 ;
4918+ }
48794919 else {
4880- // Find next delimiter
4881- ws = sz_utf8_find_whitespace (self -> start , (sz_size_t )(self -> end - self -> start ), & ws_len );
4882- self -> match_length = ws ? (sz_size_t )(ws - self -> start ) : (sz_size_t )(self -> end - self -> start );
4920+ // Skip delimiter at current position
4921+ sz_size_t ws_len = 0 ;
4922+ sz_cptr_t ws = sz_utf8_find_whitespace (self -> start , (sz_size_t )(self -> end - self -> start ), & ws_len );
4923+ if (ws == self -> start ) self -> start += ws_len ;
4924+ if (self -> start > self -> end ) { self -> match_length = 0 ; }
4925+ else {
4926+ // Find next delimiter
4927+ ws = sz_utf8_find_whitespace (self -> start , (sz_size_t )(self -> end - self -> start ), & ws_len );
4928+ self -> match_length = ws ? (sz_size_t )(ws - self -> start ) : (sz_size_t )(self -> end - self -> start );
4929+ }
48834930 }
4931+ } while (self -> skip_empty && result_memory .length == 0 && self -> start <= self -> end );
4932+
4933+ // If we exhausted all segments while skipping empties, free and return NULL
4934+ if (self -> skip_empty && result_memory .length == 0 ) {
4935+ Py_DECREF (result_obj );
4936+ return NULL ;
48844937 }
48854938
48864939 // Set its properties based on the slice
0 commit comments