Skip to content

Commit ae865e7

Browse files
committed
Document new utf8_to_uv function family
1 parent 77b3314 commit ae865e7

File tree

4 files changed

+463
-254
lines changed

4 files changed

+463
-254
lines changed

embed.fnc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -914,7 +914,7 @@ CTp |Signal_t|csighandler1 |int sig
914914
CTp |Signal_t|csighandler3 |int sig \
915915
|NULLOK Siginfo_t *info \
916916
|NULLOK void *uap
917-
ATmp |bool |c9strict_utf8_to_uv \
917+
ATdmp |bool |c9strict_utf8_to_uv \
918918
|NN const U8 * const s \
919919
|NN const U8 * const e \
920920
|NN UV *cp_p \
@@ -1174,7 +1174,7 @@ AOdp |SV * |eval_pv |NN const char *p \
11741174
|I32 croak_on_error
11751175
AOdp |SSize_t|eval_sv |NN SV *sv \
11761176
|I32 flags
1177-
ATmp |bool |extended_utf8_to_uv \
1177+
ATdmp |bool |extended_utf8_to_uv \
11781178
|NN const U8 * const s \
11791179
|NN const U8 * const e \
11801180
|NN UV *cp_p \
@@ -3075,7 +3075,7 @@ dopx |PerlIO *|start_glob |NN SV *tmpglob \
30753075
|NN IO *io
30763076
Adp |I32 |start_subparse |I32 is_format \
30773077
|U32 flags
3078-
ATmp |bool |strict_utf8_to_uv \
3078+
ATdmp |bool |strict_utf8_to_uv \
30793079
|NN const U8 * const s \
30803080
|NN const U8 * const e \
30813081
|NN UV *cp_p \
@@ -3742,7 +3742,7 @@ EMXp |U8 * |utf16_to_utf8_reversed \
37423742
|NN U8 *d \
37433743
|Size_t bytelen \
37443744
|NN Size_t *newlen
3745-
ATmp |bool |utf8_to_uv |NN const U8 * const s \
3745+
ATdmp |bool |utf8_to_uv |NN const U8 * const s \
37463746
|NN const U8 * const e \
37473747
|NN UV *cp_p \
37483748
|NULLOK Size_t *advance_p
@@ -3752,20 +3752,20 @@ AMdip |UV |utf8_to_uvchr_buf \
37523752
|NN const U8 *s \
37533753
|NN const U8 *send \
37543754
|NULLOK STRLEN *retlen
3755-
ATmp |bool |utf8_to_uv_errors \
3755+
ATdmp |bool |utf8_to_uv_errors \
37563756
|NN const U8 * const s \
37573757
|NN const U8 * const e \
37583758
|NN UV *cp_p \
37593759
|NULLOK Size_t *advance_p \
37603760
|const U32 flags \
37613761
|NULLOK U32 *errors
3762-
ATmp |bool |utf8_to_uv_flags \
3762+
ATdmp |bool |utf8_to_uv_flags \
37633763
|NN const U8 * const s \
37643764
|NN const U8 * const e \
37653765
|NN UV *cp_p \
37663766
|NULLOK Size_t *advance_p \
37673767
|const U32 flag
3768-
ATip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \
3768+
ATdip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \
37693769
|NN const U8 *e \
37703770
|NN UV *cp_p \
37713771
|NULLOK Size_t *advance_p \

inline.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2053,7 +2053,7 @@ C<L</is_strict_utf8_string>> (and kin); and if C<flags> is
20532053
C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, they give the same results as
20542054
C<L</is_c9strict_utf8_string>> (and kin). Otherwise C<flags> may be any
20552055
combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
2056-
C<L</utf8n_to_uvchr>>, with the same meanings.
2056+
C<L</utf8_to_uv>>, with the same meanings.
20572057
20582058
It's better to use one of the non-C<_flags> functions if they give you the
20592059
desired strictness, as those have a better chance of being inlined by the C
@@ -2306,7 +2306,7 @@ as C<L</isSTRICT_UTF8_CHAR>>;
23062306
and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
23072307
the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
23082308
Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2309-
understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2309+
understood by C<L</utf8_to_uv>>, with the same meanings.
23102310
23112311
The three alternative macros are for the most commonly needed validations; they
23122312
are likely to run somewhat faster than this more general one, as they can be
@@ -2931,7 +2931,7 @@ C<is_utf8_valid_partial_char_flags> when the latter is called with a zero
29312931
C<flags> parameter. This parameter is used to restrict the classes of code
29322932
points that are considered to be valid. When zero, Perl's extended UTF-8 is
29332933
used. Otherwise C<flags> can be any combination of the C<UTF8_DISALLOW_I<foo>>
2934-
flags accepted by C<L</utf8n_to_uvchr>>. If there is any sequence of bytes
2934+
flags accepted by C<L</utf8_to_uv>>. If there is any sequence of bytes
29352935
that can complete the input partial character in such a way that a
29362936
non-prohibited character is formed, the function returns TRUE; otherwise FALSE.
29372937
Non-character code points cannot be determined based on partial character
@@ -3003,7 +3003,7 @@ complete code point, this will return TRUE anyway, provided that
30033003
C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
30043004
30053005
C<flags> can be zero or any combination of the C<UTF8_DISALLOW_I<foo>> flags
3006-
accepted by C<L</utf8n_to_uvchr>>, and with the same meanings.
3006+
accepted by C<L</utf8_to_uv>>, and with the same meanings.
30073007
30083008
The functions differ from C<L</is_utf8_string_flags>> only in that the latter
30093009
returns FALSE if the final few bytes of the string don't form a complete code

mathoms.c

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -202,29 +202,6 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
202202
return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
203203
}
204204

205-
/*
206-
=for apidoc_section $unicode
207-
=for apidoc utf8_to_uvchr
208-
209-
Returns the native code point of the first character in the string C<s>
210-
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
211-
length, in bytes, of that character.
212-
213-
Some, but not all, UTF-8 malformations are detected, and in fact, some
214-
malformed input could cause reading beyond the end of the input buffer, which
215-
is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
216-
217-
If C<s> points to one of the detected malformations, and UTF8 warnings are
218-
enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
219-
C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
220-
the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
221-
is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
222-
next possible position in C<s> that could begin a non-malformed character.
223-
See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
224-
225-
=cut
226-
*/
227-
228205
UV
229206
Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
230207
{

0 commit comments

Comments
 (0)