11/*
2- * Copyright (c) 2018, 2023 , Oracle and/or its affiliates. All rights reserved.
2+ * Copyright (c) 2018, 2024 , Oracle and/or its affiliates. All rights reserved.
33 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44 *
55 * The Universal Permissive License (UPL), Version 1.0
4343import static com .oracle .graal .python .runtime .exception .PythonErrorType .ValueError ;
4444import static com .oracle .graal .python .util .PythonUtils .TS_ENCODING ;
4545
46- import java .text .Normalizer ;
4746import java .util .List ;
4847
4948import org .graalvm .shadowed .com .ibm .icu .lang .UCharacter ;
5049import org .graalvm .shadowed .com .ibm .icu .lang .UProperty ;
50+ import org .graalvm .shadowed .com .ibm .icu .text .Normalizer2 ;
51+ import org .graalvm .shadowed .com .ibm .icu .util .VersionInfo ;
5152
5253import com .oracle .graal .python .annotations .ArgumentClinic ;
5354import com .oracle .graal .python .builtins .Builtin ;
6465import com .oracle .truffle .api .CompilerDirectives .TruffleBoundary ;
6566import com .oracle .truffle .api .dsl .Bind ;
6667import com .oracle .truffle .api .dsl .Cached ;
68+ import com .oracle .truffle .api .dsl .Cached .Exclusive ;
6769import com .oracle .truffle .api .dsl .GenerateNodeFactory ;
70+ import com .oracle .truffle .api .dsl .ImportStatic ;
6871import com .oracle .truffle .api .dsl .NodeFactory ;
6972import com .oracle .truffle .api .dsl .Specialization ;
7073import com .oracle .truffle .api .nodes .Node ;
@@ -78,90 +81,10 @@ protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFa
7881 }
7982
8083 public static String getUnicodeVersion () {
81-
82- // Preliminary Unicode 11 data obtained from
83- // <https://www.unicode.org/Public/11.0.0/ucd/DerivedAge-11.0.0d13.txt>.
84- if (Character .getType ('\u0560' ) != Character .UNASSIGNED ) {
85- return "11.0.0" ; // 11.0, June 2018.
86- }
87-
88- if (Character .getType ('\u0860' ) != Character .UNASSIGNED ) {
89- return "10.0.0" ; // 10.0, June 2017.
90- }
91-
92- if (Character .getType ('\u08b6' ) != Character .UNASSIGNED ) {
93- return "9.0.0" ; // 9.0, June 2016.
94- }
95-
96- if (Character .getType ('\u08b3' ) != Character .UNASSIGNED ) {
97- return "8.0.0" ; // 8.0, June 2015.
98- }
99-
100- if (Character .getType ('\u037f' ) != Character .UNASSIGNED ) {
101- return "7.0.0" ; // 7.0, June 2014.
102- }
103-
104- if (Character .getType ('\u061c' ) != Character .UNASSIGNED ) {
105- return "6.3.0" ; // 6.3, September 2013.
106- }
107-
108- if (Character .getType ('\u20ba' ) != Character .UNASSIGNED ) {
109- return "6.2.0" ; // 6.2, September 2012.
110- }
111-
112- if (Character .getType ('\u058f' ) != Character .UNASSIGNED ) {
113- return "6.1.0" ; // 6.1, January 2012.
114- }
115-
116- if (Character .getType ('\u0526' ) != Character .UNASSIGNED ) {
117- return "6.0.0" ; // 6.0, October 2010.
118- }
119-
120- if (Character .getType ('\u0524' ) != Character .UNASSIGNED ) {
121- return "5.2.0" ; // 5.2, October 2009.
122- }
123-
124- if (Character .getType ('\u0370' ) != Character .UNASSIGNED ) {
125- return "5.1.0" ; // 5.1, March 2008.
126- }
127-
128- if (Character .getType ('\u0242' ) != Character .UNASSIGNED ) {
129- return "5.0.0" ; // 5.0, July 2006.
130- }
131-
132- if (Character .getType ('\u0237' ) != Character .UNASSIGNED ) {
133- return "4.1.0" ; // 4.1, March 2005.
134- }
135-
136- if (Character .getType ('\u0221' ) != Character .UNASSIGNED ) {
137- return "4.0.0" ; // 4.0, April 2003.
138- }
139-
140- if (Character .getType ('\u0220' ) != Character .UNASSIGNED ) {
141- return "3.2.0" ; // 3.2, March 2002.
142- }
143-
144- if (Character .getType ('\u03f4' ) != Character .UNASSIGNED ) {
145- return "3.1.0" ; // 3.1, March 2001.
146- }
147-
148- if (Character .getType ('\u01f6' ) != Character .UNASSIGNED ) {
149- return "3.0.0" ; // 3.0, September 1999.
150- }
151-
152- if (Character .getType ('\u20ac' ) != Character .UNASSIGNED ) {
153- return "2.1.0" ; // 2.1, May 1998.
154- }
155-
156- if (Character .getType ('\u0591' ) != Character .UNASSIGNED ) {
157- return "2.0.0" ; // 2.0, July 1996.
158- }
159-
160- if (Character .getType ('\u0000' ) != Character .UNASSIGNED ) {
161- return "1.1.0" ; // 1.1, June 1993.
162- }
163-
164- return "1.0.0" ; // 1.0
84+ VersionInfo version = UCharacter .getUnicodeVersion ();
85+ return Integer .toString (version .getMajor ()) + '.' +
86+ version .getMinor () + '.' +
87+ version .getMicro ();
16588 }
16689
16790 /**
@@ -186,39 +109,44 @@ public void initialize(Python3Core core) {
186109 addBuiltinConstant ("unidata_version" , getUnicodeVersion ());
187110 }
188111
112+ static final int NORMALIZER_FORM_COUNT = 4 ;
113+
114+ @ TruffleBoundary
115+ static Normalizer2 getNormalizer (TruffleString form ) {
116+ return switch (form .toJavaStringUncached ()) {
117+ case "NFC" -> Normalizer2 .getNFCInstance ();
118+ case "NFKC" -> Normalizer2 .getNFKCInstance ();
119+ case "NFD" -> Normalizer2 .getNFDInstance ();
120+ case "NFKD" -> Normalizer2 .getNFKDInstance ();
121+ default -> null ;
122+ };
123+ }
124+
189125 // unicodedata.normalize(form, unistr)
190126 @ Builtin (name = "normalize" , minNumOfPositionalArgs = 2 , parameterNames = {"form" , "unistr" })
191127 @ ArgumentClinic (name = "form" , conversion = ArgumentClinic .ClinicConversion .TString )
192128 @ ArgumentClinic (name = "unistr" , conversion = ArgumentClinic .ClinicConversion .TString )
193129 @ GenerateNodeFactory
130+ @ ImportStatic (UnicodeDataModuleBuiltins .class )
194131 public abstract static class NormalizeNode extends PythonBinaryClinicBuiltinNode {
195- @ TruffleBoundary
196- protected Normalizer .Form getForm (TruffleString form ) {
197- try {
198- return Normalizer .Form .valueOf (form .toJavaStringUncached ());
199- } catch (IllegalArgumentException e ) {
200- return null ;
201- }
202- }
203-
204- @ Specialization (guards = {"stringEquals(form, cachedForm, equalNode)" }, limit = "4" )
132+ @ Specialization (guards = {"cachedNormalizer != null" , "stringEquals(form, cachedForm, equalNode)" }, limit = "NORMALIZER_FORM_COUNT" )
205133 static TruffleString normalize (@ SuppressWarnings ("unused" ) TruffleString form , TruffleString unistr ,
206- @ Bind ("this" ) Node inliningTarget ,
207134 @ SuppressWarnings ("unused" ) @ Cached ("form" ) TruffleString cachedForm ,
208- @ Cached ("getForm (cachedForm)" ) Normalizer . Form cachedNormForm ,
135+ @ Cached ("getNormalizer (cachedForm)" ) Normalizer2 cachedNormalizer ,
209136 @ SuppressWarnings ("unused" ) @ Cached TruffleString .EqualNode equalNode ,
210137 @ Cached TruffleString .ToJavaStringNode toJavaStringNode ,
211- @ Cached TruffleString .FromJavaStringNode fromJavaStringNode ,
212- @ Cached PRaiseNode .Lazy raiseNode ) {
213- if (cachedNormForm == null ) {
214- throw raiseNode .get (inliningTarget ).raise (ValueError , ErrorMessages .INVALID_NORMALIZATION_FORM );
215- }
216- return fromJavaStringNode .execute (normalize (toJavaStringNode .execute (unistr ), cachedNormForm ), TS_ENCODING );
138+ @ Exclusive @ Cached TruffleString .FromJavaStringNode fromJavaStringNode ) {
139+ return fromJavaStringNode .execute (normalize (toJavaStringNode .execute (unistr ), cachedNormalizer ), TS_ENCODING );
140+ }
141+
142+ @ Specialization (guards = "getNormalizer(form) == null" )
143+ TruffleString invalidForm (@ SuppressWarnings ("unused" ) TruffleString form , @ SuppressWarnings ("unused" ) TruffleString unistr ) {
144+ throw PRaiseNode .raiseUncached (this , ValueError , ErrorMessages .INVALID_NORMALIZATION_FORM );
217145 }
218146
219147 @ TruffleBoundary
220- private static String normalize (String str , Normalizer . Form normForm ) {
221- return Normalizer .normalize (str , normForm );
148+ private static String normalize (String str , Normalizer2 normalizer ) {
149+ return normalizer .normalize (str );
222150 }
223151
224152 @ Override
@@ -232,26 +160,20 @@ protected ArgumentClinicProvider getArgumentClinic() {
232160 @ ArgumentClinic (name = "form" , conversion = ArgumentClinic .ClinicConversion .TString )
233161 @ ArgumentClinic (name = "unistr" , conversion = ArgumentClinic .ClinicConversion .TString )
234162 @ GenerateNodeFactory
163+ @ ImportStatic (UnicodeDataModuleBuiltins .class )
235164 public abstract static class IsNormalizedNode extends PythonBinaryClinicBuiltinNode {
236- @ TruffleBoundary
237- protected Normalizer .Form getForm (TruffleString form ) {
238- try {
239- return Normalizer .Form .valueOf (form .toJavaStringUncached ());
240- } catch (IllegalArgumentException e ) {
241- return null ;
242- }
243- }
244-
245- @ Specialization (guards = {"stringEquals(form, cachedForm, equalNode)" }, limit = "4" )
165+ @ Specialization (guards = {"cachedNormalizer != null" , "stringEquals(form, cachedForm, equalNode)" }, limit = "NORMALIZER_FORM_COUNT" )
246166 @ TruffleBoundary
247167 boolean isNormalized (@ SuppressWarnings ("unused" ) TruffleString form , TruffleString unistr ,
248168 @ SuppressWarnings ("unused" ) @ Cached ("form" ) TruffleString cachedForm ,
249- @ Cached ("getForm (cachedForm)" ) Normalizer . Form cachedNormForm ,
169+ @ Cached ("getNormalizer (cachedForm)" ) Normalizer2 cachedNormalizer ,
250170 @ SuppressWarnings ("unused" ) @ Cached TruffleString .EqualNode equalNode ) {
251- if (cachedNormForm == null ) {
252- throw PRaiseNode .raiseUncached (this , ValueError , ErrorMessages .INVALID_NORMALIZATION_FORM );
253- }
254- return Normalizer .isNormalized (unistr .toJavaStringUncached (), cachedNormForm );
171+ return cachedNormalizer .isNormalized (unistr .toJavaStringUncached ());
172+ }
173+
174+ @ Specialization (guards = "getNormalizer(form) == null" )
175+ TruffleString invalidForm (@ SuppressWarnings ("unused" ) TruffleString form , @ SuppressWarnings ("unused" ) TruffleString unistr ) {
176+ throw PRaiseNode .raiseUncached (this , ValueError , ErrorMessages .INVALID_NORMALIZATION_FORM );
255177 }
256178
257179 @ Override
0 commit comments