Skip to content

Commit 7e7d7f0

Browse files
committed
[GR-68711] Fixes for markitdown
PullRequest: graalpython/3955
2 parents 35582d9 + 928e4a3 commit 7e7d7f0

File tree

6 files changed

+163
-72
lines changed

6 files changed

+163
-72
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019, 2024, Oracle and/or its affiliates.
1+
# Copyright (c) 2019, 2025, Oracle and/or its affiliates.
22
# Copyright (C) 1996-2017 Python Software Foundation
33
#
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
@@ -28,6 +28,16 @@ def test_import():
2828
imported = False
2929
assert imported
3030

31+
def test_no_aliases_to_missing_codecs():
32+
from encodings.aliases import aliases
33+
34+
# This is how charset-normalizer discovers all encodings, we need to make sure they are importable
35+
for a in set(aliases.values()) - {"rot_13", "tactis", "mbcs"}:
36+
try:
37+
exec(f"import encodings.{a}")
38+
except Exception:
39+
raise AssertionError(f"Cannot import encodings.{a}, it should be fixed or removed from encodings.aliases")
40+
3141

3242
def test_decode():
3343
import codecs

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_unicodedata.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ test.test_unicodedata.NormalizationTest.test_normalization @ darwin-arm64,darwin
44
test.test_unicodedata.UnicodeFunctionsTest.test_category @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
55
test.test_unicodedata.UnicodeFunctionsTest.test_combining @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
66
test.test_unicodedata.UnicodeFunctionsTest.test_decimal @ darwin-x86_64
7-
test.test_unicodedata.UnicodeFunctionsTest.test_decomposition @ darwin-x86_64
7+
test.test_unicodedata.UnicodeFunctionsTest.test_decomposition @ darwin-x86_64,linux-x86_64
88
test.test_unicodedata.UnicodeFunctionsTest.test_digit @ darwin-x86_64
99
test.test_unicodedata.UnicodeFunctionsTest.test_east_asian_width @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
1010
test.test_unicodedata.UnicodeFunctionsTest.test_east_asian_width_9_0_changes @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64

graalpython/com.oracle.graal.python/src/META-INF/native-image/org.graalvm.python/python-language/reflect-config.json

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,45 @@
2626
}
2727
]
2828
},
29+
{
30+
"name": "org.graalvm.shadowed.com.ibm.icu.charset.CharsetHZ",
31+
"methods": [
32+
{
33+
"name": "<init>",
34+
"parameterTypes": [
35+
"java.lang.String",
36+
"java.lang.String",
37+
"java.lang.String[]"
38+
]
39+
}
40+
]
41+
},
42+
{
43+
"name": "org.graalvm.shadowed.com.ibm.icu.charset.CharsetMBCS",
44+
"methods": [
45+
{
46+
"name": "<init>",
47+
"parameterTypes": [
48+
"java.lang.String",
49+
"java.lang.String",
50+
"java.lang.String[]"
51+
]
52+
}
53+
]
54+
},
55+
{
56+
"name": "org.graalvm.shadowed.com.ibm.icu.charset.CharsetUTF7",
57+
"methods": [
58+
{
59+
"name": "<init>",
60+
"parameterTypes": [
61+
"java.lang.String",
62+
"java.lang.String",
63+
"java.lang.String[]"
64+
]
65+
}
66+
]
67+
},
2968
{
3069
"name": "com.sun.crypto.provider.AESCipher$General",
3170
"methods": [

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/UnicodeDataModuleBuiltins.java

Lines changed: 89 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
*/
4141
package com.oracle.graal.python.builtins.modules;
4242

43-
import static com.oracle.graal.python.runtime.exception.PythonErrorType.TypeError;
4443
import static com.oracle.graal.python.nodes.BuiltinNames.J_UNICODEDATA;
4544
import static com.oracle.graal.python.nodes.BuiltinNames.T_UNICODEDATA;
4645
import static com.oracle.graal.python.runtime.exception.PythonErrorType.KeyError;
@@ -50,15 +49,8 @@
5049

5150
import java.util.List;
5251

53-
import com.oracle.graal.python.builtins.objects.module.PythonModule;
54-
import com.oracle.graal.python.nodes.function.builtins.PythonUnaryBuiltinNode;
55-
import com.oracle.graal.python.nodes.util.CannotCastException;
56-
import com.oracle.graal.python.nodes.util.CastToTruffleStringNode;
57-
import com.oracle.truffle.api.strings.TruffleString.CodePointAtByteIndexNode;
58-
import com.oracle.truffle.api.strings.TruffleString.CodePointLengthNode;
59-
import com.oracle.truffle.api.strings.TruffleString.FromJavaStringNode;
60-
import com.oracle.truffle.api.strings.TruffleString.ToJavaStringNode;
6152
import org.graalvm.shadowed.com.ibm.icu.lang.UCharacter;
53+
import org.graalvm.shadowed.com.ibm.icu.lang.UCharacter.DecompositionType;
6254
import org.graalvm.shadowed.com.ibm.icu.lang.UProperty;
6355
import org.graalvm.shadowed.com.ibm.icu.text.Normalizer2;
6456
import org.graalvm.shadowed.com.ibm.icu.util.VersionInfo;
@@ -69,6 +61,7 @@
6961
import com.oracle.graal.python.builtins.Python3Core;
7062
import com.oracle.graal.python.builtins.PythonBuiltins;
7163
import com.oracle.graal.python.builtins.objects.PNone;
64+
import com.oracle.graal.python.builtins.objects.module.PythonModule;
7265
import com.oracle.graal.python.nodes.ErrorMessages;
7366
import com.oracle.graal.python.nodes.PRaiseNode;
7467
import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
@@ -85,6 +78,8 @@
8578
import com.oracle.truffle.api.dsl.Specialization;
8679
import com.oracle.truffle.api.nodes.Node;
8780
import com.oracle.truffle.api.strings.TruffleString;
81+
import com.oracle.truffle.api.strings.TruffleString.FromJavaStringNode;
82+
import com.oracle.truffle.api.strings.TruffleString.ToJavaStringNode;
8883

8984
@CoreFunctions(defineModule = J_UNICODEDATA, isEager = true)
9085
public final class UnicodeDataModuleBuiltins extends PythonBuiltins {
@@ -143,7 +138,7 @@ static Normalizer2 getNormalizer(TruffleString form) {
143138
@ArgumentClinic(name = "unistr", conversion = ArgumentClinic.ClinicConversion.TString)
144139
@GenerateNodeFactory
145140
@ImportStatic(UnicodeDataModuleBuiltins.class)
146-
public abstract static class NormalizeNode extends PythonBinaryClinicBuiltinNode {
141+
abstract static class NormalizeNode extends PythonBinaryClinicBuiltinNode {
147142
@Specialization(guards = {"cachedNormalizer != null", "stringEquals(form, cachedForm, equalNode)"}, limit = "NORMALIZER_FORM_COUNT")
148143
static TruffleString normalize(@SuppressWarnings("unused") TruffleString form, TruffleString unistr,
149144
@SuppressWarnings("unused") @Cached("form") TruffleString cachedForm,
@@ -155,8 +150,9 @@ static TruffleString normalize(@SuppressWarnings("unused") TruffleString form, T
155150
}
156151

157152
@Specialization(guards = "getNormalizer(form) == null")
158-
TruffleString invalidForm(@SuppressWarnings("unused") TruffleString form, @SuppressWarnings("unused") TruffleString unistr) {
159-
throw PRaiseNode.raiseStatic(this, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
153+
static TruffleString invalidForm(@SuppressWarnings("unused") TruffleString form, @SuppressWarnings("unused") TruffleString unistr,
154+
@Bind Node inliningTarget) {
155+
throw PRaiseNode.raiseStatic(inliningTarget, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
160156
}
161157

162158
@TruffleBoundary
@@ -176,19 +172,20 @@ protected ArgumentClinicProvider getArgumentClinic() {
176172
@ArgumentClinic(name = "unistr", conversion = ArgumentClinic.ClinicConversion.TString)
177173
@GenerateNodeFactory
178174
@ImportStatic(UnicodeDataModuleBuiltins.class)
179-
public abstract static class IsNormalizedNode extends PythonBinaryClinicBuiltinNode {
175+
abstract static class IsNormalizedNode extends PythonBinaryClinicBuiltinNode {
180176
@Specialization(guards = {"cachedNormalizer != null", "stringEquals(form, cachedForm, equalNode)"}, limit = "NORMALIZER_FORM_COUNT")
181177
@TruffleBoundary
182-
boolean isNormalized(@SuppressWarnings("unused") TruffleString form, TruffleString unistr,
178+
static boolean isNormalized(@SuppressWarnings("unused") TruffleString form, TruffleString unistr,
183179
@SuppressWarnings("unused") @Cached("form") TruffleString cachedForm,
184180
@Cached("getNormalizer(cachedForm)") Normalizer2 cachedNormalizer,
185181
@SuppressWarnings("unused") @Cached TruffleString.EqualNode equalNode) {
186182
return cachedNormalizer.isNormalized(unistr.toJavaStringUncached());
187183
}
188184

189185
@Specialization(guards = "getNormalizer(form) == null")
190-
TruffleString invalidForm(@SuppressWarnings("unused") TruffleString form, @SuppressWarnings("unused") TruffleString unistr) {
191-
throw PRaiseNode.raiseStatic(this, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
186+
static TruffleString invalidForm(@SuppressWarnings("unused") TruffleString form, @SuppressWarnings("unused") TruffleString unistr,
187+
@Bind Node inliningTarget) {
188+
throw PRaiseNode.raiseStatic(inliningTarget, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
192189
}
193190

194191
@Override
@@ -201,7 +198,7 @@ protected ArgumentClinicProvider getArgumentClinic() {
201198
@Builtin(name = "lookup", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"name"})
202199
@ArgumentClinic(name = "name", conversion = ArgumentClinic.ClinicConversion.TString)
203200
@GenerateNodeFactory
204-
public abstract static class LookupNode extends PythonUnaryClinicBuiltinNode {
201+
abstract static class LookupNode extends PythonUnaryClinicBuiltinNode {
205202

206203
private static final int NAME_MAX_LENGTH = 256;
207204

@@ -267,7 +264,7 @@ private static String getCharacterByUnicodeNameAlias(String unicodeName) {
267264
@Builtin(name = "name", minNumOfPositionalArgs = 1, parameterNames = {"chr", "default"})
268265
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
269266
@GenerateNodeFactory
270-
public abstract static class NameNode extends PythonBinaryClinicBuiltinNode {
267+
abstract static class NameNode extends PythonBinaryClinicBuiltinNode {
271268

272269
@Specialization
273270
static Object name(int cp, Object defaultValue,
@@ -294,7 +291,7 @@ protected ArgumentClinicProvider getArgumentClinic() {
294291
@Builtin(name = "bidirectional", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"chr"})
295292
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
296293
@GenerateNodeFactory
297-
public abstract static class BidirectionalNode extends PythonUnaryClinicBuiltinNode {
294+
abstract static class BidirectionalNode extends PythonUnaryClinicBuiltinNode {
298295
@Specialization
299296
static TruffleString bidirectional(int chr,
300297
@Cached FromJavaStringNode fromJavaStringNode) {
@@ -316,7 +313,7 @@ protected ArgumentClinicProvider getArgumentClinic() {
316313
@Builtin(name = "category", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"chr"})
317314
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
318315
@GenerateNodeFactory
319-
public abstract static class CategoryNode extends PythonUnaryClinicBuiltinNode {
316+
abstract static class CategoryNode extends PythonUnaryClinicBuiltinNode {
320317
@Specialization
321318
static TruffleString category(int chr,
322319
@Cached FromJavaStringNode fromJavaStringNode) {
@@ -336,57 +333,90 @@ protected ArgumentClinicProvider getArgumentClinic() {
336333

337334
// unicodedata.combining(chr)
338335
@Builtin(name = "combining", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"chr"})
336+
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
339337
@GenerateNodeFactory
340-
public abstract static class CombiningNode extends PythonUnaryBuiltinNode {
338+
abstract static class CombiningNode extends PythonUnaryClinicBuiltinNode {
341339

342340
@Specialization
343341
@TruffleBoundary
344-
static Object combining(Object object,
345-
@Bind Node inliningTarget) {
346-
final TruffleString chr;
342+
static Object combining(int codepoint) {
343+
return UCharacter.getCombiningClass(codepoint);
344+
}
347345

348-
try {
349-
chr = CastToTruffleStringNode.getUncached().execute(inliningTarget, object);
350-
} catch (CannotCastException e) {
351-
throw PRaiseNode.raiseStatic(inliningTarget, TypeError, ErrorMessages.S_ARG_MUST_BE_S_NOT_P, "combining()", "a unicode character", object);
352-
}
346+
@Override
347+
protected ArgumentClinicProvider getArgumentClinic() {
348+
return UnicodeDataModuleBuiltinsClinicProviders.CombiningNodeClinicProviderGen.INSTANCE;
349+
}
350+
}
353351

354-
if (CodePointLengthNode.getUncached().execute(chr, TS_ENCODING) != 1) {
355-
throw PRaiseNode.raiseStatic(inliningTarget, TypeError, ErrorMessages.S_ARG_MUST_BE_S_NOT_P, "combining()", "a unicode character", object);
352+
// unicodedata.decomposition(chr)
353+
@Builtin(name = "decomposition", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"chr"})
354+
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
355+
@GenerateNodeFactory
356+
abstract static class DecompositionNode extends PythonUnaryClinicBuiltinNode {
357+
@Specialization
358+
@TruffleBoundary
359+
static TruffleString decomposition(int codepoint) {
360+
int type = UCharacter.getIntPropertyValue(codepoint, UProperty.DECOMPOSITION_TYPE);
361+
String prefix = getDecompositionPrefix(type);
362+
String decomposition = Normalizer2.getNFKDInstance().getDecomposition(codepoint);
363+
364+
StringBuilder sb = new StringBuilder();
365+
if (prefix != null) {
366+
sb.append(prefix);
367+
}
368+
if (decomposition != null) {
369+
int cp;
370+
for (int i = 0; i < decomposition.length(); i += Character.charCount(cp)) {
371+
if (!sb.isEmpty()) {
372+
sb.append(' ');
373+
}
374+
cp = decomposition.codePointAt(i);
375+
sb.append(String.format("%04x", cp));
376+
}
356377
}
357378

358-
int codepoint = CodePointAtByteIndexNode.getUncached().execute(chr, 0, TS_ENCODING);
359-
return UCharacter.getCombiningClass(codepoint);
379+
return FromJavaStringNode.getUncached().execute(sb.toString(), TS_ENCODING);
380+
}
381+
382+
private static String getDecompositionPrefix(int type) {
383+
return switch (type) {
384+
case DecompositionType.NOBREAK -> "<noBreak>";
385+
case DecompositionType.COMPAT -> "<compat>";
386+
case DecompositionType.SUPER -> "<super>";
387+
case DecompositionType.FRACTION -> "<fraction>";
388+
case DecompositionType.SUB -> "<sub>";
389+
case DecompositionType.FONT -> "<font>";
390+
case DecompositionType.CIRCLE -> "<circle>";
391+
case DecompositionType.WIDE -> "<wide>";
392+
case DecompositionType.VERTICAL -> "<vertical>";
393+
case DecompositionType.SQUARE -> "<square>";
394+
case DecompositionType.ISOLATED -> "<isolated>";
395+
case DecompositionType.FINAL -> "<final>";
396+
case DecompositionType.INITIAL -> "<initial>";
397+
case DecompositionType.MEDIAL -> "<medial>";
398+
case DecompositionType.SMALL -> "<small>";
399+
case DecompositionType.NARROW -> "<narrow>";
400+
default -> null;
401+
};
402+
}
403+
404+
@Override
405+
protected ArgumentClinicProvider getArgumentClinic() {
406+
return UnicodeDataModuleBuiltinsClinicProviders.DecompositionNodeClinicProviderGen.INSTANCE;
360407
}
361408
}
362409

363410
// unicode.east_asia_width(chr)
364411
@Builtin(name = "east_asian_width", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"chr"})
412+
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
365413
@GenerateNodeFactory
366-
public abstract static class EastAsianWidthNode extends PythonUnaryBuiltinNode {
414+
abstract static class EastAsianWidthNode extends PythonUnaryClinicBuiltinNode {
367415
@Specialization
368416
@TruffleBoundary
369-
static TruffleString eastAsianWidth(Object object,
370-
@Bind Node inliningTarget,
371-
@Cached CastToTruffleStringNode castToTruffleStringNode,
372-
@Cached CodePointLengthNode codePointLengthNode,
373-
@Cached CodePointAtByteIndexNode codePointAtByteIndexNode,
374-
@Cached FromJavaStringNode fromJavaStringNode) {
375-
final TruffleString chr;
376-
377-
try {
378-
chr = CastToTruffleStringNode.getUncached().execute(inliningTarget, object);
379-
} catch (CannotCastException e) {
380-
throw PRaiseNode.raiseStatic(inliningTarget, TypeError, ErrorMessages.S_ARG_MUST_BE_S_NOT_P, "east_asian_width()", "a unicode character", object);
381-
}
382-
383-
if (CodePointLengthNode.getUncached().execute(chr, TS_ENCODING) != 1) {
384-
throw PRaiseNode.raiseStatic(inliningTarget, TypeError, ErrorMessages.S_ARG_MUST_BE_S_NOT_P, "east_asian_width()", "a unicode character", object);
385-
}
386-
387-
int codepoint = CodePointAtByteIndexNode.getUncached().execute(chr, 0, TS_ENCODING);
417+
static TruffleString eastAsianWidth(int codepoint) {
388418
String widthName = getWidthName(codepoint);
389-
return fromJavaStringNode.execute(widthName, TS_ENCODING);
419+
return FromJavaStringNode.getUncached().execute(widthName, TS_ENCODING);
390420
}
391421

392422
@TruffleBoundary
@@ -406,5 +436,10 @@ private static String getWidthName(int codepoint) {
406436

407437
return widthName;
408438
}
439+
440+
@Override
441+
protected ArgumentClinicProvider getArgumentClinic() {
442+
return UnicodeDataModuleBuiltinsClinicProviders.EastAsianWidthNodeClinicProviderGen.INSTANCE;
443+
}
409444
}
410445
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/function/builtins/clinic/CodePointConversionNode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ int doOthers(Object value,
9696
} catch (CannotCastException ex) {
9797
// handled below
9898
}
99-
throw raiseNode.raise(inliningTarget, TypeError, ErrorMessages.S_BRACKETS_ARG_MUST_BE_S_NOT_P, builtinName, "unicode character", value);
99+
throw raiseNode.raise(inliningTarget, TypeError, ErrorMessages.S_BRACKETS_ARG_MUST_BE_S_NOT_P, builtinName, "a unicode character", value);
100100
}
101101

102102
@ClinicConverterFactory

0 commit comments

Comments
 (0)