-
-
Notifications
You must be signed in to change notification settings - Fork 875
ICU-11443 WIP: Link detection according to UTS#58. #3878
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
arnt
wants to merge
1
commit into
unicode-org:main
Choose a base branch
from
arnt:uts58-link-detection
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| // © 2025 and later: Unicode, Inc. and others. | ||
| // License & terms of use: https://www.unicode.org/copyright.html | ||
|
|
||
| // linkemailprops.h | ||
| // created: 2025 for UTS #58 / Unicode 17.0 | ||
|
|
||
| #ifndef LINKEMAILPROPS_H | ||
| #define LINKEMAILPROPS_H | ||
|
|
||
| #include "unicode/utypes.h" | ||
| #include "unicode/ucptrie.h" | ||
| #include "unicode/uobject.h" | ||
|
|
||
| U_NAMESPACE_BEGIN | ||
|
|
||
| /** | ||
| * Link_Email binary property constants and data-file identifiers. | ||
| * A code point has Link_Email=Yes (1) if it is allowed in an email local part. | ||
| * All other code points default to No (0). | ||
| */ | ||
| class LinkEmailProps : public UMemory { | ||
| public: | ||
| /** | ||
| * Indexes into the binary data indexes[] array. | ||
| * Values are byte offsets from the start of the indexes[] array. | ||
| */ | ||
| enum { | ||
| IX_COUNT, // 0: length of indexes[] (== IX_LINK_EMAIL_COUNT) | ||
| IX_CPTRIE_TOP, // 1: limit offset of the Link_Email UCPTrie | ||
| IX_TRIE2_TOP, // 2: reserved for a second future trie (= IX_CPTRIE_TOP until used) | ||
| IX_TRIE3_TOP, // 3: reserved for a third future trie (= IX_TRIE2_TOP until used) | ||
| IX_TOTAL_SIZE, // 4: total data size (= limit of last trie) | ||
| // reserved | ||
| IX_LINK_EMAIL_COUNT = 8 | ||
| }; | ||
|
|
||
| static constexpr char DATA_TYPE[] = "icu"; | ||
| static constexpr char DATA_NAME[] = "ulinkemail"; | ||
| static constexpr uint8_t DATA_FORMAT[4] = { 'L', 'n', 'k', 'E' }; | ||
| static constexpr uint8_t FORMAT_VERSION[4] = { 1, 0, 0, 0 }; | ||
| }; | ||
|
|
||
| U_NAMESPACE_END | ||
|
|
||
| #endif // LINKEMAILPROPS_H |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| // © 2025 and later: Unicode, Inc. and others. | ||
| // License & terms of use: https://www.unicode.org/copyright.html | ||
|
|
||
| // linktermprops.h | ||
| // created: 2025 for UTS #58 / Unicode 17.0 | ||
|
|
||
| #ifndef LINKTERMPROPS_H | ||
| #define LINKTERMPROPS_H | ||
|
|
||
| #include "unicode/utypes.h" | ||
| #include "unicode/ucptrie.h" | ||
| #include "unicode/uobject.h" | ||
|
|
||
| /** | ||
| * Values of the Link_Term property (UTS #58 / proposed Unicode 19.0). | ||
| * The default value for unlisted code points is ULINK_TERM_HARD. | ||
| */ | ||
| typedef enum ULinkTerm { | ||
| ULINK_TERM_HARD = 0, /**< Terminates a URL unconditionally. Default. */ | ||
| ULINK_TERM_INCLUDE = 1, /**< May appear in a URL (letters, digits, …). */ | ||
| ULINK_TERM_SOFT = 2, /**< Terminates only when followed by Hard. */ | ||
| ULINK_TERM_CLOSE = 3, /**< Closing bracket; terminates if unmatched. */ | ||
| ULINK_TERM_OPEN = 4, /**< Opening bracket. */ | ||
| ULINK_TERM_COUNT | ||
| } ULinkTerm; | ||
|
|
||
| U_NAMESPACE_BEGIN | ||
|
|
||
| class LinkTermProps : public UMemory { | ||
| public: | ||
| /** | ||
| * Indexes into the binary data indexes[] array. | ||
| * Values are byte offsets from the start of the indexes[] array. | ||
| */ | ||
| enum { | ||
| IX_COUNT, // 0: length of indexes[] (== IX_LINK_TERM_COUNT) | ||
| IX_CPTRIE_TOP, // 1: limit offset of the Link_Term UCPTrie | ||
| IX_TRIE2_TOP, // 2: reserved for a second future trie (= IX_CPTRIE_TOP until used) | ||
| IX_TRIE3_TOP, // 3: reserved for a third future trie (= IX_TRIE2_TOP until used) | ||
| IX_TOTAL_SIZE, // 4: total data size (= limit of last trie) | ||
| // reserved | ||
| IX_LINK_TERM_COUNT = 8 | ||
| }; | ||
|
|
||
| static constexpr char DATA_TYPE[] = "icu"; | ||
| static constexpr char DATA_NAME[] = "ulinkterm"; | ||
| static constexpr uint8_t DATA_FORMAT[4] = { 'L', 'n', 'k', 'T' }; | ||
| static constexpr uint8_t FORMAT_VERSION[4] = { 1, 0, 0, 0 }; | ||
| }; | ||
|
|
||
| U_NAMESPACE_END | ||
|
|
||
| #endif // LINKTERMPROPS_H |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
icu4j/main/core/src/main/java/com/ibm/icu/impl/LinkEmailProps.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| // © 2025 and later: Unicode, Inc. and others. | ||
| // License & terms of use: https://www.unicode.org/copyright.html | ||
|
|
||
| package com.ibm.icu.impl; | ||
|
|
||
| import com.ibm.icu.util.CodePointTrie; | ||
| import com.ibm.icu.util.ICUUncheckedIOException; | ||
| import java.io.IOException; | ||
| import java.nio.ByteBuffer; | ||
|
|
||
| /** | ||
| * Link_Email binary property loaded from ulinkemail.icu. | ||
| * Implements the Link_Email property (UTS #58 / Unicode 17.0). | ||
| * | ||
| * <p>A code point has Link_Email=Yes if it may appear in an email local part. | ||
| * All other code points have Link_Email=No (the default, stored as 0). | ||
| */ | ||
| public final class LinkEmailProps { | ||
|
|
||
| // Indexes into the binary data indexes[] array (see linkemailprops.h). | ||
| private static final int IX_COUNT = 0; | ||
| private static final int IX_CPTRIE_TOP = 1; | ||
|
|
||
| // "LnkE" | ||
| private static final int DATA_FORMAT = 0x4C6E6B45; | ||
|
|
||
| private static final ICUBinary.Authenticate IS_ACCEPTABLE = | ||
| version -> version[0] == 1; | ||
|
|
||
| public static final LinkEmailProps INSTANCE = new LinkEmailProps(); | ||
|
|
||
| private final CodePointTrie.Fast8 cpTrie; | ||
|
|
||
| private LinkEmailProps() { | ||
| ByteBuffer bytes = ICUBinary.getRequiredData("ulinkemail.icu"); | ||
| try { | ||
| ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); | ||
| int startPos = bytes.position(); | ||
|
|
||
| // indexes[0] = number of entries in the indexes array. | ||
| int indexCount = bytes.getInt(); | ||
| if (indexCount < 2) { | ||
| throw new ICUUncheckedIOException("ulinkemail.icu: indexes too short"); | ||
| } | ||
| int[] inIndexes = new int[indexCount]; | ||
| inIndexes[IX_COUNT] = indexCount; | ||
| for (int i = 1; i < indexCount; i++) { | ||
| inIndexes[i] = bytes.getInt(); | ||
| } | ||
|
|
||
| // The UCPTrie starts immediately after the indexes[] array and | ||
| // ends at inIndexes[IX_CPTRIE_TOP] (a byte offset from startPos). | ||
| cpTrie = CodePointTrie.Fast8.fromBinary(bytes); | ||
| int pos = bytes.position() - startPos; | ||
| ICUBinary.skipBytes(bytes, inIndexes[IX_CPTRIE_TOP] - pos); | ||
| } catch (IOException e) { | ||
| throw new ICUUncheckedIOException(e); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if the code point has Link_Email=Yes, | ||
| * i.e., it is allowed in an email local part. | ||
| * | ||
| * @param c a Unicode code point | ||
| * @return true if {@code c} has Link_Email=Yes | ||
| */ | ||
| public boolean contains(int c) { | ||
| return cpTrie.get(c) != 0; | ||
| } | ||
| } | ||
84 changes: 84 additions & 0 deletions
84
icu4j/main/core/src/main/java/com/ibm/icu/impl/LinkTermProps.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| // © 2025 and later: Unicode, Inc. and others. | ||
| // License & terms of use: https://www.unicode.org/copyright.html | ||
|
|
||
| package com.ibm.icu.impl; | ||
|
|
||
| import com.ibm.icu.util.CodePointTrie; | ||
| import com.ibm.icu.util.ICUUncheckedIOException; | ||
| import java.io.IOException; | ||
| import java.nio.ByteBuffer; | ||
|
|
||
| /** | ||
| * Link termination properties loaded from ulinkterm.icu. | ||
| * Implements the Link_Term property (UTS #58 / proposed Unicode 19.0). | ||
| * | ||
| * <p>Values match the C-side ULinkTerm enum: | ||
| * <ul> | ||
| * <li>{@link #HARD} = 0 (default for all unlisted code points)</li> | ||
| * <li>{@link #INCLUDE} = 1</li> | ||
| * <li>{@link #SOFT} = 2</li> | ||
| * <li>{@link #CLOSE} = 3</li> | ||
| * <li>{@link #OPEN} = 4</li> | ||
| * </ul> | ||
| */ | ||
| public final class LinkTermProps { | ||
|
|
||
| // Link_Term property values — must match ULinkTerm in linktermprops.h. | ||
| public static final int HARD = 0; | ||
| public static final int INCLUDE = 1; | ||
| public static final int SOFT = 2; | ||
| public static final int CLOSE = 3; | ||
| public static final int OPEN = 4; | ||
|
|
||
| // Indexes into the binary data indexes[] array (see linktermprops.h). | ||
| private static final int IX_COUNT = 0; | ||
| private static final int IX_CPTRIE_TOP = 1; | ||
|
|
||
| // "LnkT" | ||
| private static final int DATA_FORMAT = 0x4C6E6B54; | ||
|
|
||
| private static final ICUBinary.Authenticate IS_ACCEPTABLE = | ||
| version -> version[0] == 1; | ||
|
|
||
| public static final LinkTermProps INSTANCE = new LinkTermProps(); | ||
|
|
||
| private final CodePointTrie.Fast8 cpTrie; | ||
|
|
||
| private LinkTermProps() { | ||
| ByteBuffer bytes = ICUBinary.getRequiredData("ulinkterm.icu"); | ||
| try { | ||
| ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); | ||
| int startPos = bytes.position(); | ||
|
|
||
| // indexes[0] = number of entries in the indexes array. | ||
| int indexCount = bytes.getInt(); | ||
| if (indexCount < 2) { | ||
| throw new ICUUncheckedIOException("ulinkterm.icu: indexes too short"); | ||
| } | ||
| int[] inIndexes = new int[indexCount]; | ||
| inIndexes[IX_COUNT] = indexCount; | ||
| for (int i = 1; i < indexCount; i++) { | ||
| inIndexes[i] = bytes.getInt(); | ||
| } | ||
|
|
||
| // The UCPTrie starts immediately after the indexes[] array and | ||
| // ends at inIndexes[IX_CPTRIE_TOP] (a byte offset from startPos). | ||
| cpTrie = CodePointTrie.Fast8.fromBinary(bytes); | ||
| int pos = bytes.position() - startPos; | ||
| ICUBinary.skipBytes(bytes, inIndexes[IX_CPTRIE_TOP] - pos); | ||
| } catch (IOException e) { | ||
| throw new ICUUncheckedIOException(e); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the Link_Term value for a code point. | ||
| * | ||
| * @param c a Unicode code point | ||
| * @return one of {@link #HARD}, {@link #INCLUDE}, {@link #SOFT}, | ||
| * {@link #CLOSE}, {@link #OPEN} | ||
| */ | ||
| public int get(int c) { | ||
| return cpTrie.get(c); | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most Unicode properties are supported more directly in ICU, so that additional files and parsing code are not necessary. Need to check with @markusicu as to whether the UTS58 properties are or will be.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right. If they are, I assume most or all of this can be dropped.