Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions icu4c/source/common/linkemailprops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// © 2025 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

// linkemailprops.h
// created: 2025 for UTS #58 / Unicode 17.0

#ifndef LINKEMAILPROPS_H
#define LINKEMAILPROPS_H

#include "unicode/utypes.h"
#include "unicode/ucptrie.h"
#include "unicode/uobject.h"

U_NAMESPACE_BEGIN

/**
* Link_Email binary property constants and data-file identifiers.
* A code point has Link_Email=Yes (1) if it is allowed in an email local part.
* All other code points default to No (0).
*/
class LinkEmailProps : public UMemory {
public:
/**
* Indexes into the binary data indexes[] array.
* Values are byte offsets from the start of the indexes[] array.
*/
enum {
IX_COUNT, // 0: length of indexes[] (== IX_LINK_EMAIL_COUNT)
IX_CPTRIE_TOP, // 1: limit offset of the Link_Email UCPTrie
IX_TRIE2_TOP, // 2: reserved for a second future trie (= IX_CPTRIE_TOP until used)
IX_TRIE3_TOP, // 3: reserved for a third future trie (= IX_TRIE2_TOP until used)
IX_TOTAL_SIZE, // 4: total data size (= limit of last trie)
// reserved
IX_LINK_EMAIL_COUNT = 8
};

static constexpr char DATA_TYPE[] = "icu";
static constexpr char DATA_NAME[] = "ulinkemail";
static constexpr uint8_t DATA_FORMAT[4] = { 'L', 'n', 'k', 'E' };
static constexpr uint8_t FORMAT_VERSION[4] = { 1, 0, 0, 0 };
};

U_NAMESPACE_END

#endif // LINKEMAILPROPS_H
53 changes: 53 additions & 0 deletions icu4c/source/common/linktermprops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// © 2025 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

// linktermprops.h
// created: 2025 for UTS #58 / Unicode 17.0

#ifndef LINKTERMPROPS_H
#define LINKTERMPROPS_H

#include "unicode/utypes.h"
#include "unicode/ucptrie.h"
#include "unicode/uobject.h"

/**
* Values of the Link_Term property (UTS #58 / proposed Unicode 19.0).
* The default value for unlisted code points is ULINK_TERM_HARD.
*/
typedef enum ULinkTerm {
ULINK_TERM_HARD = 0, /**< Terminates a URL unconditionally. Default. */
ULINK_TERM_INCLUDE = 1, /**< May appear in a URL (letters, digits, …). */
ULINK_TERM_SOFT = 2, /**< Terminates only when followed by Hard. */
ULINK_TERM_CLOSE = 3, /**< Closing bracket; terminates if unmatched. */
ULINK_TERM_OPEN = 4, /**< Opening bracket. */
ULINK_TERM_COUNT
} ULinkTerm;

U_NAMESPACE_BEGIN

class LinkTermProps : public UMemory {
public:
/**
* Indexes into the binary data indexes[] array.
* Values are byte offsets from the start of the indexes[] array.
*/
enum {
IX_COUNT, // 0: length of indexes[] (== IX_LINK_TERM_COUNT)
IX_CPTRIE_TOP, // 1: limit offset of the Link_Term UCPTrie
IX_TRIE2_TOP, // 2: reserved for a second future trie (= IX_CPTRIE_TOP until used)
IX_TRIE3_TOP, // 3: reserved for a third future trie (= IX_TRIE2_TOP until used)
IX_TOTAL_SIZE, // 4: total data size (= limit of last trie)
// reserved
IX_LINK_TERM_COUNT = 8
};

static constexpr char DATA_TYPE[] = "icu";
static constexpr char DATA_NAME[] = "ulinkterm";
static constexpr uint8_t DATA_FORMAT[4] = { 'L', 'n', 'k', 'T' };
static constexpr uint8_t FORMAT_VERSION[4] = { 1, 0, 0, 0 };
};

U_NAMESPACE_END

#endif // LINKTERMPROPS_H
1,296 changes: 1,296 additions & 0 deletions icu4c/source/data/unidata/LinkEmail.txt

Large diffs are not rendered by default.

2,156 changes: 2,156 additions & 0 deletions icu4c/source/data/unidata/LinkTerm.txt

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions icu4j/main/core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@

<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>generate-iana-tlds</id>
<phase>generate-sources</phase>
<goals><goal>exec</goal></goals>
<configuration>
<executable>python3</executable>
<arguments>
<argument>${project.basedir}/src/main/scripts/generate-iana-tlds.py</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<executions>
Expand Down
71 changes: 71 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/impl/LinkEmailProps.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// © 2025 and later: Unicode, Inc. and others.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most Unicode properties are supported more directly in ICU, so that additional files and parsing code are not necessary. Need to check with @markusicu as to whether the UTS58 properties are or will be.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. If they are, I assume most or all of this can be dropped.

// License & terms of use: https://www.unicode.org/copyright.html

package com.ibm.icu.impl;

import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ICUUncheckedIOException;
import java.io.IOException;
import java.nio.ByteBuffer;

/**
* Link_Email binary property loaded from ulinkemail.icu.
* Implements the Link_Email property (UTS #58 / Unicode 17.0).
*
* <p>A code point has Link_Email=Yes if it may appear in an email local part.
* All other code points have Link_Email=No (the default, stored as 0).
*/
public final class LinkEmailProps {

// Indexes into the binary data indexes[] array (see linkemailprops.h).
private static final int IX_COUNT = 0;
private static final int IX_CPTRIE_TOP = 1;

// "LnkE"
private static final int DATA_FORMAT = 0x4C6E6B45;

private static final ICUBinary.Authenticate IS_ACCEPTABLE =
version -> version[0] == 1;

public static final LinkEmailProps INSTANCE = new LinkEmailProps();

private final CodePointTrie.Fast8 cpTrie;

private LinkEmailProps() {
ByteBuffer bytes = ICUBinary.getRequiredData("ulinkemail.icu");
try {
ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
int startPos = bytes.position();

// indexes[0] = number of entries in the indexes array.
int indexCount = bytes.getInt();
if (indexCount < 2) {
throw new ICUUncheckedIOException("ulinkemail.icu: indexes too short");
}
int[] inIndexes = new int[indexCount];
inIndexes[IX_COUNT] = indexCount;
for (int i = 1; i < indexCount; i++) {
inIndexes[i] = bytes.getInt();
}

// The UCPTrie starts immediately after the indexes[] array and
// ends at inIndexes[IX_CPTRIE_TOP] (a byte offset from startPos).
cpTrie = CodePointTrie.Fast8.fromBinary(bytes);
int pos = bytes.position() - startPos;
ICUBinary.skipBytes(bytes, inIndexes[IX_CPTRIE_TOP] - pos);
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
}
}

/**
* Returns true if the code point has Link_Email=Yes,
* i.e., it is allowed in an email local part.
*
* @param c a Unicode code point
* @return true if {@code c} has Link_Email=Yes
*/
public boolean contains(int c) {
return cpTrie.get(c) != 0;
}
}
84 changes: 84 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/impl/LinkTermProps.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// © 2025 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

package com.ibm.icu.impl;

import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ICUUncheckedIOException;
import java.io.IOException;
import java.nio.ByteBuffer;

/**
* Link termination properties loaded from ulinkterm.icu.
* Implements the Link_Term property (UTS #58 / proposed Unicode 19.0).
*
* <p>Values match the C-side ULinkTerm enum:
* <ul>
* <li>{@link #HARD} = 0 (default for all unlisted code points)</li>
* <li>{@link #INCLUDE} = 1</li>
* <li>{@link #SOFT} = 2</li>
* <li>{@link #CLOSE} = 3</li>
* <li>{@link #OPEN} = 4</li>
* </ul>
*/
public final class LinkTermProps {

// Link_Term property values — must match ULinkTerm in linktermprops.h.
public static final int HARD = 0;
public static final int INCLUDE = 1;
public static final int SOFT = 2;
public static final int CLOSE = 3;
public static final int OPEN = 4;

// Indexes into the binary data indexes[] array (see linktermprops.h).
private static final int IX_COUNT = 0;
private static final int IX_CPTRIE_TOP = 1;

// "LnkT"
private static final int DATA_FORMAT = 0x4C6E6B54;

private static final ICUBinary.Authenticate IS_ACCEPTABLE =
version -> version[0] == 1;

public static final LinkTermProps INSTANCE = new LinkTermProps();

private final CodePointTrie.Fast8 cpTrie;

private LinkTermProps() {
ByteBuffer bytes = ICUBinary.getRequiredData("ulinkterm.icu");
try {
ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
int startPos = bytes.position();

// indexes[0] = number of entries in the indexes array.
int indexCount = bytes.getInt();
if (indexCount < 2) {
throw new ICUUncheckedIOException("ulinkterm.icu: indexes too short");
}
int[] inIndexes = new int[indexCount];
inIndexes[IX_COUNT] = indexCount;
for (int i = 1; i < indexCount; i++) {
inIndexes[i] = bytes.getInt();
}

// The UCPTrie starts immediately after the indexes[] array and
// ends at inIndexes[IX_CPTRIE_TOP] (a byte offset from startPos).
cpTrie = CodePointTrie.Fast8.fromBinary(bytes);
int pos = bytes.position() - startPos;
ICUBinary.skipBytes(bytes, inIndexes[IX_CPTRIE_TOP] - pos);
} catch (IOException e) {
throw new ICUUncheckedIOException(e);
}
}

/**
* Returns the Link_Term value for a code point.
*
* @param c a Unicode code point
* @return one of {@link #HARD}, {@link #INCLUDE}, {@link #SOFT},
* {@link #CLOSE}, {@link #OPEN}
*/
public int get(int c) {
return cpTrie.get(c);
}
}
Loading