diff --git a/unicodetools/data/linkification/dev/LinkFormattingTest.txt b/unicodetools/data/linkification/dev/LinkFormattingTest.txt index a13bc8cb4..eb3ff5169 100644 --- a/unicodetools/data/linkification/dev/LinkFormattingTest.txt +++ b/unicodetools/data/linkification/dev/LinkFormattingTest.txt @@ -1,5 +1,5 @@ # LinkFormattingTest.txt -# Date: 2026-05-20, 15:27:01 GMT +# Date: 2026-05-20, 21:41:06 GMT # © 2026 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -15,8 +15,12 @@ # # Comments are lines that begin with a #. # -# The fully-escaped field percent-escapes all literal syntax characters and all characters above ASCII. +# The fully-escaped field percent-escapes all code points based on https://url.spec.whatwg.org/#percent-encoded-bytes. +# This means all literal syntax characters in each Part and all code points above ASCII. +# It also percent-escapes the last character, if it is Link_Term=Soft. +# # The minimally-escaped field is the more readable format described in UTS #58. +# # Each pair also has a comment line for the internal structure of the URL. # 𝑺 = the schema # 𝑯 = the host (typically just a domain name) the internal structure is not broken down. diff --git a/unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java b/unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java index 25159fd42..db520439b 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java +++ b/unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java @@ -206,8 +206,12 @@ public static void main(String[] args) throws IOException { "", "Comments are lines that begin with a #.", "", - "The fully-escaped field percent-escapes all literal syntax characters and all characters above ASCII.", + "The fully-escaped field percent-escapes all code points based on https://url.spec.whatwg.org/#percent-encoded-bytes.", + "This means all literal syntax characters in each Part and all code points above ASCII.", + "It also percent-escapes the last character, if it is Link_Term=Soft.", + "", "The minimally-escaped field is the more readable format described in UTS #58.", + "", "Each pair also has a comment line for the internal structure of the URL.", "𝑺 = the schema", "𝑯 = the host (typically just a domain name) the internal structure is not broken down.",