-
-
Notifications
You must be signed in to change notification settings - Fork 63
Regenerate linkification data in MakeUnicodeFiles #1286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ae40ac4
cfcac12
f4d3fdd
e3845f0
4be4bb4
94e0151
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
eggrobin marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ | |
| import org.unicode.props.BagFormatter; | ||
| import org.unicode.props.UcdProperty; | ||
| import org.unicode.props.UcdPropertyValues; | ||
| import org.unicode.text.utility.DiffingPrintWriter; | ||
| import org.unicode.utilities.LinkUtilities; | ||
| import org.unicode.utilities.LinkUtilities.LinkTermination; | ||
| import org.unicode.utilities.LinkUtilities.Part; | ||
|
|
@@ -45,7 +46,7 @@ | |
| * | ||
| * @throws IOException | ||
| */ | ||
| class GenerateLinkData { | ||
| public class GenerateLinkData { | ||
|
|
||
| private static final Transliterator FIX_ODD = | ||
| Transliterator.createFromRules( | ||
|
|
@@ -75,9 +76,13 @@ class GenerateLinkData { | |
| ""); | ||
|
|
||
| public static void main(String[] args) throws IOException { | ||
| generatePropertyData(); | ||
| generateDetectionTestData(); | ||
| generateFormattingTestData(); | ||
| System.out.println("TLDs=\t" + Joiner.on(' ').join(LinkUtilities.TLDS)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven’t the faintest idea, this print statement was at the beginning of generatePropertyData and wasn’t part of the generation of any of the three files, so I lifted it here to preserve the behaviour of this tool. I guess it has something to do with top level domains ? |
||
| final String copyrightYear = dty.format(now); | ||
| generateLinkTerm(copyrightYear); | ||
| generateLinkEmail(copyrightYear); | ||
| generateLinkBracket(copyrightYear); | ||
| generateDetectionTestData(copyrightYear); | ||
| generateFormattingTestData(copyrightYear); | ||
| } | ||
|
|
||
| static final Instant now = Instant.now(); | ||
|
|
@@ -213,21 +218,24 @@ static void writePropHeader( | |
| SimpleFormatter simpleFormatter, | ||
| String filename, | ||
| String propertyName, | ||
| String missingValue) { | ||
| String missingValue, | ||
| String copyrightYear) { | ||
| out.println( | ||
| simpleFormatter.format( | ||
| filename, dt.format(now), dty.format(now), propertyName, missingValue)); | ||
| filename, dt.format(now), copyrightYear, propertyName, missingValue)); | ||
| } | ||
|
|
||
| static void writeTestHeader( | ||
| PrintWriter out, SimpleFormatter simpleFormatter, String filename, String testName) { | ||
| out.println(simpleFormatter.format(filename, dt.format(now), dty.format(now), testName)); | ||
| PrintWriter out, | ||
| SimpleFormatter simpleFormatter, | ||
| String filename, | ||
| String testName, | ||
| String copyrightYear) { | ||
| out.println(simpleFormatter.format(filename, dt.format(now), copyrightYear, testName)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curious / probably for later: are
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dt.format(now) is for the Date: line; that one is actually the time of regeneration, but ignored for diffing. copyrightYear is not ignored for diffing, so when using MakeUnicodeFiles it is from MakeUnicodeFiles.txt so that we don’t break on the 1st of January. (Of course the emoji do break on the 1st of January, see #1273. I should fix that, timebombs in CI are annoying.) |
||
| } | ||
|
|
||
| /** Generate property data for the UTS */ | ||
| static void generatePropertyData() { | ||
| System.out.println("TLDs=\t" + Joiner.on(' ').join(LinkUtilities.TLDS)); | ||
|
|
||
| public static void generateLinkTerm(String copyrightYear) { | ||
| BagFormatter bf = new BagFormatter(LinkUtilities.IUP).setLineSeparator("\n"); | ||
| bf.setShowLiteral(FIX_ODD); | ||
|
|
||
|
|
@@ -236,62 +244,81 @@ static void generatePropertyData() { | |
| bf.setValueSource(LinkTermination.PROPERTY); | ||
| bf.setLabelSource(LinkUtilities.IUP.getProperty(UcdProperty.Age)); | ||
|
|
||
| try (final PrintWriter out = | ||
| FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR_DEV, "LinkTerm.txt"); ) { | ||
| writePropHeader(out, HEADER_PROP_TERM, "LinkTerm", "Link_Term", "Hard"); | ||
| try (final var out = new DiffingPrintWriter(LinkUtilities.DATA_DIR_DEV, "LinkTerm.txt"); ) { | ||
| writePropHeader( | ||
| out.tempPrintWriter, | ||
| HEADER_PROP_TERM, | ||
| "LinkTerm", | ||
| "Link_Term", | ||
| "Hard", | ||
| copyrightYear); | ||
| for (LinkTermination propValue : LinkTermination.NON_MISSING) { | ||
| bf.showSetNames(out, propValue.base); | ||
| bf.showSetNames(out.tempPrintWriter, propValue.base); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When do we use |
||
| out.println(""); | ||
| out.flush(); | ||
| System.out.println(propValue + "=\t" + propValue.base.toPattern(false)); | ||
| } | ||
|
|
||
| } catch (IOException e) { | ||
| throw new UncheckedIOException(e); | ||
| } | ||
| } | ||
|
|
||
| public static void generateLinkEmail(String copyrightYear) { | ||
| BagFormatter bf = new BagFormatter(LinkUtilities.IUP).setLineSeparator("\n"); | ||
| bf.setShowLiteral(FIX_ODD); | ||
| bf.setLabelSource(LinkUtilities.IUP.getProperty(UcdProperty.Age)); | ||
| // LinkEmail.txt | ||
| bf.setValueSource(UnicodeLabel.NULL); | ||
| try (final PrintWriter out = | ||
| FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR_DEV, "LinkEmail.txt"); ) { | ||
| try (final var out = | ||
| new DiffingPrintWriter(LinkUtilities.DATA_DIR_DEV, "LinkEmail.txt"); ) { | ||
| writePropHeader( | ||
| out, | ||
| out.tempPrintWriter, | ||
| HEADER_PROP_BINARY, | ||
| "LinkEmail", | ||
| "Link_Email", | ||
| UcdPropertyValues.Binary.No.toString()); | ||
| UcdPropertyValues.Binary.No.toString(), | ||
| copyrightYear); | ||
| UnicodeSet linkEmailSet = LinkUtilities.LinkEmail.getSet(UcdPropertyValues.Binary.Yes); | ||
| bf.showSetNames(out, linkEmailSet); | ||
| bf.showSetNames(out.tempPrintWriter, linkEmailSet); | ||
| System.out.println("LinkEmail=\t" + linkEmailSet.toPattern(false)); | ||
| } catch (IOException e) { | ||
| throw new UncheckedIOException(e); | ||
| } | ||
| } | ||
|
|
||
| public static void generateLinkBracket(String copyrightYear) { | ||
| BagFormatter bf = new BagFormatter(LinkUtilities.IUP).setLineSeparator("\n"); | ||
| bf.setShowLiteral(FIX_ODD); | ||
| bf.setLabelSource(LinkUtilities.IUP.getProperty(UcdProperty.Age)); | ||
| // LinkBracket.txt | ||
| bf.setValueSource(LinkUtilities.getLinkBracket()); | ||
| bf.setHexValue(true); | ||
| bf.setShowDehexedValue(true); | ||
| try (final PrintWriter out = | ||
| FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR_DEV, "LinkBracket.txt"); ) { | ||
| writePropHeader(out, HEADER_PROP_STRING, "LinkBracket", "Link_Bracket", "<none>"); | ||
| bf.showSetNames(out, LinkTermination.Close.base); | ||
| } catch (IOException e) { | ||
| throw new UncheckedIOException(e); | ||
| try (final var out = | ||
| new DiffingPrintWriter(LinkUtilities.DATA_DIR_DEV, "LinkBracket.txt"); ) { | ||
| writePropHeader( | ||
| out.tempPrintWriter, | ||
| HEADER_PROP_STRING, | ||
| "LinkBracket", | ||
| "Link_Bracket", | ||
| "<none>", | ||
| copyrightYear); | ||
| bf.showSetNames(out.tempPrintWriter, LinkTermination.Close.base); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * The format of the test file sources are: source<TAB>expected OR just source If there is an | ||
| * expected value, then it is checked against what is generated. | ||
| */ | ||
| static void generateDetectionTestData() { | ||
| public static void generateDetectionTestData(String copyrightYear) { | ||
|
|
||
| OutputInt errorCount = new OutputInt(); | ||
|
|
||
| try (final PrintWriter out = | ||
| FileUtilities.openUTF8Writer( | ||
| LinkUtilities.DATA_DIR_DEV, "LinkDetectionTest.txt"); ) { | ||
| writeTestHeader(out, HEADER_DETECT_TEST, "LinkDetectionTest", "LinkDetectionTest"); | ||
| try (final var out = | ||
| new DiffingPrintWriter(LinkUtilities.DATA_DIR_DEV, "LinkDetectionTest.txt"); ) { | ||
| writeTestHeader( | ||
| out.tempPrintWriter, | ||
| HEADER_DETECT_TEST, | ||
| "LinkDetectionTest", | ||
| "LinkDetectionTest", | ||
| copyrightYear); | ||
|
|
||
| out.println("\n# Misc. test cases\n"); | ||
|
|
||
|
|
@@ -364,16 +391,20 @@ static void generateDetectionTestData() { | |
| } | ||
| } | ||
|
|
||
| static void generateFormattingTestData() { | ||
| public static void generateFormattingTestData(String copyrightYear) { | ||
|
|
||
| OutputInt errorCount = new OutputInt(); | ||
|
|
||
| try (final PrintWriter out = | ||
| FileUtilities.openUTF8Writer( | ||
| LinkUtilities.DATA_DIR_DEV, "LinkFormattingTest.txt"); ) { | ||
| writeTestHeader(out, HEADER_FORMAT_TEST, "LinkFormattingTest", "LinkFormattingTest"); | ||
| try (final var out = | ||
| new DiffingPrintWriter(LinkUtilities.DATA_DIR_DEV, "LinkFormattingTest.txt"); ) { | ||
| writeTestHeader( | ||
| out.tempPrintWriter, | ||
| HEADER_FORMAT_TEST, | ||
| "LinkFormattingTest", | ||
| "LinkFormattingTest", | ||
| copyrightYear); | ||
|
|
||
| out.println("\n# Selected test cases\n"); | ||
| out.tempPrintWriter.println("\n# Selected test cases\n"); | ||
|
|
||
| List<String> comments = new ArrayList<>(); | ||
| Output<Integer> lineCount = new Output<>(0); | ||
|
|
@@ -439,7 +470,7 @@ static void generateFormattingTestData() { | |
| comments.clear(); | ||
| return; | ||
| } | ||
| outputTestCase(out, comments, internals, actual); | ||
| outputTestCase(out.tempPrintWriter, comments, internals, actual); | ||
| }); | ||
|
|
||
| out.println("\n# Wikipedia test cases\n"); | ||
|
|
@@ -480,7 +511,7 @@ static void generateFormattingTestData() { | |
|
|
||
| String actual = internals.minimalEscape(true, null); | ||
|
|
||
| outputTestCase(out, comments, internals, actual); | ||
| outputTestCase(out.tempPrintWriter, comments, internals, actual); | ||
| }); | ||
| } catch (IOException e) { | ||
| throw new UncheckedIOException(e); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should definitely write 2026 now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but we should make this change consistently and globally by updating MakeUnicodeFiles.txt. This will rewrite the whole UCD. Let’s do that in another PR.