-
-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Add #12810: Implement escaping for keyword separators (FKA #12888) #13583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7b5505c
44257ad
e6c52dd
d2ad73d
465955a
4928536
2ccc24e
1111794
83198c0
cc5fa8b
87d1ff6
3d2785b
1b455d3
2b85d0c
4b31cd0
d15f2ef
d77bb5e
02c252d
027dde0
ea690d6
b6f728f
a72b614
4c5181b
b02aeea
eef54b6
ffeb78d
1f321c2
3e2636f
634d302
b2d902f
42710fa
4b2a8d8
a2031ed
af3c50e
e955fd1
a3d69f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
*/ | ||
public class Keyword extends ChainNode<Keyword> implements Comparable<Keyword> { | ||
|
||
// Note: {@link org.jabref.model.entry.KeywordList#parse(java.lang.String, java.lang.Character, java.lang.Character) offers configuration, which is not available here | ||
public static Character DEFAULT_HIERARCHICAL_DELIMITER = '>'; | ||
private final String keyword; | ||
|
||
|
@@ -84,9 +85,25 @@ private String getSubchainAsString(Character hierarchicalDelimiter) { | |
.orElse(""); | ||
} | ||
|
||
/* | ||
* Used for BibTex export, where we need to escape the delimiter with \ | ||
*/ | ||
Comment on lines
+88
to
+90
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The spelling 'BibTex' in the comment is incorrect according to project standards. It should be 'BibTeX' for consistency in documentation and comments. |
||
public String getSubchainAsStringWithEscaping(Character delimiter) { | ||
return getEscaped(delimiter) + | ||
getChild().map(child -> " " + DEFAULT_HIERARCHICAL_DELIMITER + " " + child.getSubchainAsStringWithEscaping(DEFAULT_HIERARCHICAL_DELIMITER)) | ||
.orElse(""); | ||
} | ||
|
||
/* | ||
* This ensures that delimiters within keyword values are not misinterpreted as separators. | ||
*/ | ||
private String getEscaped(Character delimiter) { | ||
return keyword.replace(delimiter.toString(), "\\" + delimiter); | ||
} | ||
/** | ||
* Gets the keyword of this node in the chain. | ||
*/ | ||
|
||
public String get() { | ||
return keyword; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
public class KeywordList implements Iterable<Keyword> { | ||
|
||
private final List<Keyword> keywordChains; | ||
private boolean spaceAfterDelimiter; | ||
|
||
public KeywordList() { | ||
keywordChains = new ArrayList<>(); | ||
|
@@ -52,6 +53,52 @@ public static KeywordList parse(String keywordString, Character delimiter, Chara | |
Objects.requireNonNull(hierarchicalDelimiter); | ||
|
||
KeywordList keywordList = new KeywordList(); | ||
List<String> hierarchy = new ArrayList<>(); | ||
StringBuilder currentToken = new StringBuilder(); | ||
boolean isEscaping = false; | ||
|
||
keywordList.spaceAfterDelimiter = keywordString.contains(delimiter + " "); | ||
|
||
for (int i = 0; i < keywordString.length(); i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note this is #12888 (comment) :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your review! As I dived into the project I saw that this loop had not been addressed. So I used @ungerts proposed comment, since it is readable and solves the issue. Am I missing something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is very OK. I like links to provide context. Other reviewers might think: why a for loop etc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ooh ok perfect :) |
||
char currentChar = keywordString.charAt(i); | ||
|
||
if (isEscaping && currentChar == delimiter) { // we only escape the keyword delimiter | ||
currentToken.append(currentChar); | ||
isEscaping = false; | ||
} else if (currentChar == '\\') { | ||
isEscaping = true; | ||
} else if (currentChar == hierarchicalDelimiter) { | ||
hierarchy.add(currentToken.toString().trim()); | ||
currentToken.setLength(0); | ||
} else if (currentChar == delimiter) { | ||
hierarchy.add(currentToken.toString()); | ||
currentToken.setLength(0); | ||
keywordList.add(Keyword.of(hierarchy.toArray(new String[0]))); | ||
hierarchy.clear(); | ||
} else { | ||
currentToken.append(currentChar); | ||
} | ||
} | ||
|
||
// Handle the final token | ||
if (!currentToken.isEmpty() || !hierarchy.isEmpty()) { | ||
hierarchy.add(currentToken.toString().trim()); | ||
keywordList.add(Keyword.of(hierarchy.toArray(new String[0]))); | ||
} | ||
|
||
return keywordList; | ||
} | ||
|
||
public static KeywordList oldParse(String keywordString, Character delimiter, Character hierarchicalDelimiter) { | ||
if (StringUtil.isBlank(keywordString)) { | ||
return new KeywordList(); | ||
} | ||
|
||
Objects.requireNonNull(delimiter); | ||
Objects.requireNonNull(hierarchicalDelimiter); | ||
|
||
KeywordList keywordList = new KeywordList(); | ||
keywordList.spaceAfterDelimiter = keywordString.contains(delimiter + " "); | ||
|
||
StringTokenizer tok = new StringTokenizer(keywordString, delimiter.toString()); | ||
calixtus marked this conversation as resolved.
Show resolved
Hide resolved
|
||
while (tok.hasMoreTokens()) { | ||
|
@@ -77,6 +124,13 @@ public static String serialize(List<Keyword> keywords, Character delimiter) { | |
return keywords.stream().map(Keyword::get).collect(Collectors.joining(delimiter.toString())); | ||
} | ||
|
||
// This method serializes Keywords supporting escaping of the delimiter for BibTeX Serialization (Issue #12810, #12532) | ||
public String bibtexSerialize(Character delimiter) { | ||
// If the keywords contain ", " (as in PubMed records) we keep the space. | ||
String joiner = spaceAfterDelimiter ? delimiter + " " : delimiter.toString(); | ||
return keywordChains.stream().map(keyword -> keyword.getSubchainAsStringWithEscaping(delimiter)).collect(Collectors.joining(joiner)); | ||
} | ||
|
||
public static KeywordList merge(String keywordStringA, String keywordStringB, Character delimiter) { | ||
KeywordList keywordListA = parse(keywordStringA, delimiter); | ||
KeywordList keywordListB = parse(keywordStringB, delimiter); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,12 @@ | ||
package org.jabref.model.entry; | ||
|
||
import java.util.stream.Stream; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.Arguments; | ||
import org.junit.jupiter.params.provider.MethodSource; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
|
@@ -16,6 +21,17 @@ void setUp() { | |
keywords.add("keywordTwo"); | ||
} | ||
|
||
private static Stream<Arguments> provideParseKeywordCases() { | ||
return Stream.of( | ||
Arguments.of("keyword\\,one, keywordTwo", new KeywordList("keyword,one", "keywordTwo")), | ||
Arguments.of("keywordOne\\,, keywordTwo", new KeywordList("keywordOne,", "keywordTwo")), | ||
Arguments.of("keyword\\\\, keywordTwo", new KeywordList("keyword\\", "keywordTwo")), | ||
Arguments.of("keyword\\,one > sub", new KeywordList(Keyword.of("keyword,one", "sub"))), | ||
Arguments.of("one\\,two\\,three, four", new KeywordList("one,two,three", "four")), | ||
Arguments.of("keywordOne\\\\", new KeywordList("keywordOne\\")) | ||
); | ||
} | ||
|
||
@Test | ||
void parseEmptyStringReturnsEmptyList() { | ||
assertEquals(new KeywordList(), KeywordList.parse("", ',')); | ||
|
@@ -115,4 +131,21 @@ void mergeTwoDistinctKeywordsShouldReturnTheTwoKeywordsMerged() { | |
void mergeTwoListsOfKeywordsShouldReturnTheKeywordsMerged() { | ||
assertEquals(new KeywordList("Figma", "Adobe", "JabRef", "Eclipse", "JetBrains"), KeywordList.merge("Figma, Adobe, JetBrains, Eclipse", "Adobe, JabRef", ',')); | ||
} | ||
|
||
@ParameterizedTest | ||
@MethodSource("provideParseKeywordCases") | ||
void parseKeywordWithEscapedDelimiterDoesNotSplitKeyword(String input, KeywordList expected) { | ||
assertEquals(expected, KeywordList.parse(input, ',', '>')); | ||
} | ||
|
||
// TODO: We need to redefine the roundtrip test depending on the context GUI or BibTex, | ||
// we want the user to type in escaping character but see the "clean" String as in: | ||
// keyword1\,keyword2, keyword3 --> "keyword1,keyword2", "keyword3" | ||
@ParameterizedTest | ||
@MethodSource("provideParseKeywordCases") | ||
void roundTripPreservesStructure(String original) { | ||
KeywordList parsed = KeywordList.oldParse(original, ',', '>'); | ||
// We need to test the toString() functionality | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment restates what is obvious from the code and doesn't provide additional value. Such comments should be removed as they create maintenance overhead. |
||
assertEquals(original, parsed.bibtexSerialize(',')); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment merely states what is visible from the code and doesn't provide additional value or reasoning. It should be removed or enhanced with actual implementation rationale.