Skip to content

Commit 982db03

Browse files
committed
ICU-23297 Do not use lookupMatcher in UnicodeSet parsing
1 parent 27b8fb1 commit 982db03

File tree

6 files changed

+172
-226
lines changed

6 files changed

+172
-226
lines changed

icu4c/source/common/rbbirb.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,10 @@ class RBBISymbolTable : public UMemory, public SymbolTable {
6565
UHashtable *fHashTable;
6666
RBBIRuleScanner *fRuleScanner;
6767

68-
// These next two fields are part of the mechanism for passing references to
69-
// already-constructed UnicodeSets back to the UnicodeSet constructor
70-
// when the pattern includes $variable references.
71-
const UnicodeString ffffString; // = "/uffff"
72-
UnicodeSet *fCachedSetLookup;
73-
7468
public:
7569
// API inherited from class SymbolTable
7670
virtual const UnicodeString* lookup(const UnicodeString& s) const override;
71+
virtual const UnicodeSet* lookupSet(const UnicodeString& s) const override;
7772
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override;
7873
virtual UnicodeString parseReference(const UnicodeString& text,
7974
ParsePosition& pos, int32_t limit) const override;
@@ -91,7 +86,7 @@ class RBBISymbolTable : public UMemory, public SymbolTable {
9186
// A do-nothing inline function for non-debug builds. Member funcs can't be empty
9287
// or the call sites won't compile.
9388
int32_t fFakeField;
94-
#define rbbiSymtablePrint() fFakeField=0;
89+
#define rbbiSymtablePrint() fFakeField=0;
9590
#endif
9691

9792
private:
@@ -220,7 +215,7 @@ typedef std::pair<int32_t, int32_t> IntPair;
220215
#define RBBIDebugPrintf printf
221216
#define RBBIDebugPuts puts
222217
#else
223-
#undef RBBIDebugPrintf
218+
#undef RBBIDebugPrintf
224219
#define RBBIDebugPuts(arg)
225220
#endif
226221

icu4c/source/common/rbbistbl.cpp

Lines changed: 33 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,10 @@ U_CDECL_END
4141
U_NAMESPACE_BEGIN
4242

4343
RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status)
44-
: fRules(rules), fRuleScanner(rs), ffffString(static_cast<char16_t>(0xffff))
44+
: fRules(rules), fRuleScanner(rs)
4545
{
4646
fHashTable = nullptr;
47-
fCachedSetLookup = nullptr;
48-
47+
4948
fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, &status);
5049
// uhash_open checks status
5150
if (U_FAILURE(status)) {
@@ -71,60 +70,47 @@ RBBISymbolTable::~RBBISymbolTable()
7170
//
7271
const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const
7372
{
74-
RBBISymbolTableEntry *el;
75-
RBBINode *varRefNode;
76-
RBBINode *exprNode;
77-
RBBINode *usetNode;
78-
const UnicodeString *retString;
79-
RBBISymbolTable *This = const_cast<RBBISymbolTable*>(this); // cast off const
80-
81-
el = static_cast<RBBISymbolTableEntry*>(uhash_get(fHashTable, &s));
73+
const RBBISymbolTableEntry* const el =
74+
static_cast<const RBBISymbolTableEntry*>(uhash_get(fHashTable, &s));
8275
if (el == nullptr) {
8376
return nullptr;
8477
}
78+
const RBBINode& exprNode = *el->val->fLeftChild; // Root node of expression for variable
79+
// Return the original source string for the expression.
80+
// Note that for set-valued variables used in UnicodeSet expressions, this would be rejected by
81+
// the UnicodeSet parser if the source itself contains variable references. For instance, with
82+
// $CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
83+
// $Cased = [[:Upper_Case:][:Lower_Case:][:Lt:] - $CaseIgnorable];
84+
// If lookupSet were not overridden, when parsing the right-hand side of
85+
// $NotCased = [[^ $Cased] - $CaseIgnorable];
86+
// there would be a call to lookup("Cased") which would return
87+
// "[[:Upper_Case:][:Lower_Case:][:Lt:]-$CaseIgnorable]". This contains a variable, which is
88+
// disallowed by the UnicodeSet parser inside a variable expansion.
89+
// However, set-valued variables are pre-parsed, and returned by lookupSet instead, so this call
90+
// to lookup() never happens; instead, lookupSet("CaseIgnorable") is called when computing
91+
// $Cased and returns the non-null value of $CaseIgnorable, and then when computing $NotCased,
92+
// lookupSet("Cased") returns the value computed for $Cased.
93+
return &exprNode.fText;
94+
}
8595

86-
varRefNode = el->val;
87-
exprNode = varRefNode->fLeftChild; // Root node of expression for variable
88-
if (exprNode->fType == RBBINode::setRef) {
89-
// The $variable refers to a single UnicodeSet
90-
// return the ffffString, which will subsequently be interpreted as a
91-
// stand-in character for the set by RBBISymbolTable::lookupMatcher()
92-
usetNode = exprNode->fLeftChild;
93-
This->fCachedSetLookup = usetNode->fInputSet;
94-
retString = &ffffString;
96+
const UnicodeSet* RBBISymbolTable::lookupSet(const UnicodeString& s) const {
97+
const RBBISymbolTableEntry* const el = static_cast<const RBBISymbolTableEntry*>(uhash_get(fHashTable, &s));
98+
if (el == nullptr) {
99+
return nullptr;
95100
}
96-
else
97-
{
98-
// The variable refers to something other than just a set.
99-
// return the original source string for the expression
100-
retString = &exprNode->fText;
101-
This->fCachedSetLookup = nullptr;
101+
const RBBINode& exprNode = *el->val->fLeftChild;
102+
if (exprNode.fType == RBBINode::setRef) {
103+
return exprNode.fLeftChild->fInputSet;
104+
} else {
105+
return nullptr;
102106
}
103-
return retString;
104107
}
105108

106109

107110

108-
//
109-
// RBBISymbolTable::lookupMatcher This function from the abstract symbol table
110-
// interface maps a single stand-in character to a
111-
// pointer to a Unicode Set. The Unicode Set code uses this
112-
// mechanism to get all references to the same $variable
113-
// name to refer to a single common Unicode Set instance.
114-
//
115-
// This implementation cheats a little, and does not maintain a map of stand-in chars
116-
// to sets. Instead, it takes advantage of the fact that the UnicodeSet
117-
// constructor will always call this function right after calling lookup(),
118-
// and we just need to remember what set to return between these two calls.
119-
const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const
120-
{
121-
UnicodeSet *retVal = nullptr;
122-
RBBISymbolTable *This = const_cast<RBBISymbolTable*>(this); // cast off const
123-
if (ch == 0xffff) {
124-
retVal = fCachedSetLookup;
125-
This->fCachedSetLookup = nullptr;
126-
}
127-
return retVal;
111+
// No longer used, see ICU-23297.
112+
const UnicodeFunctor* RBBISymbolTable::lookupMatcher(UChar32 /*ch*/) const {
113+
return nullptr;
128114
}
129115

130116
//

icu4c/source/common/unicode/symtable.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@
1919
#include "unicode/uobject.h"
2020

2121
/**
22-
* \file
22+
* \file
2323
* \brief C++ API: An interface that defines both lookup protocol and parsing of
2424
* symbolic names.
2525
*/
26-
26+
2727
U_NAMESPACE_BEGIN
2828

2929
class ParsePosition;
@@ -82,6 +82,16 @@ class U_COMMON_API SymbolTable /* not : public UObject because this is an interf
8282
*/
8383
virtual const UnicodeString* lookup(const UnicodeString& s) const = 0;
8484

85+
/**
86+
* Returns a pointer to a pre-parsed set associated with the variable with
87+
* the given name, or null if there is no such set (as is the case for an
88+
* undefined or string-valued variable).
89+
* @internal
90+
*/
91+
virtual const UnicodeSet* lookupSet(const UnicodeString& /*name*/) const {
92+
return nullptr;
93+
}
94+
8595
/**
8696
* Lookup the UnicodeMatcher associated with the given character, and
8797
* return it. Return <tt>nullptr</tt> if not found.

icu4c/source/common/uniset_props.cpp

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -475,11 +475,23 @@ class UnicodeSet::Lexer {
475475
}
476476
if (first == u'$' && symbols_ != nullptr) {
477477
auto nameEnd = parsePosition_;
478+
// The SymbolTable defines the lexing of variable names past the $.
478479
if (UnicodeString name = symbols_->parseReference(pattern_, nameEnd, pattern_.length());
479480
!name.isEmpty()) {
480481
chars_.jumpahead(nameEnd.getIndex() - (start + 1));
481482
const std::u16string_view source =
482483
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start);
484+
const UnicodeSet *precomputedSet = symbols_->lookupSet(name);
485+
if (precomputedSet != nullptr) {
486+
return LexicalElement(LexicalElement::VARIABLE, {}, getPos(), U_ZERO_ERROR,
487+
precomputedSet, /*set=*/{}, source);
488+
}
489+
// The variable was not a precomputed set. Use the old-fashioned `lookup`, which
490+
// should give us its source text; if that parses as a single set or element, use
491+
// it. Note that variables are not allowed in that expansion.
492+
// Implementers of higher-level syntaxes that pre-parse UnicodeSet-valued variables
493+
// can use variables in their variable definitions, but those that simply use the
494+
// source text substitution API cannot.
483495
const UnicodeString *const expression = symbols_->lookup(name);
484496
if (expression == nullptr) {
485497
return LexicalElement(
@@ -636,22 +648,6 @@ class UnicodeSet::Lexer {
636648
}
637649
switch (variableToken.category_) {
638650
case LexicalElement::LITERAL_ELEMENT:
639-
// Until ICU-23297 is fixed, a variable may expand to a single (normally private-use)
640-
// code point that stands for a precomputed set. As a change from ICU 78 and earlier,
641-
// this mechanism is not used outside of variable expansion, and is effectively an
642-
// implementation detail to make a variable represent a set without reparsing that set
643-
// every time the variable is substituted.
644-
if (U_SUCCESS(variableToken.errorCode())) {
645-
UnicodeSet const *standIn = dynamic_cast<const UnicodeSet *>(
646-
symbols_->lookupMatcher(*variableToken.codePoint()));
647-
if (standIn != nullptr) {
648-
return LexicalElement(LexicalElement::VARIABLE, {}, getPos(), U_ZERO_ERROR,
649-
/*precomputedSet=*/standIn, {}, source);
650-
}
651-
}
652-
// The character was not a stand-in, this variable actually expands to a literal-element;
653-
// fall through to the case where we bubble up the lexical element.
654-
[[fallthrough]];
655651
case LexicalElement::ESCAPED_ELEMENT:
656652
case LexicalElement::NAMED_ELEMENT:
657653
case LexicalElement::BRACKETED_ELEMENT:

0 commit comments

Comments
 (0)