@@ -1107,12 +1107,10 @@ function getFastStartsWithQuery(lhs: AQLFragment, rhsValue: string): AQLFragment
11071107 return aql `IS_STRING(${ lhs } )` ;
11081108 }
11091109
1110- // this works as long as the highest possible code point is also the last one in the collation
1111- const maxChar = String . fromCodePoint ( 0x10ffff ) ;
1112- const maxStr = rhsValue + maxChar ;
1113-
1114- // UPPER is used to get the "smallest" representation of the value case-sensitive, LOWER for the "largest".
1115- // the ordering looks like this:
1110+ // this function assumes that a collator is used that places upper and lower case of characters next to each other.
1111+ // It also assumes that strings are always placed immediately after all case-variants of its one-character shorter prefix.
1112+ // It also assumes that \u10ffff is always the last character in order
1113+ // For example, this is the sort behavior if arangodb is started with --default-language=en_US:
11161114 // [
11171115 // "A",
11181116 // "a",
@@ -1127,13 +1125,33 @@ function getFastStartsWithQuery(lhs: AQLFragment, rhsValue: string): AQLFragment
11271125 // "B",
11281126 // "b"
11291127 // ]
1130- // This means that if the actual value is longer than the given prefix (i.e. it's a real prefix and not the whole
1131- // string), the match will be case-insensitive. However, if the remaining suffix if empty, the search would
1132- // sometimes be case-sensitive: If you search for the prefix a, A will not be found (because A < a), but a will
1133- // match the prefix filter A. In order to avoid this, one needs to convert the given string to the lowest value
1134- // within its case-sensitivity category. For ASCII characters, that's simply UPPER(), but that will not always be
1135- // the case. The same thing applies to the upper bound.
1136- return aql `(${ lhs } >= UPPER(${ rhsValue } ) && ${ lhs } < LOWER(${ maxStr } ))` ;
1128+ // value >= "A" && value <= "a" therefore is identical to LOWER(value) == LOWER("a")
1129+ // value >= "A" && value < "a\u10ffff" is identical to STARTS_WITH(LOWER(value), LOWER("a"))
1130+
1131+ // IMPORTANT: these assumptions are not true for the en_US_POSIX languages (neither with --icu-language, nor
1132+ // with --default-language). en_US_POSIX is very close to the code points, so it places all uppercase characters
1133+ // together, followed by all lowercase characters ("A", "B", ..., "Z", ..., "a", "b", ...)
1134+ // so we currently do not suppor the en_US_POSIX language.
1135+
1136+ // Other collators have lowercase first, then uppercase, w.f. --icu-language=en_US.
1137+ // To support both variants, we compute the MIN and MAX of UPPER and LOWER
1138+ // arangodb will evaluate those MIN/MAX epxressions statically so it can still use them as index ranges
1139+ // (remove-unnecessary-calculations-2)
1140+ // rhs is always a string literal (and usually rather short, since it's just a filter condition),
1141+ // so it's not really a problem to repeat it
1142+
1143+ // this works as long as the highest possible code point is also the last one in the collation
1144+ const maxChar = String . fromCodePoint ( 0x10ffff ) ;
1145+ const maxStr = rhsValue + maxChar ;
1146+
1147+ // if the prefix e.g. only consists of digits, it's even easier because we don't need to case-convert it
1148+ if ( isStringCaseInsensitive ( rhsValue ) ) {
1149+ return aql `(${ lhs } >= ${ rhsValue } && ${ lhs } < ${ maxStr } )` ;
1150+ }
1151+
1152+ const lowerBoundFrag = aql `MIN([UPPER(${ rhsValue } ), LOWER(${ rhsValue } )])` ;
1153+ const upperBoundFrag = aql `MAX([LOWER(${ maxStr } ), UPPER(${ maxStr } )])` ;
1154+ return aql `(${ lhs } >= ${ lowerBoundFrag } && ${ lhs } < ${ upperBoundFrag } )` ;
11371155}
11381156
11391157function getEqualsIgnoreCaseQuery ( lhs : AQLFragment , rhsValue : string ) : AQLFragment {
@@ -1143,8 +1161,8 @@ function getEqualsIgnoreCaseQuery(lhs: AQLFragment, rhsValue: string): AQLFragme
11431161 }
11441162
11451163 // w.r.t. UPPER/LOWER, see the comment in getFastStartsWithQuery
1146- const lowerBoundFrag = aql `UPPER(${ rhsValue } )` ;
1147- const upperBoundFrag = aql `LOWER(${ rhsValue } )` ;
1164+ const lowerBoundFrag = aql `MIN([ UPPER(${ rhsValue } ), LOWER( ${ rhsValue } )] )` ;
1165+ const upperBoundFrag = aql `MAX([ LOWER(${ rhsValue } ), UPPER( ${ rhsValue } )] )` ;
11481166 return aql `(${ lhs } >= ${ lowerBoundFrag } && ${ lhs } <= ${ upperBoundFrag } )` ;
11491167}
11501168
0 commit comments