Skip to content

Commit 1f44cc5

Browse files
authored
Add comma escape character (#104274)
* Add escape character for comma in CsvTestsDataLoader * Handle escaped commas in .csv-spec values as well * Change the way the indices are populated when running the loader from command line
1 parent ca919d9 commit 1f44cc5

File tree

2 files changed

+93
-22
lines changed

2 files changed

+93
-22
lines changed

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ public final class CsvTestUtils {
6161
private static final int MAX_WIDTH = 20;
6262
private static final CsvPreference CSV_SPEC_PREFERENCES = new CsvPreference.Builder('"', '|', "\r\n").build();
6363
private static final String NULL_VALUE = "null";
64+
private static final char ESCAPE_CHAR = '\\';
65+
public static final String COMMA_ESCAPING_REGEX = "(?<!\\" + ESCAPE_CHAR + "),";
66+
public static final String ESCAPED_COMMA_SEQUENCE = ESCAPE_CHAR + ",";
6467

6568
private CsvTestUtils() {}
6669

@@ -229,6 +232,8 @@ public void close() {
229232
* Takes a csv String and converts it to a String array. Also, it recognizes an opening bracket "[" in one string and a closing "]"
230233
* in another string and it creates a single concatenated comma-separated String of all the values between the opening bracket entry
231234
* and the closing bracket entry. In other words, entries enclosed by "[]" are returned as a single element.
235+
*
236+
* Commas can be escaped with \ (backslash) character.
232237
*/
233238
static String[] multiValuesAwareCsvToStringArray(String csvLine, int lineNumber) {
234239
var mvCompressedEntries = new ArrayList<String>();
@@ -237,14 +242,20 @@ static String[] multiValuesAwareCsvToStringArray(String csvLine, int lineNumber)
237242

238243
int pos = 0; // current position in the csv String
239244
int commaPos; // current "," character position
245+
int previousCommaPos = 0;
240246
while ((commaPos = csvLine.indexOf(",", pos)) != -1 || pos <= csvLine.length()) {
247+
if (commaPos > 0 && csvLine.charAt(commaPos - 1) == ESCAPE_CHAR) {// skip the escaped comma
248+
pos = commaPos + 1;// moving on to the next character after comma
249+
continue;
250+
}
251+
241252
boolean isLastElement = commaPos == -1;
242-
String entry = csvLine.substring(pos, isLastElement ? csvLine.length() : commaPos).trim();
253+
String entry = csvLine.substring(previousCommaPos, isLastElement ? csvLine.length() : commaPos).trim();
243254
if (entry.startsWith("[")) {
244255
if (previousMvValue != null || (isLastElement && entry.endsWith("]") == false)) {
245256
String message = "Error line [{}:{}]: Unexpected start of a multi-value field value; current token [{}], "
246257
+ (isLastElement ? "no closing point" : "previous token [{}]");
247-
throw new IllegalArgumentException(format(message, lineNumber, pos, entry, previousMvValue));
258+
throw new IllegalArgumentException(format(message, lineNumber, previousCommaPos, entry, previousMvValue));
248259
}
249260
if (entry.endsWith("]")) {
250261
if (entry.length() > 2) {// single-valued multivalue field :shrug:
@@ -263,7 +274,7 @@ static String[] multiValuesAwareCsvToStringArray(String csvLine, int lineNumber)
263274
format(
264275
"Error line [{}:{}]: Unexpected end of a multi-value field value (no previous starting point); found [{}]",
265276
lineNumber,
266-
pos,
277+
previousCommaPos,
267278
entry
268279
)
269280
);
@@ -279,8 +290,8 @@ static String[] multiValuesAwareCsvToStringArray(String csvLine, int lineNumber)
279290
format(
280291
"Error line [{}:{}]: Unexpected missing value in a multi-value column; found [{}]",
281292
lineNumber,
282-
pos,
283-
csvLine.substring(pos - 1)
293+
previousCommaPos,
294+
csvLine.substring(previousCommaPos - 1)
284295
)
285296
);
286297
}
@@ -290,12 +301,22 @@ static String[] multiValuesAwareCsvToStringArray(String csvLine, int lineNumber)
290301
}
291302
}
292303
pos = 1 + (isLastElement ? csvLine.length() : commaPos);// break out of the loop if it reached its last element
304+
previousCommaPos = pos;
293305
}
294306
return mvCompressedEntries.toArray(String[]::new);
295307
}
296308

297309
public record ExpectedResults(List<String> columnNames, List<Type> columnTypes, List<List<Object>> values) {}
298310

311+
/**
312+
* The method loads a section of a .csv-spec file representing the results of executing the query of that section.
313+
* It reads both the schema (field names and their types) and the row values.
314+
* Values starting with an opening square bracket and ending with a closing square bracket are considered multi-values. Inside
315+
* these multi-values, commas separate the individual values and escaped commas are allowed with a prefixed \
316+
* default \ (backslash) character.
317+
* @param csv a string representing the header and row values of a single query execution result
318+
* @return data structure with column names, their types and values
319+
*/
299320
public static ExpectedResults loadCsvSpecValues(String csv) {
300321
List<String> columnNames;
301322
List<Type> columnTypes;
@@ -338,13 +359,21 @@ public static ExpectedResults loadCsvSpecValues(String csv) {
338359
if (value.startsWith("[") ^ value.endsWith("]")) {
339360
throw new IllegalArgumentException("Incomplete multi-value (opening and closing square brackets) found " + value);
340361
}
341-
if (value.contains(",") && value.startsWith("[")) {// commas outside a multi-value should be ok
342-
List<Object> listOfMvValues = new ArrayList<>();
343-
for (String mvValue : delimitedListToStringArray(value.substring(1, value.length() - 1), ",")) {
344-
listOfMvValues.add(columnTypes.get(i).convert(mvValue.trim()));
362+
if (value.contains(",") && value.startsWith("[")) {
363+
// split on commas but ignoring escaped commas
364+
String[] multiValues = value.substring(1, value.length() - 1).split(COMMA_ESCAPING_REGEX);
365+
if (multiValues.length > 0) {
366+
List<Object> listOfMvValues = new ArrayList<>();
367+
for (String mvValue : multiValues) {
368+
listOfMvValues.add(columnTypes.get(i).convert(mvValue.trim().replace(ESCAPED_COMMA_SEQUENCE, ",")));
369+
}
370+
rowValues.add(listOfMvValues);
371+
} else {
372+
rowValues.add(columnTypes.get(i).convert(value.replace(ESCAPED_COMMA_SEQUENCE, ",")));
345373
}
346-
rowValues.add(listOfMvValues);
347374
} else {
375+
// The value considered here is the one where any potential escaped comma is kept as is (with the escape char)
376+
// TODO if we'd want escaped commas outside multi-values fields, we'd have to adjust this value here as well
348377
rowValues.add(columnTypes.get(i).convert(value));
349378
}
350379
}

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@
4242
import java.util.Map;
4343
import java.util.Set;
4444

45-
import static org.elasticsearch.common.Strings.delimitedListToStringArray;
4645
import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
46+
import static org.elasticsearch.xpack.esql.CsvTestUtils.COMMA_ESCAPING_REGEX;
47+
import static org.elasticsearch.xpack.esql.CsvTestUtils.ESCAPED_COMMA_SEQUENCE;
4748
import static org.elasticsearch.xpack.esql.CsvTestUtils.multiValuesAwareCsvToStringArray;
4849

4950
public class CsvTestsDataLoader {
@@ -137,17 +138,33 @@ public static void main(String[] args) throws IOException {
137138
}
138139

139140
try (RestClient client = builder.build()) {
140-
loadDataSetIntoEs(client);
141+
loadDataSetIntoEs(client, (restClient, indexName, indexMapping) -> {
142+
Request request = new Request("PUT", "/" + indexName);
143+
request.setJsonEntity("{\"mappings\":" + indexMapping + "}");
144+
restClient.performRequest(request);
145+
});
141146
}
142147
}
143148

149+
private static void loadDataSetIntoEs(RestClient client, IndexCreator indexCreator) throws IOException {
150+
loadDataSetIntoEs(client, LogManager.getLogger(CsvTestsDataLoader.class), indexCreator);
151+
}
152+
144153
public static void loadDataSetIntoEs(RestClient client) throws IOException {
145-
loadDataSetIntoEs(client, LogManager.getLogger(CsvTestsDataLoader.class));
154+
loadDataSetIntoEs(client, (restClient, indexName, indexMapping) -> {
155+
ESRestTestCase.createIndex(restClient, indexName, null, indexMapping, null);
156+
});
146157
}
147158

148159
public static void loadDataSetIntoEs(RestClient client, Logger logger) throws IOException {
160+
loadDataSetIntoEs(client, logger, (restClient, indexName, indexMapping) -> {
161+
ESRestTestCase.createIndex(restClient, indexName, null, indexMapping, null);
162+
});
163+
}
164+
165+
private static void loadDataSetIntoEs(RestClient client, Logger logger, IndexCreator indexCreator) throws IOException {
149166
for (var dataSet : CSV_DATASET_MAP.values()) {
150-
load(client, dataSet.indexName, "/" + dataSet.mappingFileName, "/" + dataSet.dataFileName, logger);
167+
load(client, dataSet.indexName, "/" + dataSet.mappingFileName, "/" + dataSet.dataFileName, logger, indexCreator);
151168
}
152169
forceMerge(client, CSV_DATASET_MAP.keySet(), logger);
153170
for (var policy : ENRICH_POLICIES) {
@@ -169,7 +186,14 @@ private static void loadEnrichPolicy(RestClient client, String policyName, Strin
169186
client.performRequest(request);
170187
}
171188

172-
private static void load(RestClient client, String indexName, String mappingName, String dataName, Logger logger) throws IOException {
189+
private static void load(
190+
RestClient client,
191+
String indexName,
192+
String mappingName,
193+
String dataName,
194+
Logger logger,
195+
IndexCreator indexCreator
196+
) throws IOException {
173197
URL mapping = CsvTestsDataLoader.class.getResource(mappingName);
174198
if (mapping == null) {
175199
throw new IllegalArgumentException("Cannot find resource " + mappingName);
@@ -178,14 +202,10 @@ private static void load(RestClient client, String indexName, String mappingName
178202
if (data == null) {
179203
throw new IllegalArgumentException("Cannot find resource " + dataName);
180204
}
181-
createTestIndex(client, indexName, readTextFile(mapping));
205+
indexCreator.createIndex(client, indexName, readTextFile(mapping));
182206
loadCsvData(client, indexName, data, CsvTestsDataLoader::createParser, logger);
183207
}
184208

185-
private static void createTestIndex(RestClient client, String indexName, String mapping) throws IOException {
186-
ESRestTestCase.createIndex(client, indexName, null, mapping, null);
187-
}
188-
189209
public static String readTextFile(URL resource) throws IOException {
190210
try (BufferedReader reader = TestUtils.reader(resource)) {
191211
StringBuilder b = new StringBuilder();
@@ -198,6 +218,20 @@ public static String readTextFile(URL resource) throws IOException {
198218
}
199219

200220
@SuppressWarnings("unchecked")
221+
/**
222+
* Loads a classic csv file in an ES cluster using a RestClient.
223+
* The structure of the file is as follows:
224+
* - commented lines should start with "//"
225+
* - the first non-comment line from the file is the schema line (comma separated field_name:ES_data_type elements)
226+
* - sub-fields should be placed after the root field using a dot notation for the name:
227+
* root_field:long,root_field.sub_field:integer
228+
* - a special _id field can be used in the schema and the values of this field will be used in the bulk request as actual doc ids
229+
* - all subsequent non-comment lines represent the values that will be used to build the _bulk request
230+
* - an empty string "" refers to a null value
231+
* - a value starting with an opening square bracket "[" and ending with a closing square bracket "]" refers to a multi-value field
232+
* - multi-values are comma separated
233+
* - commas inside multivalue fields can be escaped with \ (backslash) character
234+
*/
201235
private static void loadCsvData(
202236
RestClient client,
203237
String indexName,
@@ -278,9 +312,11 @@ private static void loadCsvData(
278312
if (i > 0 && row.length() > 0) {
279313
row.append(",");
280314
}
281-
if (entries[i].contains(",")) {// multi-value
315+
// split on comma ignoring escaped commas
316+
String[] multiValues = entries[i].split(COMMA_ESCAPING_REGEX);
317+
if (multiValues.length > 0) {// multi-value
282318
StringBuilder rowStringValue = new StringBuilder("[");
283-
for (String s : delimitedListToStringArray(entries[i], ",")) {
319+
for (String s : multiValues) {
284320
rowStringValue.append("\"" + s + "\",");
285321
}
286322
// remove the last comma and put a closing bracket instead
@@ -289,6 +325,8 @@ private static void loadCsvData(
289325
} else {
290326
entries[i] = "\"" + entries[i] + "\"";
291327
}
328+
// replace any escaped commas with single comma
329+
entries[i].replace(ESCAPED_COMMA_SEQUENCE, ",");
292330
row.append("\"" + columns[i] + "\":" + entries[i]);
293331
} catch (Exception e) {
294332
throw new IllegalArgumentException(
@@ -356,4 +394,8 @@ private static XContentParser createParser(XContent xContent, InputStream data)
356394
public record TestsDataset(String indexName, String mappingFileName, String dataFileName) {}
357395

358396
public record EnrichConfig(String policyName, String policyFileName) {}
397+
398+
private interface IndexCreator {
399+
void createIndex(RestClient client, String indexName, String mapping) throws IOException;
400+
}
359401
}

0 commit comments

Comments
 (0)