Skip to content

Commit b40039b

Browse files
authored
Merge pull request #502 from marklogic/CSV-196-master
CSV-196-TrackBytePositions
2 parents dd7b4b3 + d403084 commit b40039b

File tree

10 files changed

+321
-7
lines changed

10 files changed

+321
-7
lines changed

pom.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,8 @@
245245
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
246246
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
247247
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
248+
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
249+
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
248250
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
249251
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
250252
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
155155
private CSVFormat format;
156156
private long characterOffset;
157157
private long recordNumber = 1;
158+
private boolean enableByteTracking;
158159

159160
/**
160161
* Constructs a new instance.
@@ -166,7 +167,7 @@ protected Builder() {
166167
@SuppressWarnings("resource")
167168
@Override
168169
public CSVParser get() throws IOException {
169-
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
170+
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking);
170171
}
171172

172173
/**
@@ -202,6 +203,18 @@ public Builder setRecordNumber(final long recordNumber) {
202203
return asThis();
203204
}
204205

206+
/**
207+
* Sets whether to enable byte tracking for the parser.
208+
*
209+
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
210+
* @return this instance.
211+
* @since 1.13.0
212+
*/
213+
public Builder setEnableByteTracking(final boolean enableByteTracking) {
214+
this.enableByteTracking = enableByteTracking;
215+
return asThis();
216+
}
217+
205218
}
206219

207220
final class CSVRecordIterator implements Iterator<CSVRecord> {
@@ -510,11 +523,43 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
510523
@Deprecated
511524
@SuppressWarnings("resource")
512525
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
526+
throws IOException {
527+
this(reader, format, characterOffset, recordNumber, null, false);
528+
}
529+
530+
/**
531+
* Constructs a new instance using the given {@link CSVFormat}
532+
*
533+
* <p>
534+
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
535+
* unless you close the {@code reader}.
536+
* </p>
537+
*
538+
* @param reader
539+
* a Reader containing CSV-formatted input. Must not be null.
540+
* @param format
541+
* the CSVFormat used for CSV parsing. Must not be null.
542+
* @param characterOffset
543+
* Lexer offset when the parser does not start parsing at the beginning of the source.
544+
* @param recordNumber
545+
* The next record number to assign.
546+
* @param charset
547+
* The character encoding to be used for the reader when enableByteTracking is true.
548+
* @param enableByteTracking
549+
* {@code true} to enable byte tracking for the parser; {@code false} to disable it.
550+
* @throws IllegalArgumentException
551+
* If the parameters of the format are inconsistent or if either the reader or format is null.
552+
* @throws IOException
553+
* If there is a problem reading the header or skipping the first record.
554+
* @throws CSVException Thrown on invalid input.
555+
*/
556+
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
557+
final Charset charset, final boolean enableByteTracking)
513558
throws IOException {
514559
Objects.requireNonNull(reader, "reader");
515560
Objects.requireNonNull(format, "format");
516561
this.format = format.copy();
517-
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
562+
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking));
518563
this.csvRecordIterator = new CSVRecordIterator();
519564
this.headers = createHeaders();
520565
this.characterOffset = characterOffset;
@@ -841,6 +886,7 @@ CSVRecord nextRecord() throws IOException {
841886
recordList.clear();
842887
StringBuilder sb = null;
843888
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
889+
final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
844890
do {
845891
reusableToken.reset();
846892
lexer.nextToken(reusableToken);
@@ -878,7 +924,7 @@ CSVRecord nextRecord() throws IOException {
878924
recordNumber++;
879925
final String comment = Objects.toString(sb, null);
880926
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
881-
recordNumber, startCharPosition);
927+
recordNumber, startCharPosition, startBytePosition);
882928
}
883929
return result;
884930
}

src/main/java/org/apache/commons/csv/CSVRecord.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
5050
*/
5151
private final long characterPosition;
5252

53+
/**
54+
* The starting position of this record in the source stream, measured in bytes.
55+
*/
56+
private final long bytePosition;
57+
5358
/** The accumulated comments (if any) */
5459
private final String comment;
5560

@@ -62,15 +67,15 @@ public final class CSVRecord implements Serializable, Iterable<String> {
6267
/** The parser that originates this record. This is not serialized. */
6368
private final transient CSVParser parser;
6469

65-
CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
66-
final long characterPosition) {
70+
CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
71+
final long characterPosition, final long bytePosition) {
6772
this.recordNumber = recordNumber;
6873
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
6974
this.parser = parser;
7075
this.comment = comment;
7176
this.characterPosition = characterPosition;
77+
this.bytePosition = bytePosition;
7278
}
73-
7479
/**
7580
* Returns a value by {@link Enum}.
7681
*
@@ -146,6 +151,16 @@ public long getCharacterPosition() {
146151
return characterPosition;
147152
}
148153

154+
/**
155+
* Returns the starting position of this record in the source stream, measured in bytes.
156+
*
157+
* @return the byte position of this record in the source stream.
158+
* @since 1.13.0
159+
*/
160+
public long getBytePosition() {
161+
return bytePosition;
162+
}
163+
149164
/**
150165
* Returns the comment for this record, if any.
151166
* Note that comments are attached to the following record.

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626

2727
import java.io.IOException;
2828
import java.io.Reader;
29+
import java.nio.CharBuffer;
30+
import java.nio.charset.CharacterCodingException;
31+
import java.nio.charset.Charset;
32+
import java.nio.charset.CharsetEncoder;
2933

3034
import org.apache.commons.io.IOUtils;
3135
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -51,13 +55,36 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
5155
private long position;
5256
private long positionMark;
5357

58+
/** The number of bytes read so far. */
59+
private long bytesRead;
60+
private long bytesReadMark;
61+
62+
/** Encoder for calculating the number of bytes for each character read. */
63+
private CharsetEncoder encoder;
64+
5465
/**
5566
* Constructs a new instance using the default buffer size.
5667
*/
5768
ExtendedBufferedReader(final Reader reader) {
5869
super(reader);
5970
}
6071

72+
/**
73+
* Constructs a new instance with the specified reader, character set,
74+
* and byte tracking option. Initializes an encoder if byte tracking is enabled
75+
* and a character set is provided.
76+
*
77+
* @param reader the reader supports a look-ahead option.
78+
* @param charset the character set for encoding, or {@code null} if not applicable.
79+
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
80+
*/
81+
ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) {
82+
super(reader);
83+
if (charset != null && enableByteTracking) {
84+
encoder = charset.newEncoder();
85+
}
86+
}
87+
6188
/**
6289
* Closes the stream.
6390
*
@@ -110,6 +137,7 @@ public void mark(final int readAheadLimit) throws IOException {
110137
lineNumberMark = lineNumber;
111138
lastCharMark = lastChar;
112139
positionMark = position;
140+
bytesReadMark = bytesRead;
113141
super.mark(readAheadLimit);
114142
}
115143

@@ -120,11 +148,59 @@ public int read() throws IOException {
120148
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
121149
lineNumber++;
122150
}
151+
if (encoder != null) {
152+
this.bytesRead += getEncodedCharLength(current);
153+
}
123154
lastChar = current;
124155
position++;
125156
return lastChar;
126157
}
127158

159+
/**
160+
* Gets the byte length of the given character based on the the original Unicode
161+
* specification, which defined characters as fixed-width 16-bit entities.
162+
* <p>
163+
* The Unicode characters are divided into two main ranges:
164+
* <ul>
165+
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
166+
* <ul>
167+
* <li>Represented using a single 16-bit {@code char}.</li>
168+
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
169+
* </ul>
170+
* </li>
171+
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
172+
* <ul>
173+
* <li>Represented as a pair of {@code char}s:</li>
174+
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
175+
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
176+
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
177+
* </ul>
178+
* </li>
179+
* </ul>
180+
*
181+
* @param current the current character to process.
182+
* @return the byte length of the character.
183+
* @throws CharacterCodingException if the character cannot be encoded.
184+
*/
185+
private int getEncodedCharLength(int current) throws CharacterCodingException {
186+
final char cChar = (char) current;
187+
final char lChar = (char) lastChar;
188+
if (!Character.isSurrogate(cChar)) {
189+
return encoder.encode(
190+
CharBuffer.wrap(new char[] {cChar})).limit();
191+
} else {
192+
if (Character.isHighSurrogate(cChar)) {
193+
// Move on to the next char (low surrogate)
194+
return 0;
195+
} else if (Character.isSurrogatePair(lChar, cChar)) {
196+
return encoder.encode(
197+
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
198+
} else {
199+
throw new CharacterCodingException();
200+
}
201+
}
202+
}
203+
128204
@Override
129205
public int read(final char[] buf, final int offset, final int length) throws IOException {
130206
if (length == 0) {
@@ -189,7 +265,17 @@ public void reset() throws IOException {
189265
lineNumber = lineNumberMark;
190266
lastChar = lastCharMark;
191267
position = positionMark;
268+
bytesRead = bytesReadMark;
192269
super.reset();
193270
}
194271

272+
/**
273+
* Gets the number of bytes read by the reader.
274+
*
275+
* @return the number of bytes read by the read
276+
*/
277+
long getBytesRead() {
278+
return this.bytesRead;
279+
}
280+
195281
}

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,15 @@ long getCharacterPosition() {
105105
return reader.getPosition();
106106
}
107107

108+
/**
109+
* Gets the number of bytes read
110+
*
111+
* @return the number of bytes read
112+
*/
113+
long getBytesRead() {
114+
return reader.getBytesRead();
115+
}
116+
108117
/**
109118
* Returns the current line number
110119
*

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,76 @@ public void testGetHeaderComment_NoComment3() throws IOException {
703703
}
704704
}
705705

706+
@Test
707+
public void testGetRecordThreeBytesRead() throws Exception {
708+
final String code = "id,date,val5,val4\n" +
709+
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
710+
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
711+
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
712+
final CSVFormat format = CSVFormat.Builder.create()
713+
.setDelimiter(',')
714+
.setQuote('\'')
715+
.get();
716+
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) {
717+
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
718+
719+
assertEquals(0, parser.getRecordNumber());
720+
assertNotNull(record = parser.nextRecord());
721+
assertEquals(1, record.getRecordNumber());
722+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
723+
assertEquals(record.getBytePosition(), record.getCharacterPosition());
724+
725+
assertNotNull(record = parser.nextRecord());
726+
assertEquals(2, record.getRecordNumber());
727+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
728+
assertEquals(record.getBytePosition(), record.getCharacterPosition());
729+
730+
assertNotNull(record = parser.nextRecord());
731+
assertEquals(3, record.getRecordNumber());
732+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
733+
assertEquals(record.getBytePosition(), 95);
734+
735+
assertNotNull(record = parser.nextRecord());
736+
assertEquals(4, record.getRecordNumber());
737+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
738+
assertEquals(record.getBytePosition(), 154);
739+
}
740+
}
741+
742+
@Test
743+
public void testGetRecordFourBytesRead() throws Exception {
744+
final String code = "id,a,b,c\n" +
745+
"1,😊,🤔,😂\n" +
746+
"2,😊,🤔,😂\n" +
747+
"3,😊,🤔,😂\n";
748+
final CSVFormat format = CSVFormat.Builder.create()
749+
.setDelimiter(',')
750+
.setQuote('\'')
751+
.get();
752+
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) {
753+
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
754+
755+
assertEquals(0, parser.getRecordNumber());
756+
assertNotNull(record = parser.nextRecord());
757+
assertEquals(1, record.getRecordNumber());
758+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
759+
assertEquals(record.getBytePosition(), record.getCharacterPosition());
760+
761+
assertNotNull(record = parser.nextRecord());
762+
assertEquals(2, record.getRecordNumber());
763+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
764+
assertEquals(record.getBytePosition(), record.getCharacterPosition());
765+
assertNotNull(record = parser.nextRecord());
766+
assertEquals(3, record.getRecordNumber());
767+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
768+
assertEquals(record.getBytePosition(), 26);
769+
assertNotNull(record = parser.nextRecord());
770+
assertEquals(4, record.getRecordNumber());
771+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
772+
assertEquals(record.getBytePosition(), 43);
773+
}
774+
}
775+
706776
@Test
707777
public void testGetHeaderMap() throws Exception {
708778
try (CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {

src/test/java/org/apache/commons/csv/CSVRecordTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ record = parser.iterator().next();
8787
@Test
8888
public void testCSVRecordNULLValues() throws IOException {
8989
try (CSVParser parser = CSVParser.parse("A,B\r\nONE,TWO", CSVFormat.DEFAULT.withHeader())) {
90-
final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L);
90+
final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L, 0L);
9191
assertEquals(0, csvRecord.size());
9292
assertThrows(IllegalArgumentException.class, () -> csvRecord.get("B"));
9393
}

0 commit comments

Comments
 (0)