Skip to content

Commit 3599f5b

Browse files
authored
Add support in Commons CSV for tracking byte positions during parsing (#12)
Add support in Commons CSV for tracking byte positions during parsing
1 parent b244cb1 commit 3599f5b

File tree

6 files changed

+110
-122
lines changed

6 files changed

+110
-122
lines changed

pom.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
<url>https://commons.apache.org/proper/commons-csv/</url>
2929
<inceptionYear>2005</inceptionYear>
3030
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
31-
<packaging>jar</packaging>
3231

3332
<dependencies>
3433
<dependency>

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,30 +2097,6 @@ public CSVParser parse(final Reader reader) throws IOException {
20972097
return CSVParser.builder().setReader(reader).setFormat(this).get();
20982098
}
20992099

2100-
/**
2101-
* Parses the specified content.
2102-
*
2103-
* <p>
2104-
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
2105-
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
2106-
* </p>
2107-
*
2108-
* <p>
2109-
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
2110-
* </p>
2111-
*
2112-
* @param reader the input stream
2113-
* @param characterOffset the character offset to start parsing from
2114-
* @param recordNumber the initial record number to start counting from
2115-
* @param encoding the character encoding of the input stream
2116-
* @return a parser over a stream of {@link CSVRecord}s.
2117-
* @throws IOException If an I/O error occurs
2118-
* @throws CSVException Thrown on invalid input.
2119-
*/
2120-
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
2121-
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
2122-
}
2123-
21242100
/**
21252101
* Prints to the specified output.
21262102
*

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
153153
private CSVFormat format;
154154
private long characterOffset;
155155
private long recordNumber = 1;
156+
private Charset charset;
156157

157158
/**
158159
* Constructs a new instance.
@@ -164,7 +165,7 @@ protected Builder() {
164165
@SuppressWarnings("resource")
165166
@Override
166167
public CSVParser get() throws IOException {
167-
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
168+
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, charset);
168169
}
169170

170171
/**
@@ -200,6 +201,16 @@ public Builder setRecordNumber(final long recordNumber) {
200201
return asThis();
201202
}
202203

204+
/**
205+
* Sets the character encoding to be used for the reader.
206+
*
207+
* @param charset the character encoding.
208+
* @return this instance.
209+
*/
210+
public Builder setCharset(final Charset charset) {
211+
this.charset = charset;
212+
return asThis();
213+
}
203214
}
204215

205216
final class CSVRecordIterator implements Iterator<CSVRecord> {
@@ -510,7 +521,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
510521
this(reader, format, characterOffset, recordNumber, null);
511522
}
512523

513-
/**
524+
/**
514525
* Constructs a new instance using the given {@link CSVFormat}
515526
*
516527
* <p>
@@ -525,21 +536,22 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
525536
* @param characterOffset
526537
* Lexer offset when the parser does not start parsing at the beginning of the source.
527538
* @param recordNumber
528-
* The next record number to assign
529-
* @param encoding
530-
* The encoding to use for the reader
539+
* The next record number to assign.
540+
* @param charset
541+
* The character encoding to be used for the reader.
531542
* @throws IllegalArgumentException
532543
* If the parameters of the format are inconsistent or if either the reader or format is null.
533544
* @throws IOException
534-
* If there is a problem reading the header or skipping the first record
545+
* If there is a problem reading the header or skipping the first record.
535546
* @throws CSVException Thrown on invalid input.
547+
* @since 1.13.0.
536548
*/
537-
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
538-
String encoding) throws IOException {
549+
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset)
550+
throws IOException {
539551
Objects.requireNonNull(reader, "reader");
540552
Objects.requireNonNull(format, "format");
541553
this.format = format.copy();
542-
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
554+
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset));
543555
this.csvRecordIterator = new CSVRecordIterator();
544556
this.headers = createHeaders();
545557
this.characterOffset = characterOffset;

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
5353
private long position;
5454
private long positionMark;
5555

56-
/** The number of bytes read so far */
56+
/** The number of bytes read so far. */
5757
private long bytesRead;
5858
private long bytesReadMark;
5959

60-
/** Encoder used to calculate the bytes of characters */
61-
CharsetEncoder encoder;
60+
/** Encoder for calculating the number of bytes for each character read. */
61+
private CharsetEncoder encoder;
6262

6363
/**
6464
* Constructs a new instance using the default buffer size.
@@ -67,10 +67,10 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
6767
super(reader);
6868
}
6969

70-
ExtendedBufferedReader(final Reader reader, String encoding) {
70+
ExtendedBufferedReader(final Reader reader, Charset charset) {
7171
super(reader);
72-
if (encoding != null) {
73-
encoder = Charset.forName(encoding).newEncoder();
72+
if (charset != null) {
73+
encoder = charset.newEncoder();
7474
}
7575
}
7676

@@ -146,20 +146,30 @@ public int read() throws IOException {
146146
}
147147

148148
/**
149-
* In Java, a char data type are based on the original Unicode
150-
* specification, which defined characters as fixed-width 16-bit entities.
151-
* U+0000 to U+FFFF:
152-
* - BMP, represented using 1 16-bit char
153-
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
154-
* U+10000 to U+10FFFF:
155-
* - Supplementary characters, represented as a pair of characters,
156-
* the first char from the high-surrogates range (\uD800-\uDBFF),
157-
* and the second char from the low-surrogates range (uDC00-\uDFFF).
158-
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
149+
* In Java, the {@code char} data type is based on the original Unicode
150+
* specification, which defined characters as fixed-width 16-bit entities.
151+
* <p>
152+
* The Unicode characters are divided into two main ranges:
153+
* <ul>
154+
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
155+
* <ul>
156+
* <li>Represented using a single 16-bit {@code char}.</li>
157+
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
158+
* </ul>
159+
* </li>
160+
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
161+
* <ul>
162+
* <li>Represented as a pair of {@code char}s:</li>
163+
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
164+
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
165+
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
166+
* </ul>
167+
* </li>
168+
* </ul>
159169
*/
160170
private long getCharBytes(int current) throws CharacterCodingException {
161-
char cChar = (char) current;
162-
char lChar = (char) lastChar;
171+
final char cChar = (char) current;
172+
final char lChar = (char) lastChar;
163173
if (!Character.isSurrogate(cChar)) {
164174
return encoder.encode(
165175
CharBuffer.wrap(new char[] {cChar})).limit();

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 46 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -707,38 +707,34 @@ public void testGetRecordThreeBytesRead() throws Exception {
707707
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
708708
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
709709
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
710-
// String code = "'1',4";
711-
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
712710
final CSVFormat format = CSVFormat.Builder.create()
713-
.setDelimiter(',')
714-
.setQuote('\'')
715-
.build();
716-
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
717-
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
718-
719-
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
720-
assertEquals(0, parser.getRecordNumber());
721-
assertNotNull(record = parser.nextRecord());
722-
assertEquals(1, record.getRecordNumber());
723-
assertEquals(code.indexOf('i'), record.getCharacterPosition());
724-
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
711+
.setDelimiter(',')
712+
.setQuote('\'')
713+
.get();
714+
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get() ) {
715+
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
725716

726-
assertNotNull(record = parser.nextRecord());
727-
assertEquals(2, record.getRecordNumber());
728-
assertEquals(code.indexOf('1'), record.getCharacterPosition());
729-
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
717+
assertEquals(0, parser.getRecordNumber());
718+
assertNotNull(record = parser.nextRecord());
719+
assertEquals(1, record.getRecordNumber());
720+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
721+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
730722

731-
assertNotNull(record = parser.nextRecord());
732-
assertEquals(3, record.getRecordNumber());
733-
assertEquals(code.indexOf('2'), record.getCharacterPosition());
734-
assertEquals(record.getCharacterByte(), 95);
723+
assertNotNull(record = parser.nextRecord());
724+
assertEquals(2, record.getRecordNumber());
725+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
726+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
735727

736-
assertNotNull(record = parser.nextRecord());
737-
assertEquals(4, record.getRecordNumber());
738-
assertEquals(code.indexOf('3'), record.getCharacterPosition());
739-
assertEquals(record.getCharacterByte(), 154);
728+
assertNotNull(record = parser.nextRecord());
729+
assertEquals(3, record.getRecordNumber());
730+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
731+
assertEquals(record.getCharacterByte(), 95);
740732

741-
parser.close();
733+
assertNotNull(record = parser.nextRecord());
734+
assertEquals(4, record.getRecordNumber());
735+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
736+
assertEquals(record.getCharacterByte(), 154);
737+
};
742738

743739
}
744740

@@ -748,35 +744,32 @@ public void testGetRecordFourBytesRead() throws Exception {
748744
"1,😊,🤔,😂\n" +
749745
"2,😊,🤔,😂\n" +
750746
"3,😊,🤔,😂\n";
751-
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
752747
final CSVFormat format = CSVFormat.Builder.create()
753748
.setDelimiter(',')
754749
.setQuote('\'')
755-
.build();
756-
757-
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
758-
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
759-
760-
CSVRecord record;
761-
assertEquals(0, parser.getRecordNumber());
762-
assertNotNull(record = parser.nextRecord());
763-
assertEquals(1, record.getRecordNumber());
764-
assertEquals(code.indexOf('i'), record.getCharacterPosition());
765-
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
766-
767-
assertNotNull(record = parser.nextRecord());
768-
assertEquals(2, record.getRecordNumber());
769-
assertEquals(code.indexOf('1'), record.getCharacterPosition());
770-
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
771-
assertNotNull(record = parser.nextRecord());
772-
assertEquals(3, record.getRecordNumber());
773-
assertEquals(code.indexOf('2'), record.getCharacterPosition());
774-
assertEquals(record.getCharacterByte(), 26);
775-
assertNotNull(record = parser.nextRecord());
776-
assertEquals(4, record.getRecordNumber());
777-
assertEquals(code.indexOf('3'), record.getCharacterPosition());
778-
assertEquals(record.getCharacterByte(), 43);
779-
parser.close();
750+
.get();
751+
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) {
752+
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
753+
754+
assertEquals(0, parser.getRecordNumber());
755+
assertNotNull(record = parser.nextRecord());
756+
assertEquals(1, record.getRecordNumber());
757+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
758+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
759+
760+
assertNotNull(record = parser.nextRecord());
761+
assertEquals(2, record.getRecordNumber());
762+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
763+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
764+
assertNotNull(record = parser.nextRecord());
765+
assertEquals(3, record.getRecordNumber());
766+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
767+
assertEquals(record.getCharacterByte(), 26);
768+
assertNotNull(record = parser.nextRecord());
769+
assertEquals(4, record.getRecordNumber());
770+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
771+
assertEquals(record.getCharacterByte(), 43);
772+
}
780773
}
781774

782775
@Test

src/test/java/org/apache/commons/csv/JiraCsv196Test.java

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,23 @@
2121
import java.io.IOException;
2222
import java.io.InputStreamReader;
2323
import java.io.Reader;
24-
24+
import java.nio.charset.StandardCharsets;
2525

2626
import org.junit.jupiter.api.Test;
2727

2828

2929
public class JiraCsv196Test {
3030
@Test
3131
public void parseThreeBytes() throws IOException {
32-
33-
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
3432
final CSVFormat format = CSVFormat.Builder.create()
35-
.setDelimiter(',')
36-
.setQuote('\'')
37-
.build();
38-
// CSVParser parser = new CSVParser(getTestInput(
39-
// "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8");
40-
CSVParser parser = format.parse(getTestInput(
41-
"org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8");
33+
.setDelimiter(',')
34+
.setQuote('\'')
35+
.get();
36+
CSVParser parser = new CSVParser.Builder()
37+
.setFormat(format)
38+
.setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv"))
39+
.setCharset(StandardCharsets.UTF_8)
40+
.get();
4241
long[] charByteKey = {0, 89, 242, 395};
4342
int idx = 0;
4443
for (CSVRecord record : parser) {
@@ -50,15 +49,15 @@ public void parseThreeBytes() throws IOException {
5049

5150
@Test
5251
public void parseFourBytes() throws IOException {
53-
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
5452
final CSVFormat format = CSVFormat.Builder.create()
5553
.setDelimiter(',')
5654
.setQuote('\'')
57-
.build();
58-
59-
CSVParser parser = format.parse(getTestInput(
60-
"org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8");
61-
55+
.get();
56+
CSVParser parser = new CSVParser.Builder()
57+
.setFormat(format)
58+
.setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv"))
59+
.setCharset(StandardCharsets.UTF_8)
60+
.get();
6261
long[] charByteKey = {0, 84, 701, 1318, 1935};
6362
int idx = 0;
6463
for (CSVRecord record : parser) {
@@ -67,7 +66,6 @@ public void parseFourBytes() throws IOException {
6766
parser.close();
6867
}
6968

70-
7169
private Reader getTestInput(String path) {
7270
return new InputStreamReader(
7371
ClassLoader.getSystemClassLoader().getResourceAsStream(path));

0 commit comments

Comments
 (0)