Skip to content

Commit 9ffe6b9

Browse files
sideshowbarkerhsivonen
authored andcommitted
Enable control for tokenizer buffer size
This change adds an optional bufferSize parameter to the tokenize(), parse(), and parseFragment() methods of sax.HtmlParser instances. That bufferSize parameter controls the size of the buffer which gets fed to the tokenizer. The control provided by that parameter allows the tokenizer buffer to be set, for example, to "1" — and that is particularly useful for emulating the behavior of the Firefox HTML parser, which feeds the tokenizer one single code unit at a time. Otherwise, without this change, the tokenizer buffer size for HtmlParser instances is hardcoded to 2048.
1 parent 6b65d84 commit 9ffe6b9

File tree

2 files changed

+76
-7
lines changed

2 files changed

+76
-7
lines changed

src/nu/validator/htmlparser/io/Driver.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,24 @@ public boolean isCheckingNormalization() {
173173
* if the stream threw
174174
*/
175175
public void tokenize(InputSource is) throws SAXException, IOException {
176+
int bufferSize = 2048;
177+
tokenize(is, bufferSize);
178+
}
179+
/**
180+
* Runs the tokenization. This is the main entry point.
181+
*
182+
* @param is
183+
* the input source
184+
* @param bufferSize
185+
* the size of the buffer to feed to the tokenizer
186+
* @throws SAXException
187+
* on fatal error (if configured to treat XML violations as
188+
* fatal) or if the token handler threw
189+
* @throws IOException
190+
* if the stream threw
191+
*/
192+
public void tokenize(InputSource is, int bufferSize)
193+
throws SAXException, IOException {
176194
if (is == null) {
177195
throw new IllegalArgumentException("InputSource was null.");
178196
}
@@ -216,7 +234,7 @@ public void tokenize(InputSource is) throws SAXException, IOException {
216234
CharacterHandler ch = characterHandlers[i];
217235
ch.start();
218236
}
219-
runStates();
237+
runStates(bufferSize);
220238
break;
221239
} catch (ReparseException e) {
222240
if (rewindableInputStream == null) {
@@ -270,8 +288,8 @@ void dontSwallowBom() {
270288
swallowBom = false;
271289
}
272290

273-
private void runStates() throws SAXException, IOException {
274-
char[] buffer = new char[2048];
291+
private void runStates(int bufferSize) throws SAXException, IOException {
292+
char[] buffer = new char[bufferSize];
275293
UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0);
276294
boolean lastWasCR = false;
277295
int len = -1;

src/nu/validator/htmlparser/sax/HtmlParser.java

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -402,10 +402,17 @@ public Object getProperty(String name) throws SAXNotRecognizedException,
402402
* @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
403403
*/
404404
public void parse(InputSource input) throws IOException, SAXException {
405+
parse(input, -1);
406+
}
407+
408+
/**
409+
* @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
410+
*/
411+
public void parse(InputSource input, int bufferSize) throws IOException, SAXException {
405412
lazyInit();
406413
try {
407414
treeBuilder.setFragmentContext(null);
408-
tokenize(input);
415+
tokenize(input, bufferSize);
409416
} finally {
410417
if (saxTreeBuilder != null) {
411418
Document document = saxTreeBuilder.getDocument();
@@ -426,10 +433,27 @@ public void parse(InputSource input) throws IOException, SAXException {
426433
*/
427434
public void parseFragment(InputSource input, String context)
428435
throws IOException, SAXException {
436+
parseFragment(input, context, -1);
437+
}
438+
/**
439+
* Parses a fragment with HTML context.
440+
*
441+
* @param input the input to parse
442+
* @param context the name of the context element (HTML namespace assumed)
443+
* @param bufferSize the size of the buffer to feed to the tokenizer
444+
* @throws IOException
445+
* @throws SAXException
446+
*/
447+
public void parseFragment(InputSource input, String context, int bufferSize)
448+
throws IOException, SAXException {
429449
lazyInit();
430450
try {
431451
treeBuilder.setFragmentContext(context.intern());
432-
tokenize(input);
452+
if (bufferSize == -1) {
453+
tokenize(input);
454+
} else {
455+
tokenize(input, bufferSize);
456+
}
433457
} finally {
434458
if (saxTreeBuilder != null) {
435459
DocumentFragment fragment = saxTreeBuilder.getDocumentFragment();
@@ -449,10 +473,29 @@ public void parseFragment(InputSource input, String context)
449473
*/
450474
public void parseFragment(InputSource input, String contextLocal, String contextNamespace)
451475
throws IOException, SAXException {
476+
parseFragment(input, contextLocal, contextNamespace, -1);
477+
}
478+
/**
479+
* Parses a fragment.
480+
*
481+
* @param input the input to parse
482+
* @param contextLocal the local name of the context element
483+
* @param contextNamespace the namespace of the context element
484+
* @param bufferSize the size of the buffer to feed to the tokenizer
485+
* @throws IOException
486+
* @throws SAXException
487+
*/
488+
public void parseFragment(InputSource input, String contextLocal,
489+
String contextNamespace, int bufferSize)
490+
throws IOException, SAXException {
452491
lazyInit();
453492
try {
454493
treeBuilder.setFragmentContext(contextLocal.intern(), contextNamespace.intern(), null, false);
455-
tokenize(input);
494+
if (bufferSize == -1) {
495+
tokenize(input);
496+
} else {
497+
tokenize(input, bufferSize);
498+
}
456499
} finally {
457500
if (saxTreeBuilder != null) {
458501
DocumentFragment fragment = saxTreeBuilder.getDocumentFragment();
@@ -468,6 +511,10 @@ public void parseFragment(InputSource input, String contextLocal, String context
468511
* @throws MalformedURLException
469512
*/
470513
private void tokenize(InputSource is) throws SAXException, IOException, MalformedURLException {
514+
tokenize(is, -1);
515+
}
516+
private void tokenize(InputSource is, int bufferSize) throws SAXException,
517+
IOException, MalformedURLException {
471518
if (is == null) {
472519
throw new IllegalArgumentException("Null input.");
473520
}
@@ -485,7 +532,11 @@ private void tokenize(InputSource is) throws SAXException, IOException, Malforme
485532
is.setByteStream(new URL(systemId).openStream());
486533
}
487534
}
488-
driver.tokenize(is);
535+
if (bufferSize == -1) {
536+
driver.tokenize(is);
537+
} else {
538+
driver.tokenize(is, bufferSize);
539+
}
489540
}
490541

491542
/**

0 commit comments

Comments
 (0)