Skip to content

Commit 563687a

Browse files
authored
Merge pull request #146 from tgalopin/remove-input-streams
Improve performance by relying on a native string instead of InputStream
2 parents 7453ab0 + 321ed96 commit 563687a

14 files changed

+298
-627
lines changed

README.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ Here is how you use the high-level `HTML5` library API:
5050
<?php
5151
// Assuming you installed from Composer:
5252
require "vendor/autoload.php";
53-
use Masterminds\HTML5;
5453

54+
use Masterminds\HTML5;
5555

5656
// An example HTML document:
5757
$html = <<< 'HERE'
@@ -115,8 +115,6 @@ The following options are supported:
115115
This library provides the following low-level APIs that you can use to
116116
create more customized HTML5 tools:
117117

118-
- An `InputStream` abstraction that can work with different kinds of
119-
input source (not just files and strings).
120118
- A SAX-like event-based parser that you can hook into for special kinds
121119
of parsing.
122120
- A flexible error-reporting mechanism that can be tuned to document
@@ -130,7 +128,6 @@ is well-documented.
130128

131129
The parser is designed as follows:
132130

133-
- The `InputStream` portion handles direct I/O.
134131
- The `Scanner` handles scanning on behalf of the parser.
135132
- The `Tokenizer` requests data off of the scanner, parses it, clasifies
136133
it, and sends it to an `EventHandler`. It is a *recursive descent parser.*

src/HTML5.php

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public function getOptions()
5555
*
5656
* The rules governing parsing are set out in the HTML 5 spec.
5757
*
58-
* @param string $file
58+
* @param string|resource $file
5959
* The path to the file to parse. If this is a resource, it is
6060
* assumed to be an open stream whose pointer is set to the first
6161
* byte of input.
@@ -68,13 +68,10 @@ public function load($file, array $options = array())
6868
{
6969
// Handle the case where file is a resource.
7070
if (is_resource($file)) {
71-
// FIXME: We need a StreamInputStream class.
72-
return $this->loadHTML(stream_get_contents($file), $options);
71+
return $this->parse(stream_get_contents($file), $options);
7372
}
7473

75-
$input = new FileInputStream($file);
76-
77-
return $this->parse($input, $options);
74+
return $this->parse(file_get_contents($file), $options);
7875
}
7976

8077
/**
@@ -92,9 +89,7 @@ public function load($file, array $options = array())
9289
*/
9390
public function loadHTML($string, array $options = array())
9491
{
95-
$input = new StringInputStream($string);
96-
97-
return $this->parse($input, $options);
92+
return $this->parse($string, $options);
9893
}
9994

10095
/**
@@ -121,19 +116,15 @@ public function loadHTMLFile($file, array $options = array())
121116
/**
122117
* Parse a HTML fragment from a string.
123118
*
124-
* @param string $string
125-
* The html5 fragment as a string.
126-
* @param array $options
127-
* Configuration options when parsing the HTML
119+
* @param string $string The HTML5 fragment as a string.
120+
* @param array $options Configuration options when parsing the HTML
128121
*
129122
* @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
130123
* almost all distributions of PHP.
131124
*/
132125
public function loadHTMLFragment($string, array $options = array())
133126
{
134-
$input = new StringInputStream($string);
135-
136-
return $this->parseFragment($input, $options);
127+
return $this->parseFragment($string, $options);
137128
}
138129

139130
/**
@@ -162,12 +153,12 @@ public function hasErrors()
162153
* Lower-level loading function. This requires an input stream instead
163154
* of a string, file, or resource.
164155
*
165-
* @param InputStream $input
156+
* @param string $input
166157
* @param array $options
167158
*
168159
* @return \DOMDocument
169160
*/
170-
public function parse(InputStream $input, array $options = array())
161+
public function parse($input, array $options = array())
171162
{
172163
$this->errors = array();
173164
$options = array_merge($this->getOptions(), $options);
@@ -187,14 +178,12 @@ public function parse(InputStream $input, array $options = array())
187178
* Lower-level loading function. This requires an input stream instead
188179
* of a string, file, or resource.
189180
*
190-
* @param InputStream $input
191-
* The input data to parse in the form of a InputStream instance.
192-
* @param array $options
193-
* An array of options
181+
* @param string $input The input data to parse in the form of a string.
182+
* @param array $options An array of options
194183
*
195184
* @return \DOMDocumentFragment
196185
*/
197-
public function parseFragment(InputStream $input, array $options = array())
186+
public function parseFragment($input, array $options = array())
198187
{
199188
$options = array_merge($this->getOptions(), $options);
200189
$events = new DOMTreeBuilder(true, $options);

src/HTML5/Parser/FileInputStream.php

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,10 @@
1111
* really like to rewrite this class to efficiently handle lower level
1212
* stream reads (and thus efficiently handle large documents).
1313
*
14-
* @todo A buffered input stream would be useful.
14+
* @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
1515
*/
1616
class FileInputStream extends StringInputStream implements InputStream
1717
{
18-
1918
/**
2019
* Load a file input stream.
2120
*

src/HTML5/Parser/InputStream.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
<?php
2+
23
namespace Masterminds\HTML5\Parser;
34

45
/**
@@ -9,10 +10,11 @@
910
*
1011
* Currently provided InputStream implementations include
1112
* FileInputStream and StringInputStream.
13+
*
14+
* @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
1215
*/
1316
interface InputStream extends \Iterator
1417
{
15-
1618
/**
1719
* Returns the current line that is being consumed.
1820
*

0 commit comments

Comments
 (0)