Skip to content

Commit 7453ab0

Browse files
authored
Merge pull request #145 from tgalopin/phpdoc-improvements
Add more extensions on composer.json, improve phpdocs and remove dead code
2 parents fb50d43 + 80b8e91 commit 7453ab0

File tree

9 files changed

+105
-38
lines changed

9 files changed

+105
-38
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,6 @@ print $html5->saveHTML($dom);
7575

7676
// Or save it to a file:
7777
$html5->save($dom, 'out.html');
78-
79-
?>
8078
```
8179

8280
The `$dom` created by the parser is a full `DOMDocument` object. And the

composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
}
2121
],
2222
"require" : {
23+
"ext-ctype": "*",
2324
"ext-dom": "*",
2425
"ext-libxml" : "*",
2526
"php" : ">=5.3.0"

src/HTML5.php

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
namespace Masterminds;
33

44
use Masterminds\HTML5\Parser\FileInputStream;
5+
use Masterminds\HTML5\Parser\InputStream;
56
use Masterminds\HTML5\Parser\StringInputStream;
67
use Masterminds\HTML5\Parser\DOMTreeBuilder;
78
use Masterminds\HTML5\Parser\Scanner;
@@ -160,8 +161,13 @@ public function hasErrors()
160161
*
161162
* Lower-level loading function. This requires an input stream instead
162163
* of a string, file, or resource.
164+
*
165+
* @param InputStream $input
166+
* @param array $options
167+
*
168+
* @return \DOMDocument
163169
*/
164-
public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
170+
public function parse(InputStream $input, array $options = array())
165171
{
166172
$this->errors = array();
167173
$options = array_merge($this->getOptions(), $options);
@@ -180,8 +186,15 @@ public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $optio
180186
*
181187
* Lower-level loading function. This requires an input stream instead
182188
* of a string, file, or resource.
189+
*
190+
* @param InputStream $input
191+
* The input data to parse in the form of a InputStream instance.
192+
* @param array $options
193+
* An array of options
194+
*
195+
* @return \DOMDocumentFragment
183196
*/
184-
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
197+
public function parseFragment(InputStream $input, array $options = array())
185198
{
186199
$options = array_merge($this->getOptions(), $options);
187200
$events = new DOMTreeBuilder(true, $options);

src/HTML5/Parser/DOMTreeBuilder.php

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ class DOMTreeBuilder implements EventHandler
136136
protected $stack = array();
137137

138138
protected $current; // Pointer in the tag hierarchy.
139+
protected $rules;
139140
protected $doc;
140141

141142
protected $frag;
@@ -216,7 +217,7 @@ public function document()
216217
*
217218
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
218219
*
219-
* @return \DOMFragmentDocumentFragment
220+
* @return \DOMDocumentFragment
220221
*/
221222
public function fragment()
222223
{
@@ -650,15 +651,19 @@ protected function quirksTreeResolver($name)
650651

651652
/**
652653
* Automatically climb the tree and close the closest node with the matching $tag.
654+
*
655+
* @param string $tagName
656+
*
657+
* @return bool
653658
*/
654-
protected function autoclose($tag)
659+
protected function autoclose($tagName)
655660
{
656661
$working = $this->current;
657662
do {
658663
if ($working->nodeType != XML_ELEMENT_NODE) {
659664
return false;
660665
}
661-
if ($working->tagName == $tag) {
666+
if ($working->tagName == $tagName) {
662667
$this->current = $working->parentNode;
663668

664669
return true;
@@ -672,12 +677,16 @@ protected function autoclose($tag)
672677
*
673678
* If $this->current or anything above $this->current matches the given tag
674679
* name, this returns true.
680+
*
681+
* @param string $tagName
682+
*
683+
* @return bool
675684
*/
676-
protected function isAncestor($tagname)
685+
protected function isAncestor($tagName)
677686
{
678687
$candidate = $this->current;
679688
while ($candidate->nodeType === XML_ELEMENT_NODE) {
680-
if ($candidate->tagName == $tagname) {
689+
if ($candidate->tagName == $tagName) {
681690
return true;
682691
}
683692
$candidate = $candidate->parentNode;
@@ -688,9 +697,13 @@ protected function isAncestor($tagname)
688697

689698
/**
690699
* Returns true if the immediate parent element is of the given tagname.
700+
*
701+
* @param string $tagName
702+
*
703+
* @return bool
691704
*/
692-
protected function isParent($tagname)
705+
protected function isParent($tagName)
693706
{
694-
return $this->current->tagName == $tagname;
707+
return $this->current->tagName == $tagName;
695708
}
696709
}

src/HTML5/Parser/FileInputStream.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ class FileInputStream extends StringInputStream implements InputStream
1919
/**
2020
* Load a file input stream.
2121
*
22-
* @param string $data
23-
* The file or url path to load.
22+
* @param string $data The file or url path to load.
23+
* @param string $encoding The encoding to use for the data.
24+
* @param string $debug A fprintf format to use to echo the data on stdout.
2425
*/
2526
public function __construct($data, $encoding = 'UTF-8', $debug = '')
2627
{

src/HTML5/Parser/StringInputStream.php

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,21 @@ class StringInputStream implements InputStream
6565
/**
6666
* Create a new InputStream wrapper.
6767
*
68-
* @param $data Data
69-
* to parse
68+
* @param string $data Data to parse
69+
* @param string $encoding The encoding to use for the data.
70+
* @param string $debug A fprintf format to use to echo the data on stdout.
7071
*/
7172
public function __construct($data, $encoding = 'UTF-8', $debug = '')
7273
{
7374
$data = UTF8Utils::convertToUTF8($data, $encoding);
74-
if ($debug)
75+
if ($debug) {
7576
fprintf(STDOUT, $debug, $data, strlen($data));
77+
}
7678

77-
// There is good reason to question whether it makes sense to
78-
// do this here, since most of these checks are done during
79-
// parsing, and since this check doesn't actually *do* anything.
79+
// There is good reason to question whether it makes sense to
80+
// do this here, since most of these checks are done during
81+
// parsing, and since this check doesn't actually *do* anything.
8082
$this->errors = UTF8Utils::checkForIllegalCodepoints($data);
81-
// if (!empty($e)) {
82-
// throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
83-
// }
8483

8584
$data = $this->replaceLinefeeds($data);
8685

@@ -95,7 +94,11 @@ public function __construct($data, $encoding = 'UTF-8', $debug = '')
9594
protected function replaceLinefeeds($data)
9695
{
9796
/*
98-
* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. Any CR characters that are followed by LF characters must be removed, and any CR characters not followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are represented by LF characters, and there are never any CR characters in the input to the tokenization stage.
97+
* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
98+
* Any CR characters that are followed by LF characters must be removed, and any CR characters not
99+
* followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
100+
* represented by LF characters, and there are never any CR characters in the input to the tokenization
101+
* stage.
99102
*/
100103
$crlfTable = array(
101104
"\0" => "\xEF\xBF\xBD",
@@ -126,7 +129,7 @@ public function currentLine()
126129
*/
127130
public function getCurrentLine()
128131
{
129-
return currentLine();
132+
return $this->currentLine();
130133
}
131134

132135
/**
@@ -281,6 +284,8 @@ public function charsUntil($bytes, $max = null)
281284
* substring.
282285
* @param int $max
283286
* The max number of chars to read.
287+
*
288+
* @return string
284289
*/
285290
public function charsWhile($bytes, $max = null)
286291
{

src/HTML5/Parser/Tokenizer.php

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,12 @@ protected function isTagEnd(&$selfClose)
431431

432432
/**
433433
* Parse attributes from inside of a tag.
434+
*
435+
* @param string[] $attributes
436+
*
437+
* @return bool
438+
*
439+
* @throws ParseError
434440
*/
435441
protected function attribute(&$attributes)
436442
{
@@ -489,6 +495,8 @@ protected function attribute(&$attributes)
489495
/**
490496
* Consume an attribute value.
491497
* 8.2.4.37 and after.
498+
*
499+
* @return string|null
492500
*/
493501
protected function attributeValue()
494502
{
@@ -590,6 +598,8 @@ protected function unquotedAttributeValue()
590598
* Prepend any leading characters. This essentially
591599
* negates the need to backtrack, but it's sort of
592600
* a hack.
601+
*
602+
* @return bool
593603
*/
594604
protected function bogusComment($leading = '')
595605
{
@@ -614,6 +624,8 @@ protected function bogusComment($leading = '')
614624
* Read a comment.
615625
*
616626
* Expects the first tok to be inside of the comment.
627+
*
628+
* @return bool
617629
*/
618630
protected function comment()
619631
{
@@ -645,6 +657,8 @@ protected function comment()
645657

646658
/**
647659
* Check if the scanner has reached the end of a comment.
660+
*
661+
* @return bool
648662
*/
649663
protected function isCommentEnd()
650664
{
@@ -679,6 +693,8 @@ protected function isCommentEnd()
679693
* not Quirksmode is enabled on the event handler.
680694
*
681695
* @todo This method is a little long. Should probably refactor.
696+
*
697+
* @return bool
682698
*/
683699
protected function doctype()
684700
{
@@ -701,13 +717,9 @@ protected function doctype()
701717
return $this->eof();
702718
}
703719

704-
$doctypeName = '';
705-
706720
// NULL char: convert.
707721
if ($tok === "\0") {
708722
$this->parseError("Unexpected null character in DOCTYPE.");
709-
$doctypeName .= UTF8::FFFD;
710-
$tok = $this->scanner->next();
711723
}
712724

713725
$stop = " \n\f>";
@@ -792,6 +804,7 @@ protected function doctype()
792804
* @param string $stopchars
793805
* Characters (in addition to a close-quote) that should stop the string.
794806
* E.g. sometimes '>' is higher precedence than '"' or "'".
807+
*
795808
* @return mixed String if one is found (quotations omitted)
796809
*/
797810
protected function quotedString($stopchars)
@@ -813,6 +826,8 @@ protected function quotedString($stopchars)
813826

814827
/**
815828
* Handle a CDATA section.
829+
*
830+
* @return bool
816831
*/
817832
protected function cdataSection()
818833
{
@@ -856,6 +871,8 @@ protected function cdataSection()
856871
* treated as "bogus comments". However, since we're not a user
857872
* agent, we allow them. We consume until ?> and then issue a
858873
* EventListener::processingInstruction() event.
874+
*
875+
* @return bool
859876
*/
860877
protected function processingInstruction()
861878
{
@@ -900,6 +917,10 @@ protected function processingInstruction()
900917
/**
901918
* Read from the input stream until we get to the desired sequene
902919
* or hit the end of the input stream.
920+
*
921+
* @param string $sequence
922+
*
923+
* @return string
903924
*/
904925
protected function readUntilSequence($sequence)
905926
{
@@ -935,6 +956,11 @@ protected function readUntilSequence($sequence)
935956
* Example: $this->sequenceMatches('</script>') will
936957
* see if the input stream is at the start of a
937958
* '</script>' string.
959+
*
960+
* @param string $sequence
961+
* @param bool $caseSensitive
962+
*
963+
* @return bool
938964
*/
939965
protected function sequenceMatches($sequence, $caseSensitive = true)
940966
{
@@ -976,6 +1002,8 @@ protected function flushBuffer()
9761002
* Add text to the temporary buffer.
9771003
*
9781004
* @see flushBuffer()
1005+
*
1006+
* @param string $str
9791007
*/
9801008
protected function buffer($str)
9811009
{
@@ -987,6 +1015,10 @@ protected function buffer($str)
9871015
*
9881016
* A parse error always returns false because it never consumes any
9891017
* characters.
1018+
*
1019+
* @param string $msg
1020+
*
1021+
* @return string
9901022
*/
9911023
protected function parseError($msg)
9921024
{
@@ -1009,9 +1041,11 @@ protected function parseError($msg)
10091041
* Returns false if the entity could not be found. If $inAttribute is set
10101042
* to true, a bare & will be returned as-is.
10111043
*
1012-
* @param boolean $inAttribute
1044+
* @param bool $inAttribute
10131045
* Set to true if the text is inside of an attribute value.
10141046
* false otherwise.
1047+
*
1048+
* @return bool|string
10151049
*/
10161050
protected function decodeCharacterReference($inAttribute = false)
10171051
{
@@ -1023,7 +1057,6 @@ protected function decodeCharacterReference($inAttribute = false)
10231057

10241058
// Next char after &.
10251059
$tok = $this->scanner->next();
1026-
$entity = '';
10271060
$start = $this->scanner->position();
10281061

10291062
if ($tok == false) {

src/HTML5/Parser/TreeBuildingRules.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ protected function handleRT($ele, $current)
127127

128128
protected function closeIfCurrentMatches($ele, $current, $match)
129129
{
130-
$tname = $current->tagName;
131130
if (in_array($current->tagName, $match)) {
132131
$current->parentNode->appendChild($ele);
133132
} else {

0 commit comments

Comments
 (0)