Skip to content

Commit 55c612c

Browse files
committed
Merge pull request #12 from caxy/feature/html_tag_isolation
Feature - html tag isolation
2 parents 7846bd7 + 30a3da8 commit 55c612c

File tree

4 files changed

+401
-286
lines changed

4 files changed

+401
-286
lines changed

demo/demo.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ <h2>Compared HTML <span ng-show="loading || waiting">- {{ loading ? 'Loading' :
3838
</div>
3939
</div>
4040
</body>
41-
</html>
41+
</html>

demo/index.php

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,25 @@
22

33
use Caxy\HtmlDiff\HtmlDiff;
44

5-
require __DIR__.'/../lib/Caxy/HtmlDiff/HtmlDiff.php';
6-
require __DIR__.'/../lib/Caxy/HtmlDiff/Match.php';
7-
require __DIR__.'/../lib/Caxy/HtmlDiff/Operation.php';
5+
ini_set('display_errors', 1);
6+
error_reporting(E_ERROR);
7+
8+
$classes = array(
9+
'Caxy/HtmlDiff/AbstractDiff',
10+
'Caxy/HtmlDiff/HtmlDiff',
11+
'Caxy/HtmlDiff/Match',
12+
'Caxy/HtmlDiff/Operation',
13+
);
14+
15+
foreach ($classes as $class) {
16+
require __DIR__.'/../lib/'.$class.'.php';
17+
}
818

919
$input = file_get_contents('php://input');
1020

1121
if ($input) {
1222
$data = json_decode($input, true);
13-
$diff = new HtmlDiff($data['oldText'], $data['newText']);
23+
$diff = new HtmlDiff($data['oldText'], $data['newText'], 'UTF-8', array());
1424
$diff->build();
1525

1626
header('Content-Type: application/json');

lib/Caxy/HtmlDiff/AbstractDiff.php

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
<?php
2+
3+
namespace Caxy\HtmlDiff;
4+
5+
abstract class AbstractDiff
6+
{
7+
public static $defaultSpecialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');
8+
public static $defaultSpecialCaseChars = array('.', ',', '(', ')', '\'');
9+
public static $defaultGroupDiffs = true;
10+
11+
protected $content;
12+
protected $oldText;
13+
protected $newText;
14+
protected $oldWords = array();
15+
protected $newWords = array();
16+
protected $encoding;
17+
protected $specialCaseOpeningTags = array();
18+
protected $specialCaseClosingTags = array();
19+
protected $specialCaseTags;
20+
protected $specialCaseChars;
21+
protected $groupDiffs;
22+
23+
public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null)
24+
{
25+
if ($specialCaseTags === null) {
26+
$specialCaseTags = static::$defaultSpecialCaseTags;
27+
}
28+
29+
if ($groupDiffs === null) {
30+
$groupDiffs = static::$defaultGroupDiffs;
31+
}
32+
33+
$this->oldText = $this->purifyHtml(trim($oldText));
34+
$this->newText = $this->purifyHtml(trim($newText));
35+
$this->encoding = $encoding;
36+
$this->content = '';
37+
$this->groupDiffs = $groupDiffs;
38+
$this->setSpecialCaseTags($specialCaseTags);
39+
$this->setSpecialCaseChars(static::$defaultSpecialCaseChars);
40+
}
41+
42+
public function setSpecialCaseChars(array $chars)
43+
{
44+
$this->specialCaseChars = $chars;
45+
}
46+
47+
public function getSpecialCaseChars()
48+
{
49+
return $this->specialCaseChars;
50+
}
51+
52+
public function addSpecialCaseChar($char)
53+
{
54+
if (!in_array($char, $this->specialCaseChars)) {
55+
$this->specialCaseChars[] = $char;
56+
}
57+
}
58+
59+
public function removeSpecialCaseChar($char)
60+
{
61+
$key = array_search($char, $this->specialCaseChars);
62+
if ($key !== false) {
63+
unset($this->specialCaseChars[$key]);
64+
}
65+
}
66+
67+
public function setSpecialCaseTags(array $tags = array())
68+
{
69+
$this->specialCaseTags = $tags;
70+
71+
foreach ($this->specialCaseTags as $tag) {
72+
$this->addSpecialCaseTag($tag);
73+
}
74+
}
75+
76+
public function addSpecialCaseTag($tag)
77+
{
78+
if (!in_array($tag, $this->specialCaseTags)) {
79+
$this->specialCaseTags[] = $tag;
80+
}
81+
82+
$opening = $this->getOpeningTag($tag);
83+
$closing = $this->getClosingTag($tag);
84+
85+
if (!in_array($opening, $this->specialCaseOpeningTags)) {
86+
$this->specialCaseOpeningTags[] = $opening;
87+
}
88+
if (!in_array($closing, $this->specialCaseClosingTags)) {
89+
$this->specialCaseClosingTags[] = $closing;
90+
}
91+
}
92+
93+
public function removeSpecialCaseTag($tag)
94+
{
95+
if (($key = array_search($tag, $this->specialCaseTags)) !== false) {
96+
unset($this->specialCaseTags[$key]);
97+
98+
$opening = $this->getOpeningTag($tag);
99+
$closing = $this->getClosingTag($tag);
100+
101+
if (($key = array_search($opening, $this->specialCaseOpeningTags)) !== false) {
102+
unset($this->specialCaseOpeningTags[$key]);
103+
}
104+
if (($key = array_search($closing, $this->specialCaseClosingTags)) !== false) {
105+
unset($this->specialCaseClosingTags[$key]);
106+
}
107+
}
108+
}
109+
110+
public function getSpecialCaseTags()
111+
{
112+
return $this->specialCaseTags;
113+
}
114+
115+
public function getOldHtml()
116+
{
117+
return $this->oldText;
118+
}
119+
120+
public function getNewHtml()
121+
{
122+
return $this->newText;
123+
}
124+
125+
public function getDifference()
126+
{
127+
return $this->content;
128+
}
129+
130+
public function setGroupDiffs($boolean)
131+
{
132+
$this->groupDiffs = $boolean;
133+
}
134+
135+
public function isGroupDiffs()
136+
{
137+
return $this->groupDiffs;
138+
}
139+
140+
protected function getOpeningTag($tag)
141+
{
142+
return "/<".$tag."[^>]*/i";
143+
}
144+
145+
protected function getClosingTag($tag)
146+
{
147+
return "</".$tag.">";
148+
}
149+
150+
protected function getStringBetween($str, $start, $end)
151+
{
152+
$expStr = explode( $start, $str, 2 );
153+
if ( count( $expStr ) > 1 ) {
154+
$expStr = explode( $end, $expStr[ 1 ] );
155+
if ( count( $expStr ) > 1 ) {
156+
array_pop( $expStr );
157+
158+
return implode( $end, $expStr );
159+
}
160+
}
161+
162+
return '';
163+
}
164+
165+
protected function purifyHtml($html, $tags = null)
166+
{
167+
if ( class_exists( 'Tidy' ) && false ) {
168+
$config = array( 'output-xhtml' => true, 'indent' => false );
169+
$tidy = new tidy;
170+
$tidy->parseString( $html, $config, 'utf8' );
171+
$html = (string) $tidy;
172+
173+
return $this->getStringBetween( $html, '<body>' );
174+
}
175+
176+
return $html;
177+
}
178+
179+
protected function splitInputsToWords()
180+
{
181+
$this->oldWords = $this->convertHtmlToListOfWords( $this->explode( $this->oldText ) );
182+
$this->newWords = $this->convertHtmlToListOfWords( $this->explode( $this->newText ) );
183+
}
184+
185+
protected function isPartOfWord($text)
186+
{
187+
return ctype_alnum(str_replace($this->specialCaseChars, '', $text));
188+
}
189+
190+
protected function convertHtmlToListOfWords($characterString)
191+
{
192+
$mode = 'character';
193+
$current_word = '';
194+
$words = array();
195+
foreach ($characterString as $i => $character) {
196+
switch ($mode) {
197+
case 'character':
198+
if ( $this->isStartOfTag( $character ) ) {
199+
if ($current_word != '') {
200+
$words[] = $current_word;
201+
}
202+
$current_word = "<";
203+
$mode = 'tag';
204+
} elseif ( preg_match( "[^\s]", $character ) > 0 ) {
205+
if ($current_word != '') {
206+
$words[] = $current_word;
207+
}
208+
$current_word = $character;
209+
$mode = 'whitespace';
210+
} else {
211+
if (
212+
(ctype_alnum($character) && (strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
213+
(in_array($character, $this->specialCaseChars) && isset($characterString[$i+1]) && $this->isPartOfWord($characterString[$i+1]))
214+
) {
215+
$current_word .= $character;
216+
} else {
217+
$words[] = $current_word;
218+
$current_word = $character;
219+
}
220+
}
221+
break;
222+
case 'tag' :
223+
if ( $this->isEndOfTag( $character ) ) {
224+
$current_word .= ">";
225+
$words[] = $current_word;
226+
$current_word = "";
227+
228+
if ( !preg_match('[^\s]', $character ) ) {
229+
$mode = 'whitespace';
230+
} else {
231+
$mode = 'character';
232+
}
233+
} else {
234+
$current_word .= $character;
235+
}
236+
break;
237+
case 'whitespace':
238+
if ( $this->isStartOfTag( $character ) ) {
239+
if ($current_word != '') {
240+
$words[] = $current_word;
241+
}
242+
$current_word = "<";
243+
$mode = 'tag';
244+
} elseif ( preg_match( "[^\s]", $character ) ) {
245+
$current_word .= $character;
246+
} else {
247+
if ($current_word != '') {
248+
$words[] = $current_word;
249+
}
250+
$current_word = $character;
251+
$mode = 'character';
252+
}
253+
break;
254+
default:
255+
break;
256+
}
257+
}
258+
if ($current_word != '') {
259+
$words[] = $current_word;
260+
}
261+
262+
return $words;
263+
}
264+
265+
protected function isStartOfTag($val)
266+
{
267+
return $val == "<";
268+
}
269+
270+
protected function isEndOfTag($val)
271+
{
272+
return $val == ">";
273+
}
274+
275+
protected function isWhiteSpace($value)
276+
{
277+
return !preg_match( '[^\s]', $value );
278+
}
279+
280+
protected function explode($value)
281+
{
282+
// as suggested by @onassar
283+
return preg_split( '//u', $value );
284+
}
285+
}

0 commit comments

Comments
 (0)