Skip to content

Commit 3f40c5e

Browse files
authored
Merge pull request #1273 from troosan/various_html_parsing_fixes
Various html parsing fixes, fixes for #1252 and #1254
2 parents 604e60c + 5b381bc commit 3f40c5e

File tree

6 files changed

+146
-12
lines changed

6 files changed

+146
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@ v0.15.0 (?? ??? 2018)
88
### Added
99
- Parsing of "align" HTML attribute - @troosan #1231
1010
- Parse formatting inside HTML lists - @troosan @samimussbach #1239 #945 #1215 #508
11+
- Parsing of CSS `direction` instruction, HTML `lang` attribute, formatting inside table cell - @troosan #1273 #1252 #1254
1112
- Add support for Track changes @Cip @troosan #354 #1262
1213

1314
### Fixed
1415
- fix reading of docx default style - @troosan #1238
16+
- fix the size unit of when parsing html images - @troosan #1254
17+
- fixed HTML parsing of nested lists - @troosan #1265
18+
- Save PNG alpha information when using remote images. @samsullivan #779
1519

1620

1721

samples/Sample_26_Html.php

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77

88
$section = $phpWord->addSection();
99
$html = '<h1>Adding element via HTML</h1>';
10-
$html .= '<p>Some well formed HTML snippet needs to be used</p>';
10+
$html .= '<p>Some well-formed HTML snippet needs to be used</p>';
1111
$html .= '<p>With for example <strong>some<sup>1</sup> <em>inline</em> formatting</strong><sub>1</sub></p>';
1212

1313
$html .= '<p>A link to <a href="http://phpword.readthedocs.io/">Read the docs</a></p>';
1414

15+
$html .= '<p lang="he-IL" style="text-align: right; direction: rtl">היי, זה פסקה מימין לשמאל</p>';
16+
1517
$html .= '<p style="margin-top: 240pt;">Unordered (bulleted) list:</p>';
1618
$html .= '<ul><li>Item 1</li><li>Item 2</li><ul><li>Item 2.1</li><li>Item 2.1</li></ul></ul>';
1719

@@ -29,10 +31,12 @@
2931
<ol>
3032
<li>List 2 item 1</li>
3133
<li>List 2 item 2</li>
32-
<ol>
33-
<li>sub list 1</li>
34-
<li>sub list 2</li>
35-
</ol>
34+
<li>
35+
<ol>
36+
<li>sub list 1</li>
37+
<li>sub list 2</li>
38+
</ol>
39+
</li>
3640
<li>List 2 item 3</li>
3741
<ol>
3842
<li>sub list 1, restarts with a</li>
@@ -65,10 +69,20 @@
6569
</thead>
6670
<tbody>
6771
<tr><td style="border-style: dotted;">1</td><td colspan="2">2</td></tr>
68-
<tr><td>4</td><td>5</td><td>6</td></tr>
72+
<tr><td>This is <b>bold</b> text</td><td></td><td>6</td></tr>
6973
</tbody>
7074
</table>';
7175

76+
$html .= '<p style="margin-top: 240pt;">Table inside another table:</p>';
77+
$html .= '<table align="center" style="width: 80%; border: 6px #0000FF double;">
78+
<tr><td>
79+
<table style="width: 100%; border: 4px #FF0000 dotted;">
80+
<tr><td>column 1</td><td>column 2</td></tr>
81+
</table>
82+
</td></tr>
83+
<tr><td style="text-align: center;">Cell in parent table</td></tr>
84+
</table>';
85+
7286
\PhpOffice\PhpWord\Shared\Html::addHtml($section, $html, false, false);
7387

7488
// Save file

src/PhpWord/Element/AbstractElement.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,13 @@ abstract class AbstractElement
9393
*/
9494
private $nestedLevel = 0;
9595

96+
/**
97+
* A reference to the parent
98+
*
99+
* @var \PhpOffice\PhpWord\Element\AbstractElement
100+
*/
101+
private $parent;
102+
96103
/**
97104
* changed element info
98105
*
@@ -328,6 +335,11 @@ public function setCommentRangeEnd(Comment $value)
328335
$this->commentRangeEnd->setEndElement($this);
329336
}
330337

338+
public function getParent()
339+
{
340+
return $this->parent;
341+
}
342+
331343
/**
332344
* Set parent container
333345
*
@@ -338,6 +350,7 @@ public function setCommentRangeEnd(Comment $value)
338350
public function setParentContainer(AbstractElement $container)
339351
{
340352
$this->parentContainer = substr(get_class($container), strrpos(get_class($container), '\\') + 1);
353+
$this->parent = $container;
341354

342355
// Set nested level
343356
$this->nestedLevel = $container->getNestedLevel();

src/PhpWord/Shared/Html.php

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
class Html
3232
{
3333
private static $listIndex = 0;
34+
private static $xpath;
3435

3536
/**
3637
* Add HTML parts.
@@ -65,6 +66,7 @@ public static function addHtml($element, $html, $fullHTML = false, $preserveWhit
6566
$dom = new \DOMDocument();
6667
$dom->preserveWhiteSpace = $preserveWhiteSpace;
6768
$dom->loadXML($html);
69+
self::$xpath = new \DOMXpath($dom);
6870
$node = $dom->getElementsByTagName('body');
6971

7072
self::parseNode($node->item(0), $element);
@@ -89,6 +91,10 @@ protected static function parseInlineStyle($node, $styles = array())
8991
break;
9092
case 'align':
9193
$styles['alignment'] = self::mapAlign($attribute->value);
94+
break;
95+
case 'lang':
96+
$styles['lang'] = $attribute->value;
97+
break;
9298
}
9399
}
94100
}
@@ -333,7 +339,7 @@ private static function parseRow($node, $element, &$styles)
333339
* @param \DOMNode $node
334340
* @param \PhpOffice\PhpWord\Element\Table $element
335341
* @param array &$styles
336-
* @return \PhpOffice\PhpWord\Element\Cell $element
342+
* @return \PhpOffice\PhpWord\Element\Cell|\PhpOffice\PhpWord\Element\TextRun $element
337343
*/
338344
private static function parseCell($node, $element, &$styles)
339345
{
@@ -343,8 +349,29 @@ private static function parseCell($node, $element, &$styles)
343349
if (!empty($colspan)) {
344350
$cellStyles['gridSpan'] = $colspan - 0;
345351
}
352+
$cell = $element->addCell(null, $cellStyles);
353+
354+
if (self::shouldAddTextRun($node)) {
355+
return $cell->addTextRun(self::parseInlineStyle($node, $styles['paragraph']));
356+
}
346357

347-
return $element->addCell(null, $cellStyles);
358+
return $cell;
359+
}
360+
361+
/**
362+
* Checks if $node contains an HTML element that cannot be added to TextRun
363+
*
364+
* @param \DOMNode $node
365+
* @return bool Returns true if the node contains an HTML element that cannot be added to TextRun
366+
*/
367+
private static function shouldAddTextRun(\DOMNode $node)
368+
{
369+
$containsBlockElement = self::$xpath->query('.//table|./p|./ul|./ol', $node)->length > 0;
370+
if ($containsBlockElement) {
371+
return false;
372+
}
373+
374+
return true;
348375
}
349376

350377
/**
@@ -375,14 +402,17 @@ private static function recursiveParseStylesInHierarchy(\DOMNode $node, array $s
375402
*/
376403
private static function parseList($node, $element, &$styles, &$data)
377404
{
378-
$isOrderedList = $node->nodeName == 'ol';
405+
$isOrderedList = $node->nodeName === 'ol';
379406
if (isset($data['listdepth'])) {
380407
$data['listdepth']++;
381408
} else {
382409
$data['listdepth'] = 0;
383410
$styles['list'] = 'listStyle_' . self::$listIndex++;
384411
$element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
385412
}
413+
if ($node->parentNode->nodeName === 'li') {
414+
return $element->getParent();
415+
}
386416
}
387417

388418
private static function getListStyle($isOrderedList)
@@ -469,6 +499,9 @@ private static function parseStyle($attribute, $styles)
469499
case 'text-align':
470500
$styles['alignment'] = self::mapAlign($cValue);
471501
break;
502+
case 'direction':
503+
$styles['rtl'] = $cValue === 'rtl';
504+
break;
472505
case 'font-size':
473506
$styles['size'] = Converter::cssToPoint($cValue);
474507
break;
@@ -556,10 +589,12 @@ private static function parseImage($node, $element)
556589
case 'width':
557590
$width = $attribute->value;
558591
$style['width'] = $width;
592+
$style['unit'] = \PhpOffice\PhpWord\Style\Image::UNIT_PX;
559593
break;
560594
case 'height':
561595
$height = $attribute->value;
562596
$style['height'] = $height;
597+
$style['unit'] = \PhpOffice\PhpWord\Style\Image::UNIT_PX;
563598
break;
564599
case 'style':
565600
$styleattr = explode(';', $attribute->value);

src/PhpWord/Writer/Word2007/Style/Font.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ private function writeStyle()
9090
$xmlWriter->writeAttributeIf($language->getLatin() !== null, 'w:val', $language->getLatin());
9191
$xmlWriter->writeAttributeIf($language->getEastAsia() !== null, 'w:eastAsia', $language->getEastAsia());
9292
$xmlWriter->writeAttributeIf($language->getBidirectional() !== null, 'w:bidi', $language->getBidirectional());
93+
//if bidi is not set but we are writing RTL, write the latin language in the bidi tag
94+
if ($style->isRTL() && $language->getBidirectional() === null && $language->getLatin() !== null) {
95+
$xmlWriter->writeAttribute('w:bidi', $language->getLatin());
96+
}
9397
$xmlWriter->endElement();
9498
}
9599

tests/PhpWord/Shared/HtmlTest.php

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,33 @@ public function testParseFontSize()
150150
$this->assertEquals('15', $doc->getElementAttribute('/w:document/w:body/w:p[2]/w:r/w:rPr/w:sz', 'w:val'));
151151
}
152152

153+
/**
154+
* Test direction style
155+
*/
156+
public function testParseTextDirection()
157+
{
158+
$phpWord = new \PhpOffice\PhpWord\PhpWord();
159+
$section = $phpWord->addSection();
160+
Html::addHtml($section, '<span style="direction: rtl">test</span>');
161+
162+
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
163+
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:rPr/w:rtl'));
164+
}
165+
166+
/**
167+
* Test html lang
168+
*/
169+
public function testParseLang()
170+
{
171+
$phpWord = new \PhpOffice\PhpWord\PhpWord();
172+
$section = $phpWord->addSection();
173+
Html::addHtml($section, '<span lang="fr-BE">test</span>');
174+
175+
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
176+
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:rPr/w:lang'));
177+
$this->assertEquals('fr-BE', $doc->getElementAttribute('/w:document/w:body/w:p/w:r/w:rPr/w:lang', 'w:val'));
178+
}
179+
153180
/**
154181
* Test font-family style
155182
*/
@@ -199,7 +226,7 @@ public function testParseTable()
199226
</thead>
200227
<tbody>
201228
<tr><td style="border-style: dotted;">1</td><td colspan="2">2</td></tr>
202-
<tr><td>4</td><td>5</td><td>6</td></tr>
229+
<tr><td>This is <b>bold</b> text</td><td>5</td><td><p>6</p></td></tr>
203230
</tbody>
204231
</table>';
205232
Html::addHtml($section, $html);
@@ -272,6 +299,43 @@ public function testOrderedListNumbering()
272299
$this->assertNotEquals($firstListnumId, $secondListnumId);
273300
}
274301

302+
/**
303+
* Tests parsing of nested ul/li
304+
*/
305+
public function testOrderedNestedListNumbering()
306+
{
307+
$phpWord = new \PhpOffice\PhpWord\PhpWord();
308+
$section = $phpWord->addSection();
309+
$html = '<ol>
310+
<li>List 1 item 1</li>
311+
<li>List 1 item 2</li>
312+
</ol>
313+
<p>Some Text</p>
314+
<ol>
315+
<li>List 2 item 1</li>
316+
<li>
317+
<ol>
318+
<li>sub list 1</li>
319+
<li>sub list 2</li>
320+
</ol>
321+
</li>
322+
</ol>';
323+
Html::addHtml($section, $html, false, false);
324+
325+
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
326+
327+
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:pPr/w:numPr/w:numId'));
328+
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:t'));
329+
330+
$this->assertEquals('List 1 item 1', $doc->getElement('/w:document/w:body/w:p[1]/w:r/w:t')->nodeValue);
331+
$this->assertEquals('List 2 item 1', $doc->getElement('/w:document/w:body/w:p[4]/w:r/w:t')->nodeValue);
332+
333+
$firstListnumId = $doc->getElementAttribute('/w:document/w:body/w:p[1]/w:pPr/w:numPr/w:numId', 'w:val');
334+
$secondListnumId = $doc->getElementAttribute('/w:document/w:body/w:p[4]/w:pPr/w:numPr/w:numId', 'w:val');
335+
336+
$this->assertNotEquals($firstListnumId, $secondListnumId);
337+
}
338+
275339
/**
276340
* Tests parsing of ul/li
277341
*/
@@ -336,8 +400,8 @@ public function testParseImage()
336400

337401
$baseXpath = '/w:document/w:body/w:p/w:r';
338402
$this->assertTrue($doc->elementExists($baseXpath . '/w:pict/v:shape'));
339-
$this->assertStringMatchesFormat('%Swidth:150pt%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
340-
$this->assertStringMatchesFormat('%Sheight:200pt%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
403+
$this->assertStringMatchesFormat('%Swidth:150px%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
404+
$this->assertStringMatchesFormat('%Sheight:200px%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
341405
$this->assertStringMatchesFormat('%Smso-position-horizontal:right%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
342406
$this->assertStringMatchesFormat('%Smso-position-horizontal:left%S', $doc->getElementAttribute($baseXpath . '[2]/w:pict/v:shape', 'style'));
343407
}

0 commit comments

Comments
 (0)