Skip to content
This repository was archived by the owner on May 26, 2022. It is now read-only.

Commit dbdf5f7

Browse files
committed
[ODS] Add support for whitespaces inside <text:span>
The `<text:p>` node can contain the string value directly or contain child elements. In this case, whitespaces contain in the child elements should be replaced by their XML equivalent: - space => `<text:s />` - tab => `<text:tab />` - line break => `<text:line-break />` @see https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415200_253892949
1 parent 9f4c094 commit dbdf5f7

File tree

3 files changed

+80
-18
lines changed

3 files changed

+80
-18
lines changed

src/Spout/Reader/ODS/Helper/CellValueFormatter.php

Lines changed: 69 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ class CellValueFormatter
2222

2323
/** Definition of XML nodes names used to parse data */
2424
const XML_NODE_P = 'p';
25-
const XML_NODE_S = 'text:s';
26-
const XML_NODE_A = 'text:a';
27-
const XML_NODE_SPAN = 'text:span';
25+
const XML_NODE_TEXT_A = 'text:a';
26+
const XML_NODE_TEXT_SPAN = 'text:span';
27+
const XML_NODE_TEXT_S = 'text:s';
28+
const XML_NODE_TEXT_TAB = 'text:tab';
29+
const XML_NODE_TEXT_LINE_BREAK = 'text:line-break';
2830

2931
/** Definition of XML attributes used to parse data */
3032
const XML_ATTRIBUTE_TYPE = 'office:value-type';
@@ -41,6 +43,13 @@ class CellValueFormatter
4143
/** @var \Box\Spout\Common\Helper\Escaper\ODS Used to unescape XML data */
4244
protected $escaper;
4345

46+
/** @var array List of XML nodes representing whitespaces and their corresponding value */
47+
private static $WHITESPACE_XML_NODES = [
48+
self::XML_NODE_TEXT_S => ' ',
49+
self::XML_NODE_TEXT_TAB => "\t",
50+
self::XML_NODE_TEXT_LINE_BREAK => "\n",
51+
];
52+
4453
/**
4554
* @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings
4655
* @param \Box\Spout\Common\Helper\Escaper\ODS $escaper Used to unescape XML data
@@ -96,21 +105,7 @@ protected function formatStringCellValue($node)
96105
$pNodes = $node->getElementsByTagName(self::XML_NODE_P);
97106

98107
foreach ($pNodes as $pNode) {
99-
$currentPValue = '';
100-
101-
foreach ($pNode->childNodes as $childNode) {
102-
if ($childNode instanceof \DOMText) {
103-
$currentPValue .= $childNode->nodeValue;
104-
} elseif ($childNode->nodeName === self::XML_NODE_S) {
105-
$spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C);
106-
$numSpaces = (!empty($spaceAttribute)) ? (int) $spaceAttribute : 1;
107-
$currentPValue .= str_repeat(' ', $numSpaces);
108-
} elseif ($childNode->nodeName === self::XML_NODE_A || $childNode->nodeName === self::XML_NODE_SPAN) {
109-
$currentPValue .= $childNode->nodeValue;
110-
}
111-
}
112-
113-
$pNodeValues[] = $currentPValue;
108+
$pNodeValues[] = $this->extractTextValueFromNode($pNode);
114109
}
115110

116111
$escapedCellValue = implode("\n", $pNodeValues);
@@ -119,6 +114,62 @@ protected function formatStringCellValue($node)
119114
return $cellValue;
120115
}
121116

117+
/**
118+
* @param $pNode
119+
* @return string
120+
*/
121+
private function extractTextValueFromNode($pNode)
122+
{
123+
$textValue = '';
124+
125+
foreach ($pNode->childNodes as $childNode) {
126+
if ($childNode instanceof \DOMText) {
127+
$textValue .= $childNode->nodeValue;
128+
} elseif ($this->isWhitespaceNode($childNode->nodeName)) {
129+
$textValue .= $this->transformWhitespaceNode($childNode);
130+
} elseif ($childNode->nodeName === self::XML_NODE_TEXT_A || $childNode->nodeName === self::XML_NODE_TEXT_SPAN) {
131+
$textValue .= $this->extractTextValueFromNode($childNode);
132+
}
133+
}
134+
135+
return $textValue;
136+
}
137+
138+
/**
139+
* Returns whether the given node is a whitespace node. It must be one of these:
140+
* - <text:s />
141+
* - <text:tab />
142+
* - <text:line-break />
143+
*
144+
* @param string $nodeName
145+
* @return bool
146+
*/
147+
private function isWhitespaceNode($nodeName)
148+
{
149+
return isset(self::$WHITESPACE_XML_NODES[$nodeName]);
150+
}
151+
152+
/**
153+
* The "<text:p>" node can contain the string value directly
154+
* or contain child elements. In this case, whitespaces contain in
155+
* the child elements should be replaced by their XML equivalent:
156+
* - space => <text:s />
157+
* - tab => <text:tab />
158+
* - line break => <text:line-break />
159+
*
160+
* @see https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415200_253892949
161+
*
162+
* @param \DOMNode $node The XML node representing a whitespace
163+
* @return string The corresponding whitespace value
164+
*/
165+
private function transformWhitespaceNode($node)
166+
{
167+
$countAttribute = $node->getAttribute(self::XML_ATTRIBUTE_C); // only defined for "<text:s>"
168+
$numWhitespaces = (!empty($countAttribute)) ? (int) $countAttribute : 1;
169+
170+
return str_repeat(self::$WHITESPACE_XML_NODES[$node->nodeName], $numWhitespaces);
171+
}
172+
122173
/**
123174
* Returns the cell Numeric value from the given node.
124175
*

tests/Spout/Reader/ODS/ReaderTest.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,17 @@ public function testReadShouldPreserveSpacing()
277277
$this->assertEquals([$expectedRow], $allRows);
278278
}
279279

280+
/**
281+
* @return void
282+
*/
283+
public function testReadShouldSupportWhitespaceAsXML()
284+
{
285+
$allRows = $this->getAllRowsForFile('sheet_with_whitespaces_as_xml.ods');
286+
287+
$expectedRow = ["Lorem ipsum\tdolor sit amet"];
288+
$this->assertEquals([$expectedRow], $allRows);
289+
}
290+
280291
/**
281292
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
282293
*
2.38 KB
Binary file not shown.

0 commit comments

Comments
 (0)