Skip to content
This repository was archived by the owner on May 26, 2022. It is now read-only.

Commit a19231f

Browse files
authored
Introduce XMLProcessor to reduce ODS,XLSX readers' complexity (#342)
1 parent 73d5d0e commit a19231f

File tree

3 files changed

+246
-96
lines changed

3 files changed

+246
-96
lines changed
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
<?php
2+
3+
namespace Box\Spout\Reader\Common;
4+
5+
use Box\Spout\Reader\Wrapper\XMLReader;
6+
7+
/**
8+
* Class XMLProcessor
9+
* Helps process XML files
10+
*
11+
* @package Box\Spout\Reader\Common
12+
*/
13+
class XMLProcessor
14+
{
15+
/* Node types */
16+
const NODE_TYPE_START = XMLReader::ELEMENT;
17+
const NODE_TYPE_END = XMLReader::END_ELEMENT;
18+
19+
/* Keys associated to reflection attributes to invoke a callback */
20+
const CALLBACK_REFLECTION_METHOD = 'reflectionMethod';
21+
const CALLBACK_REFLECTION_OBJECT = 'reflectionObject';
22+
23+
/* Values returned by the callbacks to indicate what the processor should do next */
24+
const PROCESSING_CONTINUE = 1;
25+
const PROCESSING_STOP = 2;
26+
27+
28+
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
29+
protected $xmlReader;
30+
31+
/** @var array Registered callbacks */
32+
private $callbacks = [];
33+
34+
35+
/**
36+
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object
37+
*/
38+
public function __construct($xmlReader)
39+
{
40+
$this->xmlReader = $xmlReader;
41+
}
42+
43+
/**
44+
* @param string $nodeName A callback may be triggered when a node with this name is read
45+
* @param int $nodeType Type of the node [NODE_TYPE_START || NODE_TYPE_END]
46+
* @param callable $callback Callback to execute when the read node has the given name and type
47+
* @return XMLProcessor
48+
*/
49+
public function registerCallback($nodeName, $nodeType, $callback)
50+
{
51+
$callbackKey = $this->getCallbackKey($nodeName, $nodeType);
52+
$this->callbacks[$callbackKey] = $this->getInvokableCallbackData($callback);
53+
54+
return $this;
55+
}
56+
57+
/**
58+
* @param string $nodeName Name of the node
59+
* @param int $nodeType Type of the node [NODE_TYPE_START || NODE_TYPE_END]
60+
* @return string Key used to store the associated callback
61+
*/
62+
private function getCallbackKey($nodeName, $nodeType)
63+
{
64+
return "$nodeName$nodeType";
65+
}
66+
67+
/**
68+
* Because the callback can be a "protected" function, we don't want to use call_user_func() directly
69+
* but instead invoke the callback using Reflection. This allows the invocation of "protected" functions.
70+
* Since some functions can be called a lot, we pre-process the callback to only return the elements that
71+
* will be needed to invoke the callback later.
72+
*
73+
* @param callable $callback Array reference to a callback: [OBJECT, METHOD_NAME]
74+
* @return array Associative array containing the elements needed to invoke the callback using Reflection
75+
*/
76+
private function getInvokableCallbackData($callback)
77+
{
78+
$callbackObject = $callback[0];
79+
$callbackMethodName = $callback[1];
80+
$reflectionMethod = new \ReflectionMethod(get_class($callbackObject), $callbackMethodName);
81+
$reflectionMethod->setAccessible(true);
82+
83+
return [
84+
self::CALLBACK_REFLECTION_METHOD => $reflectionMethod,
85+
self::CALLBACK_REFLECTION_OBJECT => $callbackObject,
86+
];
87+
}
88+
89+
/**
90+
* Resumes the reading of the XML file where it was left off.
91+
* Stops whenever a callback indicates that reading should stop or at the end of the file.
92+
*
93+
* @return void
94+
* @throws \Box\Spout\Reader\Exception\XMLProcessingException
95+
*/
96+
public function readUntilStopped()
97+
{
98+
while ($this->xmlReader->read()) {
99+
$nodeType = $this->xmlReader->nodeType;
100+
$nodeNamePossiblyWithPrefix = $this->xmlReader->name;
101+
$nodeNameWithoutPrefix = $this->xmlReader->localName;
102+
103+
$callbackData = $this->getRegisteredCallbackData($nodeNamePossiblyWithPrefix, $nodeNameWithoutPrefix, $nodeType);
104+
105+
if ($callbackData !== null) {
106+
$callbackResponse = $this->invokeCallback($callbackData, [$this->xmlReader]);
107+
108+
if ($callbackResponse === self::PROCESSING_STOP) {
109+
// stop reading
110+
break;
111+
}
112+
}
113+
}
114+
}
115+
116+
/**
117+
* @param string $nodeNamePossiblyWithPrefix Name of the node, possibly prefixed
118+
* @param string $nodeNameWithoutPrefix Name of the same node, un-prefixed
119+
* @param int $nodeType Type of the node [NODE_TYPE_START || NODE_TYPE_END]
120+
* @return array|null Callback data to be used for execution when a node of the given name/type is read or NULL if none found
121+
*/
122+
private function getRegisteredCallbackData($nodeNamePossiblyWithPrefix, $nodeNameWithoutPrefix, $nodeType)
123+
{
124+
// With prefixed nodes, we should match if (by order of preference):
125+
// 1. the callback was registered with the prefixed node name (e.g. "x:worksheet")
126+
// 2. the callback was registered with the un-prefixed node name (e.g. "worksheet")
127+
$callbackKeyForPossiblyPrefixedName = $this->getCallbackKey($nodeNamePossiblyWithPrefix, $nodeType);
128+
$callbackKeyForUnPrefixedName = $this->getCallbackKey($nodeNameWithoutPrefix, $nodeType);
129+
$hasPrefix = ($nodeNamePossiblyWithPrefix !== $nodeNameWithoutPrefix);
130+
131+
$callbackKeyToUse = $callbackKeyForUnPrefixedName;
132+
if ($hasPrefix && isset($this->callbacks[$callbackKeyForPossiblyPrefixedName])) {
133+
$callbackKeyToUse = $callbackKeyForPossiblyPrefixedName;
134+
}
135+
136+
// Using isset here because it is way faster than array_key_exists...
137+
return isset($this->callbacks[$callbackKeyToUse]) ? $this->callbacks[$callbackKeyToUse] : null;
138+
}
139+
140+
/**
141+
* @param array $callbackData Associative array containing data to invoke the callback using Reflection
142+
* @param array $args Arguments to pass to the callback
143+
* @return int Callback response
144+
*/
145+
private function invokeCallback($callbackData, $args)
146+
{
147+
$reflectionMethod = $callbackData[self::CALLBACK_REFLECTION_METHOD];
148+
$callbackObject = $callbackData[self::CALLBACK_REFLECTION_OBJECT];
149+
150+
return $reflectionMethod->invokeArgs($callbackObject, $args);
151+
}
152+
}

src/Spout/Reader/ODS/RowIterator.php

Lines changed: 45 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
use Box\Spout\Reader\IteratorInterface;
99
use Box\Spout\Reader\ODS\Helper\CellValueFormatter;
1010
use Box\Spout\Reader\Wrapper\XMLReader;
11+
use Box\Spout\Reader\Common\XMLProcessor;
1112

1213
/**
1314
* Class RowIterator
@@ -29,6 +30,9 @@ class RowIterator implements IteratorInterface
2930
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
3031
protected $xmlReader;
3132

33+
/** @var \Box\Spout\Reader\Common\XMLProcessor Helper Object to process XML nodes */
34+
protected $xmlProcessor;
35+
3236
/** @var bool Whether empty rows should be returned or skipped */
3337
protected $shouldPreserveEmptyRows;
3438

@@ -38,6 +42,9 @@ class RowIterator implements IteratorInterface
3842
/** @var bool Whether the iterator has already been rewound once */
3943
protected $hasAlreadyBeenRewound = false;
4044

45+
/** @var array Contains the data for the currently processed row (key = cell index, value = cell value) */
46+
protected $currentlyProcessedRowData = [];
47+
4148
/** @var array|null Buffer used to store the row data, while checking if there are more rows to read */
4249
protected $rowDataBuffer = null;
4350

@@ -72,6 +79,13 @@ public function __construct($xmlReader, $options)
7279
$this->xmlReader = $xmlReader;
7380
$this->shouldPreserveEmptyRows = $options->shouldPreserveEmptyRows();
7481
$this->cellValueFormatter = new CellValueFormatter($options->shouldFormatDates());
82+
83+
// Register all callbacks to process different nodes when reading the XML file
84+
$this->xmlProcessor = new XMLProcessor($this->xmlReader);
85+
$this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']);
86+
$this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']);
87+
$this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']);
88+
$this->xmlProcessor->registerCallback(self::XML_NODE_TABLE, XMLProcessor::NODE_TYPE_END, [$this, 'processTableEndingNode']);
7589
}
7690

7791
/**
@@ -122,7 +136,7 @@ public function valid()
122136
public function next()
123137
{
124138
if ($this->doesNeedDataForNextRowToBeProcessed()) {
125-
$this->readDataForNextRow($this->xmlReader);
139+
$this->readDataForNextRow();
126140
}
127141

128142
$this->lastRowIndexProcessed++;
@@ -148,54 +162,26 @@ protected function doesNeedDataForNextRowToBeProcessed()
148162
}
149163

150164
/**
151-
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object
152165
* @return void
153166
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
154167
* @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML
155168
*/
156-
protected function readDataForNextRow($xmlReader)
169+
protected function readDataForNextRow()
157170
{
158-
$rowData = [];
171+
$this->currentlyProcessedRowData = [];
159172

160173
try {
161-
while ($xmlReader->read()) {
162-
if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) {
163-
$this->processRowStartingNode($xmlReader);
164-
165-
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) {
166-
$rowData = $this->processCellStartingNode($xmlReader, $rowData);
167-
168-
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) {
169-
$isEmptyRow = $this->isEmptyRow($rowData, $this->lastProcessedCellValue);
170-
171-
// if the fetched row is empty and we don't want to preserve it...
172-
if (!$this->shouldPreserveEmptyRows && $isEmptyRow) {
173-
// ... skip it
174-
continue;
175-
}
176-
177-
$rowData = $this->processRowEndingNode($rowData, $isEmptyRow);
178-
179-
// at this point, we have all the data we need for the row
180-
// so that we can populate the buffer
181-
break;
182-
183-
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_TABLE)) {
184-
$this->processTableEndingNode();
185-
break;
186-
}
187-
}
188-
174+
$this->xmlProcessor->readUntilStopped();
189175
} catch (XMLProcessingException $exception) {
190176
throw new IOException("The sheet's data cannot be read. [{$exception->getMessage()}]");
191177
}
192178

193-
$this->rowDataBuffer = $rowData;
179+
$this->rowDataBuffer = $this->currentlyProcessedRowData;
194180
}
195181

196182
/**
197183
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<table:table-row>" starting node
198-
* @return void
184+
* @return int A return code that indicates what action should the processor take next
199185
*/
200186
protected function processRowStartingNode($xmlReader)
201187
{
@@ -204,14 +190,15 @@ protected function processRowStartingNode($xmlReader)
204190
$this->lastProcessedCellValue = null;
205191
$this->numColumnsRepeated = 1;
206192
$this->numRowsRepeated = $this->getNumRowsRepeatedForCurrentNode($xmlReader);
193+
194+
return XMLProcessor::PROCESSING_CONTINUE;
207195
}
208196

209197
/**
210198
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<table:table-cell>" starting node
211-
* @param array $rowData Data of all cells read so far
212-
* @return array Original row data + data for the cell that was just read
199+
* @return int A return code that indicates what action should the processor take next
213200
*/
214-
protected function processCellStartingNode($xmlReader, $rowData)
201+
protected function processCellStartingNode($xmlReader)
215202
{
216203
$currentNumColumnsRepeated = $this->getNumColumnsRepeatedForCurrentNode($xmlReader);
217204

@@ -221,53 +208,63 @@ protected function processCellStartingNode($xmlReader, $rowData)
221208
// process cell N only after having read cell N+1 (see below why)
222209
if ($this->hasAlreadyReadOneCellInCurrentRow) {
223210
for ($i = 0; $i < $this->numColumnsRepeated; $i++) {
224-
$rowData[] = $this->lastProcessedCellValue;
211+
$this->currentlyProcessedRowData[] = $this->lastProcessedCellValue;
225212
}
226213
}
227214

228215
$this->hasAlreadyReadOneCellInCurrentRow = true;
229216
$this->lastProcessedCellValue = $currentCellValue;
230217
$this->numColumnsRepeated = $currentNumColumnsRepeated;
231218

232-
return $rowData;
219+
return XMLProcessor::PROCESSING_CONTINUE;
233220
}
234221

235222
/**
236-
* @param array $rowData Data of all cells read so far
237-
* @param bool $isEmptyRow Whether the given row is empty
238-
* @return array
223+
* @return int A return code that indicates what action should the processor take next
239224
*/
240-
protected function processRowEndingNode($rowData, $isEmptyRow)
225+
protected function processRowEndingNode()
241226
{
227+
$isEmptyRow = $this->isEmptyRow($this->currentlyProcessedRowData, $this->lastProcessedCellValue);
228+
229+
// if the fetched row is empty and we don't want to preserve it...
230+
if (!$this->shouldPreserveEmptyRows && $isEmptyRow) {
231+
// ... skip it
232+
return XMLProcessor::PROCESSING_CONTINUE;
233+
}
234+
242235
// if the row is empty, we don't want to return more than one cell
243236
$actualNumColumnsRepeated = (!$isEmptyRow) ? $this->numColumnsRepeated : 1;
244237

245238
// Only add the value if the last read cell is not a trailing empty cell repeater in Excel.
246-
// The current count of read columns is determined by counting the values in $rowData.
239+
// The current count of read columns is determined by counting the values in "$this->currentlyProcessedRowData".
247240
// This is to avoid creating a lot of empty cells, as Excel adds a last empty "<table:table-cell>"
248241
// with a number-columns-repeated value equals to the number of (supported columns - used columns).
249242
// In Excel, the number of supported columns is 16384, but we don't want to returns rows with
250243
// always 16384 cells.
251-
if ((count($rowData) + $actualNumColumnsRepeated) !== self::MAX_COLUMNS_EXCEL) {
244+
if ((count($this->currentlyProcessedRowData) + $actualNumColumnsRepeated) !== self::MAX_COLUMNS_EXCEL) {
252245
for ($i = 0; $i < $actualNumColumnsRepeated; $i++) {
253-
$rowData[] = $this->lastProcessedCellValue;
246+
$this->currentlyProcessedRowData[] = $this->lastProcessedCellValue;
254247
}
255248
}
256249

257250
// If we are processing row N and the row is repeated M times,
258251
// then the next row to be processed will be row (N+M).
259252
$this->nextRowIndexToBeProcessed += $this->numRowsRepeated;
260253

261-
return $rowData;
254+
// at this point, we have all the data we need for the row
255+
// so that we can populate the buffer
256+
return XMLProcessor::PROCESSING_STOP;
262257
}
263258

264259
/**
265-
* @return void
260+
* @return int A return code that indicates what action should the processor take next
266261
*/
267262
protected function processTableEndingNode()
268263
{
269264
// The closing "</table:table>" marks the end of the file
270265
$this->hasReachedEndOfFile = true;
266+
267+
return XMLProcessor::PROCESSING_STOP;
271268
}
272269

273270
/**

0 commit comments

Comments
 (0)