Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
declare(strict_types=1);
4
 
5
namespace OpenSpout\Reader\XLSX;
6
 
7
use DOMElement;
8
use OpenSpout\Common\Entity\Cell;
9
use OpenSpout\Common\Entity\Row;
10
use OpenSpout\Common\Exception\InvalidArgumentException;
11
use OpenSpout\Common\Exception\IOException;
12
use OpenSpout\Reader\Common\Manager\RowManager;
13
use OpenSpout\Reader\Common\XMLProcessor;
14
use OpenSpout\Reader\Exception\SharedStringNotFoundException;
15
use OpenSpout\Reader\RowIteratorInterface;
16
use OpenSpout\Reader\Wrapper\XMLReader;
17
use OpenSpout\Reader\XLSX\Helper\CellHelper;
18
use OpenSpout\Reader\XLSX\Helper\CellValueFormatter;
19
 
20
final class RowIterator implements RowIteratorInterface
21
{
22
    /**
23
     * Definition of XML nodes names used to parse data.
24
     */
25
    public const XML_NODE_DIMENSION = 'dimension';
26
    public const XML_NODE_WORKSHEET = 'worksheet';
27
    public const XML_NODE_ROW = 'row';
28
    public const XML_NODE_CELL = 'c';
29
 
30
    /**
31
     * Definition of XML attributes used to parse data.
32
     */
33
    public const XML_ATTRIBUTE_REF = 'ref';
34
    public const XML_ATTRIBUTE_SPANS = 'spans';
35
    public const XML_ATTRIBUTE_ROW_INDEX = 'r';
36
    public const XML_ATTRIBUTE_CELL_INDEX = 'r';
37
 
38
    /** @var string Path of the XLSX file being read */
39
    private readonly string $filePath;
40
 
41
    /** @var string Path of the sheet data XML file as in [Content_Types].xml */
42
    private readonly string $sheetDataXMLFilePath;
43
 
44
    /** @var XMLReader The XMLReader object that will help read sheet's XML data */
45
    private readonly XMLReader $xmlReader;
46
 
47
    /** @var XMLProcessor Helper Object to process XML nodes */
48
    private readonly XMLProcessor $xmlProcessor;
49
 
50
    /** @var Helper\CellValueFormatter Helper to format cell values */
51
    private readonly Helper\CellValueFormatter $cellValueFormatter;
52
 
53
    /** @var RowManager Manages rows */
54
    private readonly RowManager $rowManager;
55
 
56
    /**
57
     * TODO: This variable can be deleted when row indices get preserved.
58
     *
59
     * @var int Number of read rows
60
     */
61
    private int $numReadRows = 0;
62
 
63
    /** @var Row Contains the row currently processed */
64
    private Row $currentlyProcessedRow;
65
 
66
    /** @var null|Row Buffer used to store the current row, while checking if there are more rows to read */
67
    private ?Row $rowBuffer = null;
68
 
69
    /** @var bool Indicates whether all rows have been read */
70
    private bool $hasReachedEndOfFile = false;
71
 
72
    /** @var int The number of columns the sheet has (0 meaning undefined) */
73
    private int $numColumns = 0;
74
 
75
    /** @var bool Whether empty rows should be returned or skipped */
76
    private readonly bool $shouldPreserveEmptyRows;
77
 
78
    /** @var int Last row index processed (one-based) */
79
    private int $lastRowIndexProcessed = 0;
80
 
81
    /** @var int Row index to be processed next (one-based) */
82
    private int $nextRowIndexToBeProcessed = 0;
83
 
84
    /** @var int Last column index processed (zero-based) */
85
    private int $lastColumnIndexProcessed = -1;
86
 
87
    /**
88
     * @param string             $filePath                Path of the XLSX file being read
89
     * @param string             $sheetDataXMLFilePath    Path of the sheet data XML file as in [Content_Types].xml
90
     * @param bool               $shouldPreserveEmptyRows Whether empty rows should be preserved
91
     * @param XMLReader          $xmlReader               XML Reader
92
     * @param XMLProcessor       $xmlProcessor            Helper to process XML files
93
     * @param CellValueFormatter $cellValueFormatter      Helper to format cell values
94
     * @param RowManager         $rowManager              Manages rows
95
     */
96
    public function __construct(
97
        string $filePath,
98
        string $sheetDataXMLFilePath,
99
        bool $shouldPreserveEmptyRows,
100
        XMLReader $xmlReader,
101
        XMLProcessor $xmlProcessor,
102
        CellValueFormatter $cellValueFormatter,
103
        RowManager $rowManager
104
    ) {
105
        $this->filePath = $filePath;
106
        $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
107
        $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
108
        $this->xmlReader = $xmlReader;
109
        $this->cellValueFormatter = $cellValueFormatter;
110
        $this->rowManager = $rowManager;
111
 
112
        // Register all callbacks to process different nodes when reading the XML file
113
        $this->xmlProcessor = $xmlProcessor;
114
        $this->xmlProcessor->registerCallback(self::XML_NODE_DIMENSION, XMLProcessor::NODE_TYPE_START, [$this, 'processDimensionStartingNode']);
115
        $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']);
116
        $this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']);
117
        $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']);
118
        $this->xmlProcessor->registerCallback(self::XML_NODE_WORKSHEET, XMLProcessor::NODE_TYPE_END, [$this, 'processWorksheetEndingNode']);
119
    }
120
 
121
    /**
122
     * Rewind the Iterator to the first element.
123
     * Initializes the XMLReader object that reads the associated sheet data.
124
     * The XMLReader is configured to be safe from billion laughs attack.
125
     *
126
     * @see http://php.net/manual/en/iterator.rewind.php
127
     *
128
     * @throws IOException If the sheet data XML cannot be read
129
     */
130
    public function rewind(): void
131
    {
132
        $this->xmlReader->close();
133
 
134
        if (false === $this->xmlReader->openFileInZip($this->filePath, $this->sheetDataXMLFilePath)) {
135
            throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\".");
136
        }
137
 
138
        $this->numReadRows = 0;
139
        $this->lastRowIndexProcessed = 0;
140
        $this->nextRowIndexToBeProcessed = 0;
141
        $this->rowBuffer = null;
142
        $this->hasReachedEndOfFile = false;
143
        $this->numColumns = 0;
144
 
145
        $this->next();
146
    }
147
 
148
    /**
149
     * Checks if current position is valid.
150
     *
151
     * @see http://php.net/manual/en/iterator.valid.php
152
     */
153
    public function valid(): bool
154
    {
155
        $valid = !$this->hasReachedEndOfFile;
156
        if (!$valid) {
157
            $this->xmlReader->close();
158
        }
159
 
160
        return $valid;
161
    }
162
 
163
    /**
164
     * Move forward to next element. Reads data describing the next unprocessed row.
165
     *
166
     * @see http://php.net/manual/en/iterator.next.php
167
     *
168
     * @throws SharedStringNotFoundException If a shared string was not found
169
     * @throws IOException                   If unable to read the sheet data XML
170
     */
171
    public function next(): void
172
    {
173
        ++$this->nextRowIndexToBeProcessed;
174
 
175
        if ($this->doesNeedDataForNextRowToBeProcessed()) {
176
            $this->readDataForNextRow();
177
        }
178
    }
179
 
180
    /**
181
     * Return the current element, either an empty row or from the buffer.
182
     *
183
     * @see http://php.net/manual/en/iterator.current.php
184
     */
185
    public function current(): Row
186
    {
187
        $rowToBeProcessed = $this->rowBuffer;
188
 
189
        if ($this->shouldPreserveEmptyRows) {
190
            // when we need to preserve empty rows, we will either return
191
            // an empty row or the last row read. This depends whether the
192
            // index of last row that was read matches the index of the last
193
            // row whose value should be returned.
194
            if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) {
195
                // return empty row if mismatch between last processed row
196
                // and the row that needs to be returned
197
                $rowToBeProcessed = new Row([], null);
198
            }
199
        }
200
 
201
        \assert(null !== $rowToBeProcessed);
202
 
203
        return $rowToBeProcessed;
204
    }
205
 
206
    /**
207
     * Return the key of the current element. Here, the row index.
208
     *
209
     * @see http://php.net/manual/en/iterator.key.php
210
     */
211
    public function key(): int
212
    {
213
        // TODO: This should return $this->nextRowIndexToBeProcessed
214
        //       but to avoid a breaking change, the return value for
215
        //       this function has been kept as the number of rows read.
216
        return $this->shouldPreserveEmptyRows ?
217
                $this->nextRowIndexToBeProcessed :
218
                $this->numReadRows;
219
    }
220
 
221
    /**
222
     * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
223
     *
224
     * @return string path of the XML file containing the sheet data,
225
     *                without the leading slash
226
     */
227
    private function normalizeSheetDataXMLFilePath(string $sheetDataXMLFilePath): string
228
    {
229
        return ltrim($sheetDataXMLFilePath, '/');
230
    }
231
 
232
    /**
233
     * Returns whether we need data for the next row to be processed.
234
     * We don't need to read data if:
235
     *   we have already read at least one row
236
     *     AND
237
     *   we need to preserve empty rows
238
     *     AND
239
     *   the last row that was read is not the row that need to be processed
240
     *   (i.e. if we need to return empty rows).
241
     *
242
     * @return bool whether we need data for the next row to be processed
243
     */
244
    private function doesNeedDataForNextRowToBeProcessed(): bool
245
    {
246
        $hasReadAtLeastOneRow = (0 !== $this->lastRowIndexProcessed);
247
 
248
        return
249
            !$hasReadAtLeastOneRow
250
            || !$this->shouldPreserveEmptyRows
251
            || $this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed;
252
    }
253
 
254
    /**
255
     * @throws SharedStringNotFoundException If a shared string was not found
256
     * @throws IOException                   If unable to read the sheet data XML
257
     */
258
    private function readDataForNextRow(): void
259
    {
260
        $this->currentlyProcessedRow = new Row([], null);
261
 
262
        $this->xmlProcessor->readUntilStopped();
263
 
264
        $this->rowBuffer = $this->currentlyProcessedRow;
265
    }
266
 
267
    /**
268
     * @param XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node
269
     *
270
     * @return int A return code that indicates what action should the processor take next
271
     */
272
    private function processDimensionStartingNode(XMLReader $xmlReader): int
273
    {
274
        // Read dimensions of the sheet
275
        $dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
276
        \assert(null !== $dimensionRef);
277
        if (1 === preg_match('/[A-Z]+\d+:([A-Z]+\d+)/', $dimensionRef, $matches)) {
278
            $this->numColumns = CellHelper::getColumnIndexFromCellIndex($matches[1]) + 1;
279
        }
280
 
281
        return XMLProcessor::PROCESSING_CONTINUE;
282
    }
283
 
284
    /**
285
     * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node
286
     *
287
     * @return int A return code that indicates what action should the processor take next
288
     */
289
    private function processRowStartingNode(XMLReader $xmlReader): int
290
    {
291
        // Reset index of the last processed column
292
        $this->lastColumnIndexProcessed = -1;
293
 
294
        // Mark the last processed row as the one currently being read
295
        $this->lastRowIndexProcessed = $this->getRowIndex($xmlReader);
296
 
297
        // Read spans info if present
298
        $numberOfColumnsForRow = $this->numColumns;
299
        $spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
300
        if (null !== $spans && '' !== $spans) {
301
            [, $numberOfColumnsForRow] = explode(':', $spans);
302
            $numberOfColumnsForRow = (int) $numberOfColumnsForRow;
303
        }
304
 
305
        $cells = array_fill(0, $numberOfColumnsForRow, Cell::fromValue(''));
306
        $this->currentlyProcessedRow->setCells($cells);
307
 
308
        return XMLProcessor::PROCESSING_CONTINUE;
309
    }
310
 
311
    /**
312
     * @param XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node
313
     *
314
     * @return int A return code that indicates what action should the processor take next
315
     */
316
    private function processCellStartingNode(XMLReader $xmlReader): int
317
    {
318
        $currentColumnIndex = $this->getColumnIndex($xmlReader);
319
 
320
        // NOTE: expand() will automatically decode all XML entities of the child nodes
321
        $node = $xmlReader->expand();
322
        \assert($node instanceof DOMElement);
323
        $cell = $this->cellValueFormatter->extractAndFormatNodeValue($node);
324
 
325
        $this->currentlyProcessedRow->setCellAtIndex($cell, $currentColumnIndex);
326
        $this->lastColumnIndexProcessed = $currentColumnIndex;
327
 
328
        return XMLProcessor::PROCESSING_CONTINUE;
329
    }
330
 
331
    /**
332
     * @return int A return code that indicates what action should the processor take next
333
     */
334
    private function processRowEndingNode(): int
335
    {
336
        // if the fetched row is empty and we don't want to preserve it..,
337
        if (!$this->shouldPreserveEmptyRows && $this->currentlyProcessedRow->isEmpty()) {
338
            // ... skip it
339
            return XMLProcessor::PROCESSING_CONTINUE;
340
        }
341
 
342
        ++$this->numReadRows;
343
 
344
        // If needed, we fill the empty cells
345
        if (0 === $this->numColumns) {
346
            $this->rowManager->fillMissingIndexesWithEmptyCells($this->currentlyProcessedRow);
347
        }
348
 
349
        // at this point, we have all the data we need for the row
350
        // so that we can populate the buffer
351
        return XMLProcessor::PROCESSING_STOP;
352
    }
353
 
354
    /**
355
     * @return int A return code that indicates what action should the processor take next
356
     */
357
    private function processWorksheetEndingNode(): int
358
    {
359
        // The closing "</worksheet>" marks the end of the file
360
        $this->hasReachedEndOfFile = true;
361
 
362
        return XMLProcessor::PROCESSING_STOP;
363
    }
364
 
365
    /**
366
     * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" node
367
     *
368
     * @return int Row index
369
     *
370
     * @throws InvalidArgumentException When the given cell index is invalid
371
     */
372
    private function getRowIndex(XMLReader $xmlReader): int
373
    {
374
        // Get "r" attribute if present (from something like <row r="3"...>
375
        $currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX);
376
 
377
        return (null !== $currentRowIndex) ?
378
                (int) $currentRowIndex :
379
                $this->lastRowIndexProcessed + 1;
380
    }
381
 
382
    /**
383
     * @param XMLReader $xmlReader XMLReader object, positioned on a "<c>" node
384
     *
385
     * @return int Column index
386
     *
387
     * @throws InvalidArgumentException When the given cell index is invalid
388
     */
389
    private function getColumnIndex(XMLReader $xmlReader): int
390
    {
391
        // Get "r" attribute if present (from something like <c r="A1"...>
392
        $currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
393
 
394
        return (null !== $currentCellIndex) ?
395
                CellHelper::getColumnIndexFromCellIndex($currentCellIndex) :
396
                $this->lastColumnIndexProcessed + 1;
397
    }
398
}