Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
declare(strict_types=1);
4
 
5
namespace OpenSpout\Reader\XLSX\Manager;
6
 
7
use DOMElement;
8
use OpenSpout\Common\Exception\IOException;
9
use OpenSpout\Reader\Exception\SharedStringNotFoundException;
10
use OpenSpout\Reader\Exception\XMLProcessingException;
11
use OpenSpout\Reader\Wrapper\XMLReader;
12
use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactoryInterface;
13
use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
14
use OpenSpout\Reader\XLSX\Options;
15
 
16
/**
17
 * @internal
18
 */
19
final class SharedStringsManager
20
{
21
    /**
22
     * Definition of XML nodes names used to parse data.
23
     */
24
    public const XML_NODE_SST = 'sst';
25
    public const XML_NODE_SI = 'si';
26
    public const XML_NODE_R = 'r';
27
    public const XML_NODE_T = 't';
28
 
29
    /**
30
     * Definition of XML attributes used to parse data.
31
     */
32
    public const XML_ATTRIBUTE_COUNT = 'count';
33
    public const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
34
    public const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
35
    public const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
36
 
37
    /** @var string Path of the XLSX file being read */
38
    private readonly string $filePath;
39
 
40
    private readonly Options $options;
41
 
42
    /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
43
    private readonly WorkbookRelationshipsManager $workbookRelationshipsManager;
44
 
45
    /** @var CachingStrategyFactoryInterface Factory to create shared strings caching strategies */
46
    private readonly CachingStrategyFactoryInterface $cachingStrategyFactory;
47
 
48
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
49
    private CachingStrategyInterface $cachingStrategy;
50
 
51
    public function __construct(
52
        string $filePath,
53
        Options $options,
54
        WorkbookRelationshipsManager $workbookRelationshipsManager,
55
        CachingStrategyFactoryInterface $cachingStrategyFactory
56
    ) {
57
        $this->filePath = $filePath;
58
        $this->options = $options;
59
        $this->workbookRelationshipsManager = $workbookRelationshipsManager;
60
        $this->cachingStrategyFactory = $cachingStrategyFactory;
61
    }
62
 
63
    /**
64
     * Returns whether the XLSX file contains a shared strings XML file.
65
     */
66
    public function hasSharedStrings(): bool
67
    {
68
        return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
69
    }
70
 
71
    /**
72
     * Builds an in-memory array containing all the shared strings of the sheet.
73
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
74
     * It is then accessed by the sheet data, via the string index in the built table.
75
     *
76
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
77
     *
78
     * The XML file can be really big with sheets containing a lot of data. That is why
79
     * we need to use a XML reader that provides streaming like the XMLReader library.
80
     *
81
     * @throws IOException If shared strings XML file can't be read
82
     */
83
    public function extractSharedStrings(): void
84
    {
85
        $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
86
        $xmlReader = new XMLReader();
87
        $sharedStringIndex = 0;
88
 
89
        if (false === $xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath)) {
90
            throw new IOException('Could not open "'.$sharedStringsXMLFilePath.'".');
91
        }
92
 
93
        try {
94
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
95
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
96
 
97
            $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
98
 
99
            while (self::XML_NODE_SI === $xmlReader->getCurrentNodeName()) {
100
                $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
101
                ++$sharedStringIndex;
102
 
103
                // jump to the next '<si>' tag
104
                $xmlReader->next(self::XML_NODE_SI);
105
            }
106
 
107
            $this->cachingStrategy->closeCache();
108
        } catch (XMLProcessingException $exception) {
109
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
110
        }
111
 
112
        $xmlReader->close();
113
    }
114
 
115
    /**
116
     * Returns the shared string at the given index, using the previously chosen caching strategy.
117
     *
118
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
119
     *
120
     * @return string The shared string at the given index
121
     *
122
     * @throws SharedStringNotFoundException If no shared string found for the given index
123
     */
124
    public function getStringAtIndex(int $sharedStringIndex): string
125
    {
126
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
127
    }
128
 
129
    /**
130
     * Destroys the cache, freeing memory and removing any created artifacts.
131
     */
132
    public function cleanup(): void
133
    {
134
        if (isset($this->cachingStrategy)) {
135
            $this->cachingStrategy->clearCache();
136
        }
137
    }
138
 
139
    /**
140
     * Returns the shared strings unique count, as specified in <sst> tag.
141
     *
142
     * @param XMLReader $xmlReader XMLReader instance
143
     *
144
     * @return null|int Number of unique shared strings in the sharedStrings.xml file
145
     *
146
     * @throws IOException If sharedStrings.xml is invalid and can't be read
147
     */
148
    private function getSharedStringsUniqueCount(XMLReader $xmlReader): ?int
149
    {
150
        $xmlReader->next(self::XML_NODE_SST);
151
 
152
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
153
        while (self::XML_NODE_SST === $xmlReader->getCurrentNodeName() && XMLReader::ELEMENT !== $xmlReader->nodeType) {
154
            $xmlReader->read();
155
        }
156
 
157
        $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
158
 
159
        // some software do not add the "uniqueCount" attribute but only use the "count" one
160
        // @see https://github.com/box/spout/issues/254
161
        if (null === $uniqueCount) {
162
            $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
163
        }
164
 
165
        return (null !== $uniqueCount) ? (int) $uniqueCount : null;
166
    }
167
 
168
    /**
169
     * Returns the best shared strings caching strategy.
170
     *
171
     * @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
172
     */
173
    private function getBestSharedStringsCachingStrategy(?int $sharedStringsUniqueCount): CachingStrategyInterface
174
    {
175
        return $this->cachingStrategyFactory
176
            ->createBestCachingStrategy($sharedStringsUniqueCount, $this->options->getTempFolder())
177
        ;
178
    }
179
 
180
    /**
181
     * Processes the shared strings item XML node which the given XML reader is positioned on.
182
     *
183
     * @param XMLReader $xmlReader         XML Reader positioned on a "<si>" node
184
     * @param int       $sharedStringIndex Index of the processed shared strings item
185
     */
186
    private function processSharedStringsItem(XMLReader $xmlReader, int $sharedStringIndex): void
187
    {
188
        $sharedStringValue = '';
189
 
190
        // NOTE: expand() will automatically decode all XML entities of the child nodes
191
        $siNode = $xmlReader->expand();
192
        \assert($siNode instanceof DOMElement);
193
        $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
194
 
195
        foreach ($textNodes as $textNode) {
196
            if ($this->shouldExtractTextNodeValue($textNode)) {
197
                $textNodeValue = $textNode->nodeValue;
198
                \assert(null !== $textNodeValue);
199
                $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
200
 
201
                $sharedStringValue .= $shouldPreserveWhitespace
202
                    ? $textNodeValue
203
                    : trim($textNodeValue);
204
            }
205
        }
206
 
207
        $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
208
    }
209
 
210
    /**
211
     * Not all text nodes' values must be extracted.
212
     * Some text nodes are part of a node describing the pronunciation for instance.
213
     * We'll only consider the nodes whose parents are "<si>" or "<r>".
214
     *
215
     * @param DOMElement $textNode Text node to check
216
     *
217
     * @return bool Whether the given text node's value must be extracted
218
     */
219
    private function shouldExtractTextNodeValue(DOMElement $textNode): bool
220
    {
221
        $parentNode = $textNode->parentNode;
222
        \assert(null !== $parentNode);
223
        $parentTagName = $parentNode->localName;
224
 
225
        return self::XML_NODE_SI === $parentTagName || self::XML_NODE_R === $parentTagName;
226
    }
227
 
228
    /**
229
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
230
     *
231
     * @param DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
232
     *
233
     * @return bool Whether whitespace should be preserved
234
     */
235
    private function shouldPreserveWhitespace(DOMElement $textNode): bool
236
    {
237
        $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
238
 
239
        return self::XML_ATTRIBUTE_VALUE_PRESERVE === $spaceValue;
240
    }
241
}