1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
declare(strict_types=1);
|
|
|
4 |
|
|
|
5 |
namespace OpenSpout\Reader\XLSX\Manager;
|
|
|
6 |
|
|
|
7 |
use DOMElement;
|
|
|
8 |
use OpenSpout\Common\Exception\IOException;
|
|
|
9 |
use OpenSpout\Reader\Exception\SharedStringNotFoundException;
|
|
|
10 |
use OpenSpout\Reader\Exception\XMLProcessingException;
|
|
|
11 |
use OpenSpout\Reader\Wrapper\XMLReader;
|
|
|
12 |
use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactoryInterface;
|
|
|
13 |
use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
|
|
|
14 |
use OpenSpout\Reader\XLSX\Options;
|
|
|
15 |
|
|
|
16 |
/**
|
|
|
17 |
* @internal
|
|
|
18 |
*/
|
|
|
19 |
final class SharedStringsManager
|
|
|
20 |
{
|
|
|
21 |
/**
|
|
|
22 |
* Definition of XML nodes names used to parse data.
|
|
|
23 |
*/
|
|
|
24 |
public const XML_NODE_SST = 'sst';
|
|
|
25 |
public const XML_NODE_SI = 'si';
|
|
|
26 |
public const XML_NODE_R = 'r';
|
|
|
27 |
public const XML_NODE_T = 't';
|
|
|
28 |
|
|
|
29 |
/**
|
|
|
30 |
* Definition of XML attributes used to parse data.
|
|
|
31 |
*/
|
|
|
32 |
public const XML_ATTRIBUTE_COUNT = 'count';
|
|
|
33 |
public const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
|
|
|
34 |
public const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
|
|
|
35 |
public const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
|
|
|
36 |
|
|
|
37 |
/** @var string Path of the XLSX file being read */
|
|
|
38 |
private readonly string $filePath;
|
|
|
39 |
|
|
|
40 |
private readonly Options $options;
|
|
|
41 |
|
|
|
42 |
/** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
|
|
|
43 |
private readonly WorkbookRelationshipsManager $workbookRelationshipsManager;
|
|
|
44 |
|
|
|
45 |
/** @var CachingStrategyFactoryInterface Factory to create shared strings caching strategies */
|
|
|
46 |
private readonly CachingStrategyFactoryInterface $cachingStrategyFactory;
|
|
|
47 |
|
|
|
48 |
/** @var CachingStrategyInterface The best caching strategy for storing shared strings */
|
|
|
49 |
private CachingStrategyInterface $cachingStrategy;
|
|
|
50 |
|
|
|
51 |
public function __construct(
|
|
|
52 |
string $filePath,
|
|
|
53 |
Options $options,
|
|
|
54 |
WorkbookRelationshipsManager $workbookRelationshipsManager,
|
|
|
55 |
CachingStrategyFactoryInterface $cachingStrategyFactory
|
|
|
56 |
) {
|
|
|
57 |
$this->filePath = $filePath;
|
|
|
58 |
$this->options = $options;
|
|
|
59 |
$this->workbookRelationshipsManager = $workbookRelationshipsManager;
|
|
|
60 |
$this->cachingStrategyFactory = $cachingStrategyFactory;
|
|
|
61 |
}
|
|
|
62 |
|
|
|
63 |
/**
|
|
|
64 |
* Returns whether the XLSX file contains a shared strings XML file.
|
|
|
65 |
*/
|
|
|
66 |
public function hasSharedStrings(): bool
|
|
|
67 |
{
|
|
|
68 |
return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
|
|
|
69 |
}
|
|
|
70 |
|
|
|
71 |
/**
|
|
|
72 |
* Builds an in-memory array containing all the shared strings of the sheet.
|
|
|
73 |
* All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
|
|
|
74 |
* It is then accessed by the sheet data, via the string index in the built table.
|
|
|
75 |
*
|
|
|
76 |
* More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
|
|
|
77 |
*
|
|
|
78 |
* The XML file can be really big with sheets containing a lot of data. That is why
|
|
|
79 |
* we need to use a XML reader that provides streaming like the XMLReader library.
|
|
|
80 |
*
|
|
|
81 |
* @throws IOException If shared strings XML file can't be read
|
|
|
82 |
*/
|
|
|
83 |
public function extractSharedStrings(): void
|
|
|
84 |
{
|
|
|
85 |
$sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
|
|
|
86 |
$xmlReader = new XMLReader();
|
|
|
87 |
$sharedStringIndex = 0;
|
|
|
88 |
|
|
|
89 |
if (false === $xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath)) {
|
|
|
90 |
throw new IOException('Could not open "'.$sharedStringsXMLFilePath.'".');
|
|
|
91 |
}
|
|
|
92 |
|
|
|
93 |
try {
|
|
|
94 |
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
|
|
|
95 |
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
|
|
|
96 |
|
|
|
97 |
$xmlReader->readUntilNodeFound(self::XML_NODE_SI);
|
|
|
98 |
|
|
|
99 |
while (self::XML_NODE_SI === $xmlReader->getCurrentNodeName()) {
|
|
|
100 |
$this->processSharedStringsItem($xmlReader, $sharedStringIndex);
|
|
|
101 |
++$sharedStringIndex;
|
|
|
102 |
|
|
|
103 |
// jump to the next '<si>' tag
|
|
|
104 |
$xmlReader->next(self::XML_NODE_SI);
|
|
|
105 |
}
|
|
|
106 |
|
|
|
107 |
$this->cachingStrategy->closeCache();
|
|
|
108 |
} catch (XMLProcessingException $exception) {
|
|
|
109 |
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
|
|
|
110 |
}
|
|
|
111 |
|
|
|
112 |
$xmlReader->close();
|
|
|
113 |
}
|
|
|
114 |
|
|
|
115 |
/**
|
|
|
116 |
* Returns the shared string at the given index, using the previously chosen caching strategy.
|
|
|
117 |
*
|
|
|
118 |
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
|
|
119 |
*
|
|
|
120 |
* @return string The shared string at the given index
|
|
|
121 |
*
|
|
|
122 |
* @throws SharedStringNotFoundException If no shared string found for the given index
|
|
|
123 |
*/
|
|
|
124 |
public function getStringAtIndex(int $sharedStringIndex): string
|
|
|
125 |
{
|
|
|
126 |
return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
|
|
|
127 |
}
|
|
|
128 |
|
|
|
129 |
/**
|
|
|
130 |
* Destroys the cache, freeing memory and removing any created artifacts.
|
|
|
131 |
*/
|
|
|
132 |
public function cleanup(): void
|
|
|
133 |
{
|
|
|
134 |
if (isset($this->cachingStrategy)) {
|
|
|
135 |
$this->cachingStrategy->clearCache();
|
|
|
136 |
}
|
|
|
137 |
}
|
|
|
138 |
|
|
|
139 |
/**
|
|
|
140 |
* Returns the shared strings unique count, as specified in <sst> tag.
|
|
|
141 |
*
|
|
|
142 |
* @param XMLReader $xmlReader XMLReader instance
|
|
|
143 |
*
|
|
|
144 |
* @return null|int Number of unique shared strings in the sharedStrings.xml file
|
|
|
145 |
*
|
|
|
146 |
* @throws IOException If sharedStrings.xml is invalid and can't be read
|
|
|
147 |
*/
|
|
|
148 |
private function getSharedStringsUniqueCount(XMLReader $xmlReader): ?int
|
|
|
149 |
{
|
|
|
150 |
$xmlReader->next(self::XML_NODE_SST);
|
|
|
151 |
|
|
|
152 |
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
|
|
|
153 |
while (self::XML_NODE_SST === $xmlReader->getCurrentNodeName() && XMLReader::ELEMENT !== $xmlReader->nodeType) {
|
|
|
154 |
$xmlReader->read();
|
|
|
155 |
}
|
|
|
156 |
|
|
|
157 |
$uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
|
|
|
158 |
|
|
|
159 |
// some software do not add the "uniqueCount" attribute but only use the "count" one
|
|
|
160 |
// @see https://github.com/box/spout/issues/254
|
|
|
161 |
if (null === $uniqueCount) {
|
|
|
162 |
$uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
|
|
|
163 |
}
|
|
|
164 |
|
|
|
165 |
return (null !== $uniqueCount) ? (int) $uniqueCount : null;
|
|
|
166 |
}
|
|
|
167 |
|
|
|
168 |
/**
|
|
|
169 |
* Returns the best shared strings caching strategy.
|
|
|
170 |
*
|
|
|
171 |
* @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
|
|
|
172 |
*/
|
|
|
173 |
private function getBestSharedStringsCachingStrategy(?int $sharedStringsUniqueCount): CachingStrategyInterface
|
|
|
174 |
{
|
|
|
175 |
return $this->cachingStrategyFactory
|
|
|
176 |
->createBestCachingStrategy($sharedStringsUniqueCount, $this->options->getTempFolder())
|
|
|
177 |
;
|
|
|
178 |
}
|
|
|
179 |
|
|
|
180 |
/**
|
|
|
181 |
* Processes the shared strings item XML node which the given XML reader is positioned on.
|
|
|
182 |
*
|
|
|
183 |
* @param XMLReader $xmlReader XML Reader positioned on a "<si>" node
|
|
|
184 |
* @param int $sharedStringIndex Index of the processed shared strings item
|
|
|
185 |
*/
|
|
|
186 |
private function processSharedStringsItem(XMLReader $xmlReader, int $sharedStringIndex): void
|
|
|
187 |
{
|
|
|
188 |
$sharedStringValue = '';
|
|
|
189 |
|
|
|
190 |
// NOTE: expand() will automatically decode all XML entities of the child nodes
|
|
|
191 |
$siNode = $xmlReader->expand();
|
|
|
192 |
\assert($siNode instanceof DOMElement);
|
|
|
193 |
$textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
|
|
|
194 |
|
|
|
195 |
foreach ($textNodes as $textNode) {
|
|
|
196 |
if ($this->shouldExtractTextNodeValue($textNode)) {
|
|
|
197 |
$textNodeValue = $textNode->nodeValue;
|
|
|
198 |
\assert(null !== $textNodeValue);
|
|
|
199 |
$shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
|
|
|
200 |
|
|
|
201 |
$sharedStringValue .= $shouldPreserveWhitespace
|
|
|
202 |
? $textNodeValue
|
|
|
203 |
: trim($textNodeValue);
|
|
|
204 |
}
|
|
|
205 |
}
|
|
|
206 |
|
|
|
207 |
$this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
|
|
|
208 |
}
|
|
|
209 |
|
|
|
210 |
/**
|
|
|
211 |
* Not all text nodes' values must be extracted.
|
|
|
212 |
* Some text nodes are part of a node describing the pronunciation for instance.
|
|
|
213 |
* We'll only consider the nodes whose parents are "<si>" or "<r>".
|
|
|
214 |
*
|
|
|
215 |
* @param DOMElement $textNode Text node to check
|
|
|
216 |
*
|
|
|
217 |
* @return bool Whether the given text node's value must be extracted
|
|
|
218 |
*/
|
|
|
219 |
private function shouldExtractTextNodeValue(DOMElement $textNode): bool
|
|
|
220 |
{
|
|
|
221 |
$parentNode = $textNode->parentNode;
|
|
|
222 |
\assert(null !== $parentNode);
|
|
|
223 |
$parentTagName = $parentNode->localName;
|
|
|
224 |
|
|
|
225 |
return self::XML_NODE_SI === $parentTagName || self::XML_NODE_R === $parentTagName;
|
|
|
226 |
}
|
|
|
227 |
|
|
|
228 |
/**
|
|
|
229 |
* If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
|
|
|
230 |
*
|
|
|
231 |
* @param DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
|
|
|
232 |
*
|
|
|
233 |
* @return bool Whether whitespace should be preserved
|
|
|
234 |
*/
|
|
|
235 |
private function shouldPreserveWhitespace(DOMElement $textNode): bool
|
|
|
236 |
{
|
|
|
237 |
$spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
|
|
|
238 |
|
|
|
239 |
return self::XML_ATTRIBUTE_VALUE_PRESERVE === $spaceValue;
|
|
|
240 |
}
|
|
|
241 |
}
|