1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
declare(strict_types=1);
|
|
|
4 |
|
|
|
5 |
namespace OpenSpout\Common\Helper;
|
|
|
6 |
|
|
|
7 |
use Error;
|
|
|
8 |
use OpenSpout\Common\Exception\EncodingConversionException;
|
|
|
9 |
|
|
|
10 |
/**
|
|
|
11 |
* @internal
|
|
|
12 |
*/
|
|
|
13 |
final class EncodingHelper
|
|
|
14 |
{
|
|
|
15 |
/**
|
|
|
16 |
* Definition of the encodings that can have a BOM.
|
|
|
17 |
*/
|
|
|
18 |
public const ENCODING_UTF8 = 'UTF-8';
|
|
|
19 |
public const ENCODING_UTF16_LE = 'UTF-16LE';
|
|
|
20 |
public const ENCODING_UTF16_BE = 'UTF-16BE';
|
|
|
21 |
public const ENCODING_UTF32_LE = 'UTF-32LE';
|
|
|
22 |
public const ENCODING_UTF32_BE = 'UTF-32BE';
|
|
|
23 |
|
|
|
24 |
/**
|
|
|
25 |
* Definition of the BOMs for the different encodings.
|
|
|
26 |
*/
|
|
|
27 |
public const BOM_UTF8 = "\xEF\xBB\xBF";
|
|
|
28 |
public const BOM_UTF16_LE = "\xFF\xFE";
|
|
|
29 |
public const BOM_UTF16_BE = "\xFE\xFF";
|
|
|
30 |
public const BOM_UTF32_LE = "\xFF\xFE\x00\x00";
|
|
|
31 |
public const BOM_UTF32_BE = "\x00\x00\xFE\xFF";
|
|
|
32 |
|
|
|
33 |
/** @var array<string, string> Map representing the encodings supporting BOMs (key) and their associated BOM (value) */
|
|
|
34 |
private array $supportedEncodingsWithBom;
|
|
|
35 |
|
|
|
36 |
private readonly bool $canUseIconv;
|
|
|
37 |
|
|
|
38 |
private readonly bool $canUseMbString;
|
|
|
39 |
|
|
|
40 |
public function __construct(bool $canUseIconv, bool $canUseMbString)
|
|
|
41 |
{
|
|
|
42 |
$this->canUseIconv = $canUseIconv;
|
|
|
43 |
$this->canUseMbString = $canUseMbString;
|
|
|
44 |
|
|
|
45 |
$this->supportedEncodingsWithBom = [
|
|
|
46 |
self::ENCODING_UTF8 => self::BOM_UTF8,
|
|
|
47 |
self::ENCODING_UTF16_LE => self::BOM_UTF16_LE,
|
|
|
48 |
self::ENCODING_UTF16_BE => self::BOM_UTF16_BE,
|
|
|
49 |
self::ENCODING_UTF32_LE => self::BOM_UTF32_LE,
|
|
|
50 |
self::ENCODING_UTF32_BE => self::BOM_UTF32_BE,
|
|
|
51 |
];
|
|
|
52 |
}
|
|
|
53 |
|
|
|
54 |
public static function factory(): self
|
|
|
55 |
{
|
|
|
56 |
return new self(
|
|
|
57 |
\function_exists('iconv'),
|
|
|
58 |
\function_exists('mb_convert_encoding'),
|
|
|
59 |
);
|
|
|
60 |
}
|
|
|
61 |
|
|
|
62 |
/**
|
|
|
63 |
* Returns the number of bytes to use as offset in order to skip the BOM.
|
|
|
64 |
*
|
|
|
65 |
* @param resource $filePointer Pointer to the file to check
|
|
|
66 |
* @param string $encoding Encoding of the file to check
|
|
|
67 |
*
|
|
|
68 |
* @return int Bytes offset to apply to skip the BOM (0 means no BOM)
|
|
|
69 |
*/
|
|
|
70 |
public function getBytesOffsetToSkipBOM($filePointer, string $encoding): int
|
|
|
71 |
{
|
|
|
72 |
$byteOffsetToSkipBom = 0;
|
|
|
73 |
|
|
|
74 |
if ($this->hasBOM($filePointer, $encoding)) {
|
|
|
75 |
$bomUsed = $this->supportedEncodingsWithBom[$encoding];
|
|
|
76 |
|
|
|
77 |
// we skip the N first bytes
|
|
|
78 |
$byteOffsetToSkipBom = \strlen($bomUsed);
|
|
|
79 |
}
|
|
|
80 |
|
|
|
81 |
return $byteOffsetToSkipBom;
|
|
|
82 |
}
|
|
|
83 |
|
|
|
84 |
/**
|
|
|
85 |
* Attempts to convert a non UTF-8 string into UTF-8.
|
|
|
86 |
*
|
|
|
87 |
* @param string $string Non UTF-8 string to be converted
|
|
|
88 |
* @param string $sourceEncoding The encoding used to encode the source string
|
|
|
89 |
*
|
|
|
90 |
* @return string The converted, UTF-8 string
|
|
|
91 |
*
|
|
|
92 |
* @throws EncodingConversionException If conversion is not supported or if the conversion failed
|
|
|
93 |
*/
|
|
|
94 |
public function attemptConversionToUTF8(?string $string, string $sourceEncoding): ?string
|
|
|
95 |
{
|
|
|
96 |
return $this->attemptConversion($string, $sourceEncoding, self::ENCODING_UTF8);
|
|
|
97 |
}
|
|
|
98 |
|
|
|
99 |
/**
|
|
|
100 |
* Attempts to convert a UTF-8 string into the given encoding.
|
|
|
101 |
*
|
|
|
102 |
* @param string $string UTF-8 string to be converted
|
|
|
103 |
* @param string $targetEncoding The encoding the string should be re-encoded into
|
|
|
104 |
*
|
|
|
105 |
* @return string The converted string, encoded with the given encoding
|
|
|
106 |
*
|
|
|
107 |
* @throws EncodingConversionException If conversion is not supported or if the conversion failed
|
|
|
108 |
*/
|
|
|
109 |
public function attemptConversionFromUTF8(?string $string, string $targetEncoding): ?string
|
|
|
110 |
{
|
|
|
111 |
return $this->attemptConversion($string, self::ENCODING_UTF8, $targetEncoding);
|
|
|
112 |
}
|
|
|
113 |
|
|
|
114 |
/**
|
|
|
115 |
* Returns whether the file identified by the given pointer has a BOM.
|
|
|
116 |
*
|
|
|
117 |
* @param resource $filePointer Pointer to the file to check
|
|
|
118 |
* @param string $encoding Encoding of the file to check
|
|
|
119 |
*
|
|
|
120 |
* @return bool TRUE if the file has a BOM, FALSE otherwise
|
|
|
121 |
*/
|
|
|
122 |
private function hasBOM($filePointer, string $encoding): bool
|
|
|
123 |
{
|
|
|
124 |
$hasBOM = false;
|
|
|
125 |
|
|
|
126 |
rewind($filePointer);
|
|
|
127 |
|
|
|
128 |
if (\array_key_exists($encoding, $this->supportedEncodingsWithBom)) {
|
|
|
129 |
$potentialBom = $this->supportedEncodingsWithBom[$encoding];
|
|
|
130 |
$numBytesInBom = \strlen($potentialBom);
|
|
|
131 |
|
|
|
132 |
$hasBOM = (fgets($filePointer, $numBytesInBom + 1) === $potentialBom);
|
|
|
133 |
}
|
|
|
134 |
|
|
|
135 |
return $hasBOM;
|
|
|
136 |
}
|
|
|
137 |
|
|
|
138 |
/**
|
|
|
139 |
* Attempts to convert the given string to the given encoding.
|
|
|
140 |
* Depending on what is installed on the server, we will try to iconv or mbstring.
|
|
|
141 |
*
|
|
|
142 |
* @param string $string string to be converted
|
|
|
143 |
* @param string $sourceEncoding The encoding used to encode the source string
|
|
|
144 |
* @param string $targetEncoding The encoding the string should be re-encoded into
|
|
|
145 |
*
|
|
|
146 |
* @return string The converted string, encoded with the given encoding
|
|
|
147 |
*
|
|
|
148 |
* @throws EncodingConversionException If conversion is not supported or if the conversion failed
|
|
|
149 |
*/
|
|
|
150 |
private function attemptConversion(?string $string, string $sourceEncoding, string $targetEncoding): ?string
|
|
|
151 |
{
|
|
|
152 |
// if source and target encodings are the same, it's a no-op
|
|
|
153 |
if (null === $string || $sourceEncoding === $targetEncoding) {
|
|
|
154 |
return $string;
|
|
|
155 |
}
|
|
|
156 |
|
|
|
157 |
$convertedString = null;
|
|
|
158 |
|
|
|
159 |
if ($this->canUseIconv) {
|
|
|
160 |
set_error_handler(static function (): bool {
|
|
|
161 |
return true;
|
|
|
162 |
});
|
|
|
163 |
|
|
|
164 |
$convertedString = iconv($sourceEncoding, $targetEncoding, $string);
|
|
|
165 |
|
|
|
166 |
restore_error_handler();
|
|
|
167 |
} elseif ($this->canUseMbString) {
|
|
|
168 |
$errorMessage = null;
|
|
|
169 |
set_error_handler(static function ($nr, $message) use (&$errorMessage): bool {
|
|
|
170 |
$errorMessage = $message; // @codeCoverageIgnore
|
|
|
171 |
|
|
|
172 |
return true; // @codeCoverageIgnore
|
|
|
173 |
});
|
|
|
174 |
|
|
|
175 |
try {
|
|
|
176 |
$convertedString = mb_convert_encoding($string, $targetEncoding, $sourceEncoding);
|
|
|
177 |
} catch (Error $error) {
|
|
|
178 |
$errorMessage = $error->getMessage();
|
|
|
179 |
}
|
|
|
180 |
|
|
|
181 |
restore_error_handler();
|
|
|
182 |
if (null !== $errorMessage) {
|
|
|
183 |
$convertedString = false;
|
|
|
184 |
}
|
|
|
185 |
} else {
|
|
|
186 |
throw new EncodingConversionException("The conversion from {$sourceEncoding} to {$targetEncoding} is not supported. Please install \"iconv\" or \"mbstring\".");
|
|
|
187 |
}
|
|
|
188 |
|
|
|
189 |
if (false === $convertedString) {
|
|
|
190 |
throw new EncodingConversionException("The conversion from {$sourceEncoding} to {$targetEncoding} failed.");
|
|
|
191 |
}
|
|
|
192 |
|
|
|
193 |
return $convertedString;
|
|
|
194 |
}
|
|
|
195 |
}
|