Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
namespace PhpXmlRpc\Helper;
4
 
5
use PhpXmlRpc\Exception\ValueErrorException;
6
use PhpXmlRpc\PhpXmlRpc;
7
use PhpXmlRpc\Traits\DeprecationLogger;
8
 
9
/**
10
 * @todo implement an interface
11
 */
12
class Charset
13
{
14
    use DeprecationLogger;
15
 
16
    // tables used for transcoding different charsets into us-ascii xml
17
    protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
18
 
19
    //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
20
 
21
    protected $charset_supersets = array(
22
        'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
23
            'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',
24
            'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',
25
            'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',
26
            'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),
27
    );
28
 
29
    /** @var Charset $instance */
30
    protected static $instance = null;
31
 
32
    /**
33
     * This class is singleton for performance reasons.
34
     *
35
     * @return Charset
36
     *
37
     * @todo should we just make $xml_iso88591_Entities a static variable instead ?
38
     */
39
    public static function instance()
40
    {
41
        if (self::$instance === null) {
42
            self::$instance = new static();
43
        }
44
 
45
        return self::$instance;
46
    }
47
 
48
    /**
49
     * Force usage as singleton.
50
     */
51
    protected function __construct()
52
    {
53
    }
54
 
55
    /**
56
     * @param string $tableName
57
     * @return void
58
     *
59
     * @throws ValueErrorException for unsupported $tableName
60
     *
61
     * @todo add support for cp1252 as well as latin-2 .. latin-10
62
     *       Optimization creep: instead of building all those tables on load, keep them ready-made php files
63
     *       which are not even included until needed
64
     * @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
65
     *       Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
66
     *       (though no luck when receiving them...)
67
     *       Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
68
     *       IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
69
     *       "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes
70
     *       from ISO/IEC 6429." Check what mbstring/iconv do by default with those?
71
     */
72
    protected function buildConversionTable($tableName)
73
    {
74
        switch ($tableName) {
75
            case 'xml_iso88591_Entities':
76
                if (count($this->xml_iso88591_Entities['in'])) {
77
                    return;
78
                }
79
                for ($i = 0; $i < 32; $i++) {
80
                    $this->xml_iso88591_Entities["in"][] = chr($i);
81
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
82
                }
83
 
84
                /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
85
 
86
                for ($i = 160; $i < 256; $i++) {
87
                    $this->xml_iso88591_Entities["in"][] = chr($i);
88
                    $this->xml_iso88591_Entities["out"][] = "&#{$i};";
89
                }
90
                break;
91
 
92
            /*case 'xml_cp1252_Entities':
93
                if (count($this->xml_cp1252_Entities['in'])) {
94
                    return;
95
                }
96
                for ($i = 128; $i < 160; $i++)
97
                {
98
                    $this->xml_cp1252_Entities['in'][] = chr($i);
99
                }
100
                $this->xml_cp1252_Entities['out'] = array(
101
                    '&#x20AC;', '?',        '&#x201A;', '&#x0192;',
102
                    '&#x201E;', '&#x2026;', '&#x2020;', '&#x2021;',
103
                    '&#x02C6;', '&#x2030;', '&#x0160;', '&#x2039;',
104
                    '&#x0152;', '?',        '&#x017D;', '?',
105
                    '?',        '&#x2018;', '&#x2019;', '&#x201C;',
106
                    '&#x201D;', '&#x2022;', '&#x2013;', '&#x2014;',
107
                    '&#x02DC;', '&#x2122;', '&#x0161;', '&#x203A;',
108
                    '&#x0153;', '?',        '&#x017E;', '&#x0178;'
109
                );
110
                $this->buildConversionTable('xml_iso88591_Entities');
111
                break;*/
112
 
113
            default:
114
                throw new ValueErrorException('Unsupported table: ' . $tableName);
115
        }
116
    }
117
 
118
    /**
119
     * Convert a string to the correct XML representation in a target charset.
120
     * This involves:
121
     * - character transformation for all characters which have a different representation in source and dest charsets
122
     * - using 'charset entity' representation for all characters which are outside the target charset
123
     *
124
     * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending
125
     * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars
126
     * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are
127
     * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.
128
     *
129
     * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit
130
     * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)
131
     *
132
     * @param string $data
133
     * @param string $srcEncoding
134
     * @param string $destEncoding
135
     * @return string
136
     *
137
     * @todo do a bit of basic benchmarking: strtr vs. str_replace, str_replace vs htmlspecialchars, hand-coded conversion
138
     *       vs mbstring when that is enabled
139
     * @todo make use of iconv when it is available and mbstring is not
140
     * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),
141
     *       but then take those into account as well in other methods, ie. isValidCharset)
142
     * @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not
143
     * @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159
144
     * @todo add support for escaping using CDATA sections? (add cdata start and end tokens, replace only ']]>' with ']]]]><![CDATA[>')
145
     */
146
    public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')
147
    {
148
        if ($srcEncoding == '') {
149
            // lame, but we know no better...
150
            $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;
151
        }
152
 
153
        if ($destEncoding == '') {
154
            $destEncoding = 'US-ASCII';
155
        }
156
 
157
        // in case there is transcoding going on, let's upscale to UTF8
158
        /// @todo we should do this as well when $srcEncoding == $destEncoding and the encoding is not supported by
159
        ///       htmlspecialchars
160
        if (!in_array($srcEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')) && $srcEncoding != $destEncoding &&
161
            function_exists('mb_convert_encoding')) {
162
            $data = mb_convert_encoding($data, 'UTF-8', str_replace('US-ASCII', 'ASCII', $srcEncoding));
163
            $srcEncoding = 'UTF-8';
164
        }
165
 
166
        $conversion = strtoupper($srcEncoding . '_' . $destEncoding);
167
 
168
        // list ordered with (expected) most common scenarios first
169
        switch ($conversion) {
170
            case 'UTF-8_UTF-8':
171
            case 'ISO-8859-1_ISO-8859-1':
172
            case 'US-ASCII_UTF-8':
173
            case 'US-ASCII_US-ASCII':
174
            case 'US-ASCII_ISO-8859-1':
175
            //case 'CP1252_CP1252':
176
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
177
                break;
178
 
179
            case 'UTF-8_US-ASCII':
180
            case 'UTF-8_ISO-8859-1':
181
                // NB: this will choke on invalid UTF-8, going most likely beyond EOF
182
                $escapedData = '';
183
                // be kind to users creating string xml-rpc values out of different php types
184
                $data = (string)$data;
185
                $ns = strlen($data);
186
                for ($nn = 0; $nn < $ns; $nn++) {
187
                    $ch = $data[$nn];
188
                    $ii = ord($ch);
189
                    // 7 bits in 1 byte: 0bbbbbbb (127)
190
                    if ($ii < 32) {
191
                        if ($conversion == 'UTF-8_US-ASCII') {
192
                            $escapedData .= sprintf('&#%d;', $ii);
193
                        } else {
194
                            $escapedData .= $ch;
195
                        }
196
                    }
197
                    else if ($ii < 128) {
198
                        /// @todo shall we replace this with a (supposedly) faster str_replace?
199
                        /// @todo to be 'print safe', should we encode as well character 127 (DEL) ?
200
                        switch ($ii) {
201
                            case 34:
202
                                $escapedData .= '&quot;';
203
                                break;
204
                            case 38:
205
                                $escapedData .= '&amp;';
206
                                break;
207
                            case 39:
208
                                $escapedData .= '&apos;';
209
                                break;
210
                            case 60:
211
                                $escapedData .= '&lt;';
212
                                break;
213
                            case 62:
214
                                $escapedData .= '&gt;';
215
                                break;
216
                            default:
217
                                $escapedData .= $ch;
218
                        } // switch
219
                    } // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)
220
                    elseif ($ii >> 5 == 6) {
221
                        $b1 = ($ii & 31);
222
                        $b2 = (ord($data[$nn + 1]) & 63);
223
                        $ii = ($b1 * 64) + $b2;
224
                        $escapedData .= sprintf('&#%d;', $ii);
225
                        $nn += 1;
226
                    } // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb
227
                    elseif ($ii >> 4 == 14) {
228
                        $b1 = ($ii & 15);
229
                        $b2 = (ord($data[$nn + 1]) & 63);
230
                        $b3 = (ord($data[$nn + 2]) & 63);
231
                        $ii = ((($b1 * 64) + $b2) * 64) + $b3;
232
                        $escapedData .= sprintf('&#%d;', $ii);
233
                        $nn += 2;
234
                    } // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
235
                    elseif ($ii >> 3 == 30) {
236
                        $b1 = ($ii & 7);
237
                        $b2 = (ord($data[$nn + 1]) & 63);
238
                        $b3 = (ord($data[$nn + 2]) & 63);
239
                        $b4 = (ord($data[$nn + 3]) & 63);
240
                        $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;
241
                        $escapedData .= sprintf('&#%d;', $ii);
242
                        $nn += 3;
243
                    }
244
                }
245
 
246
                // when converting to latin-1, do not be so eager with using entities for characters 160-255
247
                if ($conversion == 'UTF-8_ISO-8859-1') {
248
                    $this->buildConversionTable('xml_iso88591_Entities');
249
                    $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
250
                }
251
                break;
252
 
253
            case 'ISO-8859-1_UTF-8':
254
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
255
                /// @todo if on php >= 8.2, prefer using mbstring or iconv. Also: suppress the warning!
256
                if (function_exists('mb_convert_encoding')) {
257
                        $escapedData = mb_convert_encoding($escapedData, 'UTF-8', 'ISO-8859-1');
258
                } else {
259
                    $escapedData = utf8_encode($escapedData);
260
                }
261
                break;
262
 
263
            case 'ISO-8859-1_US-ASCII':
264
                $this->buildConversionTable('xml_iso88591_Entities');
265
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
266
                $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
267
                break;
268
 
269
            /*
270
            case 'CP1252_US-ASCII':
271
                $this->buildConversionTable('xml_cp1252_Entities');
272
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
273
                $escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
274
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
275
                break;
276
            case 'CP1252_UTF-8':
277
                $this->buildConversionTable('xml_cp1252_Entities');
278
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
279
                /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
280
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
281
                $escapedData = utf8_encode($escapedData);
282
                break;
283
            case 'CP1252_ISO-8859-1':
284
                $this->buildConversionTable('xml_cp1252_Entities');
285
                $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&amp;', '&quot;', '&apos;', '&lt;', '&gt;'), $data);
286
                // we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
287
                $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
288
                break;
289
            */
290
 
291
            default:
292
                if (function_exists('mb_convert_encoding')) {
293
                    // If reaching where, there are only 2 cases possible: UTF8->XXX or XXX->XXX
294
                    // If src is UTF8, we run htmlspecialchars before converting to the target charset, as
295
                    // htmlspecialchars has limited charset support, but it groks utf8
296
                    if ($srcEncoding === 'UTF-8') {
297
                        $data = htmlspecialchars($data,  defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, 'UTF-8');
298
                    }
299
                    if ($srcEncoding !== $destEncoding) {
300
                        try {
301
                            // php 7.4 and lower: a warning is generated. php 8.0 and up: an Error is thrown. So much for BC...
302
                            $data = @mb_convert_encoding($data, str_replace('US-ASCII', 'ASCII', $destEncoding), str_replace('US-ASCII', 'ASCII', $srcEncoding));
303
                        } catch (\ValueError $e) {
304
                            $data = false;
305
                        }
306
                    }
307
                    if ($data === false) {
308
                        $escapedData = '';
309
                        $this->getLogger()->error('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding via mbstring: failed...");
310
                    } else {
311
                        if ($srcEncoding === 'UTF-8') {
312
                            $escapedData = $data;
313
                        } else {
314
                            $escapedData = htmlspecialchars($data, defined('ENT_XML1') ? ENT_XML1 | ENT_QUOTES : ENT_QUOTES, $destEncoding);
315
                        }
316
                    }
317
                } else {
318
                    $escapedData = '';
319
                    $this->getLogger()->error('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");
320
                }
321
        }
322
 
323
        return $escapedData;
324
    }
325
 
326
    /**
327
     * @return string[]
328
     */
329
    public function knownCharsets()
330
    {
331
        $knownCharsets = array('UTF-8', 'ISO-8859-1', 'US-ASCII');
332
        // Add all charsets which mbstring can handle, but remove junk not found in IANA registry at
333
        // http://www.iana.org/assignments/character-sets/character-sets.xhtml
334
        if (function_exists('mb_list_encodings')) {
335
            $knownCharsets = array_unique(array_merge($knownCharsets, array_diff(mb_list_encodings(), array(
336
                'pass', 'auto', 'wchar', 'BASE64', 'UUENCODE', 'ASCII', 'HTML-ENTITIES', 'Quoted-Printable',
337
                '7bit','8bit', 'byte2be', 'byte2le', 'byte4be', 'byte4le'
338
            ))));
339
        }
340
        return $knownCharsets;
341
    }
342
 
343
    // *** BC layer ***
344
 
345
    /**
346
     * Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding
347
     * in the list.
348
     * @deprecated kept around for BC, as it is not in use by the lib
349
     *
350
     * @param string $encoding charset to be tested
351
     * @param string|array $validList comma separated list of valid charsets (or array of charsets)
352
     * @return bool
353
     */
354
    public function isValidCharset($encoding, $validList)
355
    {
356
        $this->logDeprecation('Method ' . __METHOD__ . ' is deprecated');
357
 
358
        if (is_string($validList)) {
359
            $validList = explode(',', $validList);
360
        }
361
        if (in_array(strtoupper($encoding), $validList)) {
362
            return true;
363
        } else {
364
            if (array_key_exists($encoding, $this->charset_supersets)) {
365
                foreach ($validList as $allowed) {
366
                    if (in_array($allowed, $this->charset_supersets[$encoding])) {
367
                        return true;
368
                    }
369
                }
370
            }
371
 
372
            return false;
373
        }
374
    }
375
 
376
    /**
377
     * Used only for backwards compatibility (the .inc shims).
378
     * @deprecated
379
     *
380
     * @param string $charset
381
     * @return array
382
     * @throws ValueErrorException for unknown/unsupported charsets
383
     */
384
    public function getEntities($charset)
385
    {
386
        $this->logDeprecation('Method ' . __METHOD__ . ' is deprecated');
387
 
388
        switch ($charset)
389
        {
390
            case 'iso88591':
391
                return $this->xml_iso88591_Entities;
392
            default:
393
                throw new ValueErrorException('Unsupported charset: ' . $charset);
394
        }
395
    }
396
}