WebSVN – Moodle – Autoría – /lib/phpxmlrpc/Helper/Charset.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`namespace PhpXmlRpc\Helper;`
		4
		5	`use PhpXmlRpc\Exception\ValueErrorException;`
		6	`use PhpXmlRpc\PhpXmlRpc;`
		7	`use PhpXmlRpc\Traits\DeprecationLogger;`
		8
		9	`/**`
		10	`* @todo implement an interface`
		11	`*/`
		12	`class Charset`
		13	`{`
		14	`use DeprecationLogger;`
		15
		16	`// tables used for transcoding different charsets into us-ascii xml`
		17	`protected $xml_iso88591_Entities = array("in" => array(), "out" => array());`
		18
		19	`//protected $xml_cp1252_Entities = array('in' => array(), out' => array());`
		20
		21	`protected $charset_supersets = array(`
		22	`'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',`
		23	`'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8',`
		24	`'ISO-8859-9', 'ISO-8859-10', 'ISO-8859-11', 'ISO-8859-12',`
		25	`'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'UTF-8',`
		26	`'EUC-JP', 'EUC-', 'EUC-KR', 'EUC-CN',),`
		27	`);`
		28
		29	`/** @var Charset $instance */`
		30	`protected static $instance = null;`
		31
		32	`/**`
		33	`* This class is singleton for performance reasons.`
		34	`*`
		35	`* @return Charset`
		36	`*`
		37	`* @todo should we just make $xml_iso88591_Entities a static variable instead ?`
		38	`*/`
		39	`public static function instance()`
		40	`{`
		41	`if (self::$instance === null) {`
		42	`self::$instance = new static();`
		43	`}`
		44
		45	`return self::$instance;`
		46	`}`
		47
		48	`/**`
		49	`* Force usage as singleton.`
		50	`*/`
		51	`protected function __construct()`
		52	`{`
		53	`}`
		54
		55	`/**`
		56	`* @param string $tableName`
		57	`* @return void`
		58	`*`
		59	`* @throws ValueErrorException for unsupported $tableName`
		60	`*`
		61	`* @todo add support for cp1252 as well as latin-2 .. latin-10`
		62	`* Optimization creep: instead of building all those tables on load, keep them ready-made php files`
		63	`* which are not even included until needed`
		64	`* @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?`
		65	`* Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk`
		66	`* (though no luck when receiving them...)`
		67	`* Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,`
		68	`* IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:`
		69	`* "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes`
		70	`* from ISO/IEC 6429." Check what mbstring/iconv do by default with those?`
		71	`*/`
		72	`protected function buildConversionTable($tableName)`
		73	`{`
		74	`switch ($tableName) {`
		75	`case 'xml_iso88591_Entities':`
		76	`if (count($this->xml_iso88591_Entities['in'])) {`
		77	`return;`
		78	`}`
		79	`for ($i = 0; $i < 32; $i++) {`
		80	`$this->xml_iso88591_Entities["in"][] = chr($i);`
		81	`$this->xml_iso88591_Entities["out"][] = "&#{$i};";`
		82	`}`
		83
		84	`/// @todo to be 'print safe', should we encode as well character 127 (DEL) ?`
		85
		86	`for ($i = 160; $i < 256; $i++) {`
		87	`$this->xml_iso88591_Entities["in"][] = chr($i);`
		88	`$this->xml_iso88591_Entities["out"][] = "&#{$i};";`
		89	`}`
		90	`break;`
		91
		92	`/*case 'xml_cp1252_Entities':`
		93	`if (count($this->xml_cp1252_Entities['in'])) {`
		94	`return;`
		95	`}`
		96	`for ($i = 128; $i < 160; $i++)`
		97	`{`
		98	`$this->xml_cp1252_Entities['in'][] = chr($i);`
		99	`}`
		100	`$this->xml_cp1252_Entities['out'] = array(`
		101	`'€', '?', '‚', 'ƒ',`
		102	`'„', '…', '†', '‡',`
		103	`'ˆ', '‰', 'Š', '‹',`
		104	`'Œ', '?', 'Ž', '?',`
		105	`'?', '‘', '’', '“',`
		106	`'”', '•', '–', '—',`
		107	`'˜', '™', 'š', '›',`
		108	`'œ', '?', 'ž', 'Ÿ'`
		109	`);`
		110	`$this->buildConversionTable('xml_iso88591_Entities');`
		111	`break;*/`
		112
		113	`default:`
		114	`throw new ValueErrorException('Unsupported table: ' . $tableName);`
		115	`}`
		116	`}`
		117
		118	`/**`
		119	`* Convert a string to the correct XML representation in a target charset.`
		120	`* This involves:`
		121	`* - character transformation for all characters which have a different representation in source and dest charsets`
		122	`* - using 'charset entity' representation for all characters which are outside the target charset`
		123	`*`
		124	`* To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending`
		125	`* requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars`
		126	`* present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are`
		127	`* independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them.`
		128	`*`
		129	`* Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit`
		130	`* strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...)`
		131	`*`
		132	`* @param string $data`
		133	`* @param string $srcEncoding`
		134	`* @param string $destEncoding`
		135	`* @return string`
		136	`*`
		137	`* @todo do a bit of basic benchmarking: strtr vs. str_replace, str_replace vs htmlspecialchars, hand-coded conversion`
		138	`* vs mbstring when that is enabled`
		139	`* @todo make use of iconv when it is available and mbstring is not`
		140	`* @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list),`
		141	`* but then take those into account as well in other methods, ie. isValidCharset)`
		142	`* @todo when converting to ASCII, allow to choose whether to escape the range 0-31,127 (non-print chars) or not`
		143	`* @todo allow picking different strategies to deal w. invalid chars? eg. source in latin-1 and chars 128-159`
		144	`* @todo add support for escaping using CDATA sections? (add cdata start and end tokens, replace only ']]>' with ']]]]><![CDATA[>')`
		145	`*/`
		146	`public function encodeEntities($data, $srcEncoding = '', $destEncoding = '')`
		147	`{`
		148	`if ($srcEncoding == '') {`
		149	`// lame, but we know no better...`
		150	`$srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding;`
		151	`}`
		152
		153	`if ($destEncoding == '') {`
		154	`$destEncoding = 'US-ASCII';`
		155	`}`
		156
		157	`// in case there is transcoding going on, let's upscale to UTF8`
		158	`/// @todo we should do this as well when $srcEncoding == $destEncoding and the encoding is not supported by`
		159	`/// htmlspecialchars`
		160	`if (!in_array($srcEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')) && $srcEncoding != $destEncoding &&`
		161	`function_exists('mb_convert_encoding')) {`
		162	`$data = mb_convert_encoding($data, 'UTF-8', str_replace('US-ASCII', 'ASCII', $srcEncoding));`
		163	`$srcEncoding = 'UTF-8';`
		164	`}`
		165
		166	`$conversion = strtoupper($srcEncoding . '_' . $destEncoding);`
		167
		168	`// list ordered with (expected) most common scenarios first`
		169	`switch ($conversion) {`
		170	`case 'UTF-8_UTF-8':`
		171	`case 'ISO-8859-1_ISO-8859-1':`
		172	`case 'US-ASCII_UTF-8':`
		173	`case 'US-ASCII_US-ASCII':`
		174	`case 'US-ASCII_ISO-8859-1':`
		175	`//case 'CP1252_CP1252':`
		176	`$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);`
		177	`break;`
		178
		179	`case 'UTF-8_US-ASCII':`
		180	`case 'UTF-8_ISO-8859-1':`
		181	`// NB: this will choke on invalid UTF-8, going most likely beyond EOF`
		182	`$escapedData = '';`
		183	`// be kind to users creating string xml-rpc values out of different php types`
		184	`$data = (string)$data;`
		185	`$ns = strlen($data);`
		186	`for ($nn = 0; $nn < $ns; $nn++) {`
		187	`$ch = $data[$nn];`
		188	`$ii = ord($ch);`
		189	`// 7 bits in 1 byte: 0bbbbbbb (127)`
		190	`if ($ii < 32) {`
		191	`if ($conversion == 'UTF-8_US-ASCII') {`
		192	`$escapedData .= sprintf('&#%d;', $ii);`
		193	`} else {`
		194	`$escapedData .= $ch;`
		195	`}`
		196	`}`
		197	`else if ($ii < 128) {`
		198	`/// @todo shall we replace this with a (supposedly) faster str_replace?`
		199	`/// @todo to be 'print safe', should we encode as well character 127 (DEL) ?`
		200	`switch ($ii) {`
		201	`case 34:`
		202	`$escapedData .= '"';`
		203	`break;`
		204	`case 38:`
		205	`$escapedData .= '&';`
		206	`break;`
		207	`case 39:`
		208	`$escapedData .= ''';`
		209	`break;`
		210	`case 60:`
		211	`$escapedData .= '<';`
		212	`break;`
		213	`case 62:`
		214	`$escapedData .= '>';`
		215	`break;`
		216	`default:`
		217	`$escapedData .= $ch;`
		218	`} // switch`
		219	`} // 11 bits in 2 bytes: 110bbbbb 10bbbbbb (2047)`
		220	`elseif ($ii >> 5 == 6) {`
		221	`$b1 = ($ii & 31);`
		222	`$b2 = (ord($data[$nn + 1]) & 63);`
		223	`$ii = ($b1 * 64) + $b2;`
		224	`$escapedData .= sprintf('&#%d;', $ii);`
		225	`$nn += 1;`
		226	`} // 16 bits in 3 bytes: 1110bbbb 10bbbbbb 10bbbbbb`
		227	`elseif ($ii >> 4 == 14) {`
		228	`$b1 = ($ii & 15);`
		229	`$b2 = (ord($data[$nn + 1]) & 63);`
		230	`$b3 = (ord($data[$nn + 2]) & 63);`
		231	`$ii = ((($b1 * 64) + $b2) * 64) + $b3;`
		232	`$escapedData .= sprintf('&#%d;', $ii);`
		233	`$nn += 2;`
		234	`} // 21 bits in 4 bytes: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb`
		235	`elseif ($ii >> 3 == 30) {`
		236	`$b1 = ($ii & 7);`
		237	`$b2 = (ord($data[$nn + 1]) & 63);`
		238	`$b3 = (ord($data[$nn + 2]) & 63);`
		239	`$b4 = (ord($data[$nn + 3]) & 63);`
		240	`$ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4;`
		241	`$escapedData .= sprintf('&#%d;', $ii);`
		242	`$nn += 3;`
		243	`}`
		244	`}`
		245
		246	`// when converting to latin-1, do not be so eager with using entities for characters 160-255`
		247	`if ($conversion == 'UTF-8_ISO-8859-1') {`
		248	`$this->buildConversionTable('xml_iso88591_Entities');`
		249	`$escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);`
		250	`}`
		251	`break;`
		252
		253	`case 'ISO-8859-1_UTF-8':`
		254	`$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);`
		255	`/// @todo if on php >= 8.2, prefer using mbstring or iconv. Also: suppress the warning!`
		256	`if (function_exists('mb_convert_encoding')) {`
		257	`$escapedData = mb_convert_encoding($escapedData, 'UTF-8', 'ISO-8859-1');`
		258	`} else {`
		259	`$escapedData = utf8_encode($escapedData);`
		260	`}`
		261	`break;`
		262
		263	`case 'ISO-8859-1_US-ASCII':`
		264	`$this->buildConversionTable('xml_iso88591_Entities');`
		265	`$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);`
		266	`$escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);`
		267	`break;`
		268
		269	`/*`
		270	`case 'CP1252_US-ASCII':`
		271	`$this->buildConversionTable('xml_cp1252_Entities');`
		272	`$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);`
		273	`$escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);`
		274	`$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);`
		275	`break;`
		276	`case 'CP1252_UTF-8':`
		277	`$this->buildConversionTable('xml_cp1252_Entities');`
		278	`$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);`
		279	`/// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)`
		280	`$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);`
		281	`$escapedData = utf8_encode($escapedData);`
		282	`break;`
		283	`case 'CP1252_ISO-8859-1':`
		284	`$this->buildConversionTable('xml_cp1252_Entities');`
		285	`$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);`
		286	`// we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...`
		287	`$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);`
		288	`break;`
		289	`*/`
		290
		291	`default:`
		292	`if (function_exists('mb_convert_encoding')) {`
		293	`// If reaching where, there are only 2 cases possible: UTF8->XXX or XXX->XXX`
		294	`// If src is UTF8, we run htmlspecialchars before converting to the target charset, as`
		295	`// htmlspecialchars has limited charset support, but it groks utf8`
		296	`if ($srcEncoding === 'UTF-8') {`
		297	`$data = htmlspecialchars($data, defined('ENT_XML1') ? ENT_XML1 \| ENT_QUOTES : ENT_QUOTES, 'UTF-8');`
		298	`}`
		299	`if ($srcEncoding !== $destEncoding) {`
		300	`try {`
		301	`// php 7.4 and lower: a warning is generated. php 8.0 and up: an Error is thrown. So much for BC...`
		302	`$data = @mb_convert_encoding($data, str_replace('US-ASCII', 'ASCII', $destEncoding), str_replace('US-ASCII', 'ASCII', $srcEncoding));`
		303	`} catch (\ValueError $e) {`
		304	`$data = false;`
		305	`}`
		306	`}`
		307	`if ($data === false) {`
		308	`$escapedData = '';`
		309	`$this->getLogger()->error('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding via mbstring: failed...");`
		310	`} else {`
		311	`if ($srcEncoding === 'UTF-8') {`
		312	`$escapedData = $data;`
		313	`} else {`
		314	`$escapedData = htmlspecialchars($data, defined('ENT_XML1') ? ENT_XML1 \| ENT_QUOTES : ENT_QUOTES, $destEncoding);`
		315	`}`
		316	`}`
		317	`} else {`
		318	`$escapedData = '';`
		319	`$this->getLogger()->error('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported...");`
		320	`}`
		321	`}`
		322
		323	`return $escapedData;`
		324	`}`
		325
		326	`/**`
		327	`* @return string[]`
		328	`*/`
		329	`public function knownCharsets()`
		330	`{`
		331	`$knownCharsets = array('UTF-8', 'ISO-8859-1', 'US-ASCII');`
		332	`// Add all charsets which mbstring can handle, but remove junk not found in IANA registry at`
		333	`// http://www.iana.org/assignments/character-sets/character-sets.xhtml`
		334	`if (function_exists('mb_list_encodings')) {`
		335	`$knownCharsets = array_unique(array_merge($knownCharsets, array_diff(mb_list_encodings(), array(`
		336	`'pass', 'auto', 'wchar', 'BASE64', 'UUENCODE', 'ASCII', 'HTML-ENTITIES', 'Quoted-Printable',`
		337	`'7bit','8bit', 'byte2be', 'byte2le', 'byte4be', 'byte4le'`
		338	`))));`
		339	`}`
		340	`return $knownCharsets;`
		341	`}`
		342
		343	`// * BC layer *`
		344
		345	`/**`
		346	`* Checks if a given charset encoding is present in a list of encodings or if it is a valid subset of any encoding`
		347	`* in the list.`
		348	`* @deprecated kept around for BC, as it is not in use by the lib`
		349	`*`
		350	`* @param string $encoding charset to be tested`
		351	`* @param string\|array $validList comma separated list of valid charsets (or array of charsets)`
		352	`* @return bool`
		353	`*/`
		354	`public function isValidCharset($encoding, $validList)`
		355	`{`
		356	`$this->logDeprecation('Method ' . __METHOD__ . ' is deprecated');`
		357
		358	`if (is_string($validList)) {`
		359	`$validList = explode(',', $validList);`
		360	`}`
		361	`if (in_array(strtoupper($encoding), $validList)) {`
		362	`return true;`
		363	`} else {`
		364	`if (array_key_exists($encoding, $this->charset_supersets)) {`
		365	`foreach ($validList as $allowed) {`
		366	`if (in_array($allowed, $this->charset_supersets[$encoding])) {`
		367	`return true;`
		368	`}`
		369	`}`
		370	`}`
		371
		372	`return false;`
		373	`}`
		374	`}`
		375
		376	`/**`
		377	`* Used only for backwards compatibility (the .inc shims).`
		378	`* @deprecated`
		379	`*`
		380	`* @param string $charset`
		381	`* @return array`
		382	`* @throws ValueErrorException for unknown/unsupported charsets`
		383	`*/`
		384	`public function getEntities($charset)`
		385	`{`
		386	`$this->logDeprecation('Method ' . __METHOD__ . ' is deprecated');`
		387
		388	`switch ($charset)`
		389	`{`
		390	`case 'iso88591':`
		391	`return $this->xml_iso88591_Entities;`
		392	`default:`
		393	`throw new ValueErrorException('Unsupported charset: ' . $charset);`
		394	`}`
		395	`}`
		396	`}`

Proyectos de Subversion Moodle

(root)/lib/phpxmlrpc/Helper/Charset.php – Rev 1