WebSVN – Moodle – Autoría – /admin/tool/messageinbound/roundcube/rcube_charset.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`+-----------------------------------------------------------------------+`
		5	`\| This file is part of the Roundcube Webmail client \|`
		6	`\| \|`
		7	`\| Copyright (C) The Roundcube Dev Team \|`
		8	`\| Copyright (C) Kolab Systems AG \|`
		9	`\| Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> \|`
		10	`\| \|`
		11	`\| Licensed under the GNU General Public License version 3 or \|`
		12	`\| any later version with exceptions for skins & plugins. \|`
		13	`\| See the README file for a full license statement. \|`
		14	`\| \|`
		15	`\| PURPOSE: \|`
		16	`\| Provide charset conversion functionality \|`
		17	`+-----------------------------------------------------------------------+`
		18	`\| Author: Thomas Bruederli <roundcube@gmail.com> \|`
		19	`\| Author: Aleksander Machniak <alec@alec.pl> \|`
		20	`\| Author: Edmund Grimley Evans <edmundo@rano.org> \|`
		21	`+-----------------------------------------------------------------------+`
		22	`*/`
		23
		24	`/**`
		25	`* Character sets conversion functionality`
		26	`*`
		27	`* @package Framework`
		28	`* @subpackage Core`
		29	`*/`
		30	`class rcube_charset`
		31	`{`
		32	`/**`
		33	`* Character set aliases (some of them from HTML5 spec.)`
		34	`*`
		35	`* @var array`
		36	`*/`
		37	`static public $aliases = [`
		38	`'USASCII' => 'WINDOWS-1252',`
		39	`'ANSIX31101983' => 'WINDOWS-1252',`
		40	`'ANSIX341968' => 'WINDOWS-1252',`
		41	`'UNKNOWN8BIT' => 'ISO-8859-15',`
		42	`'UNKNOWN' => 'ISO-8859-15',`
		43	`'USERDEFINED' => 'ISO-8859-15',`
		44	`'KSC56011987' => 'EUC-KR',`
		45	`'GB2312' => 'GBK',`
		46	`'GB231280' => 'GBK',`
		47	`'UNICODE' => 'UTF-8',`
		48	`'UTF7IMAP' => 'UTF7-IMAP',`
		49	`'TIS620' => 'WINDOWS-874',`
		50	`'ISO88599' => 'WINDOWS-1254',`
		51	`'ISO885911' => 'WINDOWS-874',`
		52	`'MACROMAN' => 'MACINTOSH',`
		53	`'77' => 'MAC',`
		54	`'128' => 'SHIFT-JIS',`
		55	`'129' => 'CP949',`
		56	`'130' => 'CP1361',`
		57	`'134' => 'GBK',`
		58	`'136' => 'BIG5',`
		59	`'161' => 'WINDOWS-1253',`
		60	`'162' => 'WINDOWS-1254',`
		61	`'163' => 'WINDOWS-1258',`
		62	`'177' => 'WINDOWS-1255',`
		63	`'178' => 'WINDOWS-1256',`
		64	`'186' => 'WINDOWS-1257',`
		65	`'204' => 'WINDOWS-1251',`
		66	`'222' => 'WINDOWS-874',`
		67	`'238' => 'WINDOWS-1250',`
		68	`'MS950' => 'CP950',`
		69	`'WINDOWS31J' => 'CP932',`
		70	`'WINDOWS949' => 'UHC',`
		71	`'WINDOWS1257' => 'ISO-8859-13',`
		72	`'ISO2022JP' => 'ISO-2022-JP-MS',`
		73	`];`
		74
		75	`/**`
		76	`* Windows codepages`
		77	`*`
		78	`* @var array`
		79	`*/`
		80	`static public $windows_codepages = [`
		81	`37 => 'IBM037', // IBM EBCDIC US-Canada`
		82	`437 => 'IBM437', // OEM United States`
		83	`500 => 'IBM500', // IBM EBCDIC International`
		84	`708 => 'ASMO-708', // Arabic (ASMO 708)`
		85	`720 => 'DOS-720', // Arabic (Transparent ASMO); Arabic (DOS)`
		86	`737 => 'IBM737', // OEM Greek (formerly 437G); Greek (DOS)`
		87	`775 => 'IBM775', // OEM Baltic; Baltic (DOS)`
		88	`850 => 'IBM850', // OEM Multilingual Latin 1; Western European (DOS)`
		89	`852 => 'IBM852', // OEM Latin 2; Central European (DOS)`
		90	`855 => 'IBM855', // OEM Cyrillic (primarily Russian)`
		91	`857 => 'IBM857', // OEM Turkish; Turkish (DOS)`
		92	`858 => 'IBM00858', // OEM Multilingual Latin 1 + Euro symbol`
		93	`860 => 'IBM860', // OEM Portuguese; Portuguese (DOS)`
		94	`861 => 'IBM861', // OEM Icelandic; Icelandic (DOS)`
		95	`862 => 'DOS-862', // OEM Hebrew; Hebrew (DOS)`
		96	`863 => 'IBM863', // OEM French Canadian; French Canadian (DOS)`
		97	`864 => 'IBM864', // OEM Arabic; Arabic (864)`
		98	`865 => 'IBM865', // OEM Nordic; Nordic (DOS)`
		99	`866 => 'cp866', // OEM Russian; Cyrillic (DOS)`
		100	`869 => 'IBM869', // OEM Modern Greek; Greek, Modern (DOS)`
		101	`870 => 'IBM870', // IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2`
		102	`874 => 'windows-874', // ANSI/OEM Thai (ISO 8859-11); Thai (Windows)`
		103	`875 => 'cp875', // IBM EBCDIC Greek Modern`
		104	`932 => 'shift_jis', // ANSI/OEM Japanese; Japanese (Shift-JIS)`
		105	`936 => 'gb2312', // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)`
		106	`950 => 'big5', // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)`
		107	`1026 => 'IBM1026', // IBM EBCDIC Turkish (Latin 5)`
		108	`1047 => 'IBM01047', // IBM EBCDIC Latin 1/Open System`
		109	`1140 => 'IBM01140', // IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)`
		110	`1141 => 'IBM01141', // IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)`
		111	`1142 => 'IBM01142', // IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)`
		112	`1143 => 'IBM01143', // IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)`
		113	`1144 => 'IBM01144', // IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)`
		114	`1145 => 'IBM01145', // IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)`
		115	`1146 => 'IBM01146', // IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)`
		116	`1147 => 'IBM01147', // IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)`
		117	`1148 => 'IBM01148', // IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)`
		118	`1149 => 'IBM01149', // IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)`
		119	`1200 => 'UTF-16', // Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications`
		120	`1201 => 'UTF-16BE', // Unicode UTF-16, big endian byte order; available only to managed applications`
		121	`1250 => 'windows-1250', // ANSI Central European; Central European (Windows)`
		122	`1251 => 'windows-1251', // ANSI Cyrillic; Cyrillic (Windows)`
		123	`1252 => 'windows-1252', // ANSI Latin 1; Western European (Windows)`
		124	`1253 => 'windows-1253', // ANSI Greek; Greek (Windows)`
		125	`1254 => 'windows-1254', // ANSI Turkish; Turkish (Windows)`
		126	`1255 => 'windows-1255', // ANSI Hebrew; Hebrew (Windows)`
		127	`1256 => 'windows-1256', // ANSI Arabic; Arabic (Windows)`
		128	`1257 => 'windows-1257', // ANSI Baltic; Baltic (Windows)`
		129	`1258 => 'windows-1258', // ANSI/OEM Vietnamese; Vietnamese (Windows)`
		130	`10000 => 'macintosh', // MAC Roman; Western European (Mac)`
		131	`12000 => 'UTF-32', // Unicode UTF-32, little endian byte order; available only to managed applications`
		132	`12001 => 'UTF-32BE', // Unicode UTF-32, big endian byte order; available only to managed applications`
		133	`20127 => 'US-ASCII', // US-ASCII (7-bit)`
		134	`20273 => 'IBM273', // IBM EBCDIC Germany`
		135	`20277 => 'IBM277', // IBM EBCDIC Denmark-Norway`
		136	`20278 => 'IBM278', // IBM EBCDIC Finland-Sweden`
		137	`20280 => 'IBM280', // IBM EBCDIC Italy`
		138	`20284 => 'IBM284', // IBM EBCDIC Latin America-Spain`
		139	`20285 => 'IBM285', // IBM EBCDIC United Kingdom`
		140	`20290 => 'IBM290', // IBM EBCDIC Japanese Katakana Extended`
		141	`20297 => 'IBM297', // IBM EBCDIC France`
		142	`20420 => 'IBM420', // IBM EBCDIC Arabic`
		143	`20423 => 'IBM423', // IBM EBCDIC Greek`
		144	`20424 => 'IBM424', // IBM EBCDIC Hebrew`
		145	`20838 => 'IBM-Thai', // IBM EBCDIC Thai`
		146	`20866 => 'koi8-r', // Russian (KOI8-R); Cyrillic (KOI8-R)`
		147	`20871 => 'IBM871', // IBM EBCDIC Icelandic`
		148	`20880 => 'IBM880', // IBM EBCDIC Cyrillic Russian`
		149	`20905 => 'IBM905', // IBM EBCDIC Turkish`
		150	`20924 => 'IBM00924', // IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)`
		151	`20932 => 'EUC-JP', // Japanese (JIS 0208-1990 and 0212-1990)`
		152	`20936 => 'cp20936', // Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)`
		153	`20949 => 'cp20949', // Korean Wansung`
		154	`21025 => 'cp1025', // IBM EBCDIC Cyrillic Serbian-Bulgarian`
		155	`21866 => 'koi8-u', // Ukrainian (KOI8-U); Cyrillic (KOI8-U)`
		156	`28591 => 'iso-8859-1', // ISO 8859-1 Latin 1; Western European (ISO)`
		157	`28592 => 'iso-8859-2', // ISO 8859-2 Central European; Central European (ISO)`
		158	`28593 => 'iso-8859-3', // ISO 8859-3 Latin 3`
		159	`28594 => 'iso-8859-4', // ISO 8859-4 Baltic`
		160	`28595 => 'iso-8859-5', // ISO 8859-5 Cyrillic`
		161	`28596 => 'iso-8859-6', // ISO 8859-6 Arabic`
		162	`28597 => 'iso-8859-7', // ISO 8859-7 Greek`
		163	`28598 => 'iso-8859-8', // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)`
		164	`28599 => 'iso-8859-9', // ISO 8859-9 Turkish`
		165	`28603 => 'iso-8859-13', // ISO 8859-13 Estonian`
		166	`28605 => 'iso-8859-15', // ISO 8859-15 Latin 9`
		167	`38598 => 'iso-8859-8-i', // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)`
		168	`50220 => 'iso-2022-jp', // ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)`
		169	`50221 => 'csISO2022JP', // ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)`
		170	`50222 => 'iso-2022-jp', // ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)`
		171	`50225 => 'iso-2022-kr', // ISO 2022 Korean`
		172	`51932 => 'EUC-JP', // EUC Japanese`
		173	`51936 => 'EUC-CN', // EUC Simplified Chinese; Chinese Simplified (EUC)`
		174	`51949 => 'EUC-KR', // EUC Korean`
		175	`52936 => 'hz-gb-2312', // HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)`
		176	`54936 => 'GB18030', // Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)`
		177	`65000 => 'UTF-7',`
		178	`65001 => 'UTF-8',`
		179	`];`
		180
		181	`/**`
		182	`* Validate character set identifier.`
		183	`*`
		184	`* @param string $input Character set identifier`
		185	`*`
		186	`* @return bool True if valid, False if not valid`
		187	`*/`
		188	`public static function is_valid($input)`
		189	`{`
		190	`return is_string($input) && preg_match('\|^[a-zA-Z0-9_./:#-]{2,32}$\|', $input) > 0;`
		191	`}`
		192
		193	`/**`
		194	`* Parse and validate charset name string.`
		195	`* Sometimes charset string is malformed, there are also charset aliases,`
		196	`* but we need strict names for charset conversion (specially utf8 class)`
		197	`*`
		198	`* @param string $input Input charset name`
		199	`*`
		200	`* @return string The validated charset name`
		201	`*/`
		202	`public static function parse_charset($input)`
		203	`{`
		204	`static $charsets = [];`
		205
		206	`$charset = strtoupper((string) $input);`
		207
		208	`if (isset($charsets[$input])) {`
		209	`return $charsets[$input];`
		210	`}`
		211
		212	`$charset = preg_replace([`
		213	`'/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO`
		214	`'/\$.*$/', // e.g. _ISO-8859-JP$SIO`
		215	`'/UNICODE-1-1-*/', // RFC1641/1642`
		216	`'/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8)`
		217	`'/\.$/' // lang code according to RFC 2231.5`
		218	`], '', $charset);`
		219
		220	`if ($charset == 'BINARY') {`
		221	`return $charsets[$input] = null;`
		222	`}`
		223
		224	`// allow A-Z and 0-9 only`
		225	`$str = preg_replace('/[^A-Z0-9]/', '', $charset);`
		226
		227	`$result = $charset;`
		228
		229	`if (isset(self::$aliases[$str])) {`
		230	`$result = self::$aliases[$str];`
		231	`}`
		232	`// UTF`
		233	`else if (preg_match('/U[A-Z][A-Z](7\|8\|16\|32)(BE\|LE)*/', $str, $m)) {`
		234	`$result = 'UTF-' . $m[1] . (!empty($m[2]) ? $m[2] : '');`
		235	`}`
		236	`// ISO-8859`
		237	`else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {`
		238	`$iso = 'ISO-8859-' . ($m[1] ?: 1);`
		239	`// some clients sends windows-1252 text as latin1,`
		240	`// it is safe to use windows-1252 for all latin1`
		241	`$result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;`
		242	`}`
		243	`// handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE`
		244	`else if (preg_match('/(WIN\|WINDOWS)([0-9]+)/', $str, $m)) {`
		245	`$result = 'WINDOWS-' . $m[2];`
		246	`}`
		247	`// LATIN`
		248	`else if (preg_match('/LATIN(.*)/', $str, $m)) {`
		249	`$aliases = ['2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,`
		250	`'7' => 13, '8' => 14, '9' => 15, '10' => 16,`
		251	`'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8`
		252	`];`
		253
		254	`// some clients sends windows-1252 text as latin1,`
		255	`// it is safe to use windows-1252 for all latin1`
		256	`if ($m[1] == 1) {`
		257	`$result = 'WINDOWS-1252';`
		258	`}`
		259	`// we need ISO labels`
		260	`else if (!empty($aliases[$m[1]])) {`
		261	`$result = 'ISO-8859-'.$aliases[$m[1]];`
		262	`}`
		263	`}`
		264
		265	`$charsets[$input] = $result;`
		266
		267	`return $result;`
		268	`}`
		269
		270	`/**`
		271	`* Convert a string from one charset to another.`
		272	`*`
		273	`* @param string $str Input string`
		274	`* @param string $from Suspected charset of the input string`
		275	`* @param string $to Target charset to convert to; defaults to RCUBE_CHARSET`
		276	`*`
		277	`* @return string Converted string`
		278	`*/`
		279	`public static function convert($str, $from, $to = null)`
		280	`{`
		281	`static $iconv_options;`
		282
		283	`$to = empty($to) ? RCUBE_CHARSET : self::parse_charset($to);`
		284	`$from = self::parse_charset($from);`
		285
		286	`// It is a common case when UTF-16 charset is used with US-ASCII content (#1488654)`
		287	`// In that case we can just skip the conversion (use UTF-8)`
		288	`if ($from == 'UTF-16' && !preg_match('/[^\x00-\x7F]/', $str)) {`
		289	`$from = 'UTF-8';`
		290	`}`
		291
		292	`if ($from == $to \|\| empty($str) \|\| empty($from)) {`
		293	`return $str;`
		294	`}`
		295
		296	`$out = false;`
		297	`$error_handler = function() { throw new \Exception(); };`
		298
		299	`// Ignore invalid characters`
		300	`$mbstring_sc = mb_substitute_character();`
		301	`mb_substitute_character('none');`
		302
		303	`// If mbstring reports an illegal character in input via E_WARNING.`
		304	`// FIXME: Is this really true with substitute character 'none'?`
		305	`// A warning is thrown in PHP<8 also on unsupported encoding, in PHP>=8 ValueError`
		306	`// is thrown instead (therefore we catch Throwable below)`
		307	`set_error_handler($error_handler, E_WARNING);`
		308
		309	`try {`
		310	`$out = mb_convert_encoding($str, $to, $from);`
		311	`}`
		312	`catch (Throwable $e) {`
		313	`$out = false;`
		314	`}`
		315
		316	`restore_error_handler();`
		317	`mb_substitute_character($mbstring_sc);`
		318
		319	`if ($out !== false) {`
		320	`return $out;`
		321	`}`
		322
		323	`if ($iconv_options === null) {`
		324	`if (function_exists('iconv')) {`
		325	`// ignore characters not available in output charset`
		326	`$iconv_options = '//IGNORE';`
		327	`if (iconv('', $iconv_options, '') === false) {`
		328	`// iconv implementation does not support options`
		329	`$iconv_options = '';`
		330	`}`
		331	`}`
		332	`else {`
		333	`$iconv_options = false;`
		334	`}`
		335	`}`
		336
		337	`// Fallback to iconv module, it is slower, but supports much more charsets than mbstring`
		338	`if ($iconv_options !== false && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP'`
		339	`&& $from !== 'ISO-2022-JP'`
		340	`) {`
		341	`// If iconv reports an illegal character in input it means that input string`
		342	`// has been truncated. It's reported as E_NOTICE.`
		343	`// PHP8 will also throw E_WARNING on unsupported encoding.`
		344	`set_error_handler($error_handler, E_NOTICE \| E_WARNING);`
		345
		346	`try {`
		347	`$out = iconv($from, $to . $iconv_options, $str);`
		348	`}`
		349	`catch (Throwable $e) {`
		350	`$out = false;`
		351	`}`
		352
		353	`restore_error_handler();`
		354
		355	`if ($out !== false) {`
		356	`return $out;`
		357	`}`
		358	`}`
		359
		360	`// return the original string`
		361	`return $str;`
		362	`}`
		363
		364	`/**`
		365	`* Check if the specified input string matches one of the provided charsets.`
		366	`* This includes UTF-32, UTF-16, RCUBE_CHARSET and default_charset.`
		367	`*`
		368	`* @param string $str Input string`
		369	`* @param array $charsets Suspected charsets of the input string`
		370	`*`
		371	`* @return string\|null First matching charset`
		372	`*/`
		373	`public static function check($str, $charsets = [])`
		374	`{`
		375	`$chunk = strlen($str) > 100 * 1024 ? substr($str, 0, 100 * 1024) : $str;`
		376
		377	`// Add dehault charset, system charset and easily detectable charset to the list`
		378	`if (substr($chunk, 0, 4) == "\0\0\xFE\xFF") $charsets[] = 'UTF-32BE';`
		379	`if (substr($chunk, 0, 4) == "\xFF\xFE\0\0") $charsets[] = 'UTF-32LE';`
		380	`if (substr($chunk, 0, 2) == "\xFE\xFF") $charsets[] = 'UTF-16BE';`
		381	`if (substr($chunk, 0, 2) == "\xFF\xFE") $charsets[] = 'UTF-16LE';`
		382
		383	`// heuristics`
		384	`if (preg_match('/\x00\x00\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-32BE';`
		385	`if (preg_match('/[^\x00]\x00\x00\x00/', $chunk)) $charsets[] = 'UTF-32LE';`
		386	`if (preg_match('/\x00[^\x00]\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-16BE';`
		387	`if (preg_match('/[^\x00]\x00[^\x00]\x00/', $chunk)) $charsets[] = 'UTF-16LE';`
		388
		389	`$charsets[] = RCUBE_CHARSET;`
		390	`$charsets[] = (string) rcube::get_instance()->config->get('default_charset');`
		391
		392	`$charsets = array_map(['rcube_charset', 'parse_charset'], $charsets);`
		393	`$charsets = array_unique(array_filter($charsets));`
		394
		395	`foreach ($charsets as $charset) {`
		396	`$ret = self::convert($chunk, $charset);`
		397
		398	`if ($ret === rcube_charset::clean($ret)) {`
		399	`return $charset;`
		400	`}`
		401	`}`
		402	`}`
		403
		404	`/**`
		405	`* Converts string from standard UTF-7 (RFC 2152) to UTF-8.`
		406	`*`
		407	`* @param string $str Input string (UTF-7)`
		408	`*`
		409	`* @return string Converted string (UTF-8)`
		410	`* @deprecated use self::convert()`
		411	`*/`
		412	`public static function utf7_to_utf8($str)`
		413	`{`
		414	`return self::convert($str, 'UTF-7', 'UTF-8');`
		415	`}`
		416
		417	`/**`
		418	`* Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)`
		419	`*`
		420	`* @param string $str Input string`
		421	`*`
		422	`* @return string The converted string`
		423	`* @deprecated use self::convert()`
		424	`*/`
		425	`public static function utf16_to_utf8($str)`
		426	`{`
		427	`return self::convert($str, 'UTF-16BE', 'UTF-8');`
		428	`}`
		429
		430	`/**`
		431	`* Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.`
		432	`* If input data is invalid, return the original input string.`
		433	`* RFC 2060 obviously intends the encoding to be unique (see`
		434	`* point 5 in section 5.1.3), so we reject any non-canonical`
		435	`* form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead`
		436	`* of &AMAAwA-).`
		437	`*`
		438	`* @param string $str Input string (UTF7-IMAP)`
		439	`*`
		440	`* @return string Output string (UTF-8)`
		441	`* @deprecated use self::convert()`
		442	`*/`
		443	`public static function utf7imap_to_utf8($str)`
		444	`{`
		445	`return self::convert($str, 'UTF7-IMAP', 'UTF-8');`
		446	`}`
		447
		448	`/**`
		449	`* Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.`
		450	`* Unicode characters above U+FFFF are replaced by U+FFFE.`
		451	`* If input data is invalid, return an empty string.`
		452	`*`
		453	`* @param string $str Input string (UTF-8)`
		454	`*`
		455	`* @return string Output string (UTF7-IMAP)`
		456	`* @deprecated use self::convert()`
		457	`*/`
		458	`public static function utf8_to_utf7imap($str)`
		459	`{`
		460	`return self::convert($str, 'UTF-8', 'UTF7-IMAP');`
		461	`}`
		462
		463	`/**`
		464	`* A method to guess character set of a string.`
		465	`*`
		466	`* @param string $string String`
		467	`* @param string $failover Default result for failover`
		468	`* @param string $language User language`
		469	`*`
		470	`* @return string Charset name`
		471	`* @deprecated`
		472	`*/`
		473	`public static function detect($string, $failover = null, $language = null)`
		474	`{`
		475	`if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian`
		476	`if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian`
		477	`if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian`
		478	`if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian`
		479	`if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';`
		480
		481	`// heuristics`
		482	`if (strlen($string) >= 4) {`
		483	`if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';`
		484	`if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';`
		485	`if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';`
		486	`if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';`
		487	`}`
		488
		489	`if (empty($language)) {`
		490	`$rcube = rcube::get_instance();`
		491	`$language = $rcube->get_user_language();`
		492	`}`
		493
		494	`// Prioritize charsets according to the current language (#1485669)`
		495	`$prio = null;`
		496	`switch ($language) {`
		497	`case 'ja_JP':`
		498	`$prio = ['ISO-2022-JP', 'JIS', 'UTF-8', 'EUC-JP', 'eucJP-win', 'SJIS'];`
		499	`break;`
		500
		501	`case 'zh_CN':`
		502	`case 'zh_TW':`
		503	`$prio = ['UTF-8', 'BIG-5', 'EUC-TW', 'GB18030'];`
		504	`break;`
		505
		506	`case 'ko_KR':`
		507	`$prio = ['UTF-8', 'EUC-KR', 'ISO-2022-KR'];`
		508	`break;`
		509
		510	`case 'ru_RU':`
		511	`$prio = ['UTF-8', 'WINDOWS-1251', 'KOI8-R'];`
		512	`break;`
		513
		514	`case 'tr_TR':`
		515	`$prio = ['UTF-8', 'ISO-8859-9', 'WINDOWS-1254'];`
		516	`break;`
		517	`}`
		518
		519	`// mb_detect_encoding() is not reliable for some charsets (#1490135)`
		520	`// use mb_check_encoding() to make charset priority lists really working`
		521	`if (!empty($prio) && function_exists('mb_check_encoding')) {`
		522	`foreach ($prio as $encoding) {`
		523	`if (mb_check_encoding($string, $encoding)) {`
		524	`return $encoding;`
		525	`}`
		526	`}`
		527	`}`
		528
		529	`if (function_exists('mb_detect_encoding')) {`
		530	`$exclude = 'BASE64,UUENCODE,HTML-ENTITIES,Quoted-Printable,'`
		531	`. '7bit,8bit,pass,wchar,byte2be,byte2le,byte4be,byte4le,'`
		532	`. 'UCS-4,UCS-4BE,UCS-4LE,UCS-2,UCS-2BE,UCS-2LE';`
		533
		534	`if (empty($prio)) {`
		535	`$prio = [`
		536	`'UTF-8',`
		537	`'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',`
		538	`'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',`
		539	`'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',`
		540	`'WINDOWS-1252', 'WINDOWS-1251', 'WINDOWS-1254',`
		541	`'EUC-JP', 'EUC-TW', 'KOI8-R', 'BIG-5', 'ISO-2022-KR', 'ISO-2022-JP', 'GB18030',`
		542	`];`
		543	`}`
		544
		545	`// We have to remove unwanted/uncommon encodings from the list.`
		546	`// This is needed especially on PHP >= 8.1`
		547	`$all_encodings = array_diff(mb_list_encodings(), explode(',', $exclude));`
		548
		549	`$encodings = array_unique(array_merge($prio, $all_encodings));`
		550
		551	`if ($encoding = mb_detect_encoding($string, $encodings, true)) {`
		552	`return $encoding;`
		553	`}`
		554	`}`
		555
		556	`// No match, check for UTF-8`
		557	`// from http://w3.org/International/questions/qa-forms-utf-8.html`
		558	`if (preg_match('/\A(`
		559	`[\x09\x0A\x0D\x20-\x7E]`
		560	`\| [\xC2-\xDF][\x80-\xBF]`
		561	`\| \xE0[\xA0-\xBF][\x80-\xBF]`
		562	`\| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}`
		563	`\| \xED[\x80-\x9F][\x80-\xBF]`
		564	`\| \xF0[\x90-\xBF][\x80-\xBF]{2}`
		565	`\| [\xF1-\xF3][\x80-\xBF]{3}`
		566	`\| \xF4[\x80-\x8F][\x80-\xBF]{2}`
		567	`)*\z/xs', substr($string, 0, 2048))`
		568	`) {`
		569	`return 'UTF-8';`
		570	`}`
		571
		572	`return $failover;`
		573	`}`
		574
		575	`/**`
		576	`* Removes non-unicode characters from input.`
		577	`* If the input is an array, both values and keys will be cleaned up.`
		578	`*`
		579	`* @param mixed $input String or array.`
		580	`*`
		581	`* @return mixed String or array`
		582	`*/`
		583	`public static function clean($input)`
		584	`{`
		585	`// handle input of type array`
		586	`if (is_array($input)) {`
		587	`foreach (array_keys($input) as $key) {`
		588	`$k = is_string($key) ? self::clean($key) : $key;`
		589	`$v = self::clean($input[$key]);`
		590
		591	`if ($k !== $key) {`
		592	`unset($input[$key]);`
		593	`if (!array_key_exists($k, $input)) {`
		594	`$input[$k] = $v;`
		595	`}`
		596	`}`
		597	`else {`
		598	`$input[$k] = $v;`
		599	`}`
		600	`}`
		601	`return $input;`
		602	`}`
		603
		604	`if (!is_string($input) \|\| $input == '') {`
		605	`return $input;`
		606	`}`
		607
		608	`$msch = mb_substitute_character();`
		609	`mb_substitute_character('none');`
		610	`$res = mb_convert_encoding($input, 'UTF-8', 'UTF-8');`
		611	`mb_substitute_character($msch);`
		612
		613	`return $res;`
		614	`}`
		615	`}`

Proyectos de Subversion Moodle

(root)/admin/tool/messageinbound/roundcube/rcube_charset.php – Rev 1