WebSVN – Moodle – Autoría – /lib/htmlpurifier/HTMLPurifier/Lexer.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`* Forgivingly lexes HTML (SGML-style) markup into tokens.`
		5	`*`
		6	`* A lexer parses a string of SGML-style markup and converts them into`
		7	`* corresponding tokens. It doesn't check for well-formedness, although its`
		8	`* internal mechanism may make this automatic (such as the case of`
		9	`* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose`
		10	`* from.`
		11	`*`
		12	`* A lexer is HTML-oriented: it might work with XML, but it's not`
		13	`* recommended, as we adhere to a subset of the specification for optimization`
		14	`* reasons. This might change in the future. Also, most tokenizers are not`
		15	`* expected to handle DTDs or PIs.`
		16	`*`
		17	`* This class should not be directly instantiated, but you may use create() to`
		18	`* retrieve a default copy of the lexer. Being a supertype, this class`
		19	`* does not actually define any implementation, but offers commonly used`
		20	`* convenience functions for subclasses.`
		21	`*`
		22	`* @note The unit tests will instantiate this class for testing purposes, as`
		23	`* many of the utility functions require a class to be instantiated.`
		24	`* This means that, even though this class is not runnable, it will`
		25	`* not be declared abstract.`
		26	`*`
		27	`* @par`
		28	`*`
		29	`* @note`
		30	`* We use tokens rather than create a DOM representation because DOM would:`
		31	`*`
		32	`* @par`
		33	`* -# Require more processing and memory to create,`
		34	`* -# Is not streamable, and`
		35	`* -# Has the entire document structure (html and body not needed).`
		36	`*`
		37	`* @par`
		38	`* However, DOM is helpful in that it makes it easy to move around nodes`
		39	`* without a lot of lookaheads to see when a tag is closed. This is a`
		40	`* limitation of the token system and some workarounds would be nice.`
		41	`*/`
		42	`class HTMLPurifier_Lexer`
		43	`{`
		44
		45	`/**`
		46	`* Whether or not this lexer implements line-number/column-number tracking.`
		47	`* If it does, set to true.`
		48	`*/`
		49	`public $tracksLineNumbers = false;`
		50
		51	`/**`
		52	`* @type HTMLPurifier_EntityParser`
		53	`*/`
		54	`private $_entity_parser;`
		55
		56	`// -- STATIC ----------------------------------------------------------`
		57
		58	`/**`
		59	`* Retrieves or sets the default Lexer as a Prototype Factory.`
		60	`*`
		61	`* By default HTMLPurifier_Lexer_DOMLex will be returned. There are`
		62	`* a few exceptions involving special features that only DirectLex`
		63	`* implements.`
		64	`*`
		65	`* @note The behavior of this class has changed, rather than accepting`
		66	`* a prototype object, it now accepts a configuration object.`
		67	`* To specify your own prototype, set %Core.LexerImpl to it.`
		68	`* This change in behavior de-singletonizes the lexer object.`
		69	`*`
		70	`* @param HTMLPurifier_Config $config`
		71	`* @return HTMLPurifier_Lexer`
		72	`* @throws HTMLPurifier_Exception`
		73	`*/`
		74	`public static function create($config)`
		75	`{`
		76	`if (!($config instanceof HTMLPurifier_Config)) {`
		77	`$lexer = $config;`
		78	`trigger_error(`
		79	`"Passing a prototype to`
		80	`HTMLPurifier_Lexer::create() is deprecated, please instead`
		81	`use %Core.LexerImpl",`
		82	`E_USER_WARNING`
		83	`);`
		84	`} else {`
		85	`$lexer = $config->get('Core.LexerImpl');`
		86	`}`
		87
		88	`$needs_tracking =`
		89	`$config->get('Core.MaintainLineNumbers') \|\|`
		90	`$config->get('Core.CollectErrors');`
		91
		92	`$inst = null;`
		93	`if (is_object($lexer)) {`
		94	`$inst = $lexer;`
		95	`} else {`
		96	`if (is_null($lexer)) {`
		97	`do {`
		98	`// auto-detection algorithm`
		99	`if ($needs_tracking) {`
		100	`$lexer = 'DirectLex';`
		101	`break;`
		102	`}`
		103
		104	`if (class_exists('DOMDocument') &&`
		105	`method_exists('DOMDocument', 'loadHTML') &&`
		106	`!extension_loaded('domxml')`
		107	`) {`
		108	`// check for DOM support, because while it's part of the`
		109	`// core, it can be disabled compile time. Also, the PECL`
		110	`// domxml extension overrides the default DOM, and is evil`
		111	`// and nasty and we shan't bother to support it`
		112	`$lexer = 'DOMLex';`
		113	`} else {`
		114	`$lexer = 'DirectLex';`
		115	`}`
		116	`} while (0);`
		117	`} // do..while so we can break`
		118
		119	`// instantiate recognized string names`
		120	`switch ($lexer) {`
		121	`case 'DOMLex':`
		122	`$inst = new HTMLPurifier_Lexer_DOMLex();`
		123	`break;`
		124	`case 'DirectLex':`
		125	`$inst = new HTMLPurifier_Lexer_DirectLex();`
		126	`break;`
		127	`case 'PH5P':`
		128	`$inst = new HTMLPurifier_Lexer_PH5P();`
		129	`break;`
		130	`default:`
		131	`throw new HTMLPurifier_Exception(`
		132	`"Cannot instantiate unrecognized Lexer type " .`
		133	`htmlspecialchars($lexer)`
		134	`);`
		135	`}`
		136	`}`
		137
		138	`if (!$inst) {`
		139	`throw new HTMLPurifier_Exception('No lexer was instantiated');`
		140	`}`
		141
		142	`// once PHP DOM implements native line numbers, or we`
		143	`// hack out something using XSLT, remove this stipulation`
		144	`if ($needs_tracking && !$inst->tracksLineNumbers) {`
		145	`throw new HTMLPurifier_Exception(`
		146	`'Cannot use lexer that does not support line numbers with ' .`
		147	`'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'`
		148	`);`
		149	`}`
		150
		151	`return $inst;`
		152
		153	`}`
		154
		155	`// -- CONVENIENCE MEMBERS ---------------------------------------------`
		156
		157	`public function __construct()`
		158	`{`
		159	`$this->_entity_parser = new HTMLPurifier_EntityParser();`
		160	`}`
		161
		162	`/**`
		163	`* Most common entity to raw value conversion table for special entities.`
		164	`* @type array`
		165	`*/`
		166	`protected $_special_entity2str =`
		167	`array(`
		168	`'"' => '"',`
		169	`'&' => '&',`
		170	`'<' => '<',`
		171	`'>' => '>',`
		172	`''' => "'",`
		173	`''' => "'",`
		174	`''' => "'"`
		175	`);`
		176
		177	`public function parseText($string, $config) {`
		178	`return $this->parseData($string, false, $config);`
		179	`}`
		180
		181	`public function parseAttr($string, $config) {`
		182	`return $this->parseData($string, true, $config);`
		183	`}`
		184
		185	`/**`
		186	`* Parses special entities into the proper characters.`
		187	`*`
		188	`* This string will translate escaped versions of the special characters`
		189	`* into the correct ones.`
		190	`*`
		191	`* @param string $string String character data to be parsed.`
		192	`* @return string Parsed character data.`
		193	`*/`
		194	`public function parseData($string, $is_attr, $config)`
		195	`{`
		196	`// following functions require at least one character`
		197	`if ($string === '') {`
		198	`return '';`
		199	`}`
		200
		201	`// subtracts amps that cannot possibly be escaped`
		202	`$num_amp = substr_count($string, '&') - substr_count($string, '& ') -`
		203	`($string[strlen($string) - 1] === '&' ? 1 : 0);`
		204
		205	`if (!$num_amp) {`
		206	`return $string;`
		207	`} // abort if no entities`
		208	`$num_esc_amp = substr_count($string, '&');`
		209	`$string = strtr($string, $this->_special_entity2str);`
		210
		211	`// code duplication for sake of optimization, see above`
		212	`$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -`
		213	`($string[strlen($string) - 1] === '&' ? 1 : 0);`
		214
		215	`if ($num_amp_2 <= $num_esc_amp) {`
		216	`return $string;`
		217	`}`
		218
		219	`// hmm... now we have some uncommon entities. Use the callback.`
		220	`if ($config->get('Core.LegacyEntityDecoder')) {`
		221	`$string = $this->_entity_parser->substituteSpecialEntities($string);`
		222	`} else {`
		223	`if ($is_attr) {`
		224	`$string = $this->_entity_parser->substituteAttrEntities($string);`
		225	`} else {`
		226	`$string = $this->_entity_parser->substituteTextEntities($string);`
		227	`}`
		228	`}`
		229	`return $string;`
		230	`}`
		231
		232	`/**`
		233	`* Lexes an HTML string into tokens.`
		234	`* @param $string String HTML.`
		235	`* @param HTMLPurifier_Config $config`
		236	`* @param HTMLPurifier_Context $context`
		237	`* @return HTMLPurifier_Token[] array representation of HTML.`
		238	`*/`
		239	`public function tokenizeHTML($string, $config, $context)`
		240	`{`
		241	`trigger_error('Call to abstract class', E_USER_ERROR);`
		242	`}`
		243
		244	`/**`
		245	`* Translates CDATA sections into regular sections (through escaping).`
		246	`* @param string $string HTML string to process.`
		247	`* @return string HTML with CDATA sections escaped.`
		248	`*/`
		249	`protected static function escapeCDATA($string)`
		250	`{`
		251	`return preg_replace_callback(`
		252	`'/<!\[CDATA\[(.+?)\]\]>/s',`
		253	`array('HTMLPurifier_Lexer', 'CDATACallback'),`
		254	`$string`
		255	`);`
		256	`}`
		257
		258	`/**`
		259	`* Special CDATA case that is especially convoluted for <script>`
		260	`* @param string $string HTML string to process.`
		261	`* @return string HTML with CDATA sections escaped.`
		262	`*/`
		263	`protected static function escapeCommentedCDATA($string)`
		264	`{`
		265	`return preg_replace_callback(`
		266	`'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',`
		267	`array('HTMLPurifier_Lexer', 'CDATACallback'),`
		268	`$string`
		269	`);`
		270	`}`
		271
		272	`/**`
		273	`* Callback function for escapeCDATA() that does the work.`
		274	`*`
		275	`* @warning Though this is public in order to let the callback happen,`
		276	`* calling it directly is not recommended.`
		277	`* @param array $matches PCRE matches array, with index 0 the entire match`
		278	`* and 1 the inside of the CDATA section.`
		279	`* @return string Escaped internals of the CDATA section.`
		280	`*/`
		281	`protected static function CDATACallback($matches)`
		282	`{`
		283	`// not exactly sure why the character set is needed, but whatever`
		284	`return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');`
		285	`}`
		286
		287	`/**`
		288	`* Takes a piece of HTML and normalizes it by converting entities, fixing`
		289	`* encoding, extracting bits, and other good stuff.`
		290	`* @param string $html HTML.`
		291	`* @param HTMLPurifier_Config $config`
		292	`* @param HTMLPurifier_Context $context`
		293	`* @return string`
		294	`* @todo Consider making protected`
		295	`*/`
		296	`public function normalize($html, $config, $context)`
		297	`{`
		298	`// normalize newlines to \n`
		299	`if ($config->get('Core.NormalizeNewlines')) {`
		300	`$html = str_replace("\r\n", "\n", (string)$html);`
		301	`$html = str_replace("\r", "\n", (string)$html);`
		302	`}`
		303
		304	`if ($config->get('HTML.Trusted')) {`
		305	`// escape convoluted CDATA`
		306	`$html = $this->escapeCommentedCDATA($html);`
		307	`}`
		308
		309	`// escape CDATA`
		310	`$html = $this->escapeCDATA($html);`
		311
		312	`// extract body from document if applicable`
		313	`if ($config->get('Core.ConvertDocumentToFragment')) {`
		314	`$e = false;`
		315	`if ($config->get('Core.CollectErrors')) {`
		316	`$e =& $context->get('ErrorCollector');`
		317	`}`
		318	`$new_html = $this->extractBody($html);`
		319	`if ($e && $new_html != $html) {`
		320	`$e->send(E_WARNING, 'Lexer: Extracted body');`
		321	`}`
		322	`$html = $new_html;`
		323	`}`
		324
		325	`// expand entities that aren't the big five`
		326	`if ($config->get('Core.LegacyEntityDecoder')) {`
		327	`$html = $this->_entity_parser->substituteNonSpecialEntities($html);`
		328	`}`
		329
		330	`// clean into wellformed UTF-8 string for an SGML context: this has`
		331	`// to be done after entity expansion because the entities sometimes`
		332	`// represent non-SGML characters (horror, horror!)`
		333	`$html = HTMLPurifier_Encoder::cleanUTF8($html);`
		334
		335	`// if processing instructions are to removed, remove them now`
		336	`if ($config->get('Core.RemoveProcessingInstructions')) {`
		337	`$html = preg_replace('#<\?.+?\?>#s', '', $html);`
		338	`}`
		339
		340	`$hidden_elements = $config->get('Core.HiddenElements');`
		341	`if ($config->get('Core.AggressivelyRemoveScript') &&`
		342	`!($config->get('HTML.Trusted') \|\| !$config->get('Core.RemoveScriptContents')`
		343	`\|\| empty($hidden_elements["script"]))) {`
		344	`$html = preg_replace('#<script[^>]>.?</script>#i', '', $html);`
		345	`}`
		346
		347	`return $html;`
		348	`}`
		349
		350	`/**`
		351	`* Takes a string of HTML (fragment or document) and returns the content`
		352	`* @todo Consider making protected`
		353	`*/`
		354	`public function extractBody($html)`
		355	`{`
		356	`$matches = array();`
		357	`$result = preg_match('\|(.?)<body[^>]>(.*)</body>\|is', $html, $matches);`
		358	`if ($result) {`
		359	`// Make sure it's not in a comment`
		360	`$comment_start = strrpos($matches[1], '<!--');`
		361	`$comment_end = strrpos($matches[1], '-->');`
		362	`if ($comment_start === false \|\|`
		363	`($comment_end !== false && $comment_end > $comment_start)) {`
		364	`return $matches[2];`
		365	`}`
		366	`}`
		367	`return $html;`
		368	`}`
		369	`}`
		370
		371	`// vim: et sw=4 sts=4`

Proyectos de Subversion Moodle

(root)/lib/htmlpurifier/HTMLPurifier/Lexer.php – Rev 1441