WebSVN – Moodle – Autoría – /lib/htmlpurifier/HTMLPurifier/Lexer.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`* Forgivingly lexes HTML (SGML-style) markup into tokens.`
		5	`*`
		6	`* A lexer parses a string of SGML-style markup and converts them into`
		7	`* corresponding tokens. It doesn't check for well-formedness, although its`
		8	`* internal mechanism may make this automatic (such as the case of`
		9	`* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose`
		10	`* from.`
		11	`*`
		12	`* A lexer is HTML-oriented: it might work with XML, but it's not`
		13	`* recommended, as we adhere to a subset of the specification for optimization`
		14	`* reasons. This might change in the future. Also, most tokenizers are not`
		15	`* expected to handle DTDs or PIs.`
		16	`*`
		17	`* This class should not be directly instantiated, but you may use create() to`
		18	`* retrieve a default copy of the lexer. Being a supertype, this class`
		19	`* does not actually define any implementation, but offers commonly used`
		20	`* convenience functions for subclasses.`
		21	`*`
		22	`* @note The unit tests will instantiate this class for testing purposes, as`
		23	`* many of the utility functions require a class to be instantiated.`
		24	`* This means that, even though this class is not runnable, it will`
		25	`* not be declared abstract.`
		26	`*`
		27	`* @par`
		28	`*`
		29	`* @note`
		30	`* We use tokens rather than create a DOM representation because DOM would:`
		31	`*`
		32	`* @par`
		33	`* -# Require more processing and memory to create,`
		34	`* -# Is not streamable, and`
		35	`* -# Has the entire document structure (html and body not needed).`
		36	`*`
		37	`* @par`
		38	`* However, DOM is helpful in that it makes it easy to move around nodes`
		39	`* without a lot of lookaheads to see when a tag is closed. This is a`
		40	`* limitation of the token system and some workarounds would be nice.`
		41	`*/`
		42	`class HTMLPurifier_Lexer`
		43	`{`
		44
		45	`/**`
		46	`* Whether or not this lexer implements line-number/column-number tracking.`
		47	`* If it does, set to true.`
		48	`*/`
		49	`public $tracksLineNumbers = false;`
		50
		51	`/**`
		52	`* @type HTMLPurifier_EntityParser`
		53	`*/`
		54	`private $_entity_parser;`
		55
		56	`// -- STATIC ----------------------------------------------------------`
		57
		58	`/**`
		59	`* Retrieves or sets the default Lexer as a Prototype Factory.`
		60	`*`
		61	`* By default HTMLPurifier_Lexer_DOMLex will be returned. There are`
		62	`* a few exceptions involving special features that only DirectLex`
		63	`* implements.`
		64	`*`
		65	`* @note The behavior of this class has changed, rather than accepting`
		66	`* a prototype object, it now accepts a configuration object.`
		67	`* To specify your own prototype, set %Core.LexerImpl to it.`
		68	`* This change in behavior de-singletonizes the lexer object.`
		69	`*`
		70	`* @param HTMLPurifier_Config $config`
		71	`* @return HTMLPurifier_Lexer`
		72	`* @throws HTMLPurifier_Exception`
		73	`*/`
		74	`public static function create($config)`
		75	`{`
		76	`if (!($config instanceof HTMLPurifier_Config)) {`
		77	`$lexer = $config;`
		78	`trigger_error(`
		79	`"Passing a prototype to`
		80	`HTMLPurifier_Lexer::create() is deprecated, please instead`
		81	`use %Core.LexerImpl",`
		82	`E_USER_WARNING`
		83	`);`
		84	`} else {`
		85	`$lexer = $config->get('Core.LexerImpl');`
		86	`}`
		87
		88	`$needs_tracking =`
		89	`$config->get('Core.MaintainLineNumbers') \|\|`
		90	`$config->get('Core.CollectErrors');`
		91
		92	`$inst = null;`
		93	`if (is_object($lexer)) {`
		94	`$inst = $lexer;`
		95	`} else {`
		96	`if (is_null($lexer)) {`
		97	`do {`
		98	`// auto-detection algorithm`
		99	`if ($needs_tracking) {`
		100	`$lexer = 'DirectLex';`
		101	`break;`
		102	`}`
		103
		104	`if (class_exists('DOMDocument') &&`
		105	`method_exists('DOMDocument', 'loadHTML') &&`
		106	`!extension_loaded('domxml')`
		107	`) {`
		108	`// check for DOM support, because while it's part of the`
		109	`// core, it can be disabled compile time. Also, the PECL`
		110	`// domxml extension overrides the default DOM, and is evil`
		111	`// and nasty and we shan't bother to support it`
		112	`$lexer = 'DOMLex';`
		113	`} else {`
		114	`$lexer = 'DirectLex';`
		115	`}`
		116	`} while (0);`
		117	`} // do..while so we can break`
		118
		119	`// instantiate recognized string names`
		120	`switch ($lexer) {`
		121	`case 'DOMLex':`
		122	`$inst = new HTMLPurifier_Lexer_DOMLex();`
		123	`break;`
		124	`case 'DirectLex':`
		125	`$inst = new HTMLPurifier_Lexer_DirectLex();`
		126	`break;`
		127	`case 'PH5P':`
		128	`$inst = new HTMLPurifier_Lexer_PH5P();`
		129	`break;`
		130	`default:`
		131	`throw new HTMLPurifier_Exception(`
		132	`"Cannot instantiate unrecognized Lexer type " .`
		133	`htmlspecialchars($lexer)`
		134	`);`
		135	`}`
		136	`}`
		137
		138	`if (!$inst) {`
		139	`throw new HTMLPurifier_Exception('No lexer was instantiated');`
		140	`}`
		141
		142	`// once PHP DOM implements native line numbers, or we`
		143	`// hack out something using XSLT, remove this stipulation`
		144	`if ($needs_tracking && !$inst->tracksLineNumbers) {`
		145	`throw new HTMLPurifier_Exception(`
		146	`'Cannot use lexer that does not support line numbers with ' .`
		147	`'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'`
		148	`);`
		149	`}`
		150
		151	`return $inst;`
		152
		153	`}`
		154
		155	`// -- CONVENIENCE MEMBERS ---------------------------------------------`
		156
		157	`public function __construct()`
		158	`{`
		159	`$this->_entity_parser = new HTMLPurifier_EntityParser();`
		160	`}`
		161
		162	`/**`
		163	`* Most common entity to raw value conversion table for special entities.`
		164	`* @type array`
		165	`*/`
		166	`protected $_special_entity2str =`
		167	`array(`
		168	`'"' => '"',`
		169	`'&' => '&',`
		170	`'<' => '<',`
		171	`'>' => '>',`
		172	`''' => "'",`
		173	`''' => "'",`
		174	`''' => "'"`
		175	`);`
		176
		177	`public function parseText($string, $config) {`
		178	`return $this->parseData($string, false, $config);`
		179	`}`
		180
		181	`public function parseAttr($string, $config) {`
		182	`return $this->parseData($string, true, $config);`
		183	`}`
		184
		185	`/**`
		186	`* Parses special entities into the proper characters.`
		187	`*`
		188	`* This string will translate escaped versions of the special characters`
		189	`* into the correct ones.`
		190	`*`
		191	`* @param string $string String character data to be parsed.`
		192	`* @return string Parsed character data.`
		193	`*/`
		194	`public function parseData($string, $is_attr, $config)`
		195	`{`
		196	`// following functions require at least one character`
		197	`if ($string === '') {`
		198	`return '';`
		199	`}`
		200
		201	`// subtracts amps that cannot possibly be escaped`
		202	`$num_amp = substr_count($string, '&') - substr_count($string, '& ') -`
		203	`($string[strlen($string) - 1] === '&' ? 1 : 0);`
		204
		205	`if (!$num_amp) {`
		206	`return $string;`
		207	`} // abort if no entities`
		208	`$num_esc_amp = substr_count($string, '&');`
		209	`$string = strtr($string, $this->_special_entity2str);`
		210
		211	`// code duplication for sake of optimization, see above`
		212	`$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -`
		213	`($string[strlen($string) - 1] === '&' ? 1 : 0);`
		214
		215	`if ($num_amp_2 <= $num_esc_amp) {`
		216	`return $string;`
		217	`}`
		218
		219	`// hmm... now we have some uncommon entities. Use the callback.`
		220	`if ($config->get('Core.LegacyEntityDecoder')) {`
		221	`$string = $this->_entity_parser->substituteSpecialEntities($string);`
		222	`} else {`
		223	`if ($is_attr) {`
		224	`$string = $this->_entity_parser->substituteAttrEntities($string);`
		225	`} else {`
		226	`$string = $this->_entity_parser->substituteTextEntities($string);`
		227	`}`
		228	`}`
		229	`return $string;`
		230	`}`
		231
		232	`/**`
		233	`* Lexes an HTML string into tokens.`
		234	`* @param $string String HTML.`
		235	`* @param HTMLPurifier_Config $config`
		236	`* @param HTMLPurifier_Context $context`
		237	`* @return HTMLPurifier_Token[] array representation of HTML.`
		238	`*/`
		239	`public function tokenizeHTML($string, $config, $context)`
		240	`{`
		241	`trigger_error('Call to abstract class', E_USER_ERROR);`
		242	`}`
		243
		244	`/**`
		245	`* Translates CDATA sections into regular sections (through escaping).`
		246	`* @param string $string HTML string to process.`
		247	`* @return string HTML with CDATA sections escaped.`
		248	`*/`
		249	`protected static function escapeCDATA($string)`
		250	`{`
		251	`return preg_replace_callback(`
		252	`'/<!\[CDATA\[(.+?)\]\]>/s',`
		253	`array('HTMLPurifier_Lexer', 'CDATACallback'),`
		254	`$string`
		255	`);`
		256	`}`
		257
		258	`/**`
		259	`* Special CDATA case that is especially convoluted for <script>`
		260	`* @param string $string HTML string to process.`
		261	`* @return string HTML with CDATA sections escaped.`
		262	`*/`
		263	`protected static function escapeCommentedCDATA($string)`
		264	`{`
		265	`return preg_replace_callback(`
		266	`'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',`
		267	`array('HTMLPurifier_Lexer', 'CDATACallback'),`
		268	`$string`
		269	`);`
		270	`}`
		271
		272	`/**`
		273	`* Special Internet Explorer conditional comments should be removed.`
		274	`* @param string $string HTML string to process.`
		275	`* @return string HTML with conditional comments removed.`
		276	`*/`
		277	`protected static function removeIEConditional($string)`
		278	`{`
		279	`return preg_replace(`
		280	`'#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings`
		281	`'',`
		282	`$string`
		283	`);`
		284	`}`
		285
		286	`/**`
		287	`* Callback function for escapeCDATA() that does the work.`
		288	`*`
		289	`* @warning Though this is public in order to let the callback happen,`
		290	`* calling it directly is not recommended.`
		291	`* @param array $matches PCRE matches array, with index 0 the entire match`
		292	`* and 1 the inside of the CDATA section.`
		293	`* @return string Escaped internals of the CDATA section.`
		294	`*/`
		295	`protected static function CDATACallback($matches)`
		296	`{`
		297	`// not exactly sure why the character set is needed, but whatever`
		298	`return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');`
		299	`}`
		300
		301	`/**`
		302	`* Takes a piece of HTML and normalizes it by converting entities, fixing`
		303	`* encoding, extracting bits, and other good stuff.`
		304	`* @param string $html HTML.`
		305	`* @param HTMLPurifier_Config $config`
		306	`* @param HTMLPurifier_Context $context`
		307	`* @return string`
		308	`* @todo Consider making protected`
		309	`*/`
		310	`public function normalize($html, $config, $context)`
		311	`{`
		312	`// normalize newlines to \n`
		313	`if ($config->get('Core.NormalizeNewlines')) {`
		314	`$html = str_replace("\r\n", "\n", (string)$html);`
		315	`$html = str_replace("\r", "\n", (string)$html);`
		316	`}`
		317
		318	`if ($config->get('HTML.Trusted')) {`
		319	`// escape convoluted CDATA`
		320	`$html = $this->escapeCommentedCDATA($html);`
		321	`}`
		322
		323	`// escape CDATA`
		324	`$html = $this->escapeCDATA($html);`
		325
		326	`$html = $this->removeIEConditional($html);`
		327
		328	`// extract body from document if applicable`
		329	`if ($config->get('Core.ConvertDocumentToFragment')) {`
		330	`$e = false;`
		331	`if ($config->get('Core.CollectErrors')) {`
		332	`$e =& $context->get('ErrorCollector');`
		333	`}`
		334	`$new_html = $this->extractBody($html);`
		335	`if ($e && $new_html != $html) {`
		336	`$e->send(E_WARNING, 'Lexer: Extracted body');`
		337	`}`
		338	`$html = $new_html;`
		339	`}`
		340
		341	`// expand entities that aren't the big five`
		342	`if ($config->get('Core.LegacyEntityDecoder')) {`
		343	`$html = $this->_entity_parser->substituteNonSpecialEntities($html);`
		344	`}`
		345
		346	`// clean into wellformed UTF-8 string for an SGML context: this has`
		347	`// to be done after entity expansion because the entities sometimes`
		348	`// represent non-SGML characters (horror, horror!)`
		349	`$html = HTMLPurifier_Encoder::cleanUTF8($html);`
		350
		351	`// if processing instructions are to removed, remove them now`
		352	`if ($config->get('Core.RemoveProcessingInstructions')) {`
		353	`$html = preg_replace('#<\?.+?\?>#s', '', $html);`
		354	`}`
		355
		356	`$hidden_elements = $config->get('Core.HiddenElements');`
		357	`if ($config->get('Core.AggressivelyRemoveScript') &&`
		358	`!($config->get('HTML.Trusted') \|\| !$config->get('Core.RemoveScriptContents')`
		359	`\|\| empty($hidden_elements["script"]))) {`
		360	`$html = preg_replace('#<script[^>]>.?</script>#i', '', $html);`
		361	`}`
		362
		363	`return $html;`
		364	`}`
		365
		366	`/**`
		367	`* Takes a string of HTML (fragment or document) and returns the content`
		368	`* @todo Consider making protected`
		369	`*/`
		370	`public function extractBody($html)`
		371	`{`
		372	`$matches = array();`
		373	`$result = preg_match('\|(.?)<body[^>]>(.*)</body>\|is', $html, $matches);`
		374	`if ($result) {`
		375	`// Make sure it's not in a comment`
		376	`$comment_start = strrpos($matches[1], '<!--');`
		377	`$comment_end = strrpos($matches[1], '-->');`
		378	`if ($comment_start === false \|\|`
		379	`($comment_end !== false && $comment_end > $comment_start)) {`
		380	`return $matches[2];`
		381	`}`
		382	`}`
		383	`return $html;`
		384	`}`
		385	`}`
		386
		387	`// vim: et sw=4 sts=4`

Proyectos de Subversion Moodle

(root)/lib/htmlpurifier/HTMLPurifier/Lexer.php – Rev 1