WebSVN – Moodle – Autoría – /lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`* Parser that uses PHP 5's DOM extension (part of the core).`
		5	`*`
		6	`* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.`
		7	`* It gives us a forgiving HTML parser, which we use to transform the HTML`
		8	`* into a DOM, and then into the tokens. It is blazingly fast (for large`
		9	`* documents, it performs twenty times faster than`
		10	`* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.`
		11	`*`
		12	`* @note Any empty elements will have empty tokens associated with them, even if`
		13	`* this is prohibited by the spec. This is cannot be fixed until the spec`
		14	`* comes into play.`
		15	`*`
		16	`* @note PHP's DOM extension does not actually parse any entities, we use`
		17	`* our own function to do that.`
		18	`*`
		19	`* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.`
		20	`* If this is a huge problem, due to the fact that HTML is hand`
		21	`* edited and you are unable to get a parser cache that caches the`
		22	`* the output of HTML Purifier while keeping the original HTML lying`
		23	`* around, you may want to run Tidy on the resulting output or use`
		24	`* HTMLPurifier_DirectLex`
		25	`*/`
		26
		27	`class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer`
		28	`{`
		29
		30	`/**`
		31	`* @type HTMLPurifier_TokenFactory`
		32	`*/`
		33	`private $factory;`
		34
		35	`public function __construct()`
		36	`{`
		37	`// setup the factory`
		38	`parent::__construct();`
		39	`$this->factory = new HTMLPurifier_TokenFactory();`
		40	`}`
		41
		42	`/**`
		43	`* @param string $html`
		44	`* @param HTMLPurifier_Config $config`
		45	`* @param HTMLPurifier_Context $context`
		46	`* @return HTMLPurifier_Token[]`
		47	`*/`
		48	`public function tokenizeHTML($html, $config, $context)`
		49	`{`
		50	`$html = $this->normalize($html, $config, $context);`
		51
		52	`// attempt to armor stray angled brackets that cannot possibly`
		53	`// form tags and thus are probably being used as emoticons`
		54	`if ($config->get('Core.AggressivelyFixLt')) {`
		55	`$char = '[^a-z!\/]';`
		56	`$comment = "/<!--(.*?)(-->\|\z)/is";`
		57	`$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);`
		58	`do {`
		59	`$old = $html;`
		60	`$html = preg_replace("/<($char)/i", '<\\1', $html);`
		61	`} while ($html !== $old);`
		62	`$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments`
		63	`}`
		64
		65	`// preprocess html, essential for UTF-8`
		66	`$html = $this->wrapHTML($html, $config, $context);`
		67
		68	`$doc = new DOMDocument();`
		69	`$doc->encoding = 'UTF-8'; // theoretically, the above has this covered`
		70
		71	`$options = 0;`
		72	`if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {`
		73	`$options \|= LIBXML_PARSEHUGE;`
		74	`}`
		75
		76	`set_error_handler(array($this, 'muteErrorHandler'));`
		77	`// loadHTML() fails on PHP 5.3 when second parameter is given`
		78	`if ($options) {`
		79	`$doc->loadHTML($html, $options);`
		80	`} else {`
		81	`$doc->loadHTML($html);`
		82	`}`
		83	`restore_error_handler();`
		84
		85	`$body = $doc->getElementsByTagName('html')->item(0)-> // <html>`
		86	`getElementsByTagName('body')->item(0); // <body>`
		87
		88	`$div = $body->getElementsByTagName('div')->item(0); // <div>`
		89	`$tokens = array();`
		90	`$this->tokenizeDOM($div, $tokens, $config);`
		91	`// If the div has a sibling, that means we tripped across`
		92	`// a premature </div> tag. So remove the div we parsed,`
		93	`// and then tokenize the rest of body. We can't tokenize`
		94	`// the sibling directly as we'll lose the tags in that case.`
		95	`if ($div->nextSibling) {`
		96	`$body->removeChild($div);`
		97	`$this->tokenizeDOM($body, $tokens, $config);`
		98	`}`
		99	`return $tokens;`
		100	`}`
		101
		102	`/**`
		103	`* Iterative function that tokenizes a node, putting it into an accumulator.`
		104	`* To iterate is human, to recurse divine - L. Peter Deutsch`
		105	`* @param DOMNode $node DOMNode to be tokenized.`
		106	`* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.`
		107	`*/`
		108	`protected function tokenizeDOM($node, &$tokens, $config)`
		109	`{`
		110	`$level = 0;`
		111	`$nodes = array($level => new HTMLPurifier_Queue(array($node)));`
		112	`$closingNodes = array();`
		113	`do {`
		114	`while (!$nodes[$level]->isEmpty()) {`
		115	`$node = $nodes[$level]->shift(); // FIFO`
		116	`$collect = $level > 0 ? true : false;`
		117	`$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);`
		118	`if ($needEndingTag) {`
		119	`$closingNodes[$level][] = $node;`
		120	`}`
		121	`if ($node->childNodes && $node->childNodes->length) {`
		122	`$level++;`
		123	`$nodes[$level] = new HTMLPurifier_Queue();`
		124	`foreach ($node->childNodes as $childNode) {`
		125	`$nodes[$level]->push($childNode);`
		126	`}`
		127	`}`
		128	`}`
		129	`$level--;`
		130	`if ($level && isset($closingNodes[$level])) {`
		131	`while ($node = array_pop($closingNodes[$level])) {`
		132	`$this->createEndNode($node, $tokens);`
		133	`}`
		134	`}`
		135	`} while ($level > 0);`
		136	`}`
		137
		138	`/**`
		139	`* Portably retrieve the tag name of a node; deals with older versions`
		140	`* of libxml like 2.7.6`
		141	`* @param DOMNode $node`
		142	`*/`
		143	`protected function getTagName($node)`
		144	`{`
		145	`if (isset($node->tagName)) {`
		146	`return $node->tagName;`
		147	`} else if (isset($node->nodeName)) {`
		148	`return $node->nodeName;`
		149	`} else if (isset($node->localName)) {`
		150	`return $node->localName;`
		151	`}`
		152	`return null;`
		153	`}`
		154
		155	`/**`
		156	`* Portably retrieve the data of a node; deals with older versions`
		157	`* of libxml like 2.7.6`
		158	`* @param DOMNode $node`
		159	`*/`
		160	`protected function getData($node)`
		161	`{`
		162	`if (isset($node->data)) {`
		163	`return $node->data;`
		164	`} else if (isset($node->nodeValue)) {`
		165	`return $node->nodeValue;`
		166	`} else if (isset($node->textContent)) {`
		167	`return $node->textContent;`
		168	`}`
		169	`return null;`
		170	`}`
		171
		172
		173	`/**`
		174	`* @param DOMNode $node DOMNode to be tokenized.`
		175	`* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.`
		176	`* @param bool $collect Says whether or start and close are collected, set to`
		177	`* false at first recursion because it's the implicit DIV`
		178	`* tag you're dealing with.`
		179	`* @return bool if the token needs an endtoken`
		180	`* @todo data and tagName properties don't seem to exist in DOMNode?`
		181	`*/`
		182	`protected function createStartNode($node, &$tokens, $collect, $config)`
		183	`{`
		184	`// intercept non element nodes. WE MUST catch all of them,`
		185	`// but we're not getting the character reference nodes because`
		186	`// those should have been preprocessed`
		187	`if ($node->nodeType === XML_TEXT_NODE) {`
		188	`$data = $this->getData($node); // Handle variable data property`
		189	`if ($data !== null) {`
		190	`$tokens[] = $this->factory->createText($data);`
		191	`}`
		192	`return false;`
		193	`} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {`
		194	`// undo libxml's special treatment of <script> and <style> tags`
		195	`$last = end($tokens);`
		196	`$data = $node->data;`
		197	`// (note $node->tagname is already normalized)`
		198	`if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' \|\| $last->name == 'style')) {`
		199	`$new_data = trim($data);`
		200	`if (substr($new_data, 0, 4) === '<!--') {`
		201	`$data = substr($new_data, 4);`
		202	`if (substr($data, -3) === '-->') {`
		203	`$data = substr($data, 0, -3);`
		204	`} else {`
		205	`// Highly suspicious! Not sure what to do...`
		206	`}`
		207	`}`
		208	`}`
		209	`$tokens[] = $this->factory->createText($this->parseText($data, $config));`
		210	`return false;`
		211	`} elseif ($node->nodeType === XML_COMMENT_NODE) {`
		212	`// this is code is only invoked for comments in script/style in versions`
		213	`// of libxml pre-2.6.28 (regular comments, of course, are still`
		214	`// handled regularly)`
		215	`$tokens[] = $this->factory->createComment($node->data);`
		216	`return false;`
		217	`} elseif ($node->nodeType !== XML_ELEMENT_NODE) {`
		218	`// not-well tested: there may be other nodes we have to grab`
		219	`return false;`
		220	`}`
		221	`$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();`
		222	`$tag_name = $this->getTagName($node); // Handle variable tagName property`
		223	`if (empty($tag_name)) {`
		224	`return (bool) $node->childNodes->length;`
		225	`}`
		226	`// We still have to make sure that the element actually IS empty`
		227	`if (!$node->childNodes->length) {`
		228	`if ($collect) {`
		229	`$tokens[] = $this->factory->createEmpty($tag_name, $attr);`
		230	`}`
		231	`return false;`
		232	`} else {`
		233	`if ($collect) {`
		234	`$tokens[] = $this->factory->createStart($tag_name, $attr);`
		235	`}`
		236	`return true;`
		237	`}`
		238	`}`
		239
		240	`/**`
		241	`* @param DOMNode $node`
		242	`* @param HTMLPurifier_Token[] $tokens`
		243	`*/`
		244	`protected function createEndNode($node, &$tokens)`
		245	`{`
		246	`$tag_name = $this->getTagName($node); // Handle variable tagName property`
		247	`$tokens[] = $this->factory->createEnd($tag_name);`
		248	`}`
		249
		250	`/**`
		251	`* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.`
		252	`*`
		253	`* @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.`
		254	`* @return array Associative array of attributes.`
		255	`*/`
		256	`protected function transformAttrToAssoc($node_map)`
		257	`{`
		258	`// NamedNodeMap is documented very well, so we're using undocumented`
		259	`// features, namely, the fact that it implements Iterator and`
		260	`// has a ->length attribute`
		261	`if ($node_map->length === 0) {`
		262	`return array();`
		263	`}`
		264	`$array = array();`
		265	`foreach ($node_map as $attr) {`
		266	`$array[$attr->name] = $attr->value;`
		267	`}`
		268	`return $array;`
		269	`}`
		270
		271	`/**`
		272	`* An error handler that mutes all errors`
		273	`* @param int $errno`
		274	`* @param string $errstr`
		275	`*/`
		276	`public function muteErrorHandler($errno, $errstr)`
		277	`{`
		278	`}`
		279
		280	`/**`
		281	`* Callback function for undoing escaping of stray angled brackets`
		282	`* in comments`
		283	`* @param array $matches`
		284	`* @return string`
		285	`*/`
		286	`public function callbackUndoCommentSubst($matches)`
		287	`{`
		288	`return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];`
		289	`}`
		290
		291	`/**`
		292	`* Callback function that entity-izes ampersands in comments so that`
		293	`* callbackUndoCommentSubst doesn't clobber them`
		294	`* @param array $matches`
		295	`* @return string`
		296	`*/`
		297	`public function callbackArmorCommentEntities($matches)`
		298	`{`
		299	`return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];`
		300	`}`
		301
		302	`/**`
		303	`* Wraps an HTML fragment in the necessary HTML`
		304	`* @param string $html`
		305	`* @param HTMLPurifier_Config $config`
		306	`* @param HTMLPurifier_Context $context`
		307	`* @return string`
		308	`*/`
		309	`protected function wrapHTML($html, $config, $context, $use_div = true)`
		310	`{`
		311	`$def = $config->getDefinition('HTML');`
		312	`$ret = '';`
		313
		314	`if (!empty($def->doctype->dtdPublic) \|\| !empty($def->doctype->dtdSystem)) {`
		315	`$ret .= '<!DOCTYPE html ';`
		316	`if (!empty($def->doctype->dtdPublic)) {`
		317	`$ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';`
		318	`}`
		319	`if (!empty($def->doctype->dtdSystem)) {`
		320	`$ret .= '"' . $def->doctype->dtdSystem . '" ';`
		321	`}`
		322	`$ret .= '>';`
		323	`}`
		324
		325	`$ret .= '<html><head>';`
		326	`$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';`
		327	`// No protection if $html contains a stray </div>!`
		328	`$ret .= '</head><body>';`
		329	`if ($use_div) $ret .= '<div>';`
		330	`$ret .= $html;`
		331	`if ($use_div) $ret .= '</div>';`
		332	`$ret .= '</body></html>';`
		333	`return $ret;`
		334	`}`
		335	`}`
		336
		337	`// vim: et sw=4 sts=4`

Proyectos de Subversion Moodle

(root)/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php – Rev 1