WebSVN – Moodle – Autoría – /lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`* Parser that uses PHP 5's DOM extension (part of the core).`
		5	`*`
		6	`* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.`
		7	`* It gives us a forgiving HTML parser, which we use to transform the HTML`
		8	`* into a DOM, and then into the tokens. It is blazingly fast (for large`
		9	`* documents, it performs twenty times faster than`
		10	`* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.`
		11	`*`
		12	`* @note Any empty elements will have empty tokens associated with them, even if`
		13	`* this is prohibited by the spec. This is cannot be fixed until the spec`
		14	`* comes into play.`
		15	`*`
		16	`* @note PHP's DOM extension does not actually parse any entities, we use`
		17	`* our own function to do that.`
		18	`*`
		19	`* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.`
		20	`* If this is a huge problem, due to the fact that HTML is hand`
		21	`* edited and you are unable to get a parser cache that caches the`
		22	`* the output of HTML Purifier while keeping the original HTML lying`
		23	`* around, you may want to run Tidy on the resulting output or use`
		24	`* HTMLPurifier_DirectLex`
		25	`*/`
		26
		27	`class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer`
		28	`{`
		29
		30	`/**`
		31	`* @type HTMLPurifier_TokenFactory`
		32	`*/`
		33	`private $factory;`
		34
		35	`public function __construct()`
		36	`{`
		37	`// setup the factory`
		38	`parent::__construct();`
		39	`$this->factory = new HTMLPurifier_TokenFactory();`
		40	`}`
		41
		42	`/**`
		43	`* @param string $html`
		44	`* @param HTMLPurifier_Config $config`
		45	`* @param HTMLPurifier_Context $context`
		46	`* @return HTMLPurifier_Token[]`
		47	`*/`
		48	`public function tokenizeHTML($html, $config, $context)`
		49	`{`
		50	`$html = $this->normalize($html, $config, $context);`
		51
		52	`// attempt to armor stray angled brackets that cannot possibly`
		53	`// form tags and thus are probably being used as emoticons`
		54	`if ($config->get('Core.AggressivelyFixLt')) {`
		55	`$char = '[^a-z!\/]';`
		56	`$comment = "/<!--(.*?)(-->\|\z)/is";`
		57	`$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);`
		58	`do {`
		59	`$old = $html;`
		60	`$html = preg_replace("/<($char)/i", '<\\1', $html);`
		61	`} while ($html !== $old);`
		62	`$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments`
		63	`}`
		64
		65	`// preprocess html, essential for UTF-8`
		66	`$html = $this->wrapHTML($html, $config, $context);`
		67
		68	`$doc = new DOMDocument();`
		69	`$doc->encoding = 'UTF-8'; // theoretically, the above has this covered`
		70
		71	`$options = 0;`
		72	`if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {`
		73	`$options \|= LIBXML_PARSEHUGE;`
		74	`}`
1441	ariadna	75	`if ($config->get('Core.RemoveBlanks') && defined('LIBXML_NOBLANKS')) {`
		76	`$options \|= LIBXML_NOBLANKS;`
		77	`}`
1	efrain	78
		79	`set_error_handler(array($this, 'muteErrorHandler'));`
		80	`// loadHTML() fails on PHP 5.3 when second parameter is given`
		81	`if ($options) {`
		82	`$doc->loadHTML($html, $options);`
		83	`} else {`
		84	`$doc->loadHTML($html);`
		85	`}`
		86	`restore_error_handler();`
		87
		88	`$body = $doc->getElementsByTagName('html')->item(0)-> // <html>`
		89	`getElementsByTagName('body')->item(0); // <body>`
		90
		91	`$div = $body->getElementsByTagName('div')->item(0); // <div>`
		92	`$tokens = array();`
		93	`$this->tokenizeDOM($div, $tokens, $config);`
		94	`// If the div has a sibling, that means we tripped across`
		95	`// a premature </div> tag. So remove the div we parsed,`
		96	`// and then tokenize the rest of body. We can't tokenize`
		97	`// the sibling directly as we'll lose the tags in that case.`
		98	`if ($div->nextSibling) {`
		99	`$body->removeChild($div);`
		100	`$this->tokenizeDOM($body, $tokens, $config);`
		101	`}`
		102	`return $tokens;`
		103	`}`
		104
		105	`/**`
		106	`* Iterative function that tokenizes a node, putting it into an accumulator.`
		107	`* To iterate is human, to recurse divine - L. Peter Deutsch`
		108	`* @param DOMNode $node DOMNode to be tokenized.`
		109	`* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.`
		110	`*/`
		111	`protected function tokenizeDOM($node, &$tokens, $config)`
		112	`{`
		113	`$level = 0;`
		114	`$nodes = array($level => new HTMLPurifier_Queue(array($node)));`
		115	`$closingNodes = array();`
		116	`do {`
		117	`while (!$nodes[$level]->isEmpty()) {`
		118	`$node = $nodes[$level]->shift(); // FIFO`
		119	`$collect = $level > 0 ? true : false;`
		120	`$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);`
		121	`if ($needEndingTag) {`
		122	`$closingNodes[$level][] = $node;`
		123	`}`
		124	`if ($node->childNodes && $node->childNodes->length) {`
		125	`$level++;`
		126	`$nodes[$level] = new HTMLPurifier_Queue();`
		127	`foreach ($node->childNodes as $childNode) {`
		128	`$nodes[$level]->push($childNode);`
		129	`}`
		130	`}`
		131	`}`
		132	`$level--;`
		133	`if ($level && isset($closingNodes[$level])) {`
		134	`while ($node = array_pop($closingNodes[$level])) {`
		135	`$this->createEndNode($node, $tokens);`
		136	`}`
		137	`}`
		138	`} while ($level > 0);`
		139	`}`
		140
		141	`/**`
		142	`* Portably retrieve the tag name of a node; deals with older versions`
		143	`* of libxml like 2.7.6`
		144	`* @param DOMNode $node`
		145	`*/`
		146	`protected function getTagName($node)`
		147	`{`
		148	`if (isset($node->tagName)) {`
		149	`return $node->tagName;`
		150	`} else if (isset($node->nodeName)) {`
		151	`return $node->nodeName;`
		152	`} else if (isset($node->localName)) {`
		153	`return $node->localName;`
		154	`}`
		155	`return null;`
		156	`}`
		157
		158	`/**`
		159	`* Portably retrieve the data of a node; deals with older versions`
		160	`* of libxml like 2.7.6`
		161	`* @param DOMNode $node`
		162	`*/`
		163	`protected function getData($node)`
		164	`{`
		165	`if (isset($node->data)) {`
		166	`return $node->data;`
		167	`} else if (isset($node->nodeValue)) {`
		168	`return $node->nodeValue;`
		169	`} else if (isset($node->textContent)) {`
		170	`return $node->textContent;`
		171	`}`
		172	`return null;`
		173	`}`
		174
		175
		176	`/**`
		177	`* @param DOMNode $node DOMNode to be tokenized.`
		178	`* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.`
		179	`* @param bool $collect Says whether or start and close are collected, set to`
		180	`* false at first recursion because it's the implicit DIV`
		181	`* tag you're dealing with.`
		182	`* @return bool if the token needs an endtoken`
		183	`* @todo data and tagName properties don't seem to exist in DOMNode?`
		184	`*/`
		185	`protected function createStartNode($node, &$tokens, $collect, $config)`
		186	`{`
		187	`// intercept non element nodes. WE MUST catch all of them,`
		188	`// but we're not getting the character reference nodes because`
		189	`// those should have been preprocessed`
		190	`if ($node->nodeType === XML_TEXT_NODE) {`
		191	`$data = $this->getData($node); // Handle variable data property`
		192	`if ($data !== null) {`
		193	`$tokens[] = $this->factory->createText($data);`
		194	`}`
		195	`return false;`
		196	`} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {`
		197	`// undo libxml's special treatment of <script> and <style> tags`
		198	`$last = end($tokens);`
		199	`$data = $node->data;`
		200	`// (note $node->tagname is already normalized)`
		201	`if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' \|\| $last->name == 'style')) {`
		202	`$new_data = trim($data);`
		203	`if (substr($new_data, 0, 4) === '<!--') {`
		204	`$data = substr($new_data, 4);`
		205	`if (substr($data, -3) === '-->') {`
		206	`$data = substr($data, 0, -3);`
		207	`} else {`
		208	`// Highly suspicious! Not sure what to do...`
		209	`}`
		210	`}`
		211	`}`
		212	`$tokens[] = $this->factory->createText($this->parseText($data, $config));`
		213	`return false;`
		214	`} elseif ($node->nodeType === XML_COMMENT_NODE) {`
		215	`// this is code is only invoked for comments in script/style in versions`
		216	`// of libxml pre-2.6.28 (regular comments, of course, are still`
		217	`// handled regularly)`
		218	`$tokens[] = $this->factory->createComment($node->data);`
		219	`return false;`
		220	`} elseif ($node->nodeType !== XML_ELEMENT_NODE) {`
		221	`// not-well tested: there may be other nodes we have to grab`
		222	`return false;`
		223	`}`
		224	`$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();`
		225	`$tag_name = $this->getTagName($node); // Handle variable tagName property`
		226	`if (empty($tag_name)) {`
		227	`return (bool) $node->childNodes->length;`
		228	`}`
		229	`// We still have to make sure that the element actually IS empty`
		230	`if (!$node->childNodes->length) {`
		231	`if ($collect) {`
		232	`$tokens[] = $this->factory->createEmpty($tag_name, $attr);`
		233	`}`
		234	`return false;`
		235	`} else {`
		236	`if ($collect) {`
		237	`$tokens[] = $this->factory->createStart($tag_name, $attr);`
		238	`}`
		239	`return true;`
		240	`}`
		241	`}`
		242
		243	`/**`
		244	`* @param DOMNode $node`
		245	`* @param HTMLPurifier_Token[] $tokens`
		246	`*/`
		247	`protected function createEndNode($node, &$tokens)`
		248	`{`
		249	`$tag_name = $this->getTagName($node); // Handle variable tagName property`
		250	`$tokens[] = $this->factory->createEnd($tag_name);`
		251	`}`
		252
		253	`/**`
		254	`* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.`
		255	`*`
		256	`* @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.`
		257	`* @return array Associative array of attributes.`
		258	`*/`
		259	`protected function transformAttrToAssoc($node_map)`
		260	`{`
		261	`// NamedNodeMap is documented very well, so we're using undocumented`
		262	`// features, namely, the fact that it implements Iterator and`
		263	`// has a ->length attribute`
		264	`if ($node_map->length === 0) {`
		265	`return array();`
		266	`}`
		267	`$array = array();`
		268	`foreach ($node_map as $attr) {`
		269	`$array[$attr->name] = $attr->value;`
		270	`}`
		271	`return $array;`
		272	`}`
		273
		274	`/**`
		275	`* An error handler that mutes all errors`
		276	`* @param int $errno`
		277	`* @param string $errstr`
		278	`*/`
		279	`public function muteErrorHandler($errno, $errstr)`
		280	`{`
		281	`}`
		282
		283	`/**`
		284	`* Callback function for undoing escaping of stray angled brackets`
		285	`* in comments`
		286	`* @param array $matches`
		287	`* @return string`
		288	`*/`
		289	`public function callbackUndoCommentSubst($matches)`
		290	`{`
		291	`return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];`
		292	`}`
		293
		294	`/**`
		295	`* Callback function that entity-izes ampersands in comments so that`
		296	`* callbackUndoCommentSubst doesn't clobber them`
		297	`* @param array $matches`
		298	`* @return string`
		299	`*/`
		300	`public function callbackArmorCommentEntities($matches)`
		301	`{`
		302	`return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];`
		303	`}`
		304
		305	`/**`
		306	`* Wraps an HTML fragment in the necessary HTML`
		307	`* @param string $html`
		308	`* @param HTMLPurifier_Config $config`
		309	`* @param HTMLPurifier_Context $context`
		310	`* @return string`
		311	`*/`
		312	`protected function wrapHTML($html, $config, $context, $use_div = true)`
		313	`{`
		314	`$def = $config->getDefinition('HTML');`
		315	`$ret = '';`
		316
		317	`if (!empty($def->doctype->dtdPublic) \|\| !empty($def->doctype->dtdSystem)) {`
		318	`$ret .= '<!DOCTYPE html ';`
		319	`if (!empty($def->doctype->dtdPublic)) {`
		320	`$ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';`
		321	`}`
		322	`if (!empty($def->doctype->dtdSystem)) {`
		323	`$ret .= '"' . $def->doctype->dtdSystem . '" ';`
		324	`}`
		325	`$ret .= '>';`
		326	`}`
		327
		328	`$ret .= '<html><head>';`
		329	`$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';`
		330	`// No protection if $html contains a stray </div>!`
		331	`$ret .= '</head><body>';`
		332	`if ($use_div) $ret .= '<div>';`
		333	`$ret .= $html;`
		334	`if ($use_div) $ret .= '</div>';`
		335	`$ret .= '</body></html>';`
		336	`return $ret;`
		337	`}`
		338	`}`
		339
		340	`// vim: et sw=4 sts=4`

Proyectos de Subversion Moodle

(root)/lib/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php – Rev 1441