WebSVN – Moodle – Autoría – /lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`* Our in-house implementation of a parser.`
		5	`*`
		6	`* A pure PHP parser, DirectLex has absolutely no dependencies, making`
		7	`* it a reasonably good default for PHP4. Written with efficiency in mind,`
		8	`* it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it`
		9	`* pales in comparison to HTMLPurifier_Lexer_DOMLex.`
		10	`*`
		11	`* @todo Reread XML spec and document differences.`
		12	`*/`
		13	`class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer`
		14	`{`
		15	`/**`
		16	`* @type bool`
		17	`*/`
		18	`public $tracksLineNumbers = true;`
		19
		20	`/**`
		21	`* Whitespace characters for str(c)spn.`
		22	`* @type string`
		23	`*/`
		24	`protected $_whitespace = "\x20\x09\x0D\x0A";`
		25
		26	`/**`
		27	`* Callback function for script CDATA fudge`
		28	`* @param array $matches, in form of array(opening tag, contents, closing tag)`
		29	`* @return string`
		30	`*/`
		31	`protected function scriptCallback($matches)`
		32	`{`
		33	`return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];`
		34	`}`
		35
		36	`/**`
		37	`* @param String $html`
		38	`* @param HTMLPurifier_Config $config`
		39	`* @param HTMLPurifier_Context $context`
		40	`* @return array\|HTMLPurifier_Token[]`
		41	`*/`
		42	`public function tokenizeHTML($html, $config, $context)`
		43	`{`
		44	`// special normalization for script tags without any armor`
		45	`// our "armor" heurstic is a < sign any number of whitespaces after`
		46	`// the first script tag`
		47	`if ($config->get('HTML.Trusted')) {`
		48	`$html = preg_replace_callback(`
		49	`'#(<script[^>]>)(\s[^<].+?)(</script>)#si',`
		50	`array($this, 'scriptCallback'),`
		51	`$html`
		52	`);`
		53	`}`
		54
		55	`$html = $this->normalize($html, $config, $context);`
		56
		57	`$cursor = 0; // our location in the text`
		58	`$inside_tag = false; // whether or not we're parsing the inside of a tag`
		59	`$array = array(); // result array`
		60
		61	`// This is also treated to mean maintain column numbers too`
		62	`$maintain_line_numbers = $config->get('Core.MaintainLineNumbers');`
		63
		64	`if ($maintain_line_numbers === null) {`
		65	`// automatically determine line numbering by checking`
		66	`// if error collection is on`
		67	`$maintain_line_numbers = $config->get('Core.CollectErrors');`
		68	`}`
		69
		70	`if ($maintain_line_numbers) {`
		71	`$current_line = 1;`
		72	`$current_col = 0;`
		73	`$length = strlen($html);`
		74	`} else {`
		75	`$current_line = false;`
		76	`$current_col = false;`
		77	`$length = false;`
		78	`}`
		79	`$context->register('CurrentLine', $current_line);`
		80	`$context->register('CurrentCol', $current_col);`
		81	`$nl = "\n";`
		82	`// how often to manually recalculate. This will ALWAYS be right,`
		83	`// but it's pretty wasteful. Set to 0 to turn off`
		84	`$synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');`
		85
		86	`$e = false;`
		87	`if ($config->get('Core.CollectErrors')) {`
		88	`$e =& $context->get('ErrorCollector');`
		89	`}`
		90
		91	`// for testing synchronization`
		92	`$loops = 0;`
		93
		94	`while (++$loops) {`
		95	`// $cursor is either at the start of a token, or inside of`
		96	`// a tag (i.e. there was a < immediately before it), as indicated`
		97	`// by $inside_tag`
		98
		99	`if ($maintain_line_numbers) {`
		100	`// $rcursor, however, is always at the start of a token.`
		101	`$rcursor = $cursor - (int)$inside_tag;`
		102
		103	`// Column number is cheap, so we calculate it every round.`
		104	`// We're interested at the end of the newline string, so`
		105	`// we need to add strlen($nl) == 1 to $nl_pos before subtracting it`
		106	`// from our "rcursor" position.`
		107	`$nl_pos = strrpos($html, $nl, $rcursor - $length);`
		108	`$current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);`
		109
		110	`// recalculate lines`
		111	`if ($synchronize_interval && // synchronization is on`
		112	`$cursor > 0 && // cursor is further than zero`
		113	`$loops % $synchronize_interval === 0) { // time to synchronize!`
		114	`$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);`
		115	`}`
		116	`}`
		117
		118	`$position_next_lt = strpos($html, '<', $cursor);`
		119	`$position_next_gt = strpos($html, '>', $cursor);`
		120
		121	`// triggers on "<b>asdf</b>" but not "asdf <b></b>"`
		122	`// special case to set up context`
		123	`if ($position_next_lt === $cursor) {`
		124	`$inside_tag = true;`
		125	`$cursor++;`
		126	`}`
		127
		128	`if (!$inside_tag && $position_next_lt !== false) {`
		129	`// We are not inside tag and there still is another tag to parse`
		130	`$token = new`
		131	`HTMLPurifier_Token_Text(`
		132	`$this->parseText(`
		133	`substr(`
		134	`$html,`
		135	`$cursor,`
		136	`$position_next_lt - $cursor`
		137	`), $config`
		138	`)`
		139	`);`
		140	`if ($maintain_line_numbers) {`
		141	`$token->rawPosition($current_line, $current_col);`
		142	`$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);`
		143	`}`
		144	`$array[] = $token;`
		145	`$cursor = $position_next_lt + 1;`
		146	`$inside_tag = true;`
		147	`continue;`
		148	`} elseif (!$inside_tag) {`
		149	`// We are not inside tag but there are no more tags`
		150	`// If we're already at the end, break`
		151	`if ($cursor === strlen($html)) {`
		152	`break;`
		153	`}`
		154	`// Create Text of rest of string`
		155	`$token = new`
		156	`HTMLPurifier_Token_Text(`
		157	`$this->parseText(`
		158	`substr(`
		159	`$html,`
		160	`$cursor`
		161	`), $config`
		162	`)`
		163	`);`
		164	`if ($maintain_line_numbers) {`
		165	`$token->rawPosition($current_line, $current_col);`
		166	`}`
		167	`$array[] = $token;`
		168	`break;`
		169	`} elseif ($inside_tag && $position_next_gt !== false) {`
		170	`// We are in tag and it is well formed`
		171	`// Grab the internals of the tag`
		172	`$strlen_segment = $position_next_gt - $cursor;`
		173
		174	`if ($strlen_segment < 1) {`
		175	`// there's nothing to process!`
		176	`$token = new HTMLPurifier_Token_Text('<');`
		177	`$cursor++;`
		178	`continue;`
		179	`}`
		180
		181	`$segment = substr($html, $cursor, $strlen_segment);`
		182
		183	`if ($segment === false) {`
		184	`// somehow, we attempted to access beyond the end of`
		185	`// the string, defense-in-depth, reported by Nate Abele`
		186	`break;`
		187	`}`
		188
		189	`// Check if it's a comment`
		190	`if (substr($segment, 0, 3) === '!--') {`
		191	`// re-determine segment length, looking for -->`
		192	`$position_comment_end = strpos($html, '-->', $cursor);`
		193	`if ($position_comment_end === false) {`
		194	`// uh oh, we have a comment that extends to`
		195	`// infinity. Can't be helped: set comment`
		196	`// end position to end of string`
		197	`if ($e) {`
		198	`$e->send(E_WARNING, 'Lexer: Unclosed comment');`
		199	`}`
		200	`$position_comment_end = strlen($html);`
		201	`$end = true;`
		202	`} else {`
		203	`$end = false;`
		204	`}`
		205	`$strlen_segment = $position_comment_end - $cursor;`
		206	`$segment = substr($html, $cursor, $strlen_segment);`
		207	`$token = new`
		208	`HTMLPurifier_Token_Comment(`
		209	`substr(`
		210	`$segment,`
		211	`3,`
		212	`$strlen_segment - 3`
		213	`)`
		214	`);`
		215	`if ($maintain_line_numbers) {`
		216	`$token->rawPosition($current_line, $current_col);`
		217	`$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);`
		218	`}`
		219	`$array[] = $token;`
		220	`$cursor = $end ? $position_comment_end : $position_comment_end + 3;`
		221	`$inside_tag = false;`
		222	`continue;`
		223	`}`
		224
		225	`// Check if it's an end tag`
		226	`$is_end_tag = (strpos($segment, '/') === 0);`
		227	`if ($is_end_tag) {`
		228	`$type = substr($segment, 1);`
		229	`$token = new HTMLPurifier_Token_End($type);`
		230	`if ($maintain_line_numbers) {`
		231	`$token->rawPosition($current_line, $current_col);`
		232	`$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);`
		233	`}`
		234	`$array[] = $token;`
		235	`$inside_tag = false;`
		236	`$cursor = $position_next_gt + 1;`
		237	`continue;`
		238	`}`
		239
		240	`// Check leading character is alnum, if not, we may`
		241	`// have accidently grabbed an emoticon. Translate into`
		242	`// text and go our merry way`
		243	`if (!ctype_alpha($segment[0])) {`
		244	`// XML: $segment[0] !== '_' && $segment[0] !== ':'`
		245	`if ($e) {`
		246	`$e->send(E_NOTICE, 'Lexer: Unescaped lt');`
		247	`}`
		248	`$token = new HTMLPurifier_Token_Text('<');`
		249	`if ($maintain_line_numbers) {`
		250	`$token->rawPosition($current_line, $current_col);`
		251	`$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);`
		252	`}`
		253	`$array[] = $token;`
		254	`$inside_tag = false;`
		255	`continue;`
		256	`}`
		257
		258	`// Check if it is explicitly self closing, if so, remove`
		259	`// trailing slash. Remember, we could have a tag like <br>, so`
		260	`// any later token processing scripts must convert improperly`
		261	`// classified EmptyTags from StartTags.`
		262	`$is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);`
		263	`if ($is_self_closing) {`
		264	`$strlen_segment--;`
		265	`$segment = substr($segment, 0, $strlen_segment);`
		266	`}`
		267
		268	`// Check if there are any attributes`
		269	`$position_first_space = strcspn($segment, $this->_whitespace);`
		270
		271	`if ($position_first_space >= $strlen_segment) {`
		272	`if ($is_self_closing) {`
		273	`$token = new HTMLPurifier_Token_Empty($segment);`
		274	`} else {`
		275	`$token = new HTMLPurifier_Token_Start($segment);`
		276	`}`
		277	`if ($maintain_line_numbers) {`
		278	`$token->rawPosition($current_line, $current_col);`
		279	`$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);`
		280	`}`
		281	`$array[] = $token;`
		282	`$inside_tag = false;`
		283	`$cursor = $position_next_gt + 1;`
		284	`continue;`
		285	`}`
		286
		287	`// Grab out all the data`
		288	`$type = substr($segment, 0, $position_first_space);`
		289	`$attribute_string =`
		290	`trim(`
		291	`substr(`
		292	`$segment,`
		293	`$position_first_space`
		294	`)`
		295	`);`
		296	`if ($attribute_string) {`
		297	`$attr = $this->parseAttributeString(`
		298	`$attribute_string,`
		299	`$config,`
		300	`$context`
		301	`);`
		302	`} else {`
		303	`$attr = array();`
		304	`}`
		305
		306	`if ($is_self_closing) {`
		307	`$token = new HTMLPurifier_Token_Empty($type, $attr);`
		308	`} else {`
		309	`$token = new HTMLPurifier_Token_Start($type, $attr);`
		310	`}`
		311	`if ($maintain_line_numbers) {`
		312	`$token->rawPosition($current_line, $current_col);`
		313	`$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);`
		314	`}`
		315	`$array[] = $token;`
		316	`$cursor = $position_next_gt + 1;`
		317	`$inside_tag = false;`
		318	`continue;`
		319	`} else {`
		320	`// inside tag, but there's no ending > sign`
		321	`if ($e) {`
		322	`$e->send(E_WARNING, 'Lexer: Missing gt');`
		323	`}`
		324	`$token = new`
		325	`HTMLPurifier_Token_Text(`
		326	`'<' .`
		327	`$this->parseText(`
		328	`substr($html, $cursor), $config`
		329	`)`
		330	`);`
		331	`if ($maintain_line_numbers) {`
		332	`$token->rawPosition($current_line, $current_col);`
		333	`}`
		334	`// no cursor scroll? Hmm...`
		335	`$array[] = $token;`
		336	`break;`
		337	`}`
		338	`break;`
		339	`}`
		340
		341	`$context->destroy('CurrentLine');`
		342	`$context->destroy('CurrentCol');`
		343	`return $array;`
		344	`}`
		345
		346	`/**`
		347	`* PHP 5.0.x compatible substr_count that implements offset and length`
		348	`* @param string $haystack`
		349	`* @param string $needle`
		350	`* @param int $offset`
		351	`* @param int $length`
		352	`* @return int`
		353	`*/`
		354	`protected function substrCount($haystack, $needle, $offset, $length)`
		355	`{`
		356	`static $oldVersion;`
		357	`if ($oldVersion === null) {`
		358	`$oldVersion = version_compare(PHP_VERSION, '5.1', '<');`
		359	`}`
		360	`if ($oldVersion) {`
		361	`$haystack = substr($haystack, $offset, $length);`
		362	`return substr_count($haystack, $needle);`
		363	`} else {`
		364	`return substr_count($haystack, $needle, $offset, $length);`
		365	`}`
		366	`}`
		367
		368	`/**`
		369	`* Takes the inside of an HTML tag and makes an assoc array of attributes.`
		370	`*`
		371	`* @param string $string Inside of tag excluding name.`
		372	`* @param HTMLPurifier_Config $config`
		373	`* @param HTMLPurifier_Context $context`
		374	`* @return array Assoc array of attributes.`
		375	`*/`
		376	`public function parseAttributeString($string, $config, $context)`
		377	`{`
		378	`$string = (string)$string; // quick typecast`
		379
		380	`if ($string == '') {`
		381	`return array();`
		382	`} // no attributes`
		383
		384	`$e = false;`
		385	`if ($config->get('Core.CollectErrors')) {`
		386	`$e =& $context->get('ErrorCollector');`
		387	`}`
		388
		389	`// let's see if we can abort as quickly as possible`
		390	`// one equal sign, no spaces => one attribute`
		391	`$num_equal = substr_count($string, '=');`
		392	`$has_space = strpos($string, ' ');`
		393	`if ($num_equal === 0 && !$has_space) {`
		394	`// bool attribute`
		395	`return array($string => $string);`
		396	`} elseif ($num_equal === 1 && !$has_space) {`
		397	`// only one attribute`
		398	`list($key, $quoted_value) = explode('=', $string);`
		399	`$quoted_value = trim($quoted_value);`
		400	`if (!$key) {`
		401	`if ($e) {`
		402	`$e->send(E_ERROR, 'Lexer: Missing attribute key');`
		403	`}`
		404	`return array();`
		405	`}`
		406	`if (!$quoted_value) {`
		407	`return array($key => '');`
		408	`}`
		409	`$first_char = @$quoted_value[0];`
		410	`$last_char = @$quoted_value[strlen($quoted_value) - 1];`
		411
		412	`$same_quote = ($first_char == $last_char);`
		413	`$open_quote = ($first_char == '"' \|\| $first_char == "'");`
		414
		415	`if ($same_quote && $open_quote) {`
		416	`// well behaved`
		417	`$value = substr($quoted_value, 1, strlen($quoted_value) - 2);`
		418	`} else {`
		419	`// not well behaved`
		420	`if ($open_quote) {`
		421	`if ($e) {`
		422	`$e->send(E_ERROR, 'Lexer: Missing end quote');`
		423	`}`
		424	`$value = substr($quoted_value, 1);`
		425	`} else {`
		426	`$value = $quoted_value;`
		427	`}`
		428	`}`
		429	`if ($value === false) {`
		430	`$value = '';`
		431	`}`
		432	`return array($key => $this->parseAttr($value, $config));`
		433	`}`
		434
		435	`// setup loop environment`
		436	`$array = array(); // return assoc array of attributes`
		437	`$cursor = 0; // current position in string (moves forward)`
		438	`$size = strlen($string); // size of the string (stays the same)`
		439
		440	`// if we have unquoted attributes, the parser expects a terminating`
		441	`// space, so let's guarantee that there's always a terminating space.`
		442	`$string .= ' ';`
		443
		444	`$old_cursor = -1;`
		445	`while ($cursor < $size) {`
		446	`if ($old_cursor >= $cursor) {`
		447	`throw new Exception("Infinite loop detected");`
		448	`}`
		449	`$old_cursor = $cursor;`
		450
		451	`$cursor += ($value = strspn($string, $this->_whitespace, $cursor));`
		452	`// grab the key`
		453
		454	`$key_begin = $cursor; //we're currently at the start of the key`
		455
		456	`// scroll past all characters that are the key (not whitespace or =)`
		457	`$cursor += strcspn($string, $this->_whitespace . '=', $cursor);`
		458
		459	`$key_end = $cursor; // now at the end of the key`
		460
		461	`$key = substr($string, $key_begin, $key_end - $key_begin);`
		462
		463	`if (!$key) {`
		464	`if ($e) {`
		465	`$e->send(E_ERROR, 'Lexer: Missing attribute key');`
		466	`}`
		467	`$cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop`
		468	`continue; // empty key`
		469	`}`
		470
		471	`// scroll past all whitespace`
		472	`$cursor += strspn($string, $this->_whitespace, $cursor);`
		473
		474	`if ($cursor >= $size) {`
		475	`$array[$key] = $key;`
		476	`break;`
		477	`}`
		478
		479	`// if the next character is an equal sign, we've got a regular`
		480	`// pair, otherwise, it's a bool attribute`
		481	`$first_char = @$string[$cursor];`
		482
		483	`if ($first_char == '=') {`
		484	`// key="value"`
		485
		486	`$cursor++;`
		487	`$cursor += strspn($string, $this->_whitespace, $cursor);`
		488
		489	`if ($cursor === false) {`
		490	`$array[$key] = '';`
		491	`break;`
		492	`}`
		493
		494	`// we might be in front of a quote right now`
		495
		496	`$char = @$string[$cursor];`
		497
		498	`if ($char == '"' \|\| $char == "'") {`
		499	`// it's quoted, end bound is $char`
		500	`$cursor++;`
		501	`$value_begin = $cursor;`
		502	`$cursor = strpos($string, $char, $cursor);`
		503	`$value_end = $cursor;`
		504	`} else {`
		505	`// it's not quoted, end bound is whitespace`
		506	`$value_begin = $cursor;`
		507	`$cursor += strcspn($string, $this->_whitespace, $cursor);`
		508	`$value_end = $cursor;`
		509	`}`
		510
		511	`// we reached a premature end`
		512	`if ($cursor === false) {`
		513	`$cursor = $size;`
		514	`$value_end = $cursor;`
		515	`}`
		516
		517	`$value = substr($string, $value_begin, $value_end - $value_begin);`
		518	`if ($value === false) {`
		519	`$value = '';`
		520	`}`
		521	`$array[$key] = $this->parseAttr($value, $config);`
		522	`$cursor++;`
		523	`} else {`
		524	`// boolattr`
		525	`if ($key !== '') {`
		526	`$array[$key] = $key;`
		527	`} else {`
		528	`// purely theoretical`
		529	`if ($e) {`
		530	`$e->send(E_ERROR, 'Lexer: Missing attribute key');`
		531	`}`
		532	`}`
		533	`}`
		534	`}`
		535	`return $array;`
		536	`}`
		537	`}`
		538
		539	`// vim: et sw=4 sts=4`

Proyectos de Subversion Moodle

(root)/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php – Rev 1