WebSVN – Moodle – Autoría – /mod/hvp/reporting/html-purifier/HtmlReportPurifier.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2
		3	`/**`
		4	`* Class HtmlPurifier`
		5	`* Purify html`
		6	`*`
		7	`* XSS filters copied from drupal 7 common.inc. Some modifications done to`
		8	`* replace Drupal one-liner functions with corresponding flat PHP.`
		9	`*/`
		10	`class HtmlReportPurifier {`
		11
		12	`/**`
		13	`* Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.`
		14	`*`
		15	`* Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.`
		16	`* For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.`
		17	`*`
		18	`* This code does four things:`
		19	`* - Removes characters and constructs that can trick browsers.`
		20	`* - Makes sure all HTML entities are well-formed.`
		21	`* - Makes sure all HTML tags and attributes are well-formed.`
		22	`* - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.`
		23	`* javascript:).`
		24	`*`
		25	`* @param $string`
		26	`* The string with raw HTML in it. It will be stripped of everything that can`
		27	`* cause an XSS attack.`
		28	`* @param array $allowed_tags`
		29	`* An array of allowed tags.`
		30	`*`
		31	`* @param bool $allowedStyles`
		32	`*`
		33	`* @return mixed\|string An XSS safe version of $string, or an empty string if $string is not`
		34	`* An XSS safe version of $string, or an empty string if $string is not`
		35	`* valid UTF-8.`
		36	`* @ingroup sanitation`
		37	`*/`
		38	`public static function filter_xss($string, $allowed_tags = array(`
		39	`'a', 'b', 'br', 'code', 'col', 'colgroup', 'dd', 'div', 'dl',`
		40	`'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3',`
		41	`'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'img', 'ins', 'li',`
		42	`'menu', 'meter', 'nav', 'ol', 'p', 's', 'section', 'span', 'strong',`
		43	`'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',`
		44	`'thead', 'time', 'tr', 'tt', 'u', 'ul'), $allowedStyles = FALSE) {`
		45
		46	`$stylePatterns = false;`
		47	`if ($allowedStyles) {`
		48	`$stylePatterns = array();`
		49	`$stylePatterns[] = '/^color: (#[a-f0-9]{3}[a-f0-9]{3}?\|rgba?$[0-9, ]+$) ;?$/i';`
		50	`$stylePatterns[] = '/^background-color: (#[a-f0-9]{3}[a-f0-9]{3}?\|rgba?$[0-9, ]+$) ;?$/i';`
		51	`$stylePatterns[] = '/^(height:\:?[0-9]{1,8}px; width:\:?[0-9]{1,8}px\|width\:?[0-9]{1,8}px\|height:500px)?$/i';`
		52	`$stylePatterns[] = '/^width\:?[0-9]{1,8}%?$/i';`
		53	`$stylePatterns[] = '/^height\:?[0-9]{1,8}%?$/i';`
		54	`$stylePatterns[] = '/^font-size:\:?[0-9]{1,8}px$/i';`
		55	`}`
		56
		57	`if (strlen($string) == 0) {`
		58	`return $string;`
		59	`}`
		60	`// Only operate on valid UTF-8 strings. This is necessary to prevent cross`
		61	`// site scripting issues on Internet Explorer 6. (Line copied from`
		62	`// drupal_validate_utf8)`
		63	`if (preg_match('/^./us', $string) != 1) {`
		64	`return '';`
		65	`}`
		66
		67	`// Store the text format.`
		68	`self::_filter_xss_split($allowed_tags, TRUE, $stylePatterns);`
		69	`// Remove NULL characters (ignored by some browsers).`
		70	`$string = str_replace(chr(0), '', $string);`
		71	`// Remove Netscape 4 JS entities.`
		72	`$string = preg_replace('%&\s\{[^}](\}\s*;?\|$)%', '', $string);`
		73
		74	`// Defuse all HTML entities.`
		75	`$string = str_replace('&', '&', $string);`
		76	`// Change back only well-formed entities in our whitelist:`
		77	`// Decimal numeric entities.`
		78	`$string = preg_replace('/&#([0-9]+;)/', '&#\1', $string);`
		79	`// Hexadecimal numeric entities.`
		80	`$string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);`
		81	`// Named entities.`
		82	`$string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);`
		83	`return preg_replace_callback('%`
		84	`(`
		85	`<(?=[^a-zA-Z!/]) # a lone <`
		86	`\| # or`
		87	`<!--.*?--> # a comment`
		88	`\| # or`
		89	`<[^>]*(>\|$) # a string that starts with a <, up until the > or the end of the string`
		90	`\| # or`
		91	`> # just a >`
		92	`)%x', 'self::_filter_xss_split', $string);`
		93	`}`
		94
		95	`/**`
		96	`* Processes an HTML tag.`
		97	`*`
		98	`* @param $m`
		99	`* An array with various meaning depending on the value of $store.`
		100	`* If $store is TRUE then the array contains the allowed tags.`
		101	`* If $store is FALSE then the array has one element, the HTML tag to process.`
		102	`* @param bool $store`
		103	`* Whether to store $m.`
		104	`* @param bool $allowedStyles Allow styles`
		105	`*`
		106	`* @return string If the element isn't allowed, an empty string. Otherwise, the cleaned up`
		107	`* If the element isn't allowed, an empty string. Otherwise, the cleaned up`
		108	`* version of the HTML element.`
		109	`*/`
		110	`private static function _filter_xss_split($m, $store = FALSE, $n = FALSE) {`
		111	`static $allowed_html;`
		112	`static $allowed_styles;`
		113
		114	`if ($store) {`
		115	`$allowed_html = array_flip($m);`
		116	`$allowed_styles = $n;`
		117	`return $allowed_html;`
		118	`}`
		119
		120	`$string = $m[1];`
		121
		122	`if (substr($string, 0, 1) != '<') {`
		123	`// We matched a lone ">" character.`
		124	`return '>';`
		125	`}`
		126	`elseif (strlen($string) == 1) {`
		127	`// We matched a lone "<" character.`
		128	`return '<';`
		129	`}`
		130
		131	`if (!preg_match('%^<\s(/\s)?([a-zA-Z0-9\-]+)\s([^>])>?\|(<!--.*?-->)$%', $string, $matches)) {`
		132	`// Seriously malformed.`
		133	`return '';`
		134	`}`
		135
		136	`$slash = trim($matches[1]);`
		137	`$elem = &$matches[2];`
		138	`$attrList = &$matches[3];`
		139	`$comment = &$matches[4];`
		140
		141	`if ($comment) {`
		142	`$elem = '!--';`
		143	`}`
		144
		145	`if (!isset($allowed_html[strtolower($elem)])) {`
		146	`// Disallowed HTML element.`
		147	`return '';`
		148	`}`
		149
		150	`if ($comment) {`
		151	`return $comment;`
		152	`}`
		153
		154	`if ($slash != '') {`
		155	`return "</$elem>";`
		156	`}`
		157
		158	`// Is there a closing XHTML slash at the end of the attributes?`
		159	`$attrList = preg_replace('%(\s?)/\s*$%', '\1', $attrList, -1, $count);`
		160	`$xhtml_slash = $count ? ' /' : '';`
		161
		162	`// Clean up attributes.`
		163
		164	`$attr2 = implode(' ', self::_filter_xss_attributes($attrList, $allowed_styles));`
		165	`$attr2 = preg_replace('/[<>]/', '', $attr2);`
		166	`$attr2 = strlen($attr2) ? ' ' . $attr2 : '';`
		167
		168	`return "<$elem$attr2$xhtml_slash>";`
		169	`}`
		170
		171	`/**`
		172	`* Processes a string of HTML attributes.`
		173	`*`
		174	`* @param $attr`
		175	`* @param array\|bool\|object $allowedStyles`
		176	`*`
		177	`* @return array Cleaned up version of the HTML attributes.`
		178	`* Cleaned up version of the HTML attributes.`
		179	`*/`
		180	`private static function _filter_xss_attributes($attr, $allowedStyles = FALSE) {`
		181	`$attrArr = array();`
		182	`$mode = 0;`
		183	`$attrName = '';`
		184	`$skip = FALSE;`
		185
		186	`while (strlen($attr) != 0) {`
		187	`// Was the last operation successful?`
		188	`$working = 0;`
		189	`switch ($mode) {`
		190	`case 0:`
		191	`// Attribute name, href for instance.`
		192	`if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {`
		193	`$attrName = strtolower($match[1]);`
		194	`$skip = (`
		195	`$attrName == 'style' \|\|`
		196	`substr($attrName, 0, 2) == 'on' \|\|`
		197	`substr($attrName, 0, 1) == '-' \|\|`
		198	`// Ignore long attributes to avoid unnecessary processing overhead.`
		199	`strlen($attrName) > 96`
		200	`);`
		201	`$working = $mode = 1;`
		202	`$attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);`
		203	`}`
		204	`break;`
		205
		206	`case 1:`
		207	`// Equals sign or valueless ("selected").`
		208	`if (preg_match('/^\s=\s/', $attr)) {`
		209	`$working = 1;`
		210	`$mode = 2;`
		211	`$attr = preg_replace('/^\s=\s/', '', $attr);`
		212	`break;`
		213	`}`
		214
		215	`if (preg_match('/^\s+/', $attr)) {`
		216	`$working = 1;`
		217	`$mode = 0;`
		218	`if (!$skip) {`
		219	`$attrArr[] = $attrName;`
		220	`}`
		221	`$attr = preg_replace('/^\s+/', '', $attr);`
		222	`}`
		223	`break;`
		224
		225	`case 2:`
		226	`// Attribute value, a URL after href= for instance.`
		227	`if (preg_match('/^"([^"]*)"(\s+\|$)/', $attr, $match)) {`
		228	`if ($allowedStyles && $attrName === 'style') {`
		229	`// Allow certain styles`
		230	`foreach ($allowedStyles as $pattern) {`
		231	`if (preg_match($pattern, $match[1])) {`
		232	`// All patterns are start to end patterns, and CKEditor adds one span per style`
		233	`$attrArr[] = 'style="' . $match[1] . '"';`
		234	`break;`
		235	`}`
		236	`}`
		237	`break;`
		238	`}`
		239
		240	`$thisVal = self::filter_xss_bad_protocol($match[1]);`
		241
		242	`if (!$skip) {`
		243	`$attrArr[] = "$attrName=\"$thisVal\"";`
		244	`}`
		245	`$working = 1;`
		246	`$mode = 0;`
		247	`$attr = preg_replace('/^"[^"]*"(\s+\|$)/', '', $attr);`
		248	`break;`
		249	`}`
		250
		251	`if (preg_match("/^'([^']*)'(\s+\|$)/", $attr, $match)) {`
		252	`$thisVal = self::filter_xss_bad_protocol($match[1]);`
		253
		254	`if (!$skip) {`
		255	`$attrArr[] = "$attrName='$thisVal'";`
		256	`}`
		257	`$working = 1;`
		258	`$mode = 0;`
		259	`$attr = preg_replace("/^'[^']*'(\s+\|$)/", '', $attr);`
		260	`break;`
		261	`}`
		262
		263	`if (preg_match("%^([^\s\"']+)(\s+\|$)%", $attr, $match)) {`
		264	`$thisVal = self::filter_xss_bad_protocol($match[1]);`
		265
		266	`if (!$skip) {`
		267	`$attrArr[] = "$attrName=\"$thisVal\"";`
		268	`}`
		269	`$working = 1;`
		270	`$mode = 0;`
		271	`$attr = preg_replace("%^[^\s\"']+(\s+\|$)%", '', $attr);`
		272	`}`
		273	`break;`
		274	`}`
		275
		276	`if ($working == 0) {`
		277	`// Not well formed; remove and try again.`
		278	`$attr = preg_replace('/`
		279	`^`
		280	`(`
		281	`"[^"]*("\|$) # - a string that starts with a double quote, up until the next double quote or the end of the string`
		282	`\| # or`
		283	`\'[^\']*(\'\|$)\| # - a string that starts with a quote, up until the next quote or the end of the string`
		284	`\| # or`
		285	`\S # - a non-whitespace character`
		286	`)* # any number of the above three`
		287	`\s* # any number of whitespaces`
		288	`/x', '', $attr);`
		289	`$mode = 0;`
		290	`}`
		291	`}`
		292
		293	`// The attribute list ends with a valueless attribute like "selected".`
		294	`if ($mode == 1 && !$skip) {`
		295	`$attrArr[] = $attrName;`
		296	`}`
		297	`return $attrArr;`
		298	`}`
		299
		300	`/**`
		301	`* Processes an HTML attribute value and strips dangerous protocols from URLs.`
		302	`*`
		303	`* @param $string`
		304	`* The string with the attribute value.`
		305	`* @param bool $decode`
		306	`* (deprecated) Whether to decode entities in the $string. Set to FALSE if the`
		307	`* $string is in plain text, TRUE otherwise. Defaults to TRUE. This parameter`
		308	`* is deprecated and will be removed in Drupal 8. To process a plain-text URI,`
		309	`* call _strip_dangerous_protocols() or check_url() instead.`
		310	`*`
		311	`* @return string Cleaned up and HTML-escaped version of $string.`
		312	`* Cleaned up and HTML-escaped version of $string.`
		313	`*/`
		314	`private static function filter_xss_bad_protocol($string, $decode = TRUE) {`
		315	`// Get the plain text representation of the attribute value (i.e. its meaning).`
		316	`// @todo Remove the $decode parameter in Drupal 8, and always assume an HTML`
		317	`// string that needs decoding.`
		318	`if ($decode) {`
		319	`$string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');`
		320	`}`
		321	`return htmlspecialchars(self::_strip_dangerous_protocols($string), ENT_QUOTES, 'UTF-8', FALSE);`
		322	`}`
		323
		324	`/**`
		325	`* Strips dangerous protocols (e.g. 'javascript:') from a URI.`
		326	`*`
		327	`* This function must be called for all URIs within user-entered input prior`
		328	`* to being output to an HTML attribute value. It is often called as part of`
		329	`* check_url() or filter_xss(), but those functions return an HTML-encoded`
		330	`* string, so this function can be called independently when the output needs to`
		331	`* be a plain-text string for passing to t(), l(), drupal_attributes(), or`
		332	`* another function that will call check_plain() separately.`
		333	`*`
		334	`* @param $uri`
		335	`* A plain-text URI that might contain dangerous protocols.`
		336	`*`
		337	`* @return string A plain-text URI stripped of dangerous protocols. As with all plain-text`
		338	`* A plain-text URI stripped of dangerous protocols. As with all plain-text`
		339	`* strings, this return value must not be output to an HTML page without`
		340	`* check_plain() being called on it. However, it can be passed to functions`
		341	`* expecting plain-text strings.`
		342	`* @see check_url()`
		343	`*/`
		344	`private static function _strip_dangerous_protocols($uri) {`
		345	`static $allowed_protocols;`
		346
		347	`if (!isset($allowed_protocols)) {`
		348	`$allowed_protocols = array_flip(array('ftp', 'http', 'https', 'mailto'));`
		349	`}`
		350
		351	`// Iteratively remove any invalid protocol found.`
		352	`do {`
		353	`$before = $uri;`
		354	`$colonPos = strpos($uri, ':');`
		355	`if ($colonPos > 0) {`
		356	`// We found a colon, possibly a protocol. Verify.`
		357	`$protocol = substr($uri, 0, $colonPos);`
		358	`// If a colon is preceded by a slash, question mark or hash, it cannot`
		359	`// possibly be part of the URL scheme. This must be a relative URL, which`
		360	`// inherits the (safe) protocol of the base document.`
		361	`if (preg_match('![/?#]!', $protocol)) {`
		362	`break;`
		363	`}`
		364	`// Check if this is a disallowed protocol. Per RFC2616, section 3.2.3`
		365	`// (URI Comparison) scheme comparison must be case-insensitive.`
		366	`if (!isset($allowed_protocols[strtolower($protocol)])) {`
		367	`$uri = substr($uri, $colonPos + 1);`
		368	`}`
		369	`}`
		370	`} while ($before != $uri);`
		371
		372	`return $uri;`
		373	`}`
		374	`}`

Proyectos de Subversion Moodle

(root)/mod/hvp/reporting/html-purifier/HtmlReportPurifier.php – Rev 1