1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
* Class HtmlPurifier
|
|
|
5 |
* Purify html
|
|
|
6 |
*
|
|
|
7 |
* XSS filters copied from drupal 7 common.inc. Some modifications done to
|
|
|
8 |
* replace Drupal one-liner functions with corresponding flat PHP.
|
|
|
9 |
*/
|
|
|
10 |
class HtmlReportPurifier {
|
|
|
11 |
|
|
|
12 |
/**
|
|
|
13 |
* Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
|
|
|
14 |
*
|
|
|
15 |
* Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
|
|
|
16 |
* For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
|
|
|
17 |
*
|
|
|
18 |
* This code does four things:
|
|
|
19 |
* - Removes characters and constructs that can trick browsers.
|
|
|
20 |
* - Makes sure all HTML entities are well-formed.
|
|
|
21 |
* - Makes sure all HTML tags and attributes are well-formed.
|
|
|
22 |
* - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
|
|
|
23 |
* javascript:).
|
|
|
24 |
*
|
|
|
25 |
* @param $string
|
|
|
26 |
* The string with raw HTML in it. It will be stripped of everything that can
|
|
|
27 |
* cause an XSS attack.
|
|
|
28 |
* @param array $allowed_tags
|
|
|
29 |
* An array of allowed tags.
|
|
|
30 |
*
|
|
|
31 |
* @param bool $allowedStyles
|
|
|
32 |
*
|
|
|
33 |
* @return mixed|string An XSS safe version of $string, or an empty string if $string is not
|
|
|
34 |
* An XSS safe version of $string, or an empty string if $string is not
|
|
|
35 |
* valid UTF-8.
|
|
|
36 |
* @ingroup sanitation
|
|
|
37 |
*/
|
|
|
38 |
public static function filter_xss($string, $allowed_tags = array(
|
|
|
39 |
'a', 'b', 'br', 'code', 'col', 'colgroup', 'dd', 'div', 'dl',
|
|
|
40 |
'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3',
|
|
|
41 |
'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'img', 'ins', 'li',
|
|
|
42 |
'menu', 'meter', 'nav', 'ol', 'p', 's', 'section', 'span', 'strong',
|
|
|
43 |
'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
|
|
|
44 |
'thead', 'time', 'tr', 'tt', 'u', 'ul'), $allowedStyles = FALSE) {
|
|
|
45 |
|
|
|
46 |
$stylePatterns = false;
|
|
|
47 |
if ($allowedStyles) {
|
|
|
48 |
$stylePatterns = array();
|
|
|
49 |
$stylePatterns[] = '/^color: *(#[a-f0-9]{3}[a-f0-9]{3}?|rgba?\([0-9, ]+\)) *;?$/i';
|
|
|
50 |
$stylePatterns[] = '/^background-color: *(#[a-f0-9]{3}[a-f0-9]{3}?|rgba?\([0-9, ]+\)) *;?$/i';
|
|
|
51 |
$stylePatterns[] = '/^(height:\:?[0-9]{1,8}px; width:\:?[0-9]{1,8}px|width\:?[0-9]{1,8}px|height:500px)?$/i';
|
|
|
52 |
$stylePatterns[] = '/^width\:?[0-9]{1,8}%?$/i';
|
|
|
53 |
$stylePatterns[] = '/^height\:?[0-9]{1,8}%?$/i';
|
|
|
54 |
$stylePatterns[] = '/^font-size:\:?[0-9]{1,8}px$/i';
|
|
|
55 |
}
|
|
|
56 |
|
|
|
57 |
if (strlen($string) == 0) {
|
|
|
58 |
return $string;
|
|
|
59 |
}
|
|
|
60 |
// Only operate on valid UTF-8 strings. This is necessary to prevent cross
|
|
|
61 |
// site scripting issues on Internet Explorer 6. (Line copied from
|
|
|
62 |
// drupal_validate_utf8)
|
|
|
63 |
if (preg_match('/^./us', $string) != 1) {
|
|
|
64 |
return '';
|
|
|
65 |
}
|
|
|
66 |
|
|
|
67 |
// Store the text format.
|
|
|
68 |
self::_filter_xss_split($allowed_tags, TRUE, $stylePatterns);
|
|
|
69 |
// Remove NULL characters (ignored by some browsers).
|
|
|
70 |
$string = str_replace(chr(0), '', $string);
|
|
|
71 |
// Remove Netscape 4 JS entities.
|
|
|
72 |
$string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
|
|
|
73 |
|
|
|
74 |
// Defuse all HTML entities.
|
|
|
75 |
$string = str_replace('&', '&', $string);
|
|
|
76 |
// Change back only well-formed entities in our whitelist:
|
|
|
77 |
// Decimal numeric entities.
|
|
|
78 |
$string = preg_replace('/&#([0-9]+;)/', '&#\1', $string);
|
|
|
79 |
// Hexadecimal numeric entities.
|
|
|
80 |
$string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
|
|
|
81 |
// Named entities.
|
|
|
82 |
$string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
|
|
|
83 |
return preg_replace_callback('%
|
|
|
84 |
(
|
|
|
85 |
<(?=[^a-zA-Z!/]) # a lone <
|
|
|
86 |
| # or
|
|
|
87 |
<!--.*?--> # a comment
|
|
|
88 |
| # or
|
|
|
89 |
<[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string
|
|
|
90 |
| # or
|
|
|
91 |
> # just a >
|
|
|
92 |
)%x', 'self::_filter_xss_split', $string);
|
|
|
93 |
}
|
|
|
94 |
|
|
|
95 |
/**
|
|
|
96 |
* Processes an HTML tag.
|
|
|
97 |
*
|
|
|
98 |
* @param $m
|
|
|
99 |
* An array with various meaning depending on the value of $store.
|
|
|
100 |
* If $store is TRUE then the array contains the allowed tags.
|
|
|
101 |
* If $store is FALSE then the array has one element, the HTML tag to process.
|
|
|
102 |
* @param bool $store
|
|
|
103 |
* Whether to store $m.
|
|
|
104 |
* @param bool $allowedStyles Allow styles
|
|
|
105 |
*
|
|
|
106 |
* @return string If the element isn't allowed, an empty string. Otherwise, the cleaned up
|
|
|
107 |
* If the element isn't allowed, an empty string. Otherwise, the cleaned up
|
|
|
108 |
* version of the HTML element.
|
|
|
109 |
*/
|
|
|
110 |
private static function _filter_xss_split($m, $store = FALSE, $n = FALSE) {
|
|
|
111 |
static $allowed_html;
|
|
|
112 |
static $allowed_styles;
|
|
|
113 |
|
|
|
114 |
if ($store) {
|
|
|
115 |
$allowed_html = array_flip($m);
|
|
|
116 |
$allowed_styles = $n;
|
|
|
117 |
return $allowed_html;
|
|
|
118 |
}
|
|
|
119 |
|
|
|
120 |
$string = $m[1];
|
|
|
121 |
|
|
|
122 |
if (substr($string, 0, 1) != '<') {
|
|
|
123 |
// We matched a lone ">" character.
|
|
|
124 |
return '>';
|
|
|
125 |
}
|
|
|
126 |
elseif (strlen($string) == 1) {
|
|
|
127 |
// We matched a lone "<" character.
|
|
|
128 |
return '<';
|
|
|
129 |
}
|
|
|
130 |
|
|
|
131 |
if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9\-]+)\s*([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
|
|
|
132 |
// Seriously malformed.
|
|
|
133 |
return '';
|
|
|
134 |
}
|
|
|
135 |
|
|
|
136 |
$slash = trim($matches[1]);
|
|
|
137 |
$elem = &$matches[2];
|
|
|
138 |
$attrList = &$matches[3];
|
|
|
139 |
$comment = &$matches[4];
|
|
|
140 |
|
|
|
141 |
if ($comment) {
|
|
|
142 |
$elem = '!--';
|
|
|
143 |
}
|
|
|
144 |
|
|
|
145 |
if (!isset($allowed_html[strtolower($elem)])) {
|
|
|
146 |
// Disallowed HTML element.
|
|
|
147 |
return '';
|
|
|
148 |
}
|
|
|
149 |
|
|
|
150 |
if ($comment) {
|
|
|
151 |
return $comment;
|
|
|
152 |
}
|
|
|
153 |
|
|
|
154 |
if ($slash != '') {
|
|
|
155 |
return "</$elem>";
|
|
|
156 |
}
|
|
|
157 |
|
|
|
158 |
// Is there a closing XHTML slash at the end of the attributes?
|
|
|
159 |
$attrList = preg_replace('%(\s?)/\s*$%', '\1', $attrList, -1, $count);
|
|
|
160 |
$xhtml_slash = $count ? ' /' : '';
|
|
|
161 |
|
|
|
162 |
// Clean up attributes.
|
|
|
163 |
|
|
|
164 |
$attr2 = implode(' ', self::_filter_xss_attributes($attrList, $allowed_styles));
|
|
|
165 |
$attr2 = preg_replace('/[<>]/', '', $attr2);
|
|
|
166 |
$attr2 = strlen($attr2) ? ' ' . $attr2 : '';
|
|
|
167 |
|
|
|
168 |
return "<$elem$attr2$xhtml_slash>";
|
|
|
169 |
}
|
|
|
170 |
|
|
|
171 |
/**
|
|
|
172 |
* Processes a string of HTML attributes.
|
|
|
173 |
*
|
|
|
174 |
* @param $attr
|
|
|
175 |
* @param array|bool|object $allowedStyles
|
|
|
176 |
*
|
|
|
177 |
* @return array Cleaned up version of the HTML attributes.
|
|
|
178 |
* Cleaned up version of the HTML attributes.
|
|
|
179 |
*/
|
|
|
180 |
private static function _filter_xss_attributes($attr, $allowedStyles = FALSE) {
|
|
|
181 |
$attrArr = array();
|
|
|
182 |
$mode = 0;
|
|
|
183 |
$attrName = '';
|
|
|
184 |
$skip = FALSE;
|
|
|
185 |
|
|
|
186 |
while (strlen($attr) != 0) {
|
|
|
187 |
// Was the last operation successful?
|
|
|
188 |
$working = 0;
|
|
|
189 |
switch ($mode) {
|
|
|
190 |
case 0:
|
|
|
191 |
// Attribute name, href for instance.
|
|
|
192 |
if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
|
|
|
193 |
$attrName = strtolower($match[1]);
|
|
|
194 |
$skip = (
|
|
|
195 |
$attrName == 'style' ||
|
|
|
196 |
substr($attrName, 0, 2) == 'on' ||
|
|
|
197 |
substr($attrName, 0, 1) == '-' ||
|
|
|
198 |
// Ignore long attributes to avoid unnecessary processing overhead.
|
|
|
199 |
strlen($attrName) > 96
|
|
|
200 |
);
|
|
|
201 |
$working = $mode = 1;
|
|
|
202 |
$attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
|
|
|
203 |
}
|
|
|
204 |
break;
|
|
|
205 |
|
|
|
206 |
case 1:
|
|
|
207 |
// Equals sign or valueless ("selected").
|
|
|
208 |
if (preg_match('/^\s*=\s*/', $attr)) {
|
|
|
209 |
$working = 1;
|
|
|
210 |
$mode = 2;
|
|
|
211 |
$attr = preg_replace('/^\s*=\s*/', '', $attr);
|
|
|
212 |
break;
|
|
|
213 |
}
|
|
|
214 |
|
|
|
215 |
if (preg_match('/^\s+/', $attr)) {
|
|
|
216 |
$working = 1;
|
|
|
217 |
$mode = 0;
|
|
|
218 |
if (!$skip) {
|
|
|
219 |
$attrArr[] = $attrName;
|
|
|
220 |
}
|
|
|
221 |
$attr = preg_replace('/^\s+/', '', $attr);
|
|
|
222 |
}
|
|
|
223 |
break;
|
|
|
224 |
|
|
|
225 |
case 2:
|
|
|
226 |
// Attribute value, a URL after href= for instance.
|
|
|
227 |
if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
|
|
|
228 |
if ($allowedStyles && $attrName === 'style') {
|
|
|
229 |
// Allow certain styles
|
|
|
230 |
foreach ($allowedStyles as $pattern) {
|
|
|
231 |
if (preg_match($pattern, $match[1])) {
|
|
|
232 |
// All patterns are start to end patterns, and CKEditor adds one span per style
|
|
|
233 |
$attrArr[] = 'style="' . $match[1] . '"';
|
|
|
234 |
break;
|
|
|
235 |
}
|
|
|
236 |
}
|
|
|
237 |
break;
|
|
|
238 |
}
|
|
|
239 |
|
|
|
240 |
$thisVal = self::filter_xss_bad_protocol($match[1]);
|
|
|
241 |
|
|
|
242 |
if (!$skip) {
|
|
|
243 |
$attrArr[] = "$attrName=\"$thisVal\"";
|
|
|
244 |
}
|
|
|
245 |
$working = 1;
|
|
|
246 |
$mode = 0;
|
|
|
247 |
$attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
|
|
|
248 |
break;
|
|
|
249 |
}
|
|
|
250 |
|
|
|
251 |
if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
|
|
|
252 |
$thisVal = self::filter_xss_bad_protocol($match[1]);
|
|
|
253 |
|
|
|
254 |
if (!$skip) {
|
|
|
255 |
$attrArr[] = "$attrName='$thisVal'";
|
|
|
256 |
}
|
|
|
257 |
$working = 1;
|
|
|
258 |
$mode = 0;
|
|
|
259 |
$attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
|
|
|
260 |
break;
|
|
|
261 |
}
|
|
|
262 |
|
|
|
263 |
if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
|
|
|
264 |
$thisVal = self::filter_xss_bad_protocol($match[1]);
|
|
|
265 |
|
|
|
266 |
if (!$skip) {
|
|
|
267 |
$attrArr[] = "$attrName=\"$thisVal\"";
|
|
|
268 |
}
|
|
|
269 |
$working = 1;
|
|
|
270 |
$mode = 0;
|
|
|
271 |
$attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
|
|
|
272 |
}
|
|
|
273 |
break;
|
|
|
274 |
}
|
|
|
275 |
|
|
|
276 |
if ($working == 0) {
|
|
|
277 |
// Not well formed; remove and try again.
|
|
|
278 |
$attr = preg_replace('/
|
|
|
279 |
^
|
|
|
280 |
(
|
|
|
281 |
"[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string
|
|
|
282 |
| # or
|
|
|
283 |
\'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
|
|
|
284 |
| # or
|
|
|
285 |
\S # - a non-whitespace character
|
|
|
286 |
)* # any number of the above three
|
|
|
287 |
\s* # any number of whitespaces
|
|
|
288 |
/x', '', $attr);
|
|
|
289 |
$mode = 0;
|
|
|
290 |
}
|
|
|
291 |
}
|
|
|
292 |
|
|
|
293 |
// The attribute list ends with a valueless attribute like "selected".
|
|
|
294 |
if ($mode == 1 && !$skip) {
|
|
|
295 |
$attrArr[] = $attrName;
|
|
|
296 |
}
|
|
|
297 |
return $attrArr;
|
|
|
298 |
}
|
|
|
299 |
|
|
|
300 |
/**
|
|
|
301 |
* Processes an HTML attribute value and strips dangerous protocols from URLs.
|
|
|
302 |
*
|
|
|
303 |
* @param $string
|
|
|
304 |
* The string with the attribute value.
|
|
|
305 |
* @param bool $decode
|
|
|
306 |
* (deprecated) Whether to decode entities in the $string. Set to FALSE if the
|
|
|
307 |
* $string is in plain text, TRUE otherwise. Defaults to TRUE. This parameter
|
|
|
308 |
* is deprecated and will be removed in Drupal 8. To process a plain-text URI,
|
|
|
309 |
* call _strip_dangerous_protocols() or check_url() instead.
|
|
|
310 |
*
|
|
|
311 |
* @return string Cleaned up and HTML-escaped version of $string.
|
|
|
312 |
* Cleaned up and HTML-escaped version of $string.
|
|
|
313 |
*/
|
|
|
314 |
private static function filter_xss_bad_protocol($string, $decode = TRUE) {
|
|
|
315 |
// Get the plain text representation of the attribute value (i.e. its meaning).
|
|
|
316 |
// @todo Remove the $decode parameter in Drupal 8, and always assume an HTML
|
|
|
317 |
// string that needs decoding.
|
|
|
318 |
if ($decode) {
|
|
|
319 |
$string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
|
|
|
320 |
}
|
|
|
321 |
return htmlspecialchars(self::_strip_dangerous_protocols($string), ENT_QUOTES, 'UTF-8', FALSE);
|
|
|
322 |
}
|
|
|
323 |
|
|
|
324 |
/**
|
|
|
325 |
* Strips dangerous protocols (e.g. 'javascript:') from a URI.
|
|
|
326 |
*
|
|
|
327 |
* This function must be called for all URIs within user-entered input prior
|
|
|
328 |
* to being output to an HTML attribute value. It is often called as part of
|
|
|
329 |
* check_url() or filter_xss(), but those functions return an HTML-encoded
|
|
|
330 |
* string, so this function can be called independently when the output needs to
|
|
|
331 |
* be a plain-text string for passing to t(), l(), drupal_attributes(), or
|
|
|
332 |
* another function that will call check_plain() separately.
|
|
|
333 |
*
|
|
|
334 |
* @param $uri
|
|
|
335 |
* A plain-text URI that might contain dangerous protocols.
|
|
|
336 |
*
|
|
|
337 |
* @return string A plain-text URI stripped of dangerous protocols. As with all plain-text
|
|
|
338 |
* A plain-text URI stripped of dangerous protocols. As with all plain-text
|
|
|
339 |
* strings, this return value must not be output to an HTML page without
|
|
|
340 |
* check_plain() being called on it. However, it can be passed to functions
|
|
|
341 |
* expecting plain-text strings.
|
|
|
342 |
* @see check_url()
|
|
|
343 |
*/
|
|
|
344 |
private static function _strip_dangerous_protocols($uri) {
|
|
|
345 |
static $allowed_protocols;
|
|
|
346 |
|
|
|
347 |
if (!isset($allowed_protocols)) {
|
|
|
348 |
$allowed_protocols = array_flip(array('ftp', 'http', 'https', 'mailto'));
|
|
|
349 |
}
|
|
|
350 |
|
|
|
351 |
// Iteratively remove any invalid protocol found.
|
|
|
352 |
do {
|
|
|
353 |
$before = $uri;
|
|
|
354 |
$colonPos = strpos($uri, ':');
|
|
|
355 |
if ($colonPos > 0) {
|
|
|
356 |
// We found a colon, possibly a protocol. Verify.
|
|
|
357 |
$protocol = substr($uri, 0, $colonPos);
|
|
|
358 |
// If a colon is preceded by a slash, question mark or hash, it cannot
|
|
|
359 |
// possibly be part of the URL scheme. This must be a relative URL, which
|
|
|
360 |
// inherits the (safe) protocol of the base document.
|
|
|
361 |
if (preg_match('![/?#]!', $protocol)) {
|
|
|
362 |
break;
|
|
|
363 |
}
|
|
|
364 |
// Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
|
|
|
365 |
// (URI Comparison) scheme comparison must be case-insensitive.
|
|
|
366 |
if (!isset($allowed_protocols[strtolower($protocol)])) {
|
|
|
367 |
$uri = substr($uri, $colonPos + 1);
|
|
|
368 |
}
|
|
|
369 |
}
|
|
|
370 |
} while ($before != $uri);
|
|
|
371 |
|
|
|
372 |
return $uri;
|
|
|
373 |
}
|
|
|
374 |
}
|