Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
/**
4
 * Class HtmlPurifier
5
 * Purify html
6
 *
7
 * XSS filters copied from drupal 7 common.inc. Some modifications done to
8
 * replace Drupal one-liner functions with corresponding flat PHP.
9
 */
10
class HtmlReportPurifier {
11
 
12
  /**
13
   * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
14
   *
15
   * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
16
   * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
17
   *
18
   * This code does four things:
19
   * - Removes characters and constructs that can trick browsers.
20
   * - Makes sure all HTML entities are well-formed.
21
   * - Makes sure all HTML tags and attributes are well-formed.
22
   * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
23
   *   javascript:).
24
   *
25
   * @param $string
26
   *   The string with raw HTML in it. It will be stripped of everything that can
27
   *   cause an XSS attack.
28
   * @param array $allowed_tags
29
   *   An array of allowed tags.
30
   *
31
   * @param bool $allowedStyles
32
   *
33
   * @return mixed|string An XSS safe version of $string, or an empty string if $string is not
34
   * An XSS safe version of $string, or an empty string if $string is not
35
   * valid UTF-8.
36
   * @ingroup sanitation
37
   */
38
  public static function filter_xss($string, $allowed_tags = array(
39
      'a', 'b', 'br', 'code', 'col', 'colgroup', 'dd', 'div', 'dl',
40
      'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3',
41
      'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'img', 'ins', 'li',
42
      'menu', 'meter', 'nav', 'ol', 'p', 's', 'section', 'span', 'strong',
43
      'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
44
      'thead', 'time', 'tr', 'tt', 'u', 'ul'), $allowedStyles = FALSE) {
45
 
46
    $stylePatterns = false;
47
    if ($allowedStyles) {
48
      $stylePatterns = array();
49
      $stylePatterns[] = '/^color: *(#[a-f0-9]{3}[a-f0-9]{3}?|rgba?\([0-9, ]+\)) *;?$/i';
50
      $stylePatterns[] = '/^background-color: *(#[a-f0-9]{3}[a-f0-9]{3}?|rgba?\([0-9, ]+\)) *;?$/i';
51
      $stylePatterns[] = '/^(height:\:?[0-9]{1,8}px; width:\:?[0-9]{1,8}px|width\:?[0-9]{1,8}px|height:500px)?$/i';
52
      $stylePatterns[] = '/^width\:?[0-9]{1,8}%?$/i';
53
      $stylePatterns[] = '/^height\:?[0-9]{1,8}%?$/i';
54
      $stylePatterns[] = '/^font-size:\:?[0-9]{1,8}px$/i';
55
    }
56
 
57
    if (strlen($string) == 0) {
58
      return $string;
59
    }
60
    // Only operate on valid UTF-8 strings. This is necessary to prevent cross
61
    // site scripting issues on Internet Explorer 6. (Line copied from
62
    // drupal_validate_utf8)
63
    if (preg_match('/^./us', $string) != 1) {
64
      return '';
65
    }
66
 
67
    // Store the text format.
68
    self::_filter_xss_split($allowed_tags, TRUE, $stylePatterns);
69
    // Remove NULL characters (ignored by some browsers).
70
    $string = str_replace(chr(0), '', $string);
71
    // Remove Netscape 4 JS entities.
72
    $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
73
 
74
    // Defuse all HTML entities.
75
    $string = str_replace('&', '&amp;', $string);
76
    // Change back only well-formed entities in our whitelist:
77
    // Decimal numeric entities.
78
    $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
79
    // Hexadecimal numeric entities.
80
    $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
81
    // Named entities.
82
    $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
83
    return preg_replace_callback('%
84
      (
85
      <(?=[^a-zA-Z!/])  # a lone <
86
      |                 # or
87
      <!--.*?-->        # a comment
88
      |                 # or
89
      <[^>]*(>|$)       # a string that starts with a <, up until the > or the end of the string
90
      |                 # or
91
      >                 # just a >
92
      )%x', 'self::_filter_xss_split', $string);
93
  }
94
 
95
  /**
96
   * Processes an HTML tag.
97
   *
98
   * @param $m
99
   *   An array with various meaning depending on the value of $store.
100
   *   If $store is TRUE then the array contains the allowed tags.
101
   *   If $store is FALSE then the array has one element, the HTML tag to process.
102
   * @param bool $store
103
   *   Whether to store $m.
104
   * @param bool $allowedStyles Allow styles
105
   *
106
   * @return string If the element isn't allowed, an empty string. Otherwise, the cleaned up
107
   * If the element isn't allowed, an empty string. Otherwise, the cleaned up
108
   * version of the HTML element.
109
   */
110
  private static function _filter_xss_split($m, $store = FALSE, $n = FALSE) {
111
    static $allowed_html;
112
    static $allowed_styles;
113
 
114
    if ($store) {
115
      $allowed_html = array_flip($m);
116
      $allowed_styles = $n;
117
      return $allowed_html;
118
    }
119
 
120
    $string = $m[1];
121
 
122
    if (substr($string, 0, 1) != '<') {
123
      // We matched a lone ">" character.
124
      return '&gt;';
125
    }
126
    elseif (strlen($string) == 1) {
127
      // We matched a lone "<" character.
128
      return '&lt;';
129
    }
130
 
131
    if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9\-]+)\s*([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
132
      // Seriously malformed.
133
      return '';
134
    }
135
 
136
    $slash    = trim($matches[1]);
137
    $elem     = &$matches[2];
138
    $attrList = &$matches[3];
139
    $comment  = &$matches[4];
140
 
141
    if ($comment) {
142
      $elem = '!--';
143
    }
144
 
145
    if (!isset($allowed_html[strtolower($elem)])) {
146
      // Disallowed HTML element.
147
      return '';
148
    }
149
 
150
    if ($comment) {
151
      return $comment;
152
    }
153
 
154
    if ($slash != '') {
155
      return "</$elem>";
156
    }
157
 
158
    // Is there a closing XHTML slash at the end of the attributes?
159
    $attrList    = preg_replace('%(\s?)/\s*$%', '\1', $attrList, -1, $count);
160
    $xhtml_slash = $count ? ' /' : '';
161
 
162
    // Clean up attributes.
163
 
164
    $attr2 = implode(' ', self::_filter_xss_attributes($attrList, $allowed_styles));
165
    $attr2 = preg_replace('/[<>]/', '', $attr2);
166
    $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
167
 
168
    return "<$elem$attr2$xhtml_slash>";
169
  }
170
 
171
  /**
172
   * Processes a string of HTML attributes.
173
   *
174
   * @param $attr
175
   * @param array|bool|object $allowedStyles
176
   *
177
   * @return array Cleaned up version of the HTML attributes.
178
   * Cleaned up version of the HTML attributes.
179
   */
180
  private static function _filter_xss_attributes($attr, $allowedStyles = FALSE) {
181
    $attrArr  = array();
182
    $mode     = 0;
183
    $attrName = '';
184
    $skip     = FALSE;
185
 
186
    while (strlen($attr) != 0) {
187
      // Was the last operation successful?
188
      $working = 0;
189
      switch ($mode) {
190
        case 0:
191
          // Attribute name, href for instance.
192
          if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
193
            $attrName = strtolower($match[1]);
194
            $skip = (
195
              $attrName == 'style' ||
196
              substr($attrName, 0, 2) == 'on' ||
197
              substr($attrName, 0, 1) == '-' ||
198
              // Ignore long attributes to avoid unnecessary processing overhead.
199
              strlen($attrName) > 96
200
            );
201
            $working  = $mode = 1;
202
            $attr     = preg_replace('/^[-a-zA-Z]+/', '', $attr);
203
          }
204
          break;
205
 
206
        case 1:
207
          // Equals sign or valueless ("selected").
208
          if (preg_match('/^\s*=\s*/', $attr)) {
209
            $working = 1;
210
            $mode    = 2;
211
            $attr    = preg_replace('/^\s*=\s*/', '', $attr);
212
            break;
213
          }
214
 
215
          if (preg_match('/^\s+/', $attr)) {
216
            $working = 1;
217
            $mode    = 0;
218
            if (!$skip) {
219
              $attrArr[] = $attrName;
220
            }
221
            $attr = preg_replace('/^\s+/', '', $attr);
222
          }
223
          break;
224
 
225
        case 2:
226
          // Attribute value, a URL after href= for instance.
227
          if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
228
            if ($allowedStyles && $attrName === 'style') {
229
              // Allow certain styles
230
              foreach ($allowedStyles as $pattern) {
231
                if (preg_match($pattern, $match[1])) {
232
                  // All patterns are start to end patterns, and CKEditor adds one span per style
233
                  $attrArr[] = 'style="' . $match[1] . '"';
234
                  break;
235
                }
236
              }
237
              break;
238
            }
239
 
240
            $thisVal = self::filter_xss_bad_protocol($match[1]);
241
 
242
            if (!$skip) {
243
              $attrArr[] = "$attrName=\"$thisVal\"";
244
            }
245
            $working = 1;
246
            $mode    = 0;
247
            $attr    = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
248
            break;
249
          }
250
 
251
          if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
252
            $thisVal = self::filter_xss_bad_protocol($match[1]);
253
 
254
            if (!$skip) {
255
              $attrArr[] = "$attrName='$thisVal'";
256
            }
257
            $working = 1;
258
            $mode    = 0;
259
            $attr    = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
260
            break;
261
          }
262
 
263
          if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
264
            $thisVal = self::filter_xss_bad_protocol($match[1]);
265
 
266
            if (!$skip) {
267
              $attrArr[] = "$attrName=\"$thisVal\"";
268
            }
269
            $working = 1;
270
            $mode    = 0;
271
            $attr    = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
272
          }
273
          break;
274
      }
275
 
276
      if ($working == 0) {
277
        // Not well formed; remove and try again.
278
        $attr = preg_replace('/
279
          ^
280
          (
281
          "[^"]*("|$)     # - a string that starts with a double quote, up until the next double quote or the end of the string
282
          |               # or
283
          \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
284
          |               # or
285
          \S              # - a non-whitespace character
286
          )*              # any number of the above three
287
          \s*             # any number of whitespaces
288
          /x', '', $attr);
289
        $mode = 0;
290
      }
291
    }
292
 
293
    // The attribute list ends with a valueless attribute like "selected".
294
    if ($mode == 1 && !$skip) {
295
      $attrArr[] = $attrName;
296
    }
297
    return $attrArr;
298
  }
299
 
300
  /**
301
   * Processes an HTML attribute value and strips dangerous protocols from URLs.
302
   *
303
   * @param $string
304
   *   The string with the attribute value.
305
   * @param bool $decode
306
   *   (deprecated) Whether to decode entities in the $string. Set to FALSE if the
307
   *   $string is in plain text, TRUE otherwise. Defaults to TRUE. This parameter
308
   *   is deprecated and will be removed in Drupal 8. To process a plain-text URI,
309
   *   call _strip_dangerous_protocols() or check_url() instead.
310
   *
311
   * @return string Cleaned up and HTML-escaped version of $string.
312
   * Cleaned up and HTML-escaped version of $string.
313
   */
314
  private static function filter_xss_bad_protocol($string, $decode = TRUE) {
315
    // Get the plain text representation of the attribute value (i.e. its meaning).
316
    // @todo Remove the $decode parameter in Drupal 8, and always assume an HTML
317
    //   string that needs decoding.
318
    if ($decode) {
319
      $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
320
    }
321
    return htmlspecialchars(self::_strip_dangerous_protocols($string), ENT_QUOTES, 'UTF-8', FALSE);
322
  }
323
 
324
  /**
325
   * Strips dangerous protocols (e.g. 'javascript:') from a URI.
326
   *
327
   * This function must be called for all URIs within user-entered input prior
328
   * to being output to an HTML attribute value. It is often called as part of
329
   * check_url() or filter_xss(), but those functions return an HTML-encoded
330
   * string, so this function can be called independently when the output needs to
331
   * be a plain-text string for passing to t(), l(), drupal_attributes(), or
332
   * another function that will call check_plain() separately.
333
   *
334
   * @param $uri
335
   *   A plain-text URI that might contain dangerous protocols.
336
   *
337
   * @return string A plain-text URI stripped of dangerous protocols. As with all plain-text
338
   * A plain-text URI stripped of dangerous protocols. As with all plain-text
339
   * strings, this return value must not be output to an HTML page without
340
   * check_plain() being called on it. However, it can be passed to functions
341
   * expecting plain-text strings.
342
   * @see check_url()
343
   */
344
  private static function _strip_dangerous_protocols($uri) {
345
    static $allowed_protocols;
346
 
347
    if (!isset($allowed_protocols)) {
348
      $allowed_protocols = array_flip(array('ftp', 'http', 'https', 'mailto'));
349
    }
350
 
351
    // Iteratively remove any invalid protocol found.
352
    do {
353
      $before   = $uri;
354
      $colonPos = strpos($uri, ':');
355
      if ($colonPos > 0) {
356
        // We found a colon, possibly a protocol. Verify.
357
        $protocol = substr($uri, 0, $colonPos);
358
        // If a colon is preceded by a slash, question mark or hash, it cannot
359
        // possibly be part of the URL scheme. This must be a relative URL, which
360
        // inherits the (safe) protocol of the base document.
361
        if (preg_match('![/?#]!', $protocol)) {
362
          break;
363
        }
364
        // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
365
        // (URI Comparison) scheme comparison must be case-insensitive.
366
        if (!isset($allowed_protocols[strtolower($protocol)])) {
367
          $uri = substr($uri, $colonPos + 1);
368
        }
369
      }
370
    } while ($before != $uri);
371
 
372
    return $uri;
373
  }
374
}