Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
/**
4
 * Forgivingly lexes HTML (SGML-style) markup into tokens.
5
 *
6
 * A lexer parses a string of SGML-style markup and converts them into
7
 * corresponding tokens.  It doesn't check for well-formedness, although its
8
 * internal mechanism may make this automatic (such as the case of
9
 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
10
 * from.
11
 *
12
 * A lexer is HTML-oriented: it might work with XML, but it's not
13
 * recommended, as we adhere to a subset of the specification for optimization
14
 * reasons. This might change in the future. Also, most tokenizers are not
15
 * expected to handle DTDs or PIs.
16
 *
17
 * This class should not be directly instantiated, but you may use create() to
18
 * retrieve a default copy of the lexer.  Being a supertype, this class
19
 * does not actually define any implementation, but offers commonly used
20
 * convenience functions for subclasses.
21
 *
22
 * @note The unit tests will instantiate this class for testing purposes, as
23
 *       many of the utility functions require a class to be instantiated.
24
 *       This means that, even though this class is not runnable, it will
25
 *       not be declared abstract.
26
 *
27
 * @par
28
 *
29
 * @note
30
 * We use tokens rather than create a DOM representation because DOM would:
31
 *
32
 * @par
33
 *  -# Require more processing and memory to create,
34
 *  -# Is not streamable, and
35
 *  -# Has the entire document structure (html and body not needed).
36
 *
37
 * @par
38
 * However, DOM is helpful in that it makes it easy to move around nodes
39
 * without a lot of lookaheads to see when a tag is closed. This is a
40
 * limitation of the token system and some workarounds would be nice.
41
 */
42
class HTMLPurifier_Lexer
43
{
44
 
45
    /**
46
     * Whether or not this lexer implements line-number/column-number tracking.
47
     * If it does, set to true.
48
     */
49
    public $tracksLineNumbers = false;
50
 
51
    /**
52
     * @type HTMLPurifier_EntityParser
53
     */
54
    private $_entity_parser;
55
 
56
    // -- STATIC ----------------------------------------------------------
57
 
58
    /**
59
     * Retrieves or sets the default Lexer as a Prototype Factory.
60
     *
61
     * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
62
     * a few exceptions involving special features that only DirectLex
63
     * implements.
64
     *
65
     * @note The behavior of this class has changed, rather than accepting
66
     *       a prototype object, it now accepts a configuration object.
67
     *       To specify your own prototype, set %Core.LexerImpl to it.
68
     *       This change in behavior de-singletonizes the lexer object.
69
     *
70
     * @param HTMLPurifier_Config $config
71
     * @return HTMLPurifier_Lexer
72
     * @throws HTMLPurifier_Exception
73
     */
74
    public static function create($config)
75
    {
76
        if (!($config instanceof HTMLPurifier_Config)) {
77
            $lexer = $config;
78
            trigger_error(
79
                "Passing a prototype to
80
                HTMLPurifier_Lexer::create() is deprecated, please instead
81
                use %Core.LexerImpl",
82
                E_USER_WARNING
83
            );
84
        } else {
85
            $lexer = $config->get('Core.LexerImpl');
86
        }
87
 
88
        $needs_tracking =
89
            $config->get('Core.MaintainLineNumbers') ||
90
            $config->get('Core.CollectErrors');
91
 
92
        $inst = null;
93
        if (is_object($lexer)) {
94
            $inst = $lexer;
95
        } else {
96
            if (is_null($lexer)) {
97
                do {
98
                    // auto-detection algorithm
99
                    if ($needs_tracking) {
100
                        $lexer = 'DirectLex';
101
                        break;
102
                    }
103
 
104
                    if (class_exists('DOMDocument') &&
105
                        method_exists('DOMDocument', 'loadHTML') &&
106
                        !extension_loaded('domxml')
107
                    ) {
108
                        // check for DOM support, because while it's part of the
109
                        // core, it can be disabled compile time. Also, the PECL
110
                        // domxml extension overrides the default DOM, and is evil
111
                        // and nasty and we shan't bother to support it
112
                        $lexer = 'DOMLex';
113
                    } else {
114
                        $lexer = 'DirectLex';
115
                    }
116
                } while (0);
117
            } // do..while so we can break
118
 
119
            // instantiate recognized string names
120
            switch ($lexer) {
121
                case 'DOMLex':
122
                    $inst = new HTMLPurifier_Lexer_DOMLex();
123
                    break;
124
                case 'DirectLex':
125
                    $inst = new HTMLPurifier_Lexer_DirectLex();
126
                    break;
127
                case 'PH5P':
128
                    $inst = new HTMLPurifier_Lexer_PH5P();
129
                    break;
130
                default:
131
                    throw new HTMLPurifier_Exception(
132
                        "Cannot instantiate unrecognized Lexer type " .
133
                        htmlspecialchars($lexer)
134
                    );
135
            }
136
        }
137
 
138
        if (!$inst) {
139
            throw new HTMLPurifier_Exception('No lexer was instantiated');
140
        }
141
 
142
        // once PHP DOM implements native line numbers, or we
143
        // hack out something using XSLT, remove this stipulation
144
        if ($needs_tracking && !$inst->tracksLineNumbers) {
145
            throw new HTMLPurifier_Exception(
146
                'Cannot use lexer that does not support line numbers with ' .
147
                'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
148
            );
149
        }
150
 
151
        return $inst;
152
 
153
    }
154
 
155
    // -- CONVENIENCE MEMBERS ---------------------------------------------
156
 
157
    public function __construct()
158
    {
159
        $this->_entity_parser = new HTMLPurifier_EntityParser();
160
    }
161
 
162
    /**
163
     * Most common entity to raw value conversion table for special entities.
164
     * @type array
165
     */
166
    protected $_special_entity2str =
167
        array(
168
            '&quot;' => '"',
169
            '&amp;' => '&',
170
            '&lt;' => '<',
171
            '&gt;' => '>',
172
            '&#39;' => "'",
173
            '&#039;' => "'",
174
            '&#x27;' => "'"
175
        );
176
 
177
    public function parseText($string, $config) {
178
        return $this->parseData($string, false, $config);
179
    }
180
 
181
    public function parseAttr($string, $config) {
182
        return $this->parseData($string, true, $config);
183
    }
184
 
185
    /**
186
     * Parses special entities into the proper characters.
187
     *
188
     * This string will translate escaped versions of the special characters
189
     * into the correct ones.
190
     *
191
     * @param string $string String character data to be parsed.
192
     * @return string Parsed character data.
193
     */
194
    public function parseData($string, $is_attr, $config)
195
    {
196
        // following functions require at least one character
197
        if ($string === '') {
198
            return '';
199
        }
200
 
201
        // subtracts amps that cannot possibly be escaped
202
        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
203
            ($string[strlen($string) - 1] === '&' ? 1 : 0);
204
 
205
        if (!$num_amp) {
206
            return $string;
207
        } // abort if no entities
208
        $num_esc_amp = substr_count($string, '&amp;');
209
        $string = strtr($string, $this->_special_entity2str);
210
 
211
        // code duplication for sake of optimization, see above
212
        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
213
            ($string[strlen($string) - 1] === '&' ? 1 : 0);
214
 
215
        if ($num_amp_2 <= $num_esc_amp) {
216
            return $string;
217
        }
218
 
219
        // hmm... now we have some uncommon entities. Use the callback.
220
        if ($config->get('Core.LegacyEntityDecoder')) {
221
            $string = $this->_entity_parser->substituteSpecialEntities($string);
222
        } else {
223
            if ($is_attr) {
224
                $string = $this->_entity_parser->substituteAttrEntities($string);
225
            } else {
226
                $string = $this->_entity_parser->substituteTextEntities($string);
227
            }
228
        }
229
        return $string;
230
    }
231
 
232
    /**
233
     * Lexes an HTML string into tokens.
234
     * @param $string String HTML.
235
     * @param HTMLPurifier_Config $config
236
     * @param HTMLPurifier_Context $context
237
     * @return HTMLPurifier_Token[] array representation of HTML.
238
     */
239
    public function tokenizeHTML($string, $config, $context)
240
    {
241
        trigger_error('Call to abstract class', E_USER_ERROR);
242
    }
243
 
244
    /**
245
     * Translates CDATA sections into regular sections (through escaping).
246
     * @param string $string HTML string to process.
247
     * @return string HTML with CDATA sections escaped.
248
     */
249
    protected static function escapeCDATA($string)
250
    {
251
        return preg_replace_callback(
252
            '/<!\[CDATA\[(.+?)\]\]>/s',
253
            array('HTMLPurifier_Lexer', 'CDATACallback'),
254
            $string
255
        );
256
    }
257
 
258
    /**
259
     * Special CDATA case that is especially convoluted for <script>
260
     * @param string $string HTML string to process.
261
     * @return string HTML with CDATA sections escaped.
262
     */
263
    protected static function escapeCommentedCDATA($string)
264
    {
265
        return preg_replace_callback(
266
            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
267
            array('HTMLPurifier_Lexer', 'CDATACallback'),
268
            $string
269
        );
270
    }
271
 
272
    /**
273
     * Special Internet Explorer conditional comments should be removed.
274
     * @param string $string HTML string to process.
275
     * @return string HTML with conditional comments removed.
276
     */
277
    protected static function removeIEConditional($string)
278
    {
279
        return preg_replace(
280
            '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
281
            '',
282
            $string
283
        );
284
    }
285
 
286
    /**
287
     * Callback function for escapeCDATA() that does the work.
288
     *
289
     * @warning Though this is public in order to let the callback happen,
290
     *          calling it directly is not recommended.
291
     * @param array $matches PCRE matches array, with index 0 the entire match
292
     *                  and 1 the inside of the CDATA section.
293
     * @return string Escaped internals of the CDATA section.
294
     */
295
    protected static function CDATACallback($matches)
296
    {
297
        // not exactly sure why the character set is needed, but whatever
298
        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
299
    }
300
 
301
    /**
302
     * Takes a piece of HTML and normalizes it by converting entities, fixing
303
     * encoding, extracting bits, and other good stuff.
304
     * @param string $html HTML.
305
     * @param HTMLPurifier_Config $config
306
     * @param HTMLPurifier_Context $context
307
     * @return string
308
     * @todo Consider making protected
309
     */
310
    public function normalize($html, $config, $context)
311
    {
312
        // normalize newlines to \n
313
        if ($config->get('Core.NormalizeNewlines')) {
314
            $html = str_replace("\r\n", "\n", (string)$html);
315
            $html = str_replace("\r", "\n", (string)$html);
316
        }
317
 
318
        if ($config->get('HTML.Trusted')) {
319
            // escape convoluted CDATA
320
            $html = $this->escapeCommentedCDATA($html);
321
        }
322
 
323
        // escape CDATA
324
        $html = $this->escapeCDATA($html);
325
 
326
        $html = $this->removeIEConditional($html);
327
 
328
        // extract body from document if applicable
329
        if ($config->get('Core.ConvertDocumentToFragment')) {
330
            $e = false;
331
            if ($config->get('Core.CollectErrors')) {
332
                $e =& $context->get('ErrorCollector');
333
            }
334
            $new_html = $this->extractBody($html);
335
            if ($e && $new_html != $html) {
336
                $e->send(E_WARNING, 'Lexer: Extracted body');
337
            }
338
            $html = $new_html;
339
        }
340
 
341
        // expand entities that aren't the big five
342
        if ($config->get('Core.LegacyEntityDecoder')) {
343
            $html = $this->_entity_parser->substituteNonSpecialEntities($html);
344
        }
345
 
346
        // clean into wellformed UTF-8 string for an SGML context: this has
347
        // to be done after entity expansion because the entities sometimes
348
        // represent non-SGML characters (horror, horror!)
349
        $html = HTMLPurifier_Encoder::cleanUTF8($html);
350
 
351
        // if processing instructions are to removed, remove them now
352
        if ($config->get('Core.RemoveProcessingInstructions')) {
353
            $html = preg_replace('#<\?.+?\?>#s', '', $html);
354
        }
355
 
356
        $hidden_elements = $config->get('Core.HiddenElements');
357
        if ($config->get('Core.AggressivelyRemoveScript') &&
358
            !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
359
            || empty($hidden_elements["script"]))) {
360
            $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
361
        }
362
 
363
        return $html;
364
    }
365
 
366
    /**
367
     * Takes a string of HTML (fragment or document) and returns the content
368
     * @todo Consider making protected
369
     */
370
    public function extractBody($html)
371
    {
372
        $matches = array();
373
        $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
374
        if ($result) {
375
            // Make sure it's not in a comment
376
            $comment_start = strrpos($matches[1], '<!--');
377
            $comment_end   = strrpos($matches[1], '-->');
378
            if ($comment_start === false ||
379
                ($comment_end !== false && $comment_end > $comment_start)) {
380
                return $matches[2];
381
            }
382
        }
383
        return $html;
384
    }
385
}
386
 
387
// vim: et sw=4 sts=4