1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
* Forgivingly lexes HTML (SGML-style) markup into tokens.
|
|
|
5 |
*
|
|
|
6 |
* A lexer parses a string of SGML-style markup and converts them into
|
|
|
7 |
* corresponding tokens. It doesn't check for well-formedness, although its
|
|
|
8 |
* internal mechanism may make this automatic (such as the case of
|
|
|
9 |
* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
|
|
|
10 |
* from.
|
|
|
11 |
*
|
|
|
12 |
* A lexer is HTML-oriented: it might work with XML, but it's not
|
|
|
13 |
* recommended, as we adhere to a subset of the specification for optimization
|
|
|
14 |
* reasons. This might change in the future. Also, most tokenizers are not
|
|
|
15 |
* expected to handle DTDs or PIs.
|
|
|
16 |
*
|
|
|
17 |
* This class should not be directly instantiated, but you may use create() to
|
|
|
18 |
* retrieve a default copy of the lexer. Being a supertype, this class
|
|
|
19 |
* does not actually define any implementation, but offers commonly used
|
|
|
20 |
* convenience functions for subclasses.
|
|
|
21 |
*
|
|
|
22 |
* @note The unit tests will instantiate this class for testing purposes, as
|
|
|
23 |
* many of the utility functions require a class to be instantiated.
|
|
|
24 |
* This means that, even though this class is not runnable, it will
|
|
|
25 |
* not be declared abstract.
|
|
|
26 |
*
|
|
|
27 |
* @par
|
|
|
28 |
*
|
|
|
29 |
* @note
|
|
|
30 |
* We use tokens rather than create a DOM representation because DOM would:
|
|
|
31 |
*
|
|
|
32 |
* @par
|
|
|
33 |
* -# Require more processing and memory to create,
|
|
|
34 |
* -# Is not streamable, and
|
|
|
35 |
* -# Has the entire document structure (html and body not needed).
|
|
|
36 |
*
|
|
|
37 |
* @par
|
|
|
38 |
* However, DOM is helpful in that it makes it easy to move around nodes
|
|
|
39 |
* without a lot of lookaheads to see when a tag is closed. This is a
|
|
|
40 |
* limitation of the token system and some workarounds would be nice.
|
|
|
41 |
*/
|
|
|
42 |
class HTMLPurifier_Lexer
|
|
|
43 |
{
|
|
|
44 |
|
|
|
45 |
/**
|
|
|
46 |
* Whether or not this lexer implements line-number/column-number tracking.
|
|
|
47 |
* If it does, set to true.
|
|
|
48 |
*/
|
|
|
49 |
public $tracksLineNumbers = false;
|
|
|
50 |
|
|
|
51 |
/**
|
|
|
52 |
* @type HTMLPurifier_EntityParser
|
|
|
53 |
*/
|
|
|
54 |
private $_entity_parser;
|
|
|
55 |
|
|
|
56 |
// -- STATIC ----------------------------------------------------------
|
|
|
57 |
|
|
|
58 |
/**
|
|
|
59 |
* Retrieves or sets the default Lexer as a Prototype Factory.
|
|
|
60 |
*
|
|
|
61 |
* By default HTMLPurifier_Lexer_DOMLex will be returned. There are
|
|
|
62 |
* a few exceptions involving special features that only DirectLex
|
|
|
63 |
* implements.
|
|
|
64 |
*
|
|
|
65 |
* @note The behavior of this class has changed, rather than accepting
|
|
|
66 |
* a prototype object, it now accepts a configuration object.
|
|
|
67 |
* To specify your own prototype, set %Core.LexerImpl to it.
|
|
|
68 |
* This change in behavior de-singletonizes the lexer object.
|
|
|
69 |
*
|
|
|
70 |
* @param HTMLPurifier_Config $config
|
|
|
71 |
* @return HTMLPurifier_Lexer
|
|
|
72 |
* @throws HTMLPurifier_Exception
|
|
|
73 |
*/
|
|
|
74 |
public static function create($config)
|
|
|
75 |
{
|
|
|
76 |
if (!($config instanceof HTMLPurifier_Config)) {
|
|
|
77 |
$lexer = $config;
|
|
|
78 |
trigger_error(
|
|
|
79 |
"Passing a prototype to
|
|
|
80 |
HTMLPurifier_Lexer::create() is deprecated, please instead
|
|
|
81 |
use %Core.LexerImpl",
|
|
|
82 |
E_USER_WARNING
|
|
|
83 |
);
|
|
|
84 |
} else {
|
|
|
85 |
$lexer = $config->get('Core.LexerImpl');
|
|
|
86 |
}
|
|
|
87 |
|
|
|
88 |
$needs_tracking =
|
|
|
89 |
$config->get('Core.MaintainLineNumbers') ||
|
|
|
90 |
$config->get('Core.CollectErrors');
|
|
|
91 |
|
|
|
92 |
$inst = null;
|
|
|
93 |
if (is_object($lexer)) {
|
|
|
94 |
$inst = $lexer;
|
|
|
95 |
} else {
|
|
|
96 |
if (is_null($lexer)) {
|
|
|
97 |
do {
|
|
|
98 |
// auto-detection algorithm
|
|
|
99 |
if ($needs_tracking) {
|
|
|
100 |
$lexer = 'DirectLex';
|
|
|
101 |
break;
|
|
|
102 |
}
|
|
|
103 |
|
|
|
104 |
if (class_exists('DOMDocument') &&
|
|
|
105 |
method_exists('DOMDocument', 'loadHTML') &&
|
|
|
106 |
!extension_loaded('domxml')
|
|
|
107 |
) {
|
|
|
108 |
// check for DOM support, because while it's part of the
|
|
|
109 |
// core, it can be disabled compile time. Also, the PECL
|
|
|
110 |
// domxml extension overrides the default DOM, and is evil
|
|
|
111 |
// and nasty and we shan't bother to support it
|
|
|
112 |
$lexer = 'DOMLex';
|
|
|
113 |
} else {
|
|
|
114 |
$lexer = 'DirectLex';
|
|
|
115 |
}
|
|
|
116 |
} while (0);
|
|
|
117 |
} // do..while so we can break
|
|
|
118 |
|
|
|
119 |
// instantiate recognized string names
|
|
|
120 |
switch ($lexer) {
|
|
|
121 |
case 'DOMLex':
|
|
|
122 |
$inst = new HTMLPurifier_Lexer_DOMLex();
|
|
|
123 |
break;
|
|
|
124 |
case 'DirectLex':
|
|
|
125 |
$inst = new HTMLPurifier_Lexer_DirectLex();
|
|
|
126 |
break;
|
|
|
127 |
case 'PH5P':
|
|
|
128 |
$inst = new HTMLPurifier_Lexer_PH5P();
|
|
|
129 |
break;
|
|
|
130 |
default:
|
|
|
131 |
throw new HTMLPurifier_Exception(
|
|
|
132 |
"Cannot instantiate unrecognized Lexer type " .
|
|
|
133 |
htmlspecialchars($lexer)
|
|
|
134 |
);
|
|
|
135 |
}
|
|
|
136 |
}
|
|
|
137 |
|
|
|
138 |
if (!$inst) {
|
|
|
139 |
throw new HTMLPurifier_Exception('No lexer was instantiated');
|
|
|
140 |
}
|
|
|
141 |
|
|
|
142 |
// once PHP DOM implements native line numbers, or we
|
|
|
143 |
// hack out something using XSLT, remove this stipulation
|
|
|
144 |
if ($needs_tracking && !$inst->tracksLineNumbers) {
|
|
|
145 |
throw new HTMLPurifier_Exception(
|
|
|
146 |
'Cannot use lexer that does not support line numbers with ' .
|
|
|
147 |
'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
|
|
|
148 |
);
|
|
|
149 |
}
|
|
|
150 |
|
|
|
151 |
return $inst;
|
|
|
152 |
|
|
|
153 |
}
|
|
|
154 |
|
|
|
155 |
// -- CONVENIENCE MEMBERS ---------------------------------------------
|
|
|
156 |
|
|
|
157 |
public function __construct()
|
|
|
158 |
{
|
|
|
159 |
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
|
|
160 |
}
|
|
|
161 |
|
|
|
162 |
/**
|
|
|
163 |
* Most common entity to raw value conversion table for special entities.
|
|
|
164 |
* @type array
|
|
|
165 |
*/
|
|
|
166 |
protected $_special_entity2str =
|
|
|
167 |
array(
|
|
|
168 |
'"' => '"',
|
|
|
169 |
'&' => '&',
|
|
|
170 |
'<' => '<',
|
|
|
171 |
'>' => '>',
|
|
|
172 |
''' => "'",
|
|
|
173 |
''' => "'",
|
|
|
174 |
''' => "'"
|
|
|
175 |
);
|
|
|
176 |
|
|
|
177 |
public function parseText($string, $config) {
|
|
|
178 |
return $this->parseData($string, false, $config);
|
|
|
179 |
}
|
|
|
180 |
|
|
|
181 |
public function parseAttr($string, $config) {
|
|
|
182 |
return $this->parseData($string, true, $config);
|
|
|
183 |
}
|
|
|
184 |
|
|
|
185 |
/**
|
|
|
186 |
* Parses special entities into the proper characters.
|
|
|
187 |
*
|
|
|
188 |
* This string will translate escaped versions of the special characters
|
|
|
189 |
* into the correct ones.
|
|
|
190 |
*
|
|
|
191 |
* @param string $string String character data to be parsed.
|
|
|
192 |
* @return string Parsed character data.
|
|
|
193 |
*/
|
|
|
194 |
public function parseData($string, $is_attr, $config)
|
|
|
195 |
{
|
|
|
196 |
// following functions require at least one character
|
|
|
197 |
if ($string === '') {
|
|
|
198 |
return '';
|
|
|
199 |
}
|
|
|
200 |
|
|
|
201 |
// subtracts amps that cannot possibly be escaped
|
|
|
202 |
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
|
|
203 |
($string[strlen($string) - 1] === '&' ? 1 : 0);
|
|
|
204 |
|
|
|
205 |
if (!$num_amp) {
|
|
|
206 |
return $string;
|
|
|
207 |
} // abort if no entities
|
|
|
208 |
$num_esc_amp = substr_count($string, '&');
|
|
|
209 |
$string = strtr($string, $this->_special_entity2str);
|
|
|
210 |
|
|
|
211 |
// code duplication for sake of optimization, see above
|
|
|
212 |
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
|
|
213 |
($string[strlen($string) - 1] === '&' ? 1 : 0);
|
|
|
214 |
|
|
|
215 |
if ($num_amp_2 <= $num_esc_amp) {
|
|
|
216 |
return $string;
|
|
|
217 |
}
|
|
|
218 |
|
|
|
219 |
// hmm... now we have some uncommon entities. Use the callback.
|
|
|
220 |
if ($config->get('Core.LegacyEntityDecoder')) {
|
|
|
221 |
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
|
|
222 |
} else {
|
|
|
223 |
if ($is_attr) {
|
|
|
224 |
$string = $this->_entity_parser->substituteAttrEntities($string);
|
|
|
225 |
} else {
|
|
|
226 |
$string = $this->_entity_parser->substituteTextEntities($string);
|
|
|
227 |
}
|
|
|
228 |
}
|
|
|
229 |
return $string;
|
|
|
230 |
}
|
|
|
231 |
|
|
|
232 |
/**
|
|
|
233 |
* Lexes an HTML string into tokens.
|
|
|
234 |
* @param $string String HTML.
|
|
|
235 |
* @param HTMLPurifier_Config $config
|
|
|
236 |
* @param HTMLPurifier_Context $context
|
|
|
237 |
* @return HTMLPurifier_Token[] array representation of HTML.
|
|
|
238 |
*/
|
|
|
239 |
public function tokenizeHTML($string, $config, $context)
|
|
|
240 |
{
|
|
|
241 |
trigger_error('Call to abstract class', E_USER_ERROR);
|
|
|
242 |
}
|
|
|
243 |
|
|
|
244 |
/**
|
|
|
245 |
* Translates CDATA sections into regular sections (through escaping).
|
|
|
246 |
* @param string $string HTML string to process.
|
|
|
247 |
* @return string HTML with CDATA sections escaped.
|
|
|
248 |
*/
|
|
|
249 |
protected static function escapeCDATA($string)
|
|
|
250 |
{
|
|
|
251 |
return preg_replace_callback(
|
|
|
252 |
'/<!\[CDATA\[(.+?)\]\]>/s',
|
|
|
253 |
array('HTMLPurifier_Lexer', 'CDATACallback'),
|
|
|
254 |
$string
|
|
|
255 |
);
|
|
|
256 |
}
|
|
|
257 |
|
|
|
258 |
/**
|
|
|
259 |
* Special CDATA case that is especially convoluted for <script>
|
|
|
260 |
* @param string $string HTML string to process.
|
|
|
261 |
* @return string HTML with CDATA sections escaped.
|
|
|
262 |
*/
|
|
|
263 |
protected static function escapeCommentedCDATA($string)
|
|
|
264 |
{
|
|
|
265 |
return preg_replace_callback(
|
|
|
266 |
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
|
|
|
267 |
array('HTMLPurifier_Lexer', 'CDATACallback'),
|
|
|
268 |
$string
|
|
|
269 |
);
|
|
|
270 |
}
|
|
|
271 |
|
|
|
272 |
/**
|
|
|
273 |
* Special Internet Explorer conditional comments should be removed.
|
|
|
274 |
* @param string $string HTML string to process.
|
|
|
275 |
* @return string HTML with conditional comments removed.
|
|
|
276 |
*/
|
|
|
277 |
protected static function removeIEConditional($string)
|
|
|
278 |
{
|
|
|
279 |
return preg_replace(
|
|
|
280 |
'#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
|
|
|
281 |
'',
|
|
|
282 |
$string
|
|
|
283 |
);
|
|
|
284 |
}
|
|
|
285 |
|
|
|
286 |
/**
|
|
|
287 |
* Callback function for escapeCDATA() that does the work.
|
|
|
288 |
*
|
|
|
289 |
* @warning Though this is public in order to let the callback happen,
|
|
|
290 |
* calling it directly is not recommended.
|
|
|
291 |
* @param array $matches PCRE matches array, with index 0 the entire match
|
|
|
292 |
* and 1 the inside of the CDATA section.
|
|
|
293 |
* @return string Escaped internals of the CDATA section.
|
|
|
294 |
*/
|
|
|
295 |
protected static function CDATACallback($matches)
|
|
|
296 |
{
|
|
|
297 |
// not exactly sure why the character set is needed, but whatever
|
|
|
298 |
return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
|
|
|
299 |
}
|
|
|
300 |
|
|
|
301 |
/**
|
|
|
302 |
* Takes a piece of HTML and normalizes it by converting entities, fixing
|
|
|
303 |
* encoding, extracting bits, and other good stuff.
|
|
|
304 |
* @param string $html HTML.
|
|
|
305 |
* @param HTMLPurifier_Config $config
|
|
|
306 |
* @param HTMLPurifier_Context $context
|
|
|
307 |
* @return string
|
|
|
308 |
* @todo Consider making protected
|
|
|
309 |
*/
|
|
|
310 |
public function normalize($html, $config, $context)
|
|
|
311 |
{
|
|
|
312 |
// normalize newlines to \n
|
|
|
313 |
if ($config->get('Core.NormalizeNewlines')) {
|
|
|
314 |
$html = str_replace("\r\n", "\n", (string)$html);
|
|
|
315 |
$html = str_replace("\r", "\n", (string)$html);
|
|
|
316 |
}
|
|
|
317 |
|
|
|
318 |
if ($config->get('HTML.Trusted')) {
|
|
|
319 |
// escape convoluted CDATA
|
|
|
320 |
$html = $this->escapeCommentedCDATA($html);
|
|
|
321 |
}
|
|
|
322 |
|
|
|
323 |
// escape CDATA
|
|
|
324 |
$html = $this->escapeCDATA($html);
|
|
|
325 |
|
|
|
326 |
$html = $this->removeIEConditional($html);
|
|
|
327 |
|
|
|
328 |
// extract body from document if applicable
|
|
|
329 |
if ($config->get('Core.ConvertDocumentToFragment')) {
|
|
|
330 |
$e = false;
|
|
|
331 |
if ($config->get('Core.CollectErrors')) {
|
|
|
332 |
$e =& $context->get('ErrorCollector');
|
|
|
333 |
}
|
|
|
334 |
$new_html = $this->extractBody($html);
|
|
|
335 |
if ($e && $new_html != $html) {
|
|
|
336 |
$e->send(E_WARNING, 'Lexer: Extracted body');
|
|
|
337 |
}
|
|
|
338 |
$html = $new_html;
|
|
|
339 |
}
|
|
|
340 |
|
|
|
341 |
// expand entities that aren't the big five
|
|
|
342 |
if ($config->get('Core.LegacyEntityDecoder')) {
|
|
|
343 |
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
|
|
344 |
}
|
|
|
345 |
|
|
|
346 |
// clean into wellformed UTF-8 string for an SGML context: this has
|
|
|
347 |
// to be done after entity expansion because the entities sometimes
|
|
|
348 |
// represent non-SGML characters (horror, horror!)
|
|
|
349 |
$html = HTMLPurifier_Encoder::cleanUTF8($html);
|
|
|
350 |
|
|
|
351 |
// if processing instructions are to removed, remove them now
|
|
|
352 |
if ($config->get('Core.RemoveProcessingInstructions')) {
|
|
|
353 |
$html = preg_replace('#<\?.+?\?>#s', '', $html);
|
|
|
354 |
}
|
|
|
355 |
|
|
|
356 |
$hidden_elements = $config->get('Core.HiddenElements');
|
|
|
357 |
if ($config->get('Core.AggressivelyRemoveScript') &&
|
|
|
358 |
!($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
|
|
|
359 |
|| empty($hidden_elements["script"]))) {
|
|
|
360 |
$html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
|
|
|
361 |
}
|
|
|
362 |
|
|
|
363 |
return $html;
|
|
|
364 |
}
|
|
|
365 |
|
|
|
366 |
/**
|
|
|
367 |
* Takes a string of HTML (fragment or document) and returns the content
|
|
|
368 |
* @todo Consider making protected
|
|
|
369 |
*/
|
|
|
370 |
public function extractBody($html)
|
|
|
371 |
{
|
|
|
372 |
$matches = array();
|
|
|
373 |
$result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
|
|
|
374 |
if ($result) {
|
|
|
375 |
// Make sure it's not in a comment
|
|
|
376 |
$comment_start = strrpos($matches[1], '<!--');
|
|
|
377 |
$comment_end = strrpos($matches[1], '-->');
|
|
|
378 |
if ($comment_start === false ||
|
|
|
379 |
($comment_end !== false && $comment_end > $comment_start)) {
|
|
|
380 |
return $matches[2];
|
|
|
381 |
}
|
|
|
382 |
}
|
|
|
383 |
return $html;
|
|
|
384 |
}
|
|
|
385 |
}
|
|
|
386 |
|
|
|
387 |
// vim: et sw=4 sts=4
|