1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
* Takes a well formed list of tokens and fixes their nesting.
|
|
|
5 |
*
|
|
|
6 |
* HTML elements dictate which elements are allowed to be their children,
|
|
|
7 |
* for example, you can't have a p tag in a span tag. Other elements have
|
|
|
8 |
* much more rigorous definitions: tables, for instance, require a specific
|
|
|
9 |
* order for their elements. There are also constraints not expressible by
|
|
|
10 |
* document type definitions, such as the chameleon nature of ins/del
|
|
|
11 |
* tags and global child exclusions.
|
|
|
12 |
*
|
|
|
13 |
* The first major objective of this strategy is to iterate through all
|
|
|
14 |
* the nodes and determine whether or not their children conform to the
|
|
|
15 |
* element's definition. If they do not, the child definition may
|
|
|
16 |
* optionally supply an amended list of elements that is valid or
|
|
|
17 |
* require that the entire node be deleted (and the previous node
|
|
|
18 |
* rescanned).
|
|
|
19 |
*
|
|
|
20 |
* The second objective is to ensure that explicitly excluded elements of
|
|
|
21 |
* an element do not appear in its children. Code that accomplishes this
|
|
|
22 |
* task is pervasive through the strategy, though the two are distinct tasks
|
|
|
23 |
* and could, theoretically, be seperated (although it's not recommended).
|
|
|
24 |
*
|
|
|
25 |
* @note Whether or not unrecognized children are silently dropped or
|
|
|
26 |
* translated into text depends on the child definitions.
|
|
|
27 |
*
|
|
|
28 |
* @todo Enable nodes to be bubbled out of the structure. This is
|
|
|
29 |
* easier with our new algorithm.
|
|
|
30 |
*/
|
|
|
31 |
|
|
|
32 |
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
|
|
33 |
{
|
|
|
34 |
|
|
|
35 |
/**
|
|
|
36 |
* @param HTMLPurifier_Token[] $tokens
|
|
|
37 |
* @param HTMLPurifier_Config $config
|
|
|
38 |
* @param HTMLPurifier_Context $context
|
|
|
39 |
* @return array|HTMLPurifier_Token[]
|
|
|
40 |
*/
|
|
|
41 |
public function execute($tokens, $config, $context)
|
|
|
42 |
{
|
|
|
43 |
|
|
|
44 |
//####################################################################//
|
|
|
45 |
// Pre-processing
|
|
|
46 |
|
|
|
47 |
// O(n) pass to convert to a tree, so that we can efficiently
|
|
|
48 |
// refer to substrings
|
|
|
49 |
$top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
|
|
|
50 |
|
|
|
51 |
// get a copy of the HTML definition
|
|
|
52 |
$definition = $config->getHTMLDefinition();
|
|
|
53 |
|
|
|
54 |
$excludes_enabled = !$config->get('Core.DisableExcludes');
|
|
|
55 |
|
|
|
56 |
// setup the context variable 'IsInline', for chameleon processing
|
|
|
57 |
// is 'false' when we are not inline, 'true' when it must always
|
|
|
58 |
// be inline, and an integer when it is inline for a certain
|
|
|
59 |
// branch of the document tree
|
|
|
60 |
$is_inline = $definition->info_parent_def->descendants_are_inline;
|
|
|
61 |
$context->register('IsInline', $is_inline);
|
|
|
62 |
|
|
|
63 |
// setup error collector
|
|
|
64 |
$e =& $context->get('ErrorCollector', true);
|
|
|
65 |
|
|
|
66 |
//####################################################################//
|
|
|
67 |
// Loop initialization
|
|
|
68 |
|
|
|
69 |
// stack that contains all elements that are excluded
|
|
|
70 |
// it is organized by parent elements, similar to $stack,
|
|
|
71 |
// but it is only populated when an element with exclusions is
|
|
|
72 |
// processed, i.e. there won't be empty exclusions.
|
|
|
73 |
$exclude_stack = array($definition->info_parent_def->excludes);
|
|
|
74 |
|
|
|
75 |
// variable that contains the start token while we are processing
|
|
|
76 |
// nodes. This enables error reporting to do its job
|
|
|
77 |
$node = $top_node;
|
|
|
78 |
// dummy token
|
|
|
79 |
list($token, $d) = $node->toTokenPair();
|
|
|
80 |
$context->register('CurrentNode', $node);
|
|
|
81 |
$context->register('CurrentToken', $token);
|
|
|
82 |
|
|
|
83 |
//####################################################################//
|
|
|
84 |
// Loop
|
|
|
85 |
|
|
|
86 |
// We need to implement a post-order traversal iteratively, to
|
|
|
87 |
// avoid running into stack space limits. This is pretty tricky
|
|
|
88 |
// to reason about, so we just manually stack-ify the recursive
|
|
|
89 |
// variant:
|
|
|
90 |
//
|
|
|
91 |
// function f($node) {
|
|
|
92 |
// foreach ($node->children as $child) {
|
|
|
93 |
// f($child);
|
|
|
94 |
// }
|
|
|
95 |
// validate($node);
|
|
|
96 |
// }
|
|
|
97 |
//
|
|
|
98 |
// Thus, we will represent a stack frame as array($node,
|
|
|
99 |
// $is_inline, stack of children)
|
|
|
100 |
// e.g. array_reverse($node->children) - already processed
|
|
|
101 |
// children.
|
|
|
102 |
|
|
|
103 |
$parent_def = $definition->info_parent_def;
|
|
|
104 |
$stack = array(
|
|
|
105 |
array($top_node,
|
|
|
106 |
$parent_def->descendants_are_inline,
|
|
|
107 |
$parent_def->excludes, // exclusions
|
|
|
108 |
0)
|
|
|
109 |
);
|
|
|
110 |
|
|
|
111 |
while (!empty($stack)) {
|
|
|
112 |
list($node, $is_inline, $excludes, $ix) = array_pop($stack);
|
|
|
113 |
// recursive call
|
|
|
114 |
$go = false;
|
|
|
115 |
$def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
|
|
|
116 |
while (isset($node->children[$ix])) {
|
|
|
117 |
$child = $node->children[$ix++];
|
|
|
118 |
if ($child instanceof HTMLPurifier_Node_Element) {
|
|
|
119 |
$go = true;
|
|
|
120 |
$stack[] = array($node, $is_inline, $excludes, $ix);
|
|
|
121 |
$stack[] = array($child,
|
|
|
122 |
// ToDo: I don't think it matters if it's def or
|
|
|
123 |
// child_def, but double check this...
|
|
|
124 |
$is_inline || $def->descendants_are_inline,
|
|
|
125 |
empty($def->excludes) ? $excludes
|
|
|
126 |
: array_merge($excludes, $def->excludes),
|
|
|
127 |
0);
|
|
|
128 |
break;
|
|
|
129 |
}
|
|
|
130 |
};
|
|
|
131 |
if ($go) continue;
|
|
|
132 |
list($token, $d) = $node->toTokenPair();
|
|
|
133 |
// base case
|
|
|
134 |
if ($excludes_enabled && isset($excludes[$node->name])) {
|
|
|
135 |
$node->dead = true;
|
|
|
136 |
if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
|
|
|
137 |
} else {
|
|
|
138 |
// XXX I suppose it would be slightly more efficient to
|
|
|
139 |
// avoid the allocation here and have children
|
|
|
140 |
// strategies handle it
|
|
|
141 |
$children = array();
|
|
|
142 |
foreach ($node->children as $child) {
|
|
|
143 |
if (!$child->dead) $children[] = $child;
|
|
|
144 |
}
|
|
|
145 |
$result = $def->child->validateChildren($children, $config, $context);
|
|
|
146 |
if ($result === true) {
|
|
|
147 |
// nop
|
|
|
148 |
$node->children = $children;
|
|
|
149 |
} elseif ($result === false) {
|
|
|
150 |
$node->dead = true;
|
|
|
151 |
if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
|
|
|
152 |
} else {
|
|
|
153 |
$node->children = $result;
|
|
|
154 |
if ($e) {
|
|
|
155 |
// XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
|
|
|
156 |
if (empty($result) && !empty($children)) {
|
|
|
157 |
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
|
|
|
158 |
} else if ($result != $children) {
|
|
|
159 |
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
|
|
|
160 |
}
|
|
|
161 |
}
|
|
|
162 |
}
|
|
|
163 |
}
|
|
|
164 |
}
|
|
|
165 |
|
|
|
166 |
//####################################################################//
|
|
|
167 |
// Post-processing
|
|
|
168 |
|
|
|
169 |
// remove context variables
|
|
|
170 |
$context->destroy('IsInline');
|
|
|
171 |
$context->destroy('CurrentNode');
|
|
|
172 |
$context->destroy('CurrentToken');
|
|
|
173 |
|
|
|
174 |
//####################################################################//
|
|
|
175 |
// Return
|
|
|
176 |
|
|
|
177 |
return HTMLPurifier_Arborize::flatten($node, $config, $context);
|
|
|
178 |
}
|
|
|
179 |
}
|
|
|
180 |
|
|
|
181 |
// vim: et sw=4 sts=4
|