1 | // Copyright (C) 2009 Google Inc. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | package com.google.caja.plugin.templates; |
16 | |
17 | import com.google.caja.SomethingWidgyHappenedError; |
18 | import com.google.caja.lang.html.HTML; |
19 | import com.google.caja.lang.html.HtmlSchema; |
20 | import com.google.caja.parser.html.AttribKey; |
21 | import com.google.caja.parser.html.ElKey; |
22 | import com.google.caja.parser.html.Nodes; |
23 | import com.google.caja.plugin.PluginMessageType; |
24 | import com.google.caja.reporting.Message; |
25 | import com.google.caja.reporting.MessageLevel; |
26 | import com.google.caja.reporting.MessagePart; |
27 | import com.google.caja.reporting.MessageQueue; |
28 | import com.google.caja.util.Criterion; |
29 | |
30 | import java.util.ArrayList; |
31 | import java.util.List; |
32 | |
33 | import org.w3c.dom.Attr; |
34 | import org.w3c.dom.Element; |
35 | import org.w3c.dom.NamedNodeMap; |
36 | import org.w3c.dom.Node; |
37 | |
38 | /** |
39 | * Rewrites an IHTML DOM, removing potentially unsafe constructs that |
40 | * can be ignored, and issuing errors if the constructs cannot be removed. |
41 | * |
42 | * @author mikesamuel@gmail.com |
43 | */ |
44 | public final class TemplateSanitizer { |
45 | private final HtmlSchema schema; |
46 | private final MessageQueue mq; |
47 | |
48 | /** |
49 | * @param schema specifies which tags and attributes are allowed, and which |
50 | * attribute values are allowed. |
51 | * @param mq a message queue that will receive errors on unsafe nodes or |
52 | * attributes, and warnings on removed nodes. |
53 | */ |
54 | public TemplateSanitizer(HtmlSchema schema, MessageQueue mq) { |
55 | this.schema = schema; |
56 | this.mq = mq; |
57 | } |
58 | |
59 | /** |
60 | * @param t the node to sanitize. |
61 | * @return true iff the htmlRoot can be safely used. If false, explanatory |
62 | * messages were added to the MessageQueue passed to the constructor. |
63 | */ |
64 | public boolean sanitize(Node t) { |
65 | boolean valid = true; |
66 | switch (t.getNodeType()) { |
67 | case Node.DOCUMENT_FRAGMENT_NODE: |
68 | for (Node child : Nodes.childrenOf(t)) { |
69 | sanitize(child); |
70 | } |
71 | break; |
72 | case Node.ELEMENT_NODE: |
73 | { |
74 | Element el = (Element) t; |
75 | ElKey elKey = ElKey.forElement(el); |
76 | { |
77 | if (!schema.isElementAllowed(elKey)) { |
78 | IhtmlMessageType msgType = schema.lookupElement(elKey) != null |
79 | ? IhtmlMessageType.UNSAFE_TAG |
80 | : IhtmlMessageType.UNKNOWN_TAG; |
81 | |
82 | // Figure out what to do with the disallowed tag. We can remove it |
83 | // from the node, replace it with its children (fold), or error out. |
84 | boolean ignore = false, fold = false; |
85 | Node p = el.getParentNode(); |
86 | if (p != null) { |
87 | if (isElementIgnorable(elKey)) { |
88 | ignore = true; |
89 | } else if (HtmlSchema.isElementFoldable(elKey)) { |
90 | fold = true; |
91 | msgType = IhtmlMessageType.FOLDING_ELEMENT; |
92 | } |
93 | } |
94 | |
95 | MessageLevel msgLevel |
96 | = ignore || fold ? MessageLevel.WARNING : msgType.getLevel(); |
97 | mq.getMessages().add(new Message( |
98 | msgType, msgLevel, Nodes.getFilePositionFor(el), elKey)); |
99 | |
100 | if (ignore) { |
101 | assert p != null; // ignore = true -> p != null above |
102 | p.removeChild(el); |
103 | return valid; // Don't recurse to children if removed. |
104 | } else { |
105 | // According to http://www.w3.org/TR/html401/appendix/notes.html |
106 | // the recommended behavior is to try to render an unrecognized |
107 | // element's contents |
108 | return valid & foldElement(elKey, el); |
109 | } |
110 | } |
111 | valid &= sanitizeAttrs(elKey, el, false); |
112 | } |
113 | // We know by construction of org.w3c.Element that there can only be |
114 | // one attribute with a given name. |
115 | // If that were not the case, passes that only inspect the |
116 | // first occurrence of an attribute could be spoofed. |
117 | break; |
118 | } |
119 | case Node.TEXT_NODE: |
120 | case Node.CDATA_SECTION_NODE: |
121 | case Node.COMMENT_NODE: |
122 | break; |
123 | default: |
124 | throw new SomethingWidgyHappenedError(t.getNodeName()); |
125 | } |
126 | for (Node child : Nodes.childrenOf(t)) { |
127 | valid &= sanitize(child); |
128 | } |
129 | return valid; |
130 | } |
131 | |
132 | private boolean sanitizeAttrs(ElKey elKey, Element el, boolean ignore) { |
133 | boolean valid = true; |
134 | // Iterate in reverse so that removed attributes don't break iteration. |
135 | NamedNodeMap attrs = el.getAttributes(); |
136 | for (int i = attrs.getLength(); --i >= 0;) { |
137 | valid &= sanitizeAttr(elKey, el, (Attr) attrs.item(i), ignore); |
138 | } |
139 | return valid; |
140 | } |
141 | |
142 | private boolean sanitizeAttr( |
143 | ElKey elKey, Element el, Attr attrib, boolean ignore) { |
144 | boolean valid = true; |
145 | AttribKey attrKey = AttribKey.forAttribute(elKey, attrib); |
146 | HTML.Attribute a = schema.lookupAttribute(attrKey); |
147 | if (null == a) { |
148 | if (!ignore) { |
149 | mq.getMessages().add(new Message( |
150 | PluginMessageType.UNKNOWN_ATTRIBUTE, MessageLevel.WARNING, |
151 | Nodes.getFilePositionFor(attrib), attrKey, elKey)); |
152 | } |
153 | valid &= removeBadAttribute(el, attrKey); |
154 | } else if (!schema.isAttributeAllowed(attrKey)) { |
155 | if (!ignore) { |
156 | mq.addMessage( |
157 | PluginMessageType.UNSAFE_ATTRIBUTE, |
158 | Nodes.getFilePositionFor(attrib), attrKey, elKey); |
159 | } |
160 | valid &= removeBadAttribute(el, attrKey); |
161 | } else { |
162 | Criterion<? super String> criteria = a.getValueCriterion(); |
163 | if (!criteria.accept(attrib.getNodeValue())) { |
164 | if (!ignore) { |
165 | mq.addMessage( |
166 | PluginMessageType.DISALLOWED_ATTRIBUTE_VALUE, |
167 | Nodes.getFilePositionForValue(attrib), |
168 | attrKey, MessagePart.Factory.valueOf(attrib.getNodeValue())); |
169 | } |
170 | valid &= removeBadAttribute(el, attrKey); |
171 | } |
172 | } |
173 | return valid; |
174 | } |
175 | |
176 | /** |
177 | * Elements that can be safely removed from the DOM without changing behavior. |
178 | */ |
179 | private static boolean isElementIgnorable(ElKey elKey) { |
180 | if (!elKey.isHtml()) { return false; } |
181 | String lcName = elKey.localName; |
182 | return "noscript".equals(lcName) || "noembed".equals(lcName) |
183 | || "noframes".equals(lcName) || "title".equals(lcName); |
184 | } |
185 | |
186 | /** |
187 | * Fold the children of a {@link HtmlSchema#isElementFoldable foldable} |
188 | * element into that element's parent. |
189 | * |
190 | * <p> |
191 | * This should have the property that:<ul> |
192 | * <li>Every element is processed |
193 | * <li>Elements can recursively fold |
194 | * <li>Folded elements that are implied (such as head when a title |
195 | * is present) don't break cajoling. |
196 | * <li>We don't fold elements that are explicitly allowed by the whitelist. |
197 | * <li>Nothing is removed from the parse tree without a notification |
198 | * to the user. |
199 | * </ul> |
200 | * |
201 | * @param el a tag with a mutable parent which will be modified in place. |
202 | * @return true iff the el's children are transitively valid, and if they |
203 | * could all be folded into the parent. |
204 | */ |
205 | private boolean foldElement(ElKey elKey, Element el) { |
206 | boolean valid = true; |
207 | |
208 | // Recurse to children to ensure that all nodes are processed. |
209 | valid &= sanitizeAttrs(elKey, el, true); |
210 | for (Node child : Nodes.childrenOf(el)) { valid &= sanitize(child); } |
211 | |
212 | for (Attr a : Nodes.attributesOf(el)) { |
213 | mq.addMessage( |
214 | PluginMessageType.CANNOT_FOLD_ATTRIBUTE, Nodes.getFilePositionFor(a), |
215 | MessagePart.Factory.valueOf(a.getNodeName()), |
216 | MessagePart.Factory.valueOf(el.getLocalName())); |
217 | } |
218 | |
219 | // Pick the subset of children to fold in. |
220 | List<Node> foldedChildren = new ArrayList<Node>(); |
221 | for (Node child : Nodes.childrenOf(el)) { |
222 | switch (child.getNodeType()) { |
223 | case Node.ELEMENT_NODE: case Node.TEXT_NODE: |
224 | case Node.CDATA_SECTION_NODE: |
225 | foldedChildren.add(child); |
226 | break; |
227 | default: |
228 | // Ignore. |
229 | } |
230 | } |
231 | |
232 | // Rebuild the sibling list, substituting foldedChildren for any occurrences |
233 | // of el.node. |
234 | Node next = el.getNextSibling(); |
235 | Node parent = el.getParentNode(); |
236 | parent.removeChild(el); |
237 | for (Node n : foldedChildren) { parent.insertBefore(n, next); } |
238 | |
239 | return valid; |
240 | } |
241 | |
242 | private boolean removeBadAttribute(Element el, AttribKey attrKey) { |
243 | el.removeAttributeNS(attrKey.ns.uri, attrKey.localName); |
244 | return true; |
245 | } |
246 | } |