1 | // Copyright (C) 2009 Google Inc. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | package com.google.caja.parser.html; |
16 | |
17 | import com.google.caja.util.Function; |
18 | import com.google.caja.util.Maps; |
19 | import com.google.caja.util.Strings; |
20 | |
21 | import java.util.Map; |
22 | import java.util.regex.Matcher; |
23 | import java.util.regex.Pattern; |
24 | |
25 | import org.w3c.dom.DOMImplementation; |
26 | import org.w3c.dom.DocumentType; |
27 | |
28 | class DoctypeMaker { |
29 | |
30 | public static Function<DOMImplementation, DocumentType> parse(String text) { |
31 | // We recognize a subset of the XML DOCTYPE grammar. Specifically, we |
32 | // do not recognize embedded entity declarations to avoid XXE, or |
33 | // annotations. |
34 | |
35 | // As noted above, we do not recognize the intSubset portion. |
36 | Matcher m = DOCTYPE_PATTERN.matcher(text); |
37 | if (!m.matches()) { return null; } |
38 | |
39 | String name = m.group(1), system2 = dequote(m.group(2)), |
40 | pubid = dequote(m.group(3)), system4 = dequote(m.group(4)); |
41 | final String system = system2 == null ? system4 : system2; |
42 | boolean isHtml = isHtml(name, pubid, system); |
43 | if (isHtml && name.indexOf(':') < 0) { |
44 | name = Strings.toLowerCase(name); |
45 | } |
46 | final String qname = name; |
47 | final String publicId = pubid; |
48 | final String systemId = system; |
49 | return new Function<DOMImplementation, DocumentType>() { |
50 | public DocumentType apply(DOMImplementation impl) { |
51 | return impl.createDocumentType(qname, publicId, systemId); |
52 | } |
53 | }; |
54 | } |
55 | |
56 | /** |
57 | * This implementation is based on the grammar in the |
58 | * <a href="http://www.w3.org/TR/REC-xml/#NT-doctypedecl">XML spec S 2.8</a> |
59 | */ |
60 | private static final Pattern DOCTYPE_PATTERN; |
61 | static { |
62 | // S ::= (#x20 | #x9 | #xD | #xA)+ |
63 | String s = "[ \\t\\r\\n]+"; |
64 | String sStar = "[ \\t\\r\\n]*"; |
65 | // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] |
66 | // | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
67 | // | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
68 | // | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
69 | // | [#x10000-#xEFFFF] |
70 | String nameStartCharSet = ( |
71 | "A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F" |
72 | + "\u1FFF\u200C\u200D\u2070-\u218F\u2C00\u2FEF\u3001\uD7FF\uF900-\uFDCF" |
73 | + "\uFDF0-\uFFFD"); |
74 | String nameStartChar = "[" + nameStartCharSet + "]"; |
75 | // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
76 | // | [#x0300-#x036F] | [#x203F-#x2040] |
77 | String nameChar = ( |
78 | "[" + nameStartCharSet + "\\-.0-9\u0087\u0300-\u036F\u203F-\u2040]"); |
79 | // Name ::= NameStartChar (NameChar)* |
80 | String name = "(?:" + nameStartChar + nameChar + "*)"; |
81 | // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") |
82 | String systemLiteral = "(?:\"[^\"]*\"|'[^']*')"; |
83 | // PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] |
84 | String pubidChar = "[ \\r\\na-zA-Z0-9\\-'()+,./:=?;!*#$_%]"; |
85 | // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" |
86 | String pubidLiteral = ( |
87 | "(?:\"" + pubidChar + "*\"|\'" + pubidChar.replace("'", "\"") + "*')"); |
88 | // ExternalID ::= 'SYSTEM' S SystemLiteral |
89 | // | 'PUBLEIC' S PubidLiteral S SystemLiteral |
90 | String externalId = ( |
91 | "(?:SYSTEM" + s + "(" + systemLiteral + ")" |
92 | + "|PUBLIC" + s + "("+ pubidLiteral + ")" |
93 | // XML does not allow the system id to be omitted, but HTML does. |
94 | // Also, whitespaces between public id and system id can be omitted. |
95 | + "(?:" + sStar + "(" + systemLiteral + "))?)"); |
96 | String intSubset = "[^\\]>]*"; |
97 | // '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' |
98 | // Groups: Name 1, SystemLiteral 2 or 4, PubidLiteral 3. |
99 | DOCTYPE_PATTERN = Pattern.compile( |
100 | "<!DOCTYPE" + s + "(" + name + ")(?:" + s + externalId + ")?" |
101 | + "(?:" + s + ")?(?:\\[" + intSubset + "\\](?:" + s + ")?)?>", |
102 | Pattern.CASE_INSENSITIVE); |
103 | } |
104 | |
105 | private static final Map<String, String> BY_SYSTEM_ID |
106 | = Maps.<String, String>immutableMap() |
107 | .put("http://www.w3.org/TR/html4/*.dtd", Namespaces.HTML_NAMESPACE_URI) |
108 | .put("http://www.w3.org/TR/xhtml1/DTD/*.dtd", |
109 | Namespaces.HTML_NAMESPACE_URI) |
110 | .put("http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/*.dtd", |
111 | Namespaces.SVG_NAMESPACE_URI) |
112 | .put("http://www.w3.org/Graphics/SVG/1.1/DTD/*.dtd", |
113 | Namespaces.SVG_NAMESPACE_URI) |
114 | .create(); |
115 | |
116 | public static String systemIdToNsUri(String systemId) { |
117 | String nsUri = BY_SYSTEM_ID.get(systemId); |
118 | if (nsUri == null && systemId != null) { |
119 | String wildcard = systemId.replaceFirst("/[^/]+\\.dtd$", "/*.dtd"); |
120 | nsUri = BY_SYSTEM_ID.get(wildcard); |
121 | } |
122 | return nsUri; |
123 | } |
124 | |
125 | private static String dequote(String s) { |
126 | if (s == null) { return s; } |
127 | int len = s.length(); |
128 | if (len < 2) { return s; } |
129 | char ch0 = s.charAt(0); |
130 | if (ch0 != '"' && ch0 != '\'') { return s; } |
131 | if (ch0 != s.charAt(len - 1)) { return s; } |
132 | return s.substring(1, len - 1); |
133 | } |
134 | |
135 | static boolean isHtml(String name, String pubid, String systemId) { |
136 | String nsUri = systemIdToNsUri(systemId); |
137 | if (nsUri != null && Namespaces.isHtml(nsUri)) { return true; } |
138 | if (pubid != null) { |
139 | pubid = Strings.toLowerCase(pubid).replaceAll("\\s+", " ").trim(); |
140 | return pubid.startsWith("-//w3c//dtd html ") |
141 | || pubid.startsWith("-//w3c//dtd xhtml ") |
142 | || pubid.startsWith("-//ietf//dtd html"); |
143 | } else if (systemId == null) { |
144 | // <!DOCTYPE html> |
145 | return Strings.equalsIgnoreCase("html", name); |
146 | } |
147 | return false; |
148 | } |
149 | } |