1 | // Copyright (C) 2005 Google Inc. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | package com.google.caja.lexer; |
16 | |
17 | import com.google.caja.util.CajaTestCase; |
18 | import com.google.caja.util.MoreAsserts; |
19 | import com.google.caja.util.TestUtil; |
20 | |
21 | import java.util.ArrayList; |
22 | import java.util.Arrays; |
23 | import java.util.List; |
24 | |
25 | /** |
26 | * |
27 | * @author mikesamuel@gmail.com |
28 | */ |
29 | public class HtmlLexerTest extends CajaTestCase { |
30 | |
31 | public final void testHtmlLexer() throws Exception { |
32 | // Do the lexing. |
33 | CharProducer p = fromResource("htmllexerinput1.html"); |
34 | StringBuilder actual = new StringBuilder(); |
35 | lex(new HtmlLexer(p), actual); |
36 | |
37 | // Get the golden. |
38 | String golden = fromResource("htmllexergolden1.txt").toString(); |
39 | |
40 | // Compare. |
41 | assertEquals(golden, actual.toString()); |
42 | } |
43 | |
44 | public final void testXmlLexer() throws Exception { |
45 | // Do the lexing. |
46 | CharProducer p = fromResource("htmllexerinput2.xml"); |
47 | StringBuilder actual = new StringBuilder(); |
48 | HtmlLexer lexer = new HtmlLexer(p); |
49 | lexer.setTreatedAsXml(true); |
50 | lex(lexer, actual); |
51 | |
52 | // Get the golden. |
53 | String golden = TestUtil.readResource(getClass(), "htmllexergolden2.txt"); |
54 | |
55 | // Compare. |
56 | assertEquals(golden, actual.toString()); |
57 | } |
58 | |
59 | public final void testEofInTag() throws Exception { |
60 | assertTokens("<div", true, "TAGBEGIN: <div"); |
61 | assertTokens("</div", true, "TAGBEGIN: </div"); |
62 | assertTokens("<div\n", true, "TAGBEGIN: <div"); |
63 | assertTokens("</div\n", true, "TAGBEGIN: </div"); |
64 | assertTokens("<div", false, "TAGBEGIN: <div"); |
65 | assertTokens("</div", false, "TAGBEGIN: </div"); |
66 | assertTokens("<div\n", false, "TAGBEGIN: <div"); |
67 | assertTokens("</div\n", false, "TAGBEGIN: </div"); |
68 | } |
69 | |
70 | public final void testPartialTagInCData() throws Exception { |
71 | assertTokens( |
72 | "<script>w('</b')</script>", false, |
73 | "TAGBEGIN: <script", |
74 | "TAGEND: >", |
75 | "UNESCAPED: w('</b')", |
76 | "TAGBEGIN: </script", |
77 | "TAGEND: >"); |
78 | } |
79 | |
80 | public final void testUrlEndingInSlashOutsideQuotes() throws Exception { |
81 | assertTokens( |
82 | "<a href=http://foo.com/>Clicky</a>", false, |
83 | "TAGBEGIN: <a", |
84 | "ATTRNAME: href", |
85 | "ATTRVALUE: http://foo.com/", |
86 | "TAGEND: >", |
87 | "TEXT: Clicky", |
88 | "TAGBEGIN: </a", |
89 | "TAGEND: >"); |
90 | assertTokens( |
91 | "<a href=http://foo.com/>Clicky</a>", true, |
92 | "TAGBEGIN: <a", |
93 | "ATTRNAME: href", |
94 | "ATTRVALUE: http://foo.com/", |
95 | "TAGEND: >", |
96 | "TEXT: Clicky", |
97 | "TAGBEGIN: </a", |
98 | "TAGEND: >"); |
99 | } |
100 | |
101 | public final void testShortTags() throws Exception { |
102 | // See comments in html-sanitizer-test.js as to why we don't bother with |
103 | // short tags. In short, they are not in HTML5 and not implemented properly |
104 | // in existing HTML4 clients. |
105 | assertTokens( |
106 | "<p<a href=\"/\">first part of the text</> second part", false, |
107 | "TAGBEGIN: <p", |
108 | "ATTRNAME: <a", |
109 | "ATTRNAME: href", |
110 | "ATTRVALUE: \"/\"", |
111 | "TAGEND: >", |
112 | "TEXT: first part of the text</> second part"); |
113 | assertTokens( |
114 | "<p/b/", false, |
115 | "TAGBEGIN: <p", |
116 | "ATTRNAME: /", |
117 | "ATTRNAME: b/"); |
118 | assertTokens( |
119 | "<p<b>", false, |
120 | "TAGBEGIN: <p", |
121 | "ATTRNAME: <b", |
122 | "TAGEND: >"); |
123 | } |
124 | |
125 | private void lex(HtmlLexer lexer, Appendable out) throws Exception { |
126 | int maxTypeLength = 0; |
127 | for (HtmlTokenType t : HtmlTokenType.values()) { |
128 | maxTypeLength = Math.max(maxTypeLength, t.name().length()); |
129 | } |
130 | |
131 | while (lexer.hasNext()) { |
132 | Token<HtmlTokenType> t = lexer.next(); |
133 | // Do C style escaping of the token text so that each token in the golden |
134 | // file can fit on one line. |
135 | String escaped = t.text.replace("\\", "\\\\").replace("\n", "\\n"); |
136 | String type = t.type.toString(); |
137 | while (type.length() < maxTypeLength) { type += " "; } |
138 | out.append(type).append(" [").append(escaped).append("] : ") |
139 | .append(t.pos.toString()).append("\n"); |
140 | } |
141 | } |
142 | |
143 | private void assertTokens(String markup, boolean asXml, String... golden) |
144 | throws ParseException { |
145 | HtmlLexer lexer = new HtmlLexer(fromString(markup)); |
146 | lexer.setTreatedAsXml(asXml); |
147 | List<String> actual = new ArrayList<String>(); |
148 | while (lexer.hasNext()) { |
149 | Token<HtmlTokenType> t = lexer.next(); |
150 | actual.add(t.type + ": " + t.text); |
151 | } |
152 | MoreAsserts.assertListsEqual(Arrays.asList(golden), actual); |
153 | } |
154 | } |