1 | // Copyright (C) 2010 Google Inc. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | package com.google.caja.lexer; |
16 | |
17 | import java.io.ByteArrayOutputStream; |
18 | import java.io.IOException; |
19 | import java.io.InputStream; |
20 | import java.io.InputStreamReader; |
21 | import java.io.Reader; |
22 | import java.io.StringReader; |
23 | import java.io.UnsupportedEncodingException; |
24 | import java.nio.charset.Charset; |
25 | import java.nio.charset.IllegalCharsetNameException; |
26 | import java.nio.charset.UnsupportedCharsetException; |
27 | |
28 | import com.google.caja.SomethingWidgyHappenedError; |
29 | import com.google.caja.util.Pair; |
30 | |
31 | import org.mozilla.intl.chardet.nsDetector; |
32 | import org.mozilla.intl.chardet.nsICharsetDetectionObserver; |
33 | import org.mozilla.intl.chardet.nsPSMDetector; |
34 | |
35 | /** |
36 | * Utilities for dealing with converting byte streams with unknown character |
37 | * sets to character streams. |
38 | * |
39 | * @author Mike Samuel <mikesamuel@gmail.com> |
40 | */ |
41 | public final class Chardet { |
42 | private Chardet() { /* uninstantiable */ } |
43 | |
44 | private static final String UTF8 = "UTF-8"; |
45 | private static final String UTF16BE = "UTF-16BE"; |
46 | private static final String UTF16LE = "UTF-16LE"; |
47 | private static final String UTF32BE = "UTF-32BE"; |
48 | private static final String UTF32LE = "UTF-32LE"; |
49 | private static final String UTF7 = "UTF-7"; |
50 | private static final String UTF1 = "UTF-1"; |
51 | private static final String ISO_8859_1 = "ISO-8859-1"; |
52 | |
53 | /** |
54 | * Given a byte stream, figure out an encoding and return a character stream |
55 | * and the encoding used to convert bytes to characters. |
56 | */ |
57 | public static Pair<Reader, String> guessCharset(InputStream in) |
58 | throws IOException { |
59 | ByteArrayOutputStream buffered = new ByteArrayOutputStream(); |
60 | byte[] buf = new byte[1024]; |
61 | boolean isAscii = true; |
62 | int len = in.read(buf); |
63 | if (len <= 0) { return Pair.pair((Reader) new StringReader(""), UTF8); } |
64 | String charset = findCharset(buf, len); |
65 | if (charset != null) { |
66 | // If the charset is specified in the document, use that. |
67 | buffered.write(buf, 0, len); |
68 | // Otherwise, look for a BOM at the start of the content. |
69 | } else if (hasUtf8BOM(buf, len)) { |
70 | charset = UTF8; |
71 | buffered.write(buf, 3, len - 3); |
72 | // Check UTF32 before UTF16 since a little endian UTF16 BOM is a prefix of |
73 | // a little endian UTF32 BOM. |
74 | } else if (hasUtf32BEBOM(buf, len)) { |
75 | charset = UTF32BE; |
76 | buffered.write(buf, 4, len - 4); |
77 | } else if (hasUtf32LEBOM(buf, len)) { |
78 | charset = UTF32LE; |
79 | buffered.write(buf, 4, len - 4); |
80 | } else if (hasUtf16BEBOM(buf, len)) { |
81 | charset = UTF16BE; |
82 | buffered.write(buf, 2, len - 2); |
83 | } else if (hasUtf16LEBOM(buf, len)) { |
84 | charset = UTF16LE; |
85 | buffered.write(buf, 2, len - 2); |
86 | } else if (hasUtf7BOM(buf, len)) { |
87 | charset = UTF7; |
88 | buffered.write(buf, 4, len - 4); |
89 | } else if (hasUtf1BOM(buf, len)) { |
90 | charset = UTF1; |
91 | buffered.write(buf, 3, len - 3); |
92 | } else { |
93 | // Use jchardet which tries a variety of heuristics to choose an encoding. |
94 | nsDetector det = new nsDetector(nsPSMDetector.ALL); |
95 | class Observer implements nsICharsetDetectionObserver { |
96 | String charset; |
97 | public void Notify(String charset) { |
98 | this.charset = charset; |
99 | } |
100 | } |
101 | // The below is adapted from the main method in HtmlCharsetDetector. |
102 | Observer observer = new Observer(); |
103 | det.Init(observer); |
104 | do { |
105 | buffered.write(buf, 0, len); |
106 | if (isAscii) { isAscii = det.isAscii(buf, len); } |
107 | if (!isAscii) { |
108 | if (det.DoIt(buf, len, false)) { break; } |
109 | } |
110 | } while ((len = in.read(buf)) > 0); |
111 | det.DataEnd(); |
112 | charset = observer.charset; |
113 | } |
114 | if (charset != null) { charset = supportedCharsetName(charset); } |
115 | if (charset == null) { charset = UTF8; } |
116 | return Pair.pair( |
117 | joinStreamsWithCharset(buffered.toByteArray(), in, charset), |
118 | charset); |
119 | } |
120 | |
121 | private static final byte[] CHARSET_BYTES; |
122 | private static final byte[] ENCODING_BYTES; |
123 | static { |
124 | try { |
125 | CHARSET_BYTES = "charset".getBytes(ISO_8859_1); |
126 | ENCODING_BYTES = "encoding".getBytes(ISO_8859_1); |
127 | } catch (UnsupportedEncodingException ex) { |
128 | throw new SomethingWidgyHappenedError(ex); |
129 | } |
130 | } |
131 | |
132 | /** |
133 | * Looks for sequences like {@code charset="..."} inside angle brackets to |
134 | * match {@code <meta value="text/html;charset=...">} and after {@code <?} |
135 | * sequences like {@code encoding="..."} to match XML prologs. |
136 | */ |
137 | private static String findCharset(final byte[] buf, final int len) { |
138 | for (int i = 0; i < len; ++i) { |
139 | if ('<' != buf[i]) { continue; } |
140 | byte lastByte = '<'; |
141 | byte[] attrBytes = CHARSET_BYTES; |
142 | // Now we're inside <, so look for attrBytes. |
143 | for (int j = i + 1, n = len; j < n; ++j) { |
144 | byte b = buf[j]; |
145 | if (b == 0) { continue; } |
146 | if (b == '?' && lastByte == '<') { attrBytes = ENCODING_BYTES; } |
147 | if ((b | 0x20) == attrBytes[0] && !isAlnum(lastByte)) { |
148 | int wordLen = attrBytes.length; |
149 | int pos = j + 1, k = 1; |
150 | // Match attrBytes against buf[pos:] |
151 | while (pos < n && k < wordLen) { |
152 | b = buf[pos]; |
153 | if (b == 0 || b == '-') { // Skip over NULs in UTF-16 and UTF-32. |
154 | ++pos; |
155 | } else if ((b | 0x20) == attrBytes[k]) { |
156 | ++k; |
157 | ++pos; |
158 | } else { |
159 | break; |
160 | } |
161 | } |
162 | if (k == wordLen) { |
163 | // Now we've found the attribute or parameter name. |
164 | // Skip over spaces and NULs looking for '=' |
165 | while (pos < len) { |
166 | b = buf[pos]; |
167 | if (b == '=') { |
168 | // Skip over spaces and NULs looking for alnum or quote. |
169 | while (++pos < len) { |
170 | b = buf[pos]; |
171 | if (b == 0 || isSpace(b)) { continue; } |
172 | int start; |
173 | if (b == '"' || b == '\'') { |
174 | start = pos + 1; |
175 | } else if (isAlnum(b)) { |
176 | start = pos; |
177 | } else { |
178 | break; |
179 | } |
180 | int end = start; |
181 | boolean sawLetter = false; |
182 | // Now, find the end of the charset. |
183 | while (end < len) { |
184 | b = buf[end]; |
185 | if (b == 0 || b == '-' || b == '_') { |
186 | ++end; |
187 | } else if (isAlnum(b)) { |
188 | sawLetter = true; |
189 | ++end; |
190 | } else { |
191 | break; |
192 | } |
193 | } |
194 | if (sawLetter) { |
195 | StringBuilder sb = new StringBuilder(end - start); |
196 | for (int bi = start; bi < end; ++bi) { |
197 | if (buf[bi] != 0) { sb.append((char) buf[bi]); } |
198 | } |
199 | // Only use the charset if it's recognized. |
200 | // Otherwise, we continue looking. |
201 | String charset = supportedCharsetName(sb.toString()); |
202 | if (charset != null) { return charset; } |
203 | } |
204 | } |
205 | break; |
206 | } |
207 | if (b != 0 && !isSpace(b)) { |
208 | break; |
209 | } |
210 | ++pos; |
211 | } |
212 | } |
213 | if (b == '<' || b == '>') { |
214 | i = pos - 1; |
215 | break; |
216 | } |
217 | } else if (b == '<' || b == '>') { |
218 | i = j - 1; |
219 | break; |
220 | } |
221 | lastByte = buf[j]; |
222 | } |
223 | } |
224 | return null; |
225 | } |
226 | |
227 | /** |
228 | * Produces a character stream from an underlying byte stream. |
229 | * @param buffered lookahead bytes read from tail. |
230 | * @param tail the unread portion of the stream |
231 | * @param charset the character set to use to decode the bytes in buffered and |
232 | * tail. |
233 | */ |
234 | private static Reader joinStreamsWithCharset( |
235 | byte[] buffered, InputStream tail, String charset) |
236 | throws IOException { |
237 | |
238 | class JoinedStream extends InputStream { |
239 | byte[] buffered; |
240 | int pos; |
241 | final InputStream tail; |
242 | |
243 | JoinedStream(byte[] buffered, InputStream tail) { |
244 | this.buffered = buffered; |
245 | this.tail = tail; |
246 | } |
247 | |
248 | @Override |
249 | public int read() throws IOException { |
250 | if (buffered != null) { |
251 | if (pos < buffered.length) { return buffered[pos++]; } |
252 | buffered = null; |
253 | } |
254 | return tail.read(); |
255 | } |
256 | |
257 | @Override |
258 | public int read(byte[] out, int off, int len) throws IOException { |
259 | int nRead = 0; |
260 | if (buffered != null) { |
261 | int avail = buffered.length - pos; |
262 | if (avail != 0) { |
263 | int k = Math.min(len, avail); |
264 | int p1 = pos + k; |
265 | int p2 = off + k; |
266 | pos = p1; |
267 | while (--p2 >= off) { out[p2] = buffered[--p1]; } |
268 | off += k; |
269 | len -= k; |
270 | nRead = k; |
271 | } else { |
272 | buffered = null; |
273 | } |
274 | } |
275 | if (len == 0) { return nRead; } |
276 | int nFromTail = tail.read(out, off, len); |
277 | if (nFromTail > 0) { return nFromTail + nRead; } |
278 | return nRead != 0 ? nRead : -1; |
279 | } |
280 | |
281 | @Override |
282 | public void close() throws IOException { |
283 | buffered = null; |
284 | tail.close(); |
285 | } |
286 | } |
287 | |
288 | return new InputStreamReader(new JoinedStream(buffered, tail), charset); |
289 | } |
290 | |
291 | private static boolean isAlnum(byte b) { |
292 | if (b < '0' || b > 'z') { return false; } |
293 | if (b < 'A') { return b <= '9'; } |
294 | return b >= 'a' || b <= 'Z'; |
295 | } |
296 | |
297 | private static boolean isSpace(byte b) { |
298 | return b <= ' ' |
299 | && (b == ' ' || b == '\r' || b == '\n' || b == '\t' || b == '\f'); |
300 | } |
301 | |
302 | static String supportedCharsetName(String s) { |
303 | try { |
304 | return Charset.forName(s).name(); |
305 | } catch (UnsupportedCharsetException ex) { |
306 | return null; |
307 | } catch (IllegalCharsetNameException ex) { |
308 | return null; |
309 | } |
310 | } |
311 | |
312 | private static final byte |
313 | _00 = (byte) 0, |
314 | _2B = (byte) 0x2b, |
315 | _2F = (byte) 0x2f, |
316 | _38 = (byte) 0x38, |
317 | _39 = (byte) 0x39, |
318 | _4C = (byte) 0x4c, |
319 | _64 = (byte) 0x64, |
320 | _76 = (byte) 0x76, |
321 | _BB = (byte) 0xbb, |
322 | _BF = (byte) 0xbf, |
323 | _EF = (byte) 0xef, |
324 | _F7 = (byte) 0xf7, |
325 | _FE = (byte) 0xfe, |
326 | _FF = (byte) 0xff; |
327 | |
328 | // See http://en.wikipedia.org/wiki/Byte_order_mark for a table of byte |
329 | // sequences. |
330 | private static boolean hasUtf8BOM(byte[] b, int len) { |
331 | return len >= 3 && b[0] == _EF && b[1] == _BB && b[2] == _BF; |
332 | } |
333 | |
334 | private static boolean hasUtf16BEBOM(byte[] b, int len) { |
335 | return len >= 2 && b[0] == _FE && b[1] == _FF; |
336 | } |
337 | |
338 | private static boolean hasUtf16LEBOM(byte[] b, int len) { |
339 | return len >= 2 && b[0] == _FF && b[1] == _FE; |
340 | } |
341 | |
342 | private static boolean hasUtf32BEBOM(byte[] b, int len) { |
343 | return len >= 4 && b[0] == _00 && b[1] == _00 |
344 | && b[2] == _FE && b[3] == _FF; |
345 | } |
346 | |
347 | private static boolean hasUtf32LEBOM(byte[] b, int len) { |
348 | return len >= 4 && b[0] == _FF && b[1] == _FE |
349 | && b[2] == _00 && b[3] == _00; |
350 | } |
351 | |
352 | private static boolean hasUtf7BOM(byte[] b, int len) { |
353 | if (len < 4 || b[0] != _2B || b[1] != _2F || b[2] != _76) { |
354 | return false; |
355 | } |
356 | byte b3 = b[3]; |
357 | return b3 == _38 || b3 == _39 || b3 == _2B || b3 == _2F; |
358 | } |
359 | |
360 | private static boolean hasUtf1BOM(byte[] b, int len) { |
361 | return len >= 3 && b[0] == _F7 && b[1] == _64 && b[2] == _4C; |
362 | } |
363 | } |