001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.io.Reader; 027import java.io.StringReader; 028import java.net.HttpURLConnection; 029import java.net.URL; 030import java.net.URLConnection; 031import java.text.MessageFormat; 032import java.util.Locale; 033import java.util.regex.Matcher; 034import java.util.regex.Pattern; 035 036import org.apache.commons.io.ByteOrderMark; 037 038/** 039 * Character stream that handles all the necessary Voodoo to figure out the 040 * charset encoding of the XML document within the stream. 041 * <p> 042 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 043 * This one IS a character stream. 044 * <p> 045 * All this has to be done without consuming characters from the stream, if not 046 * the XML parser will not recognized the document as a valid XML. This is not 047 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 048 * right now, XmlStreamReader handles it and things work in all parsers). 049 * <p> 050 * The XmlStreamReader class handles the charset encoding of XML documents in 051 * Files, raw streams and HTTP streams by offering a wide set of constructors. 052 * <p> 053 * By default the charset encoding detection is lenient, the constructor with 054 * the lenient flag can be used for a script (following HTTP MIME and XML 055 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 056 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 057 * Determining the character encoding of a feed</a>. 058 * <p> 059 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 060 * Apache License 2.0. 061 * 062 * @see org.apache.commons.io.output.XmlStreamWriter 063 * @since 2.0 064 */ 065public class XmlStreamReader extends Reader { 066 private static final int BUFFER_SIZE = 4096; 067 068 private static final String UTF_8 = "UTF-8"; 069 070 private static final String US_ASCII = "US-ASCII"; 071 072 private static final String UTF_16BE = "UTF-16BE"; 073 074 private static final String UTF_16LE = "UTF-16LE"; 075 076 private static final String UTF_32BE = "UTF-32BE"; 077 078 private static final String UTF_32LE = "UTF-32LE"; 079 080 private static final String UTF_16 = "UTF-16"; 081 082 private static final String UTF_32 = "UTF-32"; 083 084 private static final String EBCDIC = "CP1047"; 085 086 private static final ByteOrderMark[] BOMS = new ByteOrderMark[] { 087 ByteOrderMark.UTF_8, 088 ByteOrderMark.UTF_16BE, 089 ByteOrderMark.UTF_16LE, 090 ByteOrderMark.UTF_32BE, 091 ByteOrderMark.UTF_32LE 092 }; 093 094 // UTF_16LE and UTF_32LE have the same two starting BOM bytes. 095 private static final ByteOrderMark[] XML_GUESS_BYTES = new ByteOrderMark[] { 096 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 097 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), 098 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 099 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 100 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 101 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 102 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 103 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) 104 }; 105 106 private final Reader reader; 107 108 private final String encoding; 109 110 private final String defaultEncoding; 111 112 /** 113 * Returns the default encoding to use if none is set in HTTP content-type, 114 * XML prolog and the rules based on content-type are not adequate. 115 * <p> 116 * If it is NULL the content-type based rules are used. 117 * 118 * @return the default encoding to use. 119 */ 120 public String getDefaultEncoding() { 121 return defaultEncoding; 122 } 123 124 /** 125 * Creates a Reader for a File. 126 * <p> 127 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 128 * if this is also missing defaults to UTF-8. 129 * <p> 130 * It does a lenient charset encoding detection, check the constructor with 131 * the lenient parameter for details. 132 * 133 * @param file File to create a Reader from. 134 * @throws IOException thrown if there is a problem reading the file. 135 */ 136 public XmlStreamReader(final File file) throws IOException { 137 this(new FileInputStream(file)); 138 } 139 140 /** 141 * Creates a Reader for a raw InputStream. 142 * <p> 143 * It follows the same logic used for files. 144 * <p> 145 * It does a lenient charset encoding detection, check the constructor with 146 * the lenient parameter for details. 147 * 148 * @param is InputStream to create a Reader from. 149 * @throws IOException thrown if there is a problem reading the stream. 150 */ 151 public XmlStreamReader(final InputStream is) throws IOException { 152 this(is, true); 153 } 154 155 /** 156 * Creates a Reader for a raw InputStream. 157 * <p> 158 * It follows the same logic used for files. 159 * <p> 160 * If lenient detection is indicated and the detection above fails as per 161 * specifications it then attempts the following: 162 * <p> 163 * If the content type was 'text/html' it replaces it with 'text/xml' and 164 * tries the detection again. 165 * <p> 166 * Else if the XML prolog had a charset encoding that encoding is used. 167 * <p> 168 * Else if the content type had a charset encoding that encoding is used. 169 * <p> 170 * Else 'UTF-8' is used. 171 * <p> 172 * If lenient detection is indicated an XmlStreamReaderException is never 173 * thrown. 174 * 175 * @param is InputStream to create a Reader from. 176 * @param lenient indicates if the charset encoding detection should be 177 * relaxed. 178 * @throws IOException thrown if there is a problem reading the stream. 179 * @throws XmlStreamReaderException thrown if the charset encoding could not 180 * be determined according to the specs. 181 */ 182 public XmlStreamReader(final InputStream is, final boolean lenient) throws IOException { 183 this(is, lenient, null); 184 } 185 186 /** 187 * Creates a Reader for a raw InputStream. 188 * <p> 189 * It follows the same logic used for files. 190 * <p> 191 * If lenient detection is indicated and the detection above fails as per 192 * specifications it then attempts the following: 193 * <p> 194 * If the content type was 'text/html' it replaces it with 'text/xml' and 195 * tries the detection again. 196 * <p> 197 * Else if the XML prolog had a charset encoding that encoding is used. 198 * <p> 199 * Else if the content type had a charset encoding that encoding is used. 200 * <p> 201 * Else 'UTF-8' is used. 202 * <p> 203 * If lenient detection is indicated an XmlStreamReaderException is never 204 * thrown. 205 * 206 * @param is InputStream to create a Reader from. 207 * @param lenient indicates if the charset encoding detection should be 208 * relaxed. 209 * @param defaultEncoding The default encoding 210 * @throws IOException thrown if there is a problem reading the stream. 211 * @throws XmlStreamReaderException thrown if the charset encoding could not 212 * be determined according to the specs. 213 */ 214 public XmlStreamReader(final InputStream is, final boolean lenient, final String defaultEncoding) 215 throws IOException { 216 this.defaultEncoding = defaultEncoding; 217 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 218 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 219 this.encoding = doRawStream(bom, pis, lenient); 220 this.reader = new InputStreamReader(pis, encoding); 221 } 222 223 /** 224 * Creates a Reader using the InputStream of a URL. 225 * <p> 226 * If the URL is not of type HTTP and there is not 'content-type' header in 227 * the fetched data it uses the same logic used for Files. 228 * <p> 229 * If the URL is a HTTP Url or there is a 'content-type' header in the 230 * fetched data it uses the same logic used for an InputStream with 231 * content-type. 232 * <p> 233 * It does a lenient charset encoding detection, check the constructor with 234 * the lenient parameter for details. 235 * 236 * @param url URL to create a Reader from. 237 * @throws IOException thrown if there is a problem reading the stream of 238 * the URL. 239 */ 240 public XmlStreamReader(final URL url) throws IOException { 241 this(url.openConnection(), null); 242 } 243 244 /** 245 * Creates a Reader using the InputStream of a URLConnection. 246 * <p> 247 * If the URLConnection is not of type HttpURLConnection and there is not 248 * 'content-type' header in the fetched data it uses the same logic used for 249 * files. 250 * <p> 251 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 252 * the fetched data it uses the same logic used for an InputStream with 253 * content-type. 254 * <p> 255 * It does a lenient charset encoding detection, check the constructor with 256 * the lenient parameter for details. 257 * 258 * @param conn URLConnection to create a Reader from. 259 * @param defaultEncoding The default encoding 260 * @throws IOException thrown if there is a problem reading the stream of 261 * the URLConnection. 262 */ 263 public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException { 264 this.defaultEncoding = defaultEncoding; 265 final boolean lenient = true; 266 final String contentType = conn.getContentType(); 267 final InputStream is = conn.getInputStream(); 268 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 269 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 270 if (conn instanceof HttpURLConnection || contentType != null) { 271 this.encoding = doHttpStream(bom, pis, contentType, lenient); 272 } else { 273 this.encoding = doRawStream(bom, pis, lenient); 274 } 275 this.reader = new InputStreamReader(pis, encoding); 276 } 277 278 /** 279 * Creates a Reader using an InputStream and the associated content-type 280 * header. 281 * <p> 282 * First it checks if the stream has BOM. If there is not BOM checks the 283 * content-type encoding. If there is not content-type encoding checks the 284 * XML prolog encoding. If there is not XML prolog encoding uses the default 285 * encoding mandated by the content-type MIME type. 286 * <p> 287 * It does a lenient charset encoding detection, check the constructor with 288 * the lenient parameter for details. 289 * 290 * @param is InputStream to create the reader from. 291 * @param httpContentType content-type header to use for the resolution of 292 * the charset encoding. 293 * @throws IOException thrown if there is a problem reading the file. 294 */ 295 public XmlStreamReader(final InputStream is, final String httpContentType) 296 throws IOException { 297 this(is, httpContentType, true); 298 } 299 300 /** 301 * Creates a Reader using an InputStream and the associated content-type 302 * header. This constructor is lenient regarding the encoding detection. 303 * <p> 304 * First it checks if the stream has BOM. If there is not BOM checks the 305 * content-type encoding. If there is not content-type encoding checks the 306 * XML prolog encoding. If there is not XML prolog encoding uses the default 307 * encoding mandated by the content-type MIME type. 308 * <p> 309 * If lenient detection is indicated and the detection above fails as per 310 * specifications it then attempts the following: 311 * <p> 312 * If the content type was 'text/html' it replaces it with 'text/xml' and 313 * tries the detection again. 314 * <p> 315 * Else if the XML prolog had a charset encoding that encoding is used. 316 * <p> 317 * Else if the content type had a charset encoding that encoding is used. 318 * <p> 319 * Else 'UTF-8' is used. 320 * <p> 321 * If lenient detection is indicated an XmlStreamReaderException is never 322 * thrown. 323 * 324 * @param is InputStream to create the reader from. 325 * @param httpContentType content-type header to use for the resolution of 326 * the charset encoding. 327 * @param lenient indicates if the charset encoding detection should be 328 * relaxed. 329 * @param defaultEncoding The default encoding 330 * @throws IOException thrown if there is a problem reading the file. 331 * @throws XmlStreamReaderException thrown if the charset encoding could not 332 * be determined according to the specs. 333 */ 334 public XmlStreamReader(final InputStream is, final String httpContentType, 335 final boolean lenient, final String defaultEncoding) throws IOException { 336 this.defaultEncoding = defaultEncoding; 337 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(is, BUFFER_SIZE), false, BOMS); 338 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 339 this.encoding = doHttpStream(bom, pis, httpContentType, lenient); 340 this.reader = new InputStreamReader(pis, encoding); 341 } 342 343 /** 344 * Creates a Reader using an InputStream and the associated content-type 345 * header. This constructor is lenient regarding the encoding detection. 346 * <p> 347 * First it checks if the stream has BOM. If there is not BOM checks the 348 * content-type encoding. If there is not content-type encoding checks the 349 * XML prolog encoding. If there is not XML prolog encoding uses the default 350 * encoding mandated by the content-type MIME type. 351 * <p> 352 * If lenient detection is indicated and the detection above fails as per 353 * specifications it then attempts the following: 354 * <p> 355 * If the content type was 'text/html' it replaces it with 'text/xml' and 356 * tries the detection again. 357 * <p> 358 * Else if the XML prolog had a charset encoding that encoding is used. 359 * <p> 360 * Else if the content type had a charset encoding that encoding is used. 361 * <p> 362 * Else 'UTF-8' is used. 363 * <p> 364 * If lenient detection is indicated an XmlStreamReaderException is never 365 * thrown. 366 * 367 * @param is InputStream to create the reader from. 368 * @param httpContentType content-type header to use for the resolution of 369 * the charset encoding. 370 * @param lenient indicates if the charset encoding detection should be 371 * relaxed. 372 * @throws IOException thrown if there is a problem reading the file. 373 * @throws XmlStreamReaderException thrown if the charset encoding could not 374 * be determined according to the specs. 375 */ 376 public XmlStreamReader(final InputStream is, final String httpContentType, 377 final boolean lenient) throws IOException { 378 this(is, httpContentType, lenient, null); 379 } 380 381 /** 382 * Returns the charset encoding of the XmlStreamReader. 383 * 384 * @return charset encoding. 385 */ 386 public String getEncoding() { 387 return encoding; 388 } 389 390 /** 391 * Invokes the underlying reader's <code>read(char[], int, int)</code> method. 392 * @param buf the buffer to read the characters into 393 * @param offset The start offset 394 * @param len The number of bytes to read 395 * @return the number of characters read or -1 if the end of stream 396 * @throws IOException if an I/O error occurs 397 */ 398 @Override 399 public int read(final char[] buf, final int offset, final int len) throws IOException { 400 return reader.read(buf, offset, len); 401 } 402 403 /** 404 * Closes the XmlStreamReader stream. 405 * 406 * @throws IOException thrown if there was a problem closing the stream. 407 */ 408 @Override 409 public void close() throws IOException { 410 reader.close(); 411 } 412 413 /** 414 * Process the raw stream. 415 * 416 * @param bom BOMInputStream to detect byte order marks 417 * @param pis BOMInputStream to guess XML encoding 418 * @param lenient indicates if the charset encoding detection should be 419 * relaxed. 420 * @return the encoding to be used 421 * @throws IOException thrown if there is a problem reading the stream. 422 */ 423 private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) 424 throws IOException { 425 final String bomEnc = bom.getBOMCharsetName(); 426 final String xmlGuessEnc = pis.getBOMCharsetName(); 427 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 428 try { 429 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 430 } catch (final XmlStreamReaderException ex) { 431 if (lenient) { 432 return doLenientDetection(null, ex); 433 } else { 434 throw ex; 435 } 436 } 437 } 438 439 /** 440 * Process a HTTP stream. 441 * 442 * @param bom BOMInputStream to detect byte order marks 443 * @param pis BOMInputStream to guess XML encoding 444 * @param httpContentType The HTTP content type 445 * @param lenient indicates if the charset encoding detection should be 446 * relaxed. 447 * @return the encoding to be used 448 * @throws IOException thrown if there is a problem reading the stream. 449 */ 450 private String doHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType, 451 final boolean lenient) throws IOException { 452 final String bomEnc = bom.getBOMCharsetName(); 453 final String xmlGuessEnc = pis.getBOMCharsetName(); 454 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 455 try { 456 return calculateHttpEncoding(httpContentType, bomEnc, 457 xmlGuessEnc, xmlEnc, lenient); 458 } catch (final XmlStreamReaderException ex) { 459 if (lenient) { 460 return doLenientDetection(httpContentType, ex); 461 } else { 462 throw ex; 463 } 464 } 465 } 466 467 /** 468 * Do lenient detection. 469 * 470 * @param httpContentType content-type header to use for the resolution of 471 * the charset encoding. 472 * @param ex The thrown exception 473 * @return the encoding 474 * @throws IOException thrown if there is a problem reading the stream. 475 */ 476 private String doLenientDetection(String httpContentType, 477 XmlStreamReaderException ex) throws IOException { 478 if (httpContentType != null && httpContentType.startsWith("text/html")) { 479 httpContentType = httpContentType.substring("text/html".length()); 480 httpContentType = "text/xml" + httpContentType; 481 try { 482 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), 483 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 484 } catch (final XmlStreamReaderException ex2) { 485 ex = ex2; 486 } 487 } 488 String encoding = ex.getXmlEncoding(); 489 if (encoding == null) { 490 encoding = ex.getContentTypeEncoding(); 491 } 492 if (encoding == null) { 493 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 494 } 495 return encoding; 496 } 497 498 /** 499 * Calculate the raw encoding. 500 * 501 * @param bomEnc BOM encoding 502 * @param xmlGuessEnc XML Guess encoding 503 * @param xmlEnc XML encoding 504 * @return the raw encoding 505 * @throws IOException thrown if there is a problem reading the stream. 506 */ 507 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, 508 final String xmlEnc) throws IOException { 509 510 // BOM is Null 511 if (bomEnc == null) { 512 if (xmlGuessEnc == null || xmlEnc == null) { 513 return defaultEncoding == null ? UTF_8 : defaultEncoding; 514 } 515 if (xmlEnc.equals(UTF_16) && 516 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 517 return xmlGuessEnc; 518 } 519 return xmlEnc; 520 } 521 522 // BOM is UTF-8 523 if (bomEnc.equals(UTF_8)) { 524 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 525 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 526 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 527 } 528 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 529 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 530 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 531 } 532 return bomEnc; 533 } 534 535 // BOM is UTF-16BE or UTF-16LE 536 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 537 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 538 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 539 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 540 } 541 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 542 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 543 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 544 } 545 return bomEnc; 546 } 547 548 // BOM is UTF-32BE or UTF-32LE 549 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 550 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 551 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 552 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 553 } 554 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 555 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 556 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 557 } 558 return bomEnc; 559 } 560 561 // BOM is something else 562 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 563 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 564 } 565 566 567 /** 568 * Calculate the HTTP encoding. 569 * 570 * @param httpContentType The HTTP content type 571 * @param bomEnc BOM encoding 572 * @param xmlGuessEnc XML Guess encoding 573 * @param xmlEnc XML encoding 574 * @param lenient indicates if the charset encoding detection should be 575 * relaxed. 576 * @return the HTTP encoding 577 * @throws IOException thrown if there is a problem reading the stream. 578 */ 579 String calculateHttpEncoding(final String httpContentType, 580 final String bomEnc, final String xmlGuessEnc, final String xmlEnc, 581 final boolean lenient) throws IOException { 582 583 // Lenient and has XML encoding 584 if (lenient && xmlEnc != null) { 585 return xmlEnc; 586 } 587 588 // Determine mime/encoding content types from HTTP Content Type 589 final String cTMime = getContentTypeMime(httpContentType); 590 final String cTEnc = getContentTypeEncoding(httpContentType); 591 final boolean appXml = isAppXml(cTMime); 592 final boolean textXml = isTextXml(cTMime); 593 594 // Mime type NOT "application/xml" or "text/xml" 595 if (!appXml && !textXml) { 596 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 597 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 598 } 599 600 // No content type encoding 601 if (cTEnc == null) { 602 if (appXml) { 603 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 604 } else { 605 return defaultEncoding == null ? US_ASCII : defaultEncoding; 606 } 607 } 608 609 // UTF-16BE or UTF-16LE content type encoding 610 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 611 if (bomEnc != null) { 612 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 613 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 614 } 615 return cTEnc; 616 } 617 618 // UTF-16 content type encoding 619 if (cTEnc.equals(UTF_16)) { 620 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 621 return bomEnc; 622 } 623 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 624 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 625 } 626 627 // UTF-32BE or UTF-132E content type encoding 628 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 629 if (bomEnc != null) { 630 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 631 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 632 } 633 return cTEnc; 634 } 635 636 // UTF-32 content type encoding 637 if (cTEnc.equals(UTF_32)) { 638 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 639 return bomEnc; 640 } 641 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 642 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 643 } 644 645 return cTEnc; 646 } 647 648 /** 649 * Returns MIME type or NULL if httpContentType is NULL. 650 * 651 * @param httpContentType the HTTP content type 652 * @return The mime content type 653 */ 654 static String getContentTypeMime(final String httpContentType) { 655 String mime = null; 656 if (httpContentType != null) { 657 final int i = httpContentType.indexOf(";"); 658 if (i >= 0) { 659 mime = httpContentType.substring(0, i); 660 } else { 661 mime = httpContentType; 662 } 663 mime = mime.trim(); 664 } 665 return mime; 666 } 667 668 private static final Pattern CHARSET_PATTERN = Pattern 669 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 670 671 /** 672 * Returns charset parameter value, NULL if not present, NULL if 673 * httpContentType is NULL. 674 * 675 * @param httpContentType the HTTP content type 676 * @return The content type encoding (upcased) 677 */ 678 static String getContentTypeEncoding(final String httpContentType) { 679 String encoding = null; 680 if (httpContentType != null) { 681 final int i = httpContentType.indexOf(";"); 682 if (i > -1) { 683 final String postMime = httpContentType.substring(i + 1); 684 final Matcher m = CHARSET_PATTERN.matcher(postMime); 685 encoding = m.find() ? m.group(1) : null; 686 encoding = encoding != null ? encoding.toUpperCase(Locale.US) : null; 687 } 688 } 689 return encoding; 690 } 691 692 public static final Pattern ENCODING_PATTERN = Pattern.compile( 693 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 694 Pattern.MULTILINE); 695 696 /** 697 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. 698 * 699 * @param is InputStream to create the reader from. 700 * @param guessedEnc guessed encoding 701 * @return the encoding declared in the <?xml encoding=...?> 702 * @throws IOException thrown if there is a problem reading the stream. 703 */ 704 private static String getXmlProlog(final InputStream is, final String guessedEnc) 705 throws IOException { 706 String encoding = null; 707 if (guessedEnc != null) { 708 final byte[] bytes = new byte[BUFFER_SIZE]; 709 is.mark(BUFFER_SIZE); 710 int offset = 0; 711 int max = BUFFER_SIZE; 712 int c = is.read(bytes, offset, max); 713 int firstGT = -1; 714 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 715 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { 716 offset += c; 717 max -= c; 718 c = is.read(bytes, offset, max); 719 xmlProlog = new String(bytes, 0, offset, guessedEnc); 720 firstGT = xmlProlog.indexOf('>'); 721 } 722 if (firstGT == -1) { 723 if (c == -1) { 724 throw new IOException("Unexpected end of XML stream"); 725 } else { 726 throw new IOException( 727 "XML prolog or ROOT element not found on first " 728 + offset + " bytes"); 729 } 730 } 731 final int bytesRead = offset; 732 if (bytesRead > 0) { 733 is.reset(); 734 final BufferedReader bReader = new BufferedReader(new StringReader( 735 xmlProlog.substring(0, firstGT + 1))); 736 final StringBuffer prolog = new StringBuffer(); 737 String line = bReader.readLine(); 738 while (line != null) { 739 prolog.append(line); 740 line = bReader.readLine(); 741 } 742 final Matcher m = ENCODING_PATTERN.matcher(prolog); 743 if (m.find()) { 744 encoding = m.group(1).toUpperCase(); 745 encoding = encoding.substring(1, encoding.length() - 1); 746 } 747 } 748 } 749 return encoding; 750 } 751 752 /** 753 * Indicates if the MIME type belongs to the APPLICATION XML family. 754 * 755 * @param mime The mime type 756 * @return true if the mime type belongs to the APPLICATION XML family, 757 * otherwise false 758 */ 759 static boolean isAppXml(final String mime) { 760 return mime != null && 761 (mime.equals("application/xml") || 762 mime.equals("application/xml-dtd") || 763 mime.equals("application/xml-external-parsed-entity") || 764 mime.startsWith("application/") && mime.endsWith("+xml")); 765 } 766 767 /** 768 * Indicates if the MIME type belongs to the TEXT XML family. 769 * 770 * @param mime The mime type 771 * @return true if the mime type belongs to the TEXT XML family, 772 * otherwise false 773 */ 774 static boolean isTextXml(final String mime) { 775 return mime != null && 776 (mime.equals("text/xml") || 777 mime.equals("text/xml-external-parsed-entity") || 778 mime.startsWith("text/") && mime.endsWith("+xml")); 779 } 780 781 private static final String RAW_EX_1 = 782 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 783 784 private static final String RAW_EX_2 = 785 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 786 787 private static final String HTTP_EX_1 = 788 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 789 790 private static final String HTTP_EX_2 = 791 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 792 793 private static final String HTTP_EX_3 = 794 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; 795 796}