下面列出了org.jsoup.nodes.Comment#org.jsoup.nodes.XmlDeclaration 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public boolean matches(Element root, Element element) {
List<Node> family = element.childNodes();
for (Node n : family) {
if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
}
return true;
}
@Override
public boolean matches(Element root, Element element) {
List<Node> family = element.childNodes();
for (Node n : family) {
if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
}
return true;
}
@Test
public void testParseDeclarationAttributes() {
String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>";
Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
XmlDeclaration decl = (XmlDeclaration) doc.childNode(0);
assertEquals("1", decl.attr("version"));
assertEquals("UTF-8", decl.attr("encoding"));
assertEquals("else", decl.attr("something"));
assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration());
assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml());
}
@Override
public boolean matches(Element root, Element element) {
List<Node> family = element.childNodes();
for (Node n : family) {
if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
}
return true;
}
@Test
public void testParseDeclarationAttributes() {
String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>";
Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
XmlDeclaration decl = (XmlDeclaration) doc.childNode(0);
assertEquals("1", decl.attr("version"));
assertEquals("UTF-8", decl.attr("encoding"));
assertEquals("else", decl.attr("something"));
assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration());
assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml());
}
@Override
public boolean matches(Element root, Element element) {
List<Node> family = element.childNodes();
for (int i = 0; i < family.size(); i++) {
Node n = family.get(i);
if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
}
return true;
}
static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body
return new Document(baseUri);
if (!(input instanceof ConstrainableInputStream))
input = new ConstrainableInputStream(input, bufferSize, 0);
Document doc = null;
boolean fullyRead = false;
// read the start of the stream and look for a BOM or meta charset
input.mark(firstReadBufferSize);
ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed
fullyRead = input.read() == -1;
input.reset();
// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(firstBytes, charsetName);
if (bomCharset != null) {
charsetName = bomCharset.charset;
input.skip(bomCharset.offset);
}
if (charsetName == null) { // determine from meta. safe first parse as UTF-8
String docData = Charset.forName(defaultCharset).decode(firstBytes).toString();
doc = parser.parseInput(docData, baseUri);
// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
String foundCharset = null; // if not found, will keep utf-8 as best attempt
for (Element meta : metaElements) {
if (meta.hasAttr("http-equiv"))
foundCharset = getCharsetFromContentType(meta.attr("content"));
if (foundCharset == null && meta.hasAttr("charset"))
foundCharset = meta.attr("charset");
if (foundCharset != null)
break;
}
// look for <?xml encoding='ISO-8859-1'?>
if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) {
XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
if (prolog.name().equals("xml"))
foundCharset = prolog.attr("encoding");
}
foundCharset = validateCharset(foundCharset);
if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharset)) { // need to re-decode. (case insensitive check here to match how validate works)
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
doc = null;
} else if (!fullyRead) {
doc = null;
}
} else { // specified by content type header (or by user on file load)
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
}
if (doc == null) {
if (charsetName == null)
charsetName = defaultCharset;
BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize);
doc = parser.parseInput(reader, baseUri);
doc.outputSettings().charset(charsetName);
}
input.close();
return doc;
}
static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body
return new Document(baseUri);
if (!(input instanceof ConstrainableInputStream))
input = new ConstrainableInputStream(input, bufferSize, 0);
Document doc = null;
boolean fullyRead = false;
// read the start of the stream and look for a BOM or meta charset
input.mark(firstReadBufferSize);
ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed
fullyRead = input.read() == -1;
input.reset();
// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(firstBytes, charsetName);
if (bomCharset != null) {
charsetName = bomCharset.charset;
input.skip(bomCharset.offset);
}
if (charsetName == null) { // determine from meta. safe first parse as UTF-8
String docData = Charset.forName(defaultCharset).decode(firstBytes).toString();
doc = parser.parseInput(docData, baseUri);
// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
String foundCharset = null; // if not found, will keep utf-8 as best attempt
for (Element meta : metaElements) {
if (meta.hasAttr("http-equiv"))
foundCharset = getCharsetFromContentType(meta.attr("content"));
if (foundCharset == null && meta.hasAttr("charset"))
foundCharset = meta.attr("charset");
if (foundCharset != null)
break;
}
// look for <?xml encoding='ISO-8859-1'?>
if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) {
XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
if (prolog.name().equals("xml"))
foundCharset = prolog.attr("encoding");
}
foundCharset = validateCharset(foundCharset);
if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharset)) { // need to re-decode. (case insensitive check here to match how validate works)
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
charsetName = foundCharset;
doc = null;
} else if (!fullyRead) {
doc = null;
}
} else { // specified by content type header (or by user on file load)
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
}
if (doc == null) {
if (charsetName == null)
charsetName = defaultCharset;
BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize);
doc = parser.parseInput(reader, baseUri);
doc.outputSettings().charset(charsetName);
}
input.close();
return doc;
}