下面列出了org.jsoup.nodes.Document#outputSettings ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* Jsoup.parse(in, charsetName, baseUri)
*/
@Override
public Document handle( InputStream input) throws IOException{
//获取Jsoup参数
String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
//使用Jsoup将html转换成Document对象
Document doc = Jsoup.parse(input, charsetName, baseUri);
OutputSettings outputSettings = new OutputSettings();
outputSettings.prettyPrint(false);
/*
outputSettings.syntax(syntax)
outputSettings.charset(charset)
outputSettings*/
doc.outputSettings(outputSettings);
//返回Document对象
return doc;
}
@NotNull static AdditionalInfo handleXML(String xml) {
AdditionalInfo info = new AdditionalInfo();
info.setTitle(TITLE);
Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
String text = doc.select("item description").first().html().replace("\r\n", "<br>").trim();
if (text.startsWith("Zurzeit gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")
|| text.startsWith("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")) {
info.setHasInformation(false);
info.setText("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.");
}
if (text.endsWith("<br>")) {
text = text.substring(0, text.length() - 4);
}
info.setTitle(TITLE + " (Stand: " + doc.select("pubDate").first().text() + ")");
info.setText(text);
return info;
}
@Override
public String doFormat(String code, LineEnding ending) {
Document document;
switch (formatter.syntax()) {
case html:
document = Jsoup.parse(code, "", Parser.htmlParser());
break;
case xml:
document = Jsoup.parse(code, "", Parser.xmlParser());
break;
default:
throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
}
document.outputSettings(formatter);
String formattedCode = document.outerHtml();
if (code.equals(formattedCode)) {
return null;
}
return formattedCode;
}
private static void testHtmlParser(String url) throws Exception {
Document doc = Jsoup.connect(url).userAgent(USER_AGENT).cookie("auth", "token")
.timeout(30000).get();
Charset charset = doc.charset();
System.out.println("charset = " + charset);
System.out.println("location = " + doc.location());
System.out.println("nodeName = " + doc.nodeName());
Document.OutputSettings outputSettings = doc.outputSettings();
System.out.println("charset = " + outputSettings.charset());
System.out.println("indentAmount = " + outputSettings.indentAmount());
System.out.println("syntax = " + outputSettings.syntax());
System.out.println("escapeMode = " + outputSettings.escapeMode());
System.out.println("prettyPrint = " + outputSettings.prettyPrint());
System.out.println("outline = " + outputSettings.outline());
System.out.println("title = " + doc.title());
System.out.println("baseUri = " + doc.baseUri());
Element head = doc.head();
Elements children = head.children();
for(Element child: children) {
System.out.print(child.tag().getName() + " : ");
System.out.println(child);
}
printElements(doc.body().children());
}
public String getDescription(String page) {
try {
// Fetch the image page
Response resp = Http.url(page)
.referrer(this.url)
.response();
cookies.putAll(resp.cookies());
// Try to find the description
Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]");
if (els.isEmpty()) {
LOGGER.debug("No description at " + page);
throw new IOException("No description found");
}
LOGGER.debug("Description found!");
Document documentz = resp.parse();
Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is.
// Would break completely if FurAffinity changed site layout.
documentz.outputSettings(new Document.OutputSettings().prettyPrint(false));
ele.select("br").append("\\n");
ele.select("p").prepend("\\n\\n");
LOGGER.debug("Returning description at " + page);
String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
return documentz.select("meta[property=og:title]").attr("content") + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name.
} catch (IOException ioe) {
LOGGER.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'");
return null;
}
}
/**
* @param html The HTML to convert to text with new lines preserved, may be {@code null}.
* @return The provided HTML converted to text with new lines preserved or {@code null} if null string input.
*/
@Nullable
public static String cleanWithLinebreaks(@Nullable String html) {
if (html == null || html.isBlank()) {
return html;
}
final Document document = Jsoup.parse(html);
// Makes html() preserve linebreak and spacing
document.outputSettings(new Document.OutputSettings().prettyPrint(false));
document.select("br").append("\\n");
document.select("p").prepend("\\n\\n");
final String str = document.html().replace("\\\\n", "\n");
return Jsoup.clean(str, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}
private static List<String> parse(String input, Whitelist wl)
{
String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
//System.out.println(input);
Document document = Jsoup.parse(cleanInput);
document.outputSettings(outputSettings);
List<String> textList = new ArrayList<String>();
readNodes(document.body().childNodes(), textList);
return textList;
}
private static List<Map.Entry<String,String>> parseWithTags(String input, Whitelist wl)
{
String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
Document document = Jsoup.parse(cleanInput);
document.outputSettings(outputSettings);
List<Map.Entry<String,String>> textListMap = new ArrayList<Map.Entry<String,String>>();
readNodesWithTags(document.body().childNodes(), textListMap,"body");
return textListMap;
}
/**
* Replace html line breaks and > < entities.
*
* @param html
* the html
* @return the string
*/
public static String replaceHtmlLineBreaks(String html) {
if (html == null)
return html;
Document document = Jsoup.parse(html);
// makes html() preserve linebreaks and spacing
document.outputSettings(new Document.OutputSettings().prettyPrint(false));
document.select("br").append("\\n");
document.select("p").prepend("\\n\\n");
String s = document.html().replaceAll("\\\\n", "\n");
String cleanedString = Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
cleanedString = cleanedString.replaceAll(">", ">");
cleanedString = cleanedString.replaceAll("<", "<");
return cleanedString;
}
/**
* 过滤包含HTML字符串.
* @param text - 待过滤的字符串
* @return 过滤后的字符串.
*/
public static String filter(String text) {
if ( text == null ) {
return text;
}
Document document = Jsoup.parse(text);
document.outputSettings(new Document.OutputSettings().prettyPrint(false));
document.select("br").append("\\n");
document.select("p").prepend("\\n\\n");
String s = document.html().replaceAll("\\\\n", "\n");
return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
final File f = getFileName(jCas);
final DocumentAnnotation da = getDocumentAnnotation(jCas);
final Document doc =
Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
final Element head = doc.head();
if (!Strings.isNullOrEmpty(css)) {
final Element cssLink = head.appendElement("link");
cssLink.attr("rel", "stylesheet");
cssLink.attr("href", css);
}
final Element charset = head.appendElement("meta");
charset.attr("charset", "utf-8");
appendMeta(head, "document.type", da.getDocType());
appendMeta(head, "document.sourceUri", da.getSourceUri());
appendMeta(head, "externalId", da.getHash());
appendMeta(head, "document.classification", da.getDocumentClassification());
appendMeta(
head,
"document.caveats",
String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
appendMeta(
head,
"document.releasability",
String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));
String title = null;
for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
appendMeta(head, md.getKey(), md.getValue());
if ("documentTitle".equalsIgnoreCase(md.getKey())) {
title = md.getValue();
}
}
if (!Strings.isNullOrEmpty(title)) {
doc.title(title);
}
final Element body = doc.body();
writeBody(jCas, body);
try {
FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
} catch (final IOException e) {
throw new AnalysisEngineProcessException(e);
}
}