下面列出了org.jsoup.nodes.Document#outerHtml ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private String injectJs(String oldHtml) {
if (TextUtils.isEmpty(oldHtml)) {
return "";
}
Document doc = Jsoup.parse(oldHtml);
doc.head().append("<script type=\"text/javascript\">\n" +
" function getPostData() {\n" +
" let recaptcha = document.getElementById(\"g-recaptcha-response\").value;\n" +
" if (!recaptcha || recaptcha === '') {\n" +
" recaptcha = document.getElementById(\"g-recaptcha-response\").innerHTML;\n" +
" }\n" +
" const action = document.getElementById('challenge-form').getAttribute(\"action\");\n" +
" const r = document.getElementsByName(\"r\")[0].getAttribute(\"value\");\n" +
" const id = document.getElementById('id').getAttribute(\"value\");\n" +
" return action + \",\" + r + \",\" + id + \",\" + recaptcha;\n" +
" }\n" +
" </script>");
String html = doc.outerHtml();
Log.d(TAG, "JS注入完成");
return html;
}
private String injectJs(String oldHtml) {
if (TextUtils.isEmpty(oldHtml)) {
return "";
}
Document doc = Jsoup.parse(oldHtml);
doc.head().append("<script type=\"text/javascript\">\n" +
" function getPostData() {\n" +
" let recaptcha = document.getElementById(\"g-recaptcha-response\").value;\n" +
" if (!recaptcha || recaptcha === '') {\n" +
" recaptcha = document.getElementById(\"g-recaptcha-response\").innerHTML;\n" +
" }\n" +
" const action = document.getElementById('challenge-form').getAttribute(\"action\");\n" +
" const r = document.getElementsByName(\"r\")[0].getAttribute(\"value\");\n" +
" const id = document.getElementById('id').getAttribute(\"value\");\n" +
" return action + \",\" + r + \",\" + id + \",\" + recaptcha;\n" +
" }\n" +
" </script>");
String html = doc.outerHtml();
Log.d(TAG, "JS注入完成");
return html;
}
public static String getVideoURLAtPage(String url) throws IOException {
Document doc = Http.url(url)
.userAgent(USER_AGENT)
.get();
String html = doc.outerHtml();
String videoURL = null;
for (String quality : new String[] {"1080", "720", "480", "240"}) {
quality = "url" + quality + "\\\":\\\"";
if (html.contains(quality)) {
videoURL = html.substring(html.indexOf(quality) + quality.length());
videoURL = videoURL.substring(0, videoURL.indexOf("\""));
videoURL = videoURL.replace("\\", "");
break;
}
}
if (videoURL == null) {
throw new IOException("Could not find video URL at " + url);
}
return videoURL;
}
@Override
public String doFormat(String code, LineEnding ending) {
Document document;
switch (formatter.syntax()) {
case html:
document = Jsoup.parse(code, "", Parser.htmlParser());
break;
case xml:
document = Jsoup.parse(code, "", Parser.xmlParser());
break;
default:
throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
}
document.outputSettings(formatter);
String formattedCode = document.outerHtml();
if (code.equals(formattedCode)) {
return null;
}
return formattedCode;
}
@Override
public ContentWithImages inline(String htmlContent, List<ImageResource> images) {
Document doc = Jsoup.parse(htmlContent);
List<Attachment> attachments = new ArrayList<>(images.size());
for (ImageResource image : images) {
// search all images in the HTML with the provided path or URL that
// are not skipped
Elements imgs = getImagesToAttach(doc, image);
if (!imgs.isEmpty()) {
String contentId = idGenerator.generate(image.getName());
// generate attachment
Attachment attachment = new Attachment(new ByteResource(image.getName(), image.getContent()), null, INLINE, format(CONTENT_ID, contentId));
// update the HTML to use the generated content id instead of
// the path or URL
for (Element img : imgs) {
img.attr(SRC_ATTR, format(SRC_VALUE, contentId));
img.attr(INLINED_ATTR, true);
}
attachments.add(attachment);
}
}
return new ContentWithImages(doc.outerHtml(), attachments);
}
public static String stripBody(Post post) {
String body = post.getBody();
Document doc = Jsoup.parse("<body>"+body+"</body>");
doc.getElementsByTag("a").remove();
doc.getElementsByTag("code").remove();
doc.getElementsByTag("img").remove();
doc.getElementsByTag("pre").remove();
doc.getElementsByTag("blockquote").remove();
return doc.outerHtml();
}
private static void createTestcaseFiles() throws IOException {
File srcDir = new File(RGAA3_TESTCASE_PATH);
for (File file : srcDir.listFiles()) {
String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", "");
String theme = fileName.substring(0, 2);
String crit = fileName.substring(2, 4);
String test = fileName.substring(4, 6);
String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString();
String wrongKey = theme+"."+crit+"."+test;
for (File testcase : file.listFiles()) {
if (testcase.isFile() && testcase.getName().contains("html")) {
Document doc = Jsoup.parse(FileUtils.readFileToString(testcase));
Element detail = doc.select(".test-detail").first();
if (detail == null) {
System.out.println(doc.outerHtml());
} else {
detail.tagName("div");
detail.text("");
for (Element el : detail.children()) {
el.remove();
}
if (!detail.hasAttr("lang")) {
detail.attr("lang", "fr");
}
detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
doc.outputSettings().outline(false);
doc.outputSettings().indentAmount(4);
String outputHtml = doc.outerHtml();
if (outputHtml.contains(wrongKey)) {
outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot());
}
FileUtils.writeStringToFile(testcase, outputHtml);
}
}
}
}
}
@Override
public void run() {
dirtyHTML = removeBadNamespaceDefinition(dirtyHTML);
Document doc = Jsoup.parse(dirtyHTML);
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
doc.outputSettings().outline(true);
doc.outputSettings().indentAmount(2);
removeComments(doc);
removeMalformedAttributes(doc);
result = doc.outerHtml();
}
@Override
public String inline(String htmlContent, List<ExternalCss> cssContents) {
Document doc = Jsoup.parse(htmlContent);
internStyles(doc, cssContents);
String stylesheet = fetchStyles(doc);
extractStyles(doc, stylesheet);
applyStyles(doc);
return doc.outerHtml();
}
@Override
public ContentWithImages inline(String htmlContent, List<ImageResource> images) {
Document doc = Jsoup.parse(htmlContent);
for (ImageResource image : images) {
Elements imgs = getImagesToInline(doc, image);
for (Element img : imgs) {
img.attr(SRC_ATTR, MessageFormat.format(BASE64_URI, image.getMimetype(), Base64Utils.encodeToString(image.getContent())));
img.attr(INLINED_ATTR, true);
}
}
return new ContentWithImages(doc.outerHtml(), new ArrayList<Attachment>(0));
}
private String addJavascript(String content) {
Document sourcePage = Jsoup.parse(content);
sourcePage.body().lastElementSibling().after(
String.format("<script src=\"%s\"></script>", "/assets/javascript/browserWatch.js")
);
return sourcePage.outerHtml();
}
public static String cleanDomFromText(File f) throws IOException {
Document d = Jsoup.parse(f, null);
stringsToRemove = new LinkedList<String>();
// System.out.println(d);
getStringsToRemove(d.getAllElements());
// System.err.println(stringsToRemove);
String domToString = d.outerHtml();
for (String s : stringsToRemove) {
domToString = domToString.replace(s, "");
}
// System.out.println(domToString);
return domToString;
}
@Test public void testNormalisesIsIndex() {
Document doc = Jsoup.parse("<body><isindex action='/submit'></body>");
String html = doc.outerHtml();
assertEquals("<form action=\"/submit\"> <hr> <label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label> <hr> </form>",
StringUtil.normaliseWhitespace(doc.body().html()));
}
@Test public void testNormalisesIsIndex() {
Document doc = Jsoup.parse("<body><isindex action='/submit'></body>");
String html = doc.outerHtml();
assertEquals("<form action=\"/submit\"> <hr> <label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label> <hr> </form>",
StringUtil.normaliseWhitespace(doc.body().html()));
}
/**
* Remove attributes that are used only by Ogham:
* <ul>
* <li>{@link CssInlinerConstants#INLINE_MODE_ATTR}</li>
* <li>{@link CssInlinerConstants#INLINED_ATTR}</li>
* </ul>
*
* @param html
* the html to clean
* @return the cleaned html
*/
public static String removeOghamAttributes(String html) {
Document doc = Jsoup.parse(html);
Elements nodes = doc.select("["+INLINE_MODE_ATTR+"], ["+INLINED_ATTR+"]");
for (Element node : nodes) {
node.removeAttr(INLINE_MODE_ATTR);
node.removeAttr(INLINED_ATTR);
}
return doc.outerHtml();
}
/**
* Remove attributes that are used only by Ogham:
* <ul>
* <li>{@link ImageInlinerConstants#INLINE_MODE_ATTR}</li>
* <li>{@link ImageInlinerConstants#INLINED_ATTR}</li>
* </ul>
*
* @param html
* the html to clean
* @return the cleaned html
*/
public static String removeOghamAttributes(String html) {
Document doc = Jsoup.parse(html);
Elements imgs = doc.select("img");
for (Element img : imgs) {
img.removeAttr(INLINE_MODE_ATTR);
img.removeAttr(INLINED_ATTR);
}
return doc.outerHtml();
}