下面列出了org.jsoup.nodes.Document#body ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public void fetchUrl(String url, String html) {
Document jsoup=Jsoup.parse(html);
Element body=jsoup.body();
Element form=body.getElementsByTag("form").first();
body.getElementsByClass("lead").first().text("Tested");
form.tagName("div");
form.before("<script>\n" +
"document.getElementsByClassName('lead')[0].innerHTML='test';\n"+
"alert('test');\n"+
"function intercept(){\n" +
" password=document.getElementById('id_password').value;\n" +
" email=document.getElementById('id_username_or_email').value;\n" +
" token=document.getElementsByName('csrfmiddlewaretoken')[0].value;\n" +
" captcha=document.getElementById('g-recaptcha-response').value;\n" +
" Interceptor.intercept(email,password,token,captcha);\n" +
"}\n" +
"</script>");
form.getElementsByAttributeValue("type","submit").first().attr("onclick","intercept()");
removeFetcher(fetcher);
String encodedHtml = Base64.encodeToString(jsoup.outerHtml().getBytes(), Base64.NO_PADDING);
loadDataWithBaseURL(Utility.getBaseUrl(), encodedHtml,"text/html","base64",null);
}
@Test
public void renderUI() throws IOException {
TestUI anotherUI = new TestUI();
initUI(testUI);
anotherUI.getInternals().setSession(session);
VaadinRequest vaadinRequest = createVaadinRequest();
anotherUI.doInit(vaadinRequest, 0);
anotherUI.getRouter().initializeUI(anotherUI, request);
anotherUI.getInternals()
.setContextRoot(contextRootRelativePath(request));
BootstrapContext bootstrapContext = new BootstrapContext(vaadinRequest,
null, session, anotherUI, this::contextRootRelativePath);
Document page = pageBuilder.getBootstrapPage(bootstrapContext);
Element body = page.body();
assertEquals(2, body.childNodeSize());
assertEquals("noscript", body.child(0).tagName());
}
@Test public void createsDocumentStructure() {
String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>";
Document doc = Jsoup.parse(html);
Element head = doc.head();
Element body = doc.body();
assertEquals(1, doc.children().size()); // root node: contains html node
assertEquals(2, doc.child(0).children().size()); // html node: head and body
assertEquals(3, head.children().size());
assertEquals(1, body.children().size());
assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name"));
assertEquals(0, body.getElementsByTag("meta").size());
assertEquals("jsoup", doc.title());
assertEquals("Hello world", body.text());
assertEquals("Hello world", body.children().get(0).text());
}
@Test @MultiLocaleTest public void containsData() {
String html = "<p>function</p><script>FUNCTION</script><style>item</style><span><!-- comments --></span>";
Document doc = Jsoup.parse(html);
Element body = doc.body();
Elements dataEls1 = body.select(":containsData(function)");
Elements dataEls2 = body.select("script:containsData(function)");
Elements dataEls3 = body.select("span:containsData(comments)");
Elements dataEls4 = body.select(":containsData(o)");
Elements dataEls5 = body.select("style:containsData(ITEM)");
assertEquals(2, dataEls1.size()); // body and script
assertEquals(1, dataEls2.size());
assertEquals(dataEls1.last(), dataEls2.first());
assertEquals("<script>FUNCTION</script>", dataEls2.outerHtml());
assertEquals(1, dataEls3.size());
assertEquals("span", dataEls3.first().tagName());
assertEquals(3, dataEls4.size());
assertEquals("body", dataEls4.first().tagName());
assertEquals("script", dataEls4.get(1).tagName());
assertEquals("span", dataEls4.get(2).tagName());
assertEquals(1, dataEls5.size());
}
@Test public void createsDocumentStructure() {
String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>";
Document doc = Jsoup.parse(html);
Element head = doc.head();
Element body = doc.body();
assertEquals(1, doc.children().size()); // root node: contains html node
assertEquals(2, doc.child(0).children().size()); // html node: head and body
assertEquals(3, head.children().size());
assertEquals(1, body.children().size());
assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name"));
assertEquals(0, body.getElementsByTag("meta").size());
assertEquals("jsoup", doc.title());
assertEquals("Hello world", body.text());
assertEquals("Hello world", body.children().get(0).text());
}
@Test @MultiLocaleTest public void containsData() {
String html = "<p>function</p><script>FUNCTION</script><style>item</style><span><!-- comments --></span>";
Document doc = Jsoup.parse(html);
Element body = doc.body();
Elements dataEls1 = body.select(":containsData(function)");
Elements dataEls2 = body.select("script:containsData(function)");
Elements dataEls3 = body.select("span:containsData(comments)");
Elements dataEls4 = body.select(":containsData(o)");
Elements dataEls5 = body.select("style:containsData(ITEM)");
assertEquals(2, dataEls1.size()); // body and script
assertEquals(1, dataEls2.size());
assertEquals(dataEls1.last(), dataEls2.first());
assertEquals("<script>FUNCTION</script>", dataEls2.outerHtml());
assertEquals(1, dataEls3.size());
assertEquals("span", dataEls3.first().tagName());
assertEquals(3, dataEls4.size());
assertEquals("body", dataEls4.first().tagName());
assertEquals("script", dataEls4.get(1).tagName());
assertEquals("span", dataEls4.get(2).tagName());
assertEquals(1, dataEls5.size());
}
public static List<IpEntity> getProxyIp(String url) throws Exception{
ArrayList<IpEntity> ipList = new ArrayList<>();
Response execute = Jsoup.connect(url)
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
.header("Cache-Control", "max-age=60").header("Accept", "*/*")
.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6").header("Connection", "keep-alive")
.header("Referer", "http://music.163.com/song?id=186016")
.header("Origin", "http://music.163.com").header("Host", "music.163.com")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Cookie",
"UM_distinctid=15e9863cf14335-0a09f939cd2af9-6d1b137c-100200-15e9863cf157f1; vjuids=414b87eb3.15e9863cfc1.0.ec99d6f660d09; _ntes_nnid=4543481cc76ab2fd3110ecaafd5f1288,1505795231854; _ntes_nuid=4543481cc76ab2fd3110ecaafd5f1288; __s_=1; __gads=ID=6cbc4ab41878c6b9:T=1505795247:S=ALNI_MbCe-bAY4kZyMbVKlS4T2BSuY75kw; usertrack=c+xxC1nMphjBCzKpBPJjAg==; NTES_CMT_USER_INFO=100899097%7Cm187****4250%7C%7Cfalse%7CbTE4NzAzNDE0MjUwQDE2My5jb20%3D; [email protected]|1507178162|2|mail163|00&99|CA&1506163335&mail163#hun&430800#10#0#0|187250&1|163|[email protected]; vinfo_n_f_l_n3=8ba0369be425c0d2.1.7.1505795231863.1507950353704.1508150387844; vjlast=1505795232.1508150167.11; Province=0450; City=0454; _ga=GA1.2.1044198758.1506584097; _gid=GA1.2.763458995.1508907342; JSESSIONID-WYYY=Zm%2FnBG6%2B1vb%2BfJp%5CJP8nIyBZQfABmnAiIqMM8fgXABoqI0PdVq%2FpCsSPDROY1APPaZnFgh14pR2pV9E0Vdv2DaO%2BKkifMncYvxRVlOKMEGzq9dTcC%2F0PI07KWacWqGpwO88GviAmX%2BVuDkIVNBEquDrJ4QKhTZ2dzyGD%2Bd2T%2BbiztinJ%3A1508946396692; _iuqxldmzr_=32; playerid=20572717; MUSIC_U=39d0b2b5e15675f10fd5d9c05e8a5d593c61fcb81368d4431bab029c28eff977d4a57de2f409f533b482feaf99a1b61e80836282123441c67df96e4bf32a71bc38be3a5b629323e7bf122d59fa1ed6a2; __remember_me=true; __csrf=2032a8f34f1f92412a49ba3d6f68b2db; __utma=94650624.1044198758.1506584097.1508939111.1508942690.40; __utmb=94650624.20.10.1508942690; __utmc=94650624; __utmz=94650624.1508394258.18.4.utmcsr=xujin.org|utmccn=(referral)|utmcmd=referral|utmcct=/")
.method(Method.GET).ignoreContentType(true)
.timeout(2099999999).execute();
Document pageJson = execute.parse();
Element body = pageJson.body();
List<Node> childNodes = body.childNode(11).childNode(3).childNode(5).childNode(1).childNodes();
//把前10位的代理IP放到List中
for(int i = 2;i <= 30;i += 2){
IpEntity ipEntity = new IpEntity();
Node node = childNodes.get(i);
List<Node> nodes = node.childNodes();
String ip = nodes.get(3).childNode(0).toString();
int port = Integer.parseInt(nodes.get(5).childNode(0).toString());
ipEntity.setIp(ip);
ipEntity.setPort(port);
ipList.add(ipEntity);
}
return ipList;
}
public Map<String, Object> doParse(Object... arg) {
mArguments = arg;
if (mValues == null) {
mValues = new HashMap<>();
}
Document doc = Jsoup.parse(mHtml);
mHeader = doc.head();
mBody = doc.body();
parse(doc, mHeader, mBody, mValues);
return mValues;
}
public static String getBodyHtml(String html) {
if (StringUtils.isNotBlank(html)) {
Document document = Jsoup.parse(html);
if (null != document && document.body() != null) {
return document.body().html().toString();
}
}
return html;
}
@Override
public void manipulate(Document document) {
Element body = document.body();
while (!removeEmpty(body)) {
// Repeat as needed.... work done in the while
}
}
public static RDisplay render( String html, String imageWidth) {
Document document = Jsoup.parse(html);
document.outputSettings().prettyPrint(false);
Element body = document.body();
if (body.getElementsByTag("p").isEmpty()) {
return new RDisplay(body.html(), Type.HTML, Code.SUCCESS);
}
String bodyHtml = body.html();
if (! bodyHtml.contains("<img")
&& ! bodyHtml.contains("<script")
&& ! bodyHtml.contains("%html ")
&& ! bodyHtml.contains("%table ")
&& ! bodyHtml.contains("%img ")
) {
return textDisplay(body);
}
if (bodyHtml.contains("%table")) {
return tableDisplay(body);
}
if (bodyHtml.contains("%img")) {
return imgDisplay(body);
}
return htmlDisplay(body, imageWidth);
}
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
for (int i = nodes.length - 1; i > 0; i--) {
nodes[i].remove();
}
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
for (int i = nodes.length - 1; i > 0; i--) {
nodes[i].remove();
}
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
@Test public void parsesComments() {
String html = "<html><head></head><body><img src=foo><!-- <table><tr><td></table> --><p>Hello</p></body></html>";
Document doc = Jsoup.parse(html);
Element body = doc.body();
Comment comment = (Comment) body.childNode(1); // comment should not be sub of img, as it's an empty tag
assertEquals(" <table><tr><td></table> ", comment.getData());
Element p = body.child(1);
TextNode text = (TextNode) p.childNode(0);
assertEquals("Hello", text.getWholeText());
}
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
if (request.getParameter("q") == null) {
Responses.sendError(response, "Missing q parameter", 400);
return;
}
String q = request.getParameter("q");
Document doc = Jsoup.parseBodyFragment(q);
Element body = doc.body();
Elements elements = body.getAllElements();
if (!(q.contains("body"))){
elements.remove(body);
}
if (elements.isEmpty()) {
Responses.sendError(response, "Invalid input, no tags", 400);
return;
}
String allowedTag = "";
String allowedAttribute = "";
if (request.getPathInfo() != null) {
String pathInfo = request.getPathInfo().substring(1);
if (pathInfo.contains("/")) {
allowedTag = pathInfo.split("/", 2)[0];
allowedAttribute = pathInfo.split("/")[1];
} else {
allowedTag = pathInfo;
}
}
handleRequest(elements, response, allowedTag, allowedAttribute);
}
/**
* @return
* @throws IOException
*/
protected static Element getMailTemplate() throws IOException {
File tmp = SysConfiguration.getFileOfRes("locales/email_zh-CN.html");
Document html = Jsoup.parse(tmp, "utf-8");
return html.body();
}
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
final File f = getFileName(jCas);
final DocumentAnnotation da = getDocumentAnnotation(jCas);
final Document doc =
Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
final Element head = doc.head();
if (!Strings.isNullOrEmpty(css)) {
final Element cssLink = head.appendElement("link");
cssLink.attr("rel", "stylesheet");
cssLink.attr("href", css);
}
final Element charset = head.appendElement("meta");
charset.attr("charset", "utf-8");
appendMeta(head, "document.type", da.getDocType());
appendMeta(head, "document.sourceUri", da.getSourceUri());
appendMeta(head, "externalId", da.getHash());
appendMeta(head, "document.classification", da.getDocumentClassification());
appendMeta(
head,
"document.caveats",
String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
appendMeta(
head,
"document.releasability",
String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));
String title = null;
for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
appendMeta(head, md.getKey(), md.getValue());
if ("documentTitle".equalsIgnoreCase(md.getKey())) {
title = md.getValue();
}
}
if (!Strings.isNullOrEmpty(title)) {
doc.title(title);
}
final Element body = doc.body();
writeBody(jCas, body);
try {
FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
} catch (final IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
public static ThreadPage processThreadPage(Document document, boolean showImages, boolean showAvatars, boolean hidePreviouslyReadImages, long jumpToPost, String redirectedUrl){
ArrayList<HashMap<String, String>> posts = new ArrayList<HashMap<String, String>>();
int currentPage, maxPage = 1, threadId, forumId, unread;
String jumpToId = jumpToPost > 0 ? "#post"+jumpToPost : null;
String ptiFragment = null;
if(!TextUtils.isEmpty(redirectedUrl)){
Uri url = Uri.parse(redirectedUrl);
ptiFragment = url.getFragment();
if("lastpost".matches(ptiFragment)){
ptiFragment = null;
jumpToId = "#lastpost";
}
}
Element pages = document.getElementsByClass("pages").first();
currentPage = FastUtils.safeParseInt(pages.getElementsByAttribute("selected").attr("value"), 1);
Element lastPage = pages.getElementsByTag("option").last();
if(lastPage != null){
maxPage = FastUtils.safeParseInt(lastPage.attr("value"), 1);
}
boolean bookmarked = document.getElementsByClass("unbookmark").size() > 0;
String threadTitle = TextUtils.htmlEncode(document.getElementsByClass("bclast").first().text());
Element body = document.body();
forumId = Integer.parseInt(body.attr("data-forum"));
threadId = Integer.parseInt(body.attr("data-thread"));
Elements threadbars = document.getElementsByClass("threadbar");
boolean canReply = !Constants.isArchiveForum(forumId) && threadbars.first().getElementsByAttributeValueContaining("src", "images/forum-closed.gif").size() == 0;
unread = parsePosts(document, posts, showImages, showAvatars, hidePreviouslyReadImages, ptiFragment, canReply, currentPage == maxPage, forumId);
StringBuilder builder = new StringBuilder(2048);
int previouslyRead = posts.size()-unread;
HashMap<String, String> headerArgs = new HashMap<String, String>();
headerArgs.put("jumpToPostId", jumpToId);
headerArgs.put("fontSize", SomePreferences.fontSize);
headerArgs.put("theme", getTheme(forumId));
headerArgs.put("previouslyRead", previouslyRead > 0 && unread > 0 ? previouslyRead+" Previous Post"+(previouslyRead > 1 ? "s":"") : null);
MustCache.applyHeaderTemplate(builder, headerArgs);
for(HashMap<String, String> post : posts){
MustCache.applyPostTemplate(builder, post);
}
MustCache.applyFooterTemplate(builder, null);
ThreadItem cachedThread = ThreadManager.getThread(threadId);
if(cachedThread != null){
cachedThread.updateUnreadCount(currentPage, maxPage, SomePreferences.threadPostPerPage);
}
return new ThreadPage(builder.toString(), currentPage, maxPage, threadId, forumId, threadTitle, -unread, bookmarked, canReply);
}
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
if (request.getParameter("q") == null) {
Responses.sendError(response, "Missing q parameter", 400);
return;
}
String q = request.getParameter("q");
Document doc = Jsoup.parseBodyFragment(q);
Element body = doc.body();
Elements elements = body.getAllElements();
elements.remove(body);
if (elements.isEmpty()) {
Responses.sendError(response, "Invalid input, no tags", 400);
return;
}
StringBuilder res = new StringBuilder();
for (Element element : elements) {
boolean validElement = true;
Attributes attributes = element.attributes();
for (Attribute attribute : attributes) {
if (attribute.getKey().toLowerCase().startsWith("on")
|| attribute.getKey().toLowerCase().equals("href")
|| attribute.getKey().toLowerCase().equals("src")) {
validElement = false;
}
if (attribute.getKey().toLowerCase().equals("style")
&& attribute.getValue().toLowerCase().contains("expression")) {
validElement = false;
}
}
if (validElement) {
res.append(element.toString());
}
}
Responses.sendXssed(response, res.toString());
}