下面列出了org.jsoup.nodes.Comment#org.jsoup.nodes.Node 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public void head(Node node, int depth) {
String name = node.nodeName();
if (node instanceof TextNode) {
append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
} else if (name.equals("ul")) {
listNesting++;
} else if (name.equals("li")) {
append("\n ");
for (int i = 1; i < listNesting; i++) {
append(" ");
}
if (listNesting == 1) {
append("* ");
} else {
append("- ");
}
} else if (name.equals("dt")) {
append(" ");
} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
append("\n");
}
}
private void generateTextContent(StringBuilder sb, List<Node> nodes) {
for (Node node : nodes) {
if (node instanceof Element) {
Element element = (Element) node;
switch (element.nodeName()) {
case "a":
sb.append(getTextLink(element));
break;
case "br":
sb.append('\n');
break;
default:
generateTextContent(sb, element.childNodes());
break;
}
} else if (node instanceof TextNode) {
sb.append(((TextNode) node).getWholeText());
}
}
}
@Override
public void head(Node node, int depth) {
String name = node.nodeName();
if (node instanceof TextNode) {
append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
} else if (name.equals("ul")) {
listNesting++;
} else if (name.equals("li")) {
append("\n ");
for (int i = 1; i < listNesting; i++) {
append(" ");
}
if (listNesting == 1) {
append("* ");
} else {
append("- ");
}
} else if (name.equals("dt")) {
append(" ");
} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
append("\n");
}
}
@Override
public String operate(Element element) {
int index = 0;
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
if (group == 0) {
accum.append(textNode.text());
} else if (++index == group) {
return textNode.text();
}
}
}
return accum.toString();
}
/**
* 解析出歌手 专辑
* <p>Title: test4</p>
* <p>Description: </p>
* @throws Exception
*/
@Test
public void test4() throws Exception{
Response execute = Jsoup.connect("http://music.163.com/song?id=63650")
.ignoreContentType(true).execute();
Document parse = execute.parse();
Elements elements = parse.getElementsByClass("s-fc7");
Element singerElement = elements.get(1);
Node singerChildNode = singerElement.childNode(0);
String singer = singerChildNode.toString();
//Album
Element albumElement = elements.get(2);
Node albumChildNode = albumElement.childNode(0);
String album = albumChildNode.toString();
System.out.println(singer+"--------"+album);
}
/**
* 获取歌曲名称
* <p>Title: getSongNameById</p>
* <p>Description: </p>
* @param songId
* @return
* @throws Exception
*/
public static String getSongNameById(long songId) throws Exception{
String songName = null;
Response execute = Jsoup.connect("http://music.163.com/m/song?id=" + songId)
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
.header("Cache-Control", "no-cache").timeout(2000000000)
.execute();
Document parse = execute.parse();
Elements elementsByClass = parse.getElementsByClass("f-ff2");
if(elementsByClass.size() > 0){
Element element = elementsByClass.get(0);
Node childNode = element.childNode(0);
songName = childNode.toString();
}else{
songName = "ES中歌曲在网易云音乐中找不到";
}
return songName;
}
public static String autoDigest(String str, int size) {
StringBuilder sb = new StringBuilder();
Document document = Jsoup.parseBodyFragment(str);
List<Node> allTextNode = new ArrayList<>();
getAllTextNode(document.childNodes(), allTextNode);
int tLength = 0;
for (Node node : allTextNode) {
if (node instanceof TextNode) {
sb.append(node.parent().outerHtml());
tLength += ((TextNode) node).text().length();
if (tLength > size) {
sb.append(" ...");
break;
}
}
}
String digest = sb.toString();
Elements elements = Jsoup.parse(str).body().select("video");
if (elements != null && !elements.isEmpty()) {
digest = elements.get(0).toString() + "<br/>" + digest;
}
return digest.trim();
}
/**
* Start a depth-first traverse of the root and all of its descendants.
* @param root the root node point to traverse.
*/
public void traverse(Node root) {
Node node = root;
int depth = 0;
while (node != null) {
visitor.head(node, depth);
if (node.childNodeSize() > 0) {
node = node.childNode(0);
depth++;
} else {
while (node.nextSibling() == null && depth > 0) {
visitor.tail(node, depth);
node = node.parent();
depth--;
}
visitor.tail(node, depth);
if (node == root)
break;
node = node.nextSibling();
}
}
}
private static void readNodes(List<Node> nodeList, List<String> textList)
{
String tempText;
for(Node node : nodeList)
{
if(node.childNodeSize()>0)
{
readNodes(node.childNodes(), textList);
}
else
{
if(node.nodeName().equals("#text"))
{
tempText=((TextNode) node).getWholeText();
tempText=newline.matcher(tempText).replaceAll("");
if(!tempText.isEmpty())
textList.add(tempText);
}
}
}
}
private static void readNodesWithTags(List<Node> nodeList, List<Map.Entry<String,String>> textListMap, String tag)
{
for(Node node : nodeList)
{
if(node.childNodeSize()>0)
{
readNodesWithTags(node.childNodes(), textListMap, node.nodeName());
}
else
{
if(node.nodeName().equals("#text"))
{
if(tag.equalsIgnoreCase("body"))
tag="p";
textListMap.add(new AbstractMap.SimpleEntry<String,String>(tag, ((TextNode) node).getWholeText() ));
}
}
}
}
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
for (Node child : e.childNodes()) {
if (unlikely(child)) {
continue;
}
if (child instanceof TextNode) {
TextNode textNode = (TextNode) child;
String txt = textNode.text();
accum.append(txt);
} else if (child instanceof Element) {
Element element = (Element) child;
if (accum.length() > 0 && element.isBlock()
&& !lastCharIsWhitespace(accum))
accum.append(' ');
else if (element.tagName().equals("br"))
accum.append(' ');
appendTextSkipHidden(element, accum, indent + 1);
}
}
}
public void initRawInfo()
{
StringBuilder sb = new StringBuilder();
for (Node n : this) {
// NodeHelper.cleanEmptyElements(n);
if (n instanceof TextNode) {
this.setTagName(getPath(n));
String nodeRawText = ((TextNode) n).text();
sb.append(Utils.normalizeBreaks(nodeRawText).trim());
if (NodeHelper.isLink(n)) {
charsCountInLinks += nodeRawText.length();
}
}
}
rawText = sb.toString();
}
@Override
public String operate(Element element) {
int index = 0;
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
if (group == 0) {
accum.append(textNode.text());
} else if (++index == group) {
return textNode.text();
}
}
}
return accum.toString();
}
@Override
public String buildTextFromElement(Element element) {
StringBuilder elementText = new StringBuilder();
if (element.hasAttr(ALT_ATTR)) {
elementText.append(SPACER);
elementText.append(altAttrTextBuilder.buildTextFromElement(element));
}
for (Node child : element.childNodes()) {
if (child instanceof TextNode && !((TextNode)child).isBlank()) {
elementText.append(SPACER);
elementText.append(StringUtils.trim(((TextNode)child).text()));
} else if (child instanceof Element){
elementText.append(SPACER);
elementText.append(buildTextFromElement((Element)child));
}
}
return StringUtils.trim(elementText.toString());
}
/**
* Converts the given element and its children to a JSoup node with
* children.
*
* @param document
* A JSoup document
* @param element
* The element to convert
* @return A JSoup node containing the converted element
*/
public static Node toJsoup(Document document, Element element) {
if (element.isTextNode()) {
return new TextNode(element.getText(), document.baseUri());
}
org.jsoup.nodes.Element target = document
.createElement(element.getTag());
if (element.hasProperty("innerHTML")) {
target.html((String) element.getPropertyRaw("innerHTML"));
}
element.getAttributeNames().forEach(name -> {
String attributeValue = element.getAttribute(name);
if ("".equals(attributeValue)) {
target.attr(name, true);
} else {
target.attr(name, attributeValue);
}
});
element.getChildren()
.forEach(child -> target.appendChild(toJsoup(document, child)));
return target;
}
void parseTopDoc() throws IOException {
String source = "https://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_doc/";
Document doc = Jsoup.parse(new URL(source), 5 * 1000); // 5 sec timeout
// System.out.printf("%s%n", doc);
Elements links = doc.select("a[href]");
for (Element link : links) {
// System.out.printf("%s", link);
Node sib = link.nextSibling();
String title = null;
if (sib != null) {
String sibt = sib.toString();
title = StringUtil2.remove(sibt, "-").trim();
// System.out.printf(" == '%s'", title);
}
if (link.text().equals("Table 4.2")) {
// System.out.printf(" == ");
parseTable42(link.attr("abs:href"), link.text(), title);
} else {
if (link.text().startsWith("Table 4")) {
// System.out.printf(" == ");
parseCodeTable(link.attr("abs:href"), link.text(), title);
}
}
// System.out.printf("%n");
}
}
@Override
public boolean matches(Element root, Element element) {
List<Node> family = element.childNodes();
for (Node n : family) {
if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
}
return true;
}
@Test public void xmlFragment() {
String xml = "<one src='/foo/' />Two<three><four /></three>";
List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
assertEquals(3, nodes.size());
assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
assertEquals("one", nodes.get(0).nodeName());
assertEquals("Two", ((TextNode)nodes.get(1)).text());
}
@Test public void xmlFragment() {
String xml = "<one src='/foo/' />Two<three><four /></three>";
List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
assertEquals(3, nodes.size());
assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
assertEquals("one", nodes.get(0).nodeName());
assertEquals("Two", ((TextNode)nodes.get(1)).text());
}
static boolean truncate(Document d, boolean reformat) {
int max = (reformat ? MAX_FORMAT_TEXT_SIZE : MAX_FULL_TEXT_SIZE);
int length = 0;
int images = 0;
for (Element elm : d.select("*")) {
if ("img".equals(elm.tagName()))
images++;
boolean skip = false;
for (Node child : elm.childNodes()) {
if (child instanceof TextNode) {
TextNode tnode = ((TextNode) child);
String text = tnode.getWholeText();
if (length < max) {
if (length + text.length() >= max) {
text = text.substring(0, max - length) + " ...";
tnode.text(text);
skip = true;
}
} else {
if (skip)
tnode.text("");
}
length += text.length();
}
}
if (length >= max && !skip)
elm.remove();
}
Log.i("Message size=" + length + " images=" + images);
return (length >= max);
}
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
for (int i = nodes.length - 1; i > 0; i--) {
nodes[i].remove();
}
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
@Override
public void tail(Node node, int depth) {
if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
if (isApplicable(tn))
matchedNodes.add(tn);
}
}
private static List<FlowerCategory> getCategoryList() {
List<FlowerCategory> categories = new ArrayList<FlowerCategory>();
try {
Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get();
Elements catelist = doc.getElementsByClass("catelist");
Element cates = catelist.first();
List<Node> childNodes = cates.childNodes();
for (int i = 0; i < childNodes.size(); i++) {
Node node = childNodes.get(i);
List<Node> childs = node.childNodes();
if (childs != null && childs.size() > 0) {
FlowerCategory category = new FlowerCategory();
for (int j = 0; j < childs.size(); j++) {
Node child = childs.get(j);
if ("a".equals(child.nodeName())) {
category.setUrl(child.attr("href"));
category.setImgPath(child.childNode(1).attr("src"));
} else if ("h2".equals(child.nodeName())) {
category.setName(child.attr("title"));
}
}
categories.add(category);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return categories;
}
public static List<IpEntity> getProxyIp(String url) throws Exception{
ArrayList<IpEntity> ipList = new ArrayList<>();
Response execute = Jsoup.connect(url)
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
.header("Cache-Control", "max-age=60").header("Accept", "*/*")
.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6").header("Connection", "keep-alive")
.header("Referer", "http://music.163.com/song?id=186016")
.header("Origin", "http://music.163.com").header("Host", "music.163.com")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Cookie",
"UM_distinctid=15e9863cf14335-0a09f939cd2af9-6d1b137c-100200-15e9863cf157f1; vjuids=414b87eb3.15e9863cfc1.0.ec99d6f660d09; _ntes_nnid=4543481cc76ab2fd3110ecaafd5f1288,1505795231854; _ntes_nuid=4543481cc76ab2fd3110ecaafd5f1288; __s_=1; __gads=ID=6cbc4ab41878c6b9:T=1505795247:S=ALNI_MbCe-bAY4kZyMbVKlS4T2BSuY75kw; usertrack=c+xxC1nMphjBCzKpBPJjAg==; NTES_CMT_USER_INFO=100899097%7Cm187****4250%7C%7Cfalse%7CbTE4NzAzNDE0MjUwQDE2My5jb20%3D; [email protected]|1507178162|2|mail163|00&99|CA&1506163335&mail163#hun&430800#10#0#0|187250&1|163|[email protected]; vinfo_n_f_l_n3=8ba0369be425c0d2.1.7.1505795231863.1507950353704.1508150387844; vjlast=1505795232.1508150167.11; Province=0450; City=0454; _ga=GA1.2.1044198758.1506584097; _gid=GA1.2.763458995.1508907342; JSESSIONID-WYYY=Zm%2FnBG6%2B1vb%2BfJp%5CJP8nIyBZQfABmnAiIqMM8fgXABoqI0PdVq%2FpCsSPDROY1APPaZnFgh14pR2pV9E0Vdv2DaO%2BKkifMncYvxRVlOKMEGzq9dTcC%2F0PI07KWacWqGpwO88GviAmX%2BVuDkIVNBEquDrJ4QKhTZ2dzyGD%2Bd2T%2BbiztinJ%3A1508946396692; _iuqxldmzr_=32; playerid=20572717; MUSIC_U=39d0b2b5e15675f10fd5d9c05e8a5d593c61fcb81368d4431bab029c28eff977d4a57de2f409f533b482feaf99a1b61e80836282123441c67df96e4bf32a71bc38be3a5b629323e7bf122d59fa1ed6a2; __remember_me=true; __csrf=2032a8f34f1f92412a49ba3d6f68b2db; __utma=94650624.1044198758.1506584097.1508939111.1508942690.40; __utmb=94650624.20.10.1508942690; __utmc=94650624; __utmz=94650624.1508394258.18.4.utmcsr=xujin.org|utmccn=(referral)|utmcmd=referral|utmcct=/")
.method(Method.GET).ignoreContentType(true)
.timeout(2099999999).execute();
Document pageJson = execute.parse();
Element body = pageJson.body();
List<Node> childNodes = body.childNode(11).childNode(3).childNode(5).childNode(1).childNodes();
//把前10位的代理IP放到List中
for(int i = 2;i <= 30;i += 2){
IpEntity ipEntity = new IpEntity();
Node node = childNodes.get(i);
List<Node> nodes = node.childNodes();
String ip = nodes.get(3).childNode(0).toString();
int port = Integer.parseInt(nodes.get(5).childNode(0).toString());
ipEntity.setIp(ip);
ipEntity.setPort(port);
ipList.add(ipEntity);
}
return ipList;
}
@Test
public void test1() throws Exception{
Response execute = Jsoup.connect("http://music.163.com/m/song?id=" + 91445)
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
.header("Cache-Control", "no-cache").timeout(2000000000)
.execute();
Document parse = execute.parse();
Elements elementsByClass = parse.getElementsByClass("f-ff2");
Element element = elementsByClass.get(0);
Node childNode = element.childNode(0);
// 获取歌曲名称
String songName = childNode.toString();
System.out.println(songName);
}
protected String getText(Element element) {
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
accum.append(textNode.text());
}
}
return accum.toString();
}
public void tail(Node node, int depth) {
String name = node.nodeName();
if (name.equals("br"))
append("\n");
else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5"))
append("\n\n");
else if (name.equals("a"))
append(String.format(" <%s>", node.absUrl("href")));
}
private boolean unlikely(Node e) {
if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
return true;
String style = e.attr("style");
String clazz = e.attr("class");
return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
}
@Test public void handleNullContextInParseFragment() {
String html = "<ol><li>One</li></ol><p>Two</p>";
List<Node> nodes = Parser.parseFragment(html, null, "http://example.com/");
assertEquals(1, nodes.size()); // returns <html> node (not document) -- no context means doc gets created
assertEquals("html", nodes.get(0).nodeName());
assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
}
private static List<CodeInfo> parseHTMLNodeToParagraphs(Node node) {
List<CodeInfo> paragraphList = new ArrayList<>();
List<Node> childNodes = node.childNodes();
for (Node childNode : childNodes) {
if (childNode.nodeName().equals("p") || childNode.nodeName().equals("li")) continue;
if (childNode.nodeName().equals("pre"))
childNode.childNodes().stream()
.filter(n -> n.nodeName().equals("code"))
.map(n -> new CodeInfo(StringEscapeUtils.unescapeHtml4(((Element) n).text())))
.forEach(paragraphList::add);
else paragraphList.addAll(parseHTMLNodeToParagraphs(childNode));
}
return paragraphList;
}