下面列出了org.jsoup.nodes.Document#select ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Test public void testNestedHas() {
Document doc = Jsoup.parse("<div><p><span>One</span></p></div> <div><p>Two</p></div>");
Elements divs = doc.select("div:has(p:has(span))");
assertEquals(1, divs.size());
assertEquals("One", divs.first().text());
// test matches in has
divs = doc.select("div:has(p:matches((?i)two))");
assertEquals(1, divs.size());
assertEquals("div", divs.first().tagName());
assertEquals("Two", divs.first().text());
// test contains in has
divs = doc.select("div:has(p:contains(two))");
assertEquals(1, divs.size());
assertEquals("div", divs.first().tagName());
assertEquals("Two", divs.first().text());
}
@Override
protected void successOnUI(String data) {
super.successOnUI(data);
if (!mIsCanceled) {
Document doc;
try {
doc = get(ConstantUtil.VERIFY_TELEPHONE_URL);
} catch (IOException e) {
e.printStackTrace();
return;
}
Elements elements = doc.select("button#getSmsCode");
final boolean telephoneVerified = elements.isEmpty();
mHandler.post(() -> {
App.getInstance().mGlobal.telephoneVerified.setValue(telephoneVerified);
});
}
}
@Override
protected List<String> parseImages(@NonNull Content content) throws IOException {
List<String> result = new ArrayList<>();
progressStart(content.getQtyPages());
/*
* Open all pages and grab the URL of the displayed image
*/
for (int i = 0; i < content.getQtyPages(); i++) {
String readerUrl = content.getReaderUrl().replace("001", Helper.formatIntAsStr(i + 1, 3));
Document doc = getOnlineDocument(readerUrl);
if (doc != null) {
Elements elements = doc.select("section a img");
if (elements != null && !elements.isEmpty()) {
Element e = elements.first();
result.add(e.attr("src"));
}
}
progressPlus();
}
progressComplete();
return result;
}
protected void tryFetchContact(ZhilianResume resume, Document doc) {
final String SPLIT1 = "url=";
final String SPLIT2 = "ldparam=";
Elements as = doc.select("table table table table tr td a");
for(Element elem : as) {
String href = elem.attr("href");
if(href.contains(SPLIT2) && href.contains(SPLIT1)) {
String url = href.substring(href.lastIndexOf(SPLIT1) + SPLIT1.length(), href.length());
String content;
try {
content = Request.Get(url).execute().returnContent().asString();
Document doc2 = Jsoup.parse(content);
Elements infos = doc2.select("div.login_content p a");
resume.setName(infos.get(0).text());
resume.setPhone(infos.get(1).text());
resume.setMail(infos.get(2).text());
} catch (Exception e) {
e.printStackTrace(System.err);
}
return;
}
}
}
@Test
public void testClassFileFormatVersionIsPresent() throws IOException {
Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "japicmp", "class-file-format-version.html");
if (!Files.exists(htmlPath)) {
return; //in JDK 1.7 case
}
Document document = Jsoup.parse(htmlPath.toFile(), Charset.forName("UTF-8").toString());
Elements classFileFormatElements = document.select(".class_fileFormatVersion");
assertThat(classFileFormatElements.isEmpty(), is(false));
Elements tdCells = classFileFormatElements.select("table > tbody > tr > td");
assertThat(tdCells.isEmpty(), is(false));
for (Element element : tdCells) {
String text = element.text();
if (!"MODIFIED".equals(text) && !"50.0".equals(text) && !"52.0".equals(text)) {
Assert.fail("text of HTML element does not equal 'MODIFIED' or 50.0 or 52.0: " + text);
}
}
}
protected void findUrls(String referer, String htmlPage, Map<String, String> allPages) {
log.debug("find urls on this web page: " + referer);
if (abort) {
appendMessage("aborted");
return;
}
Document document = Jsoup.parse(htmlPage);
Elements newsHeadlines = document.select("a");
Iterator<Element> iterator = newsHeadlines.iterator();
while (iterator.hasNext()) {
if (abort) {
appendMessage("aborted");
break;
}
Element element = (Element) iterator.next();
element.setBaseUri(referer);
// System.out.println("base uri: "+ check.getUrl());
// System.out.println("referer: "+ referer);
String url = element.absUrl("href").trim();
log.debug("spider check found url: " + url);
if (!url.toString().isEmpty() && !url.startsWith("mailto:") && !SinglePageCheckService.ignoreUrl(url, check.getDoNotFollowUrls()) && url.startsWith(check.getUrl()) && !url.equals(referer)) {
log.debug("spider check put to all pages url: " + url);
allPages.put(url, referer);
}
}
}
private static String _getText(Document d, boolean full) {
truncate(d, !full);
for (Element bq : d.select("blockquote")) {
bq.prependChild(new TextNode("["));
bq.appendChild(new TextNode("]"));
}
String text = d.text();
if (full)
return text;
String preview = text.substring(0, Math.min(text.length(), PREVIEW_SIZE));
if (preview.length() < text.length())
preview += "…";
return preview;
}
/**
* Applies the styles to a <code>data-cssstyle</code> attribute. This is
* because the styles need to be applied sequentially, but before the
* <code>style</code> defined for the element inline.
*
* @param doc
* the html document
*/
private static void extractStyles(Document doc, String stylesheet) {
String cleanedStylesheet = ignoreAtRules(stylesheet);
cleanedStylesheet = NEW_LINES.matcher(cleanedStylesheet).replaceAll("");
cleanedStylesheet = COMMENTS.matcher(cleanedStylesheet).replaceAll("");
cleanedStylesheet = SPACES.matcher(cleanedStylesheet).replaceAll(" ");
String styleRules = cleanedStylesheet.trim();
String delims = "{}";
StringTokenizer st = new StringTokenizer(styleRules, delims);
while (st.countTokens() > 1) {
String selector = st.nextToken();
String properties = st.nextToken();
Elements selectedElements = doc.select(selector.trim());
for (Element selElem : selectedElements) {
String oldProperties = selElem.attr(TEMP_STYLE_ATTR);
selElem.attr(TEMP_STYLE_ATTR, oldProperties.length() > 0 ? concatenateProperties(oldProperties, properties) : properties);
}
}
}
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
List<AlbumInfo> urls = new ArrayList<>();
Document document = Jsoup.parse(new String(result, "utf-8"));
Elements elements = document.select("div.album");
for (Element element : elements) {
AlbumInfo temp = new AlbumInfo();
Elements title = element.select("span.name");
if (title.size() > 0)
temp.setTitle(title.get(0).text());
Elements album = element.select(".pic_box a");
temp.setAlbumUrl(album.attr("href"));
Elements pic = album.select("img");
if (pic.size() > 0)
temp.setPicUrl(pic.get(0).attr("src"));
urls.add(temp);
}
resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
resultMap.put(ContentsActivity.parameter.RESULT, urls);
return resultMap;
}
/**
* Replace link tags with style tags in order to keep the same inclusion
* order
*
* @param doc
* the html document
* @param cssContents
* the list of external css files with their content
*/
private static void internStyles(Document doc, List<ExternalCss> cssContents) {
Elements els = doc.select(CSS_LINKS_SELECTOR);
for (Element e : els) {
if (isInlineModeAllowed(e, InlineModes.STYLE_ATTR)) {
String path = e.attr(HREF_ATTR);
ExternalCss css = getCss(cssContents, path);
if (css != null) {
Element style = new Element(Tag.valueOf(STYLE_TAG), "");
style.appendChild(new DataNode(getCssContent(css)));
e.replaceWith(style);
}
}
}
}
/**
* 从服务器查询当前分类下图书的数量。包含所有子分类下的图书
*
* @return 当前分类下图书的数量
* @throws IOException 查询失败
*/
public int queryBooksSize() throws IOException {
checkCookie();
String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1";
String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
// System.out.println(html);
Document doc = Jsoup.parse(html);
Elements form = doc.select("input[name=totalnumber]");
if (!form.isEmpty()) {
String booksize = form.get(0).attr("value");
return Integer.parseInt(booksize);
}
return 0;
}
private static String extractPageData(Document document) {
Elements elements = document.select("main[data-page-data]");
checkState(!elements.isEmpty(), "Main element not found in %s", document);
Element element = Iterables.getOnlyElement(elements);
Gson gson = new GsonBuilder().setPrettyPrinting().create();
String data = element.attributes().dataset().get("page-data");
JsonArray json = gson.fromJson(data, JsonArray.class);
return gson.toJson(json).trim();
}
@Test
public void testNoneHeading() {
Document document =
Jsoup.parseBodyFragment(
"<p><b>This is a group heading:</b></p><p>This is not a group heading</p><p>This is not a group heading.</p>");
manipulator.manipulate(document);
Elements h2s = document.select("h2");
assertEquals(0, h2s.size());
}
@Test public void testPseudoEquals() {
Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>");
Elements ps = doc.select("div p:eq(0)");
assertEquals(2, ps.size());
assertEquals("One", ps.get(0).text());
assertEquals("Four", ps.get(1).text());
Elements ps2 = doc.select("div:eq(0) p:eq(0)");
assertEquals(1, ps2.size());
assertEquals("One", ps2.get(0).text());
assertEquals("p", ps2.get(0).tagName());
}
@Override
protected List<String> getURLsFromPage(Document page) {
List<String> imgURLs = new ArrayList<>();
Elements thumbnails = page.select("#galleryImages .inner-block img");
for (Element thumb : thumbnails) {
String thumbUrl = thumb.attr("src");
String picUrl = thumbUrl.replace("thumbs/", "");
// use HTTP instead of HTTPS (less headaches)
imgURLs.add(picUrl.replaceFirst("https://", "http://"));
}
return imgURLs;
}
@Override
public Document getNextPage(Document page) throws IOException {
Document nextPage = super.getNextPage(page);
switch (urlType) {
case LIST:
if (!page.select(".loadmoreitems").isEmpty()) {
// All items are not loaded.
// Load remaining items using postUrl.
String offSet = page.select(".loadmoreitems").last().attr("data-offset");
Map<String, String> postParams = new HashMap<>();
postParams.put("listid", listId);
postParams.put("offset", offSet);
try {
nextPage = Http.url(postUrl).data(postParams).retries(3).post();
} catch (IOException e1) {
LOGGER.error("Failed to load more images after " + offSet, e1);
throw e1;
}
}
break;
case FOLDER:
Elements pageLinks = page.select(".pages a");
if (!pageLinks.isEmpty() && pageLinks.last().text().startsWith("Next")) {
String nextUrl = pageLinks.last().attr("abs:href");
nextPage = Http.url(nextUrl).retries(3).get();
}
break;
case UNKNOWN:
default:
}
return nextPage;
}
public List<Definition> wordLookup(String key) {
try {
// Document doc = Jsoup.connect(wordUrl + key)
// .userAgent("Mozilla")
// .timeout(5000)
// .get();
Request request = new Request.Builder().url(wordUrl + key)
//.addHeader("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36")
.addHeader("User-Agent", Constant.UA)
.build();
String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string();
Document doc = Jsoup.parse(rawhtml);
List<Definition> definitionList = new ArrayList<>();
for(Element audioEle : doc.select("ul.slides > li")){
HashMap<String, String> eleMap = new HashMap<>();
String audioUrl = "";
Elements audioElements = audioEle.select("audio");
if(audioElements.size() > 0){
audioUrl = audioElements.get(0).attr("src");
}
String audioName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".mp3";
String imageUrl = "";
Elements imageElements = audioEle.select("img");
if(imageElements.size() > 0){
imageUrl = imageElements.get(0).attr("src");
}
String imageName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".png";
String channel = getSingleQueryResult(audioEle, "div.mTop", false).trim();
String en = getSingleQueryResult(audioEle, "div.mBottom", true)
.replaceAll("<em>", "<b>")
.replaceAll("</em>", "</b>");
String cn = getSingleQueryResult(audioEle, "div.mFoot", true)
.replaceAll("<em>", "<b>")
.replaceAll("</em>", "</b>");
String context = getSingleQueryResult(audioEle, "div.mTextend", true);
String detailUrl = "http://www.91dict.com" + audioEle.select("a.viewdetail").get(0).attr("href");
String audioTag = String.format("[sound:%s]", Constant.AUDIO_SUB_DIRECTORY + File.separator + audioName);
String html = String.format(tplt_card,
en,
audioTag,
cn,
"<font color=grey>" + channel + "</font>",
Constant.IMAGE_SUB_DIRECTORY + File.separator + imageName,
detailUrl
);
String html_ui = String.format(tplt_ui,
en,
cn,
"<font color=grey>" + channel + "</font>"
);
eleMap.put(EXP_ELE[0], key);
eleMap.put(EXP_ELE[1], html);
definitionList.add(new Definition(eleMap, html_ui, imageUrl, imageName, audioUrl, audioName));
}
return definitionList;
} catch (IOException ioe) {
//Log.d("time out", Log.getStackTraceString(ioe));
//Toast.makeText(MyApplication.getContext(), Log.getStackTraceString(ioe), Toast.LENGTH_SHORT).show();
return new ArrayList<Definition>();
}
}
private static SimpleListBean parseSmsDetail(Document doc) {
if (doc == null) {
return null;
}
//get my uid and username
Elements uidMenuES = doc.select("#umenu cite a.noborder");
if (uidMenuES.size() < 1) {
return null;
}
String mySpaceUrl = Utils.nullToText(uidMenuES.first().attr("href"));
String myUid = Utils.getMiddleString(mySpaceUrl, "uid=", "&");
String myUsername = uidMenuES.first().text();
Elements smslistES = doc.select("li.s_clear");
if (smslistES.size() < 1) {
return null;
}
SimpleListBean list = new SimpleListBean();
for (int i = 0; i < smslistES.size(); ++i) {
Element smsE = smslistES.get(i);
SimpleListItemBean item = new SimpleListItemBean();
// author
Elements pciteES = smsE.select("p.cite");
if (pciteES.size() == 0) {
continue;
}
Elements citeES = pciteES.first().select("cite");
if (citeES.size() == 0) {
continue;
}
item.setAuthor(citeES.first().text());
// avatar
Elements avatarES = smsE.select("a.avatar");
if (avatarES.size() > 0) {
if (item.getAuthor().equals(myUsername)) {
item.setUid(myUid);
} else {
String spaceUrl = Utils.nullToText(avatarES.first().attr("href"));
item.setUid(Utils.getMiddleString(spaceUrl, "uid=", "&"));
}
item.setAvatarUrl(HiUtils.getAvatarUrlByUid(item.getUid()));
}
// time
item.setTime(pciteES.first().ownText());
// info
Elements summaryES = smsE.select("div.summary");
if (summaryES.size() == 0) {
continue;
}
item.setInfo(summaryES.first().html());
// new
Elements imgES = pciteES.first().select("img");
if (imgES.size() > 0) {
if (imgES.first().attr("src").contains(HiUtils.NewPMImage)) {
item.setNew(true);
}
}
list.add(item);
}
return list;
}
@Test public void testByAttributeRegexCombined() {
Document doc = Jsoup.parse("<div><table class=x><td>Hello</td></table></div>");
Elements els = doc.select("div table[class~=x|y]");
assertEquals(1, els.size());
assertEquals("Hello", els.text());
}
@Test public void testPseudoCombined() {
Document doc = Jsoup.parse("<div class='foo'><p>One</p><p>Two</p></div><div><p>Three</p><p>Four</p></div>");
Elements ps = doc.select("div.foo p:gt(0)");
assertEquals(1, ps.size());
assertEquals("Two", ps.get(0).text());
}