下面列出了org.jsoup.nodes.Document#getElementsByClass ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
public static void parseVillagetr(String url, Area countyArea) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("villagetr");
List<Area> counties = new LinkedList<Area>();
int sort = 1;
for (Element tr : trs) {
Elements tds = tr.getElementsByTag("td");
if (tds == null || tds.size() != 3) {
continue;
}
String villagetrCode = tds.get(0).text();
String villagetrName = tds.get(2).text();
Area villagetrArea = Area.builder().code(villagetrCode).label(villagetrName).source(url)
.sortValue(sort++).level(new RemoteData<>("VILLAGETR")).fullName(countyArea.getFullName() + villagetrName)
.build();
StaticLog.info(" 村级数据: {} ", villagetrArea);
counties.add(villagetrArea);
}
countyArea.setChildren(counties);
}
private List<Area> parseCity(String provinceName, String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("citytr");
List<Area> cities = new LinkedList<Area>();
int sort = 1;
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
String href = links.get(0).attr("href");
String cityCode = links.get(0).text();
// String cityCode = links.get(0).text().substring(0, 4);
String cityName = links.get(1).text();
Area cityArea = Area.builder()
.label(cityName).code(cityCode).source(url).sortValue(sort++)
.level(new RemoteData<>("CITY"))
.fullName(provinceName + cityName)
.build();
cityArea.setChildren(parseCounty(provinceName + cityName, COMMON_URL + href));
StaticLog.info(" 市级数据: {} ", cityArea);
cities.add(cityArea);
}
return cities;
}
public static ArrayList<DevCatalog> parseBrands(IHttpClient client, String devicesTypeUrl) throws Throwable {
String pageBody = client.performGet(devicesTypeUrl + "all").getResponseBody();
Document doc = Jsoup.parse(pageBody);
ArrayList<DevCatalog> res = new ArrayList<>();
Elements con = doc.getElementsByClass("word-list");
Elements con1 = con.select("li");
for (Element element1 : con1) {
String brandsLink = element1.getElementsByTag("a").attr("href");
String brandsName = element1.text();
DevCatalog f = new DevCatalog(brandsLink, brandsName);
f.setType(DevCatalog.DEVICE_BRAND);
res.add(f);
}
return res;
}
private void pullACG17News() throws IOException, InterruptedException {
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("http://acg17.com/category/news/")).GET().build();
String body = httpClient.send(request, HttpResponse.BodyHandlers.ofString()).body();
Document doc = Jsoup.parse(body);
Elements elements = doc.getElementsByClass("item-list");
List<ACGNew> acgNewList = elements.stream().map(e -> {
String style = e.getElementsByClass("attachment-tie-medium size-tie-medium wp-post-image").get(0).attr("style");
String cover = style.substring(style.indexOf("url(") + 4, style.indexOf(")"));
Element t = e.getElementsByClass("post-box-title").get(0).child(0);
LocalDate createDate = LocalDate.parse(e.getElementsByClass("tie-date").get(0).text().replaceAll("[年月]", "-").replace("日", ""));
String intro = e.getElementsByClass("entry").get(0).child(0).text();
String title = t.text();
String rerfererUrl = t.attr("href");
return new ACGNew(title, intro, NewsCrawlerConstant.ACG17, cover, rerfererUrl, createDate, NewsCrawlerConstant.ACG17);
}).collect(Collectors.toList());
process(acgNewList, "class", "entry");
}
private List<Integer> querySubjectId(Integer pageNum) throws IOException, InterruptedException {
List<Integer> idList = new ArrayList<>(24);
int currentIndex = 0;
//开始查找id并添加到文件
for (; currentIndex < pageNum; currentIndex++) {
System.out.println("开始爬取第" + currentIndex + "页");
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://bangumi.tv/anime/browser/?sort=date&page=" + currentIndex)).GET().build();
String body = httpClient.send(request, HttpResponse.BodyHandlers.ofString()).body();
//jsoup提取文本
Document doc = Jsoup.parse(body);
Elements elements = doc.getElementsByClass("subjectCover cover ll");
elements.forEach(e -> {
idList.add(Integer.parseInt(e.attr("href").replaceAll("\\D", "") + "\n"));
});
}
return idList;
}
@Nullable
public static Element getElementByClass(Document doc, String className) {
Elements elements = doc.getElementsByClass(className);
if (elements != null && elements.size() > 0) {
return elements.get(0);
} else {
return null;
}
}
public static void parseProvince(String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
// 获取 class='provincetr' 的元素
Elements elements = document.getElementsByClass("provincetr");
List<Area> provinces = new LinkedList<Area>();
int sort = 1;
for (Element element : elements) {
// 获取 elements 下属性是 href 的元素
Elements links = element.getElementsByAttribute("href");
for (Element link : links) {
String provinceName = link.text();
String href = link.attr("href");
String provinceCode = href.substring(0, 2);
StaticLog.info("provinceName: {} , provinceCode: {} .", provinceName, provinceCode);
Area provinceArea = Area.builder().code(provinceCode).label(provinceName).source(url)
.sortValue(sort++).fullName(provinceName).level(new RemoteData<>("PROVINCE"))
.build();
StaticLog.info("省级数据: {} ", provinceArea);
parseCity(COMMON_URL + href, provinceArea);
provinces.add(provinceArea);
}
}
StaticLog.info(JSONUtil.toJsonPrettyStr(provinces));
}
public static void parseTowntr(String url, Area countyArea) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("towntr");
List<Area> counties = new LinkedList<Area>();
int sort = 1;
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String towntrCode = links.get(0).text().substring(0, 9);
String towntrName = links.get(1).text();
Area towntrArea = Area.builder().label(towntrName).code(towntrCode).source(url)
.sortValue(sort++).level(new RemoteData<>("TOWNTR")).fullName(countyArea.getFullName() + towntrName)
.build();
StaticLog.info(" 乡镇级数据: {} ", towntrArea);
parseVillagetr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href,
countyArea);
counties.add(towntrArea);
}
countyArea.setChildren(counties);
}
private String retrieveLinkInLanguage(Document document, String language) {
Elements elementsByClass = document.getElementsByClass("interwiki-" + language);
if (elementsByClass == null || elementsByClass.isEmpty()) {
// logger.info("link in " + language + " was not found");
return null;
}
return elementsByClass.first().child(0).attr("href");
}
@Override
public List<Vacancy> getVacancies(String searchString)
{
List<Vacancy> Vacancies = new ArrayList<>();
int pageNum = 0;
Document doc = null;
while(true)
{
try {
doc = getDocument(searchString, pageNum);
} catch (IOException e) {
e.printStackTrace();
}
Elements vacancies = doc.getElementsByClass("job");
if (vacancies.size()==0) break;
for (Element element: vacancies)
{
if (element != null)
{
Vacancy vac = new Vacancy();
vac.setTitle(element.getElementsByAttributeValue("class", "title").text());
vac.setCompanyName(element.getElementsByAttributeValue("class", "company_name").text());
vac.setSiteName(URL_FORMAT);
vac.setUrl("https://moikrug.ru" + element.select("a[class=job_icon]").attr("href"));
String salary = element.getElementsByAttributeValue("class", "salary").text();
String city = element.getElementsByAttributeValue("class", "location").text();
vac.setSalary(salary.length()==0 ? "" : salary);
vac.setCity(city.length()==0 ? "" : city);
Vacancies.add(vac);
}
}
pageNum++;
}
return Vacancies;
}
private Map<String, String> parseJson(String content) {
Document doc = Jsoup.parse(content);
Elements elements = doc.getElementsByClass("tt-video-box");
String id = elements.get(0).attr("tt-videoid");
String imageUrl = elements.get(0).attr("tt-poster");
Map<String, String> map = new HashMap<>();
if (!TextUtils.isEmpty(id)) {
map.put("id", id);
}
if (!TextUtils.isEmpty(imageUrl)) {
map.put("imageUrl", imageUrl);
}
return map;
}
@Nullable
public static Element getElementByClass(Document doc, String className) {
Elements elements = doc.getElementsByClass(className);
if (elements != null && elements.size() > 0) {
return elements.get(0);
} else {
return null;
}
}
public static void baidu(String keyword) throws Exception {
String content = HttpHelper.getInstance().get(baseURL.replaceAll("keyword", keyword));
Document jsoup = Jsoup.parse(content);
Elements elements = jsoup.getElementsByClass("result");
for (Element element : elements) {
String str = element.select(".c-showurl").text();
if (str.contains("www.wenzhihuai.com")) {
String wenzhihuai = element.select(".t").select("a").attr("href");
HttpHelper.getInstance().get(wenzhihuai);
logger.info("百度->温志怀URL:" + wenzhihuai);
}
}
}
@Override
public Object start() {
println("请输入要读取的文件 如/etc/passwd,输入exit退出");
while(true){
String input = getInput();
if (input.equals("exit"))
break;
if (input.startsWith("/"))
input = input.substring(1,input.length());
String format = String.format(readFilePayload, "file:///".concat(input));
Document parse = attack(format);
if (parse!=null){
Elements wiki = parse.getElementsByClass("wiki-content");
if (wiki!=null&&wiki.hasText()){
String text = wiki.html();
println("=========================");
sendColorMsg(Message.RED(HtmlUtils.htmlEscape(text)));
println("=========================");
}
}
}
return "";
}
/**
* 乡镇级数据
*
* @param url
* @return
*/
public List<Area> parseTowntr(String fullName, String url) {
String htmlStr = HttpUtil.get(url, CHARSET);
Document document = Jsoup.parse(htmlStr);
Elements trs = document.getElementsByClass("towntr");
List<Area> counties = new LinkedList<Area>();
int sort = 1;
for (Element tr : trs) {
Elements links = tr.getElementsByTag("a");
if (links == null || links.size() != 2) {
continue;
}
String href = links.get(0).attr("href");
String towntrCode = links.get(0).text();
// String towntrCode = links.get(0).text().substring(0, 6);
String towntrName = links.get(1).text();
Area towntrArea = Area.builder()
.label(towntrName).code(towntrCode).source(url)
.fullName(fullName + towntrName)
.level(new RemoteData<>("TOWNTR"))
.sortValue(sort++)
// .nodes(parseVillagetr(fullName + towntrName, COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href))
.build();
StaticLog.info(" 乡镇级数据: {} ", towntrArea);
counties.add(towntrArea);
}
return counties;
}
@Override
public List<RetrievableDeck> getDeckList() throws IOException {
String url = getString(URL) + "/magic/deck/search?format=" + getString(FORMAT);
logger.debug("get List deck at " + url);
List<RetrievableDeck> list = new ArrayList<>();
int maxPage = getInt(MAX_PAGE);
for (int i = 1; i <= maxPage; i++) {
url = getString(URL) + "/magic/deck/search?format=" + getString(FORMAT) + "&page=" + i;
Document d = Jsoup.parse(IncapsulaParser.readUrl(url));
for (Element tr : d.getElementsByClass("gradeA")) {
RetrievableDeck deck = new RetrievableDeck();
String mana = "";
Element manaEl = tr.getElementsByTag(MTGConstants.HTML_TAG_TD).get(0);
if (manaEl.toString().contains("white-mana"))
mana += "{W}";
if (manaEl.toString().contains("blue-mana"))
mana += "{U}";
if (manaEl.toString().contains("black-mana"))
mana += "{B}";
if (manaEl.toString().contains("red-mana"))
mana += "{R}";
if (manaEl.toString().contains("green-mana"))
mana += "{G}";
String deckName = tr.getElementsByTag(MTGConstants.HTML_TAG_TD).get(1).text();
String link = getString(URL) + tr.getElementsByTag(MTGConstants.HTML_TAG_TD).get(1).getElementsByTag("a").attr("href");
String deckPlayer = tr.getElementsByTag(MTGConstants.HTML_TAG_TD).get(2).text();
String deckDesc = tr.getElementsByTag(MTGConstants.HTML_TAG_TD).get(3).text();
deck.setColor(mana);
deck.setAuthor(deckPlayer);
deck.setName(deckName);
deck.setDescription(deckDesc);
try {
deck.setUrl(new URI(link));
} catch (URISyntaxException e) {
deck.setUrl(null);
}
list.add(deck);
}
}
return list;
}
protected Elements getElementsByClass(String html, String cssClass) {
Document document = Jsoup.parse(html);
return document.getElementsByClass(cssClass);
}
public static String getErrorMessage(String html) {
Document doc = null;
doc = Jsoup.parse(html);
Elements links = doc.getElementsByClass("msg");
return links.text().toString();
}
public static void main(String[] args) {
File in = new File("index.html");
News_pinglunDaoImpl ndi=new News_pinglunDaoImpl();
try {
Document doc = Jsoup.parse(in, "UTF-8", "");
Elements e1 = doc.getElementsByClass("comment_item");
for (int i = e1.size()-1; i>=0; i--) {
String ptime=e1.get(i).getElementsByClass("ptime").text();
ptime=ptime.replaceAll("����", "");
System.out.println(ptime
+ "\t"
+e1.get(i).getElementsByClass("username")
.text()
+ "\t"
+ (e1.get(i).getElementsByTag("img").attr("src"))
+ "\t"
+ e1.get(i).getElementsByClass("comment_body").text());
int id = 30;//����id
String user = e1.get(i).getElementsByClass("username").text()+ ";"
+ (e1.get(i).getElementsByTag("img").attr("src"));
String plocation = "";
String pcontent = e1.get(i).getElementsByClass("comment_body").text();
String zan = "0";
News_pinglun news = new News_pinglun(id, user, plocation, ptime,
pcontent, zan);
if (ndi.save(news)) {
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static Character getCharacter(String url, Language lg) throws IOException {
Document doc = JSoupManager.getDocument(url);
String bigSkinURL = doc.getElementsByClass("ak-entitylook").first().attr("style");
bigSkinURL = bigSkinURL.substring(bigSkinURL.indexOf("https://"), bigSkinURL.indexOf(")"));
String littleSkinURL = doc.getElementsByClass("ak-entitylook").last().toString();
littleSkinURL = littleSkinURL.substring(littleSkinURL.indexOf("https://"), littleSkinURL.indexOf(")"));
String pseudo = doc.getElementsByClass("ak-return-link").first().text();
String level = doc.getElementsByClass("ak-directories-level").first().text()
.replace(Translator.getLabel(lg, "whois.extract.level"), "").trim();
String classe = doc.getElementsByClass("ak-directories-breed").first().text();
String server = doc.getElementsByClass("ak-directories-server-name").first().text();
String score = doc.getElementsByClass("ak-score-text").first().text() + " ("
+ doc.getElementsByClass("ak-progress-bar-text").first().text() + ")";
// Optional
String guildName = null;
String guildUrl = null;
String alliName = null;
String alliUrl = null;
Elements elem = doc.getElementsByClass("ak-infos-guildname");
if (!elem.isEmpty()) {
guildName = elem.first().text();
guildUrl = elem.first().select("a").attr("abs:href");
elem = doc.getElementsByClass("ak-infos-alliancename");
if (!elem.isEmpty()) {
alliName = elem.first().text();
alliUrl = elem.first().select("a").attr("abs:href");
}
}
StringBuilder ladderXP = new StringBuilder();
StringBuilder ladderKoli = new StringBuilder();
StringBuilder ladderSuccess = new StringBuilder();
elem = doc.getElementsByClass("ak-container ak-table ak-responsivetable");
if (!elem.isEmpty()) {
ladderXP.append(doc.getElementsByClass("ak-total-xp").first().text()).append("\n");
for(Element cote : doc.getElementsByClass("ak-total-kolizeum"))
if (! cote.text().endsWith("-1"))
ladderKoli.append(cote.text().replace(Translator.getLabel(lg, "whois.extract.koli"), "").trim()).append("\n");
Elements trs = elem.first().getElementsByTag("tbody").first().getElementsByTag("tr");
for (Element tr : trs) {
String ladderText = tr.getElementsByTag("td").first().text() + " : ";
tr.getElementsByTag("td").first().remove();
if (!tr.getElementsByTag("td").first().text().equals("-"))
ladderXP.append(ladderText).append(EmojiManager.getEmojiForLadder(tr.getElementsByTag("td").first().text())).append("\n");
if (!tr.getElementsByTag("td").get(1).text().equals("-"))
ladderKoli.append(ladderText).append(EmojiManager.getEmojiForLadder(tr.getElementsByTag("td").get(1).text())).append("\n");
if (!tr.getElementsByTag("td").last().text().equals("-"))
ladderSuccess.append(ladderText).append(EmojiManager.getEmojiForLadder(tr.getElementsByTag("td").last().text())).append("\n");
}
}
return new Character(pseudo, level, classe, server, score,
guildName, guildUrl, alliName, alliUrl, littleSkinURL, bigSkinURL, url,
ladderXP.toString(), ladderKoli.toString(), ladderSuccess.toString());
}