下面列出了org.jsoup.nodes.Document#title ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private List<String> parseData(String html) {
//jsoup解析数据
Document document = Jsoup.parse(html);
String title = document.title();
ArrayList<String> strings = new ArrayList<>();
strings.add(title);
Elements ul = document.getElementsByTag("ul");
for (Element element : ul) {
if (ul.hasClass("panel_body itemlist")) {
Elements a = element.getElementsByTag("a");
for (Element aa : a) {
if (aa.ownText().length() > 20)
strings.add(aa.ownText());
}
}
}
return strings;
}
@Override
public void rip() throws IOException {
LOGGER.info("Retrieving " + this.url);
Document doc = Http.url(url).get();
//Get user friendly filename from page title
String title = doc.title();
Elements script = doc.select("script");
if (script.isEmpty()) {
throw new IOException("Could not find script code at " + url);
}
//Regex assumes highest quality source is listed first
Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
for (Element element : script) {
Matcher m = p.matcher(element.data());
if (m.find()){
String vidUrl = m.group(1);
addURLToDownload(new URL(vidUrl), HOST + "_" + title);
}
}
waitForThreads();
}
public void extractDataWithJsoup(String href){
Document doc = null;
try {
doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get();
} catch (IOException e) {
//Your exception handling here
}
if(doc != null){
String title = doc.title();
String text = doc.body().text();
Elements links = doc.select("a[href]");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
String linkOuterHtml = link.outerHtml();
String linkInnerHtml = link.html();
}
}
}
public static boolean isConnected(){
try {
Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Referer", "https://www.baidu.com")
.header("Host", "www.baidu.com")
.header("User-Agent", USER_AGENT)
.ignoreContentType(true)
.timeout(30000)
.get();
LOGGER.info("搜索结果页面标题:"+doc.title());
if(doc.title() != null && doc.title().contains("杨尚川")){
return true;
}
}catch (Exception e){
if("Network is unreachable".equals(e.getMessage())){
return false;
}else{
LOGGER.error("状态检查失败:"+e.getMessage());
}
}
return false;
}
/**
* Accesses the source of a HTML page and looks for a title element
*
* @param url http URI String
* @return String of text between the first <title> tag group on the page, empty if error.
*/
private String scrapeURLHTMLTitle( String url )
{
String title = "";
try
{
Document doc = Jsoup.connect( url ).get();
title = doc.title();
}
catch ( IOException x )
{
System.err.format( "scrapeURLHTMLTitle BufferedReader error: %s%n", x );
}
return title;
}
public static boolean isConnected(){
try {
Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Referer", "https://www.baidu.com")
.header("Host", "www.baidu.com")
.header("User-Agent", USER_AGENT)
.ignoreContentType(true)
.timeout(30000)
.get();
LOGGER.info("搜索结果页面标题:"+doc.title());
if(doc.title() != null && doc.title().contains("杨尚川")){
return true;
}
}catch (Exception e){
if("Network is unreachable".equals(e.getMessage())){
return false;
}else{
LOGGER.error("状态检查失败:"+e.getMessage());
}
}
return false;
}
private String getArticleTitle(Document sourceDoc) {
String title = "";
if(sourceDoc.head() != null &&
StringUtils.isNotEmpty(sourceDoc.head().attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE))) {
title = sourceDoc.head().attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE);
} else if (sourceDoc.select(WxCrawlerConstant.HTMLElementSelector.TITLE).first() != null) {
title = sourceDoc.select(WxCrawlerConstant.HTMLElementSelector.TITLE).first().text();
} else {
title = sourceDoc.title();
}
return title;
}
/**
* 爬取 ss 账号
*/
public ShadowSocksEntity getShadowSocks() {
try {
Document document = getDocument();
ShadowSocksEntity entity = new ShadowSocksEntity(getTargetURL(), document.title(), true, new Date());
entity.setShadowSocksSet(parse(document));
return entity;
} catch (IOException e) {
log.error(e.getMessage());
}
return new ShadowSocksEntity(getTargetURL(), "", false, new Date());
}
@Test
public void whenMisSpeltContainersEndpoint_thenNotFoundResponseWithAPIMessage() throws Exception {
HttpResponse response = getResponse(INVALID_CONTAINER_ENDPOINT, HttpStatus.SC_NOT_FOUND);
final String htmlResponse = EntityUtils.toString(response.getEntity());
Document doc = Jsoup.parse(htmlResponse);
String title = doc.title();
Elements h1 = doc.select("h1:first-child");
Elements a = doc.select("a");
assertNotNull(title);
assertEquals("404 Not Found", title);
assertEquals("404 Page Not Found", h1.text());
assertEquals("Go to Dashboard", a.text());
}
private void parseCalendar(Document document) throws TopicMapException {
String title = document.title();
Topic type = getType("vcalendar");
Topic topic = getOrCreateTopic(tm,null, title);
topic.addType(type);
parseCalendar(topic, document.body());
}
public static List<Entry> getAllLinks(Document webpage) throws Exception {
List<Entry> urls = new ArrayList<>(); //why
String title = webpage.title();
Elements links = webpage.select("a[href]");
Element link;
for(int j=0; j < links.size(); j++){
link=links.get(j);
final String label = defaultIfBlank(link.text(), link.attr("href"));
urls.add(new Entry(link.absUrl("href"), label));
}
return urls;
}
public void displayBodyText(Document document) {
// Displays the entire body of the document
String title = document.title();
out.println("Title: " + title);
out.println("---Body---");
Elements element = document.select("body");
out.println("Text: " + element.text());
}
@Override
public void parse(HttpFetchResult result, String url, String threadName, boolean isUpdate) {
try {
String html = result.getHtml();
Document doc = Jsoup.parse(html);
String title = doc.title();
logger.info(threadName +" " + title + " " + url + " ");
} catch (Exception e) {
e.printStackTrace();
}
}
public static boolean execute(Map<String, String> cookies, String action){
String url = "http://192.168.0.1/goform/SysStatusHandle";
Map<String, String> map = new HashMap<>();
map.put("action", action);
map.put("CMD", "WAN_CON");
map.put("GO", "system_status.asp");
Connection conn = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Host", HOST)
.header("Referer", REFERER)
.header("User-Agent", USER_AGENT)
.ignoreContentType(true)
.timeout(30000);
for(String cookie : cookies.keySet()){
conn.cookie(cookie, cookies.get(cookie));
}
String title = null;
try {
Connection.Response response = conn.method(Connection.Method.POST).data(map).execute();
String html = response.body();
Document doc = Jsoup.parse(html);
title = doc.title();
LOGGER.info("操作连接页面标题:"+title);
Thread.sleep(10000);
}catch (Exception e){
LOGGER.error(e.getMessage());
}
if("LAN | LAN Settings".equals(title)){
if(("3".equals(action) && isConnected())
|| ("4".equals(action) && !isConnected())){
return true;
}
}
return false;
}
public void onHandleIntent(Intent intent) {
this.url = intent.getStringExtra("url");
try {
// Connect to the web site
Document document = Jsoup.connect(url).get();
// Get the html document title
title = document.title();
} catch (IOException e) {
e.printStackTrace();
}
Intent resultIntent = new Intent(TITLE_FILTER);
resultIntent.putExtra("title", title);
LocalBroadcastManager.getInstance(this).sendBroadcast(resultIntent);
}
@Override
protected Void doInBackground(Void... params) {
try {
// Connect to the web site
Document document = Jsoup.connect(url).get();
// Get the html document title
title = document.title();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* @throws IOException
* 方法名: getBody
* 功 能: TODO(这里用一句话描述这个方法的作用)
* 参 数: @param url
* 参 数: @param key
* 参 数: @return
* 返 回: String
* 作 者 : Tenghui.Wang
* @throws
*/
public static String getBody(String url, String key) throws IOException {
Document doc = Jsoup.connect("http://www.oschina.net/")
.data("query", "Java") // 请求参数
.userAgent("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2") // 设置 User-Agent
.cookie("auth", "token") // 设置 cookie
.timeout(3000) // 设置连接超时时间
.post(); // 使用 POST 方法访问 URL
return doc.title();
}
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
final File f = getFileName(jCas);
final DocumentAnnotation da = getDocumentAnnotation(jCas);
final Document doc =
Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
final Element head = doc.head();
if (!Strings.isNullOrEmpty(css)) {
final Element cssLink = head.appendElement("link");
cssLink.attr("rel", "stylesheet");
cssLink.attr("href", css);
}
final Element charset = head.appendElement("meta");
charset.attr("charset", "utf-8");
appendMeta(head, "document.type", da.getDocType());
appendMeta(head, "document.sourceUri", da.getSourceUri());
appendMeta(head, "externalId", da.getHash());
appendMeta(head, "document.classification", da.getDocumentClassification());
appendMeta(
head,
"document.caveats",
String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
appendMeta(
head,
"document.releasability",
String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));
String title = null;
for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
appendMeta(head, md.getKey(), md.getValue());
if ("documentTitle".equalsIgnoreCase(md.getKey())) {
title = md.getValue();
}
}
if (!Strings.isNullOrEmpty(title)) {
doc.title(title);
}
final Element body = doc.body();
writeBody(jCas, body);
try {
FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
} catch (final IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* The document title is in the format "Game Title - Page X" if we're on /giveaways/id/name/search?page=X,
* so we strip out the page number.
*/
public static String getPageTitle(Document document) {
String title = document.title();
return title.replaceAll(" - Page ([\\d,]+)$", "");
}
/**
* Get the title of the HTML. If no <code>title</code> tag exists, then the
* title is null.
*
* @param htmlContent
* the HTML content that may contain a title
* @return the title of the HTML or null if none
*/
public static String getTitle(String htmlContent) {
Document doc = Jsoup.parse(htmlContent);
Elements titleNode = doc.select("head > title");
return titleNode.isEmpty() ? null : doc.title();
}