jsoup教程_3 Jsoup 讲解
项目源代码 https://gitee.com/fakerlove/jsoup
文章目录
- Jsoup 讲解
- 3.1 解析Url
- 引入依赖
- 测试
- 3.2 解析字符串
- 3.3 解析文件
- 3.4 使用dom 方式解析
3. Jsoup 讲解
3.1 解析Url
引入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>httpclient-demo</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.1</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.11</version>
</dependency>
</dependencies>
</project>
测试
package com.ak.mytest;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;
import java.io.IOException;
import java.net.URL;
public class MyTest {
@Test
public void Url(){
// 解析url 地址,第一个是url,第二个是 连接超时时间
try {
Document document=Jsoup.parse(new URL("https://movie.douban.com/chart"),5000);
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
} catch (IOException e) {
e.printStackTrace();
}
}
}
结果
3.2 解析字符串
package com.ak.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
public class HttpUtils {
public static PoolingHttpClientConnectionManager cm;
public static ArrayList<String> agents;
static {
// 创建连接池管理器
cm = new PoolingHttpClientConnectionManager();
// 设置连接数
cm.setMaxTotal(100);
// 设置每个主机(理解为网站,如:百度10个、网易10个)的最大连接数
cm.setDefaultMaxPerRoute(10);
//初始化 User-Agent 信息
agents = new ArrayList<String>();
// 添加 User-Agent 信息
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36");
agents.add("Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
agents.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
agents.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER");
agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36");
System.out.println("<--------- HttpUtils initialization success --------->");
}
/** * 获取页面源代码 * * @param url 网页链接 * @return 页面源代码 */
public static String doGetHtml(String url) {
// 通过连接池获取 httpClient
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet(url);
// 伪造 User-Agent(反反爬虫)
// 生成一个范围在 0-x(不包含x)内的任意正整数
int agentNum = new Random().nextInt(agents.size());
httpGet.addHeader("User-Agent", agents.get(agentNum));
// 设置请求信息
httpGet.setConfig(getConfig());
// 定义 response,方便 finally 中关闭
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
// 获取并判断,状态码是否正常(正常值:200)
if (response.getStatusLine().getStatusCode() == 200) {
// 判断响应体是否为空,不为空则获取内容
if (response.getEntity() != null) {
// 获取响应体,并指定 UTF-8 编码
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 判断并关闭 response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 不关闭 httpClient,交给连接池管理
}
System.out.println("<--------- doGetHtml() ERROR --------->");
return "";
}
public static RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
// 创建连接的最长时间
.setConnectTimeout(1000)
// 获取连接最长时间
.setConnectionRequestTimeout(1000)
// 数据传输最长时间
.setSocketTimeout(10 * 1000)
.build();
return config;
}
}
测试
@Test
public void testString(){
// 获取字符串
String content= HttpUtils.doGetHtml("https://www.baidu.com");
// 转移成decument 文件
Document parse = Jsoup.parse(content);
String title = parse.getElementsByTag("title").first().text();
System.out.println(title);
}
结果
3.3 解析文件
@Test
public void testFile() throws IOException {
Document parse = Jsoup.parse(new File("C:\\Users\\bn\\Desktop\\豆瓣.html"), "utf-8");
String title = parse.getElementsByTag("title").first().text();
System.out.println(title);
}
结果
3.4 使用dom 方式解析
@Test
public void testDom(){
// 解析url 地址,第一个是url,第二个是 连接超时时间
try {
Document document=Jsoup.parse(new URL("https://movie.douban.com/chart"),5000);
// 第一个是 tag
String title = document.getElementsByTag("title").first().text();
// 第二个是class
Elements nbg = document.getElementsByClass("pl");
for (Element element : nbg){
System.out.println(element.text());
}
// 第三个是
Element content = document.getElementById("content");
System.out.println(content.text().length());
System.out.println(title);
} catch (IOException e) {
e.printStackTrace();
}
}
结果
还没有评论,来说两句吧...