jsoup教程_3 Jsoup 讲解

£神魔★判官ぃ 2023-01-08 03:25 303阅读 0赞

项目源代码 https://gitee.com/fakerlove/jsoup

文章目录

    1. Jsoup 讲解
    • 3.1 解析Url
      • 引入依赖
      • 测试
    • 3.2 解析字符串
    • 3.3 解析文件
    • 3.4 使用dom 方式解析

3. Jsoup 讲解

3.1 解析Url

引入依赖

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3. <modelVersion>4.0.0</modelVersion>
  4. <groupId>org.example</groupId>
  5. <artifactId>httpclient-demo</artifactId>
  6. <version>1.0-SNAPSHOT</version>
  7. <properties>
  8. <maven.compiler.source>11</maven.compiler.source>
  9. <maven.compiler.target>11</maven.compiler.target>
  10. </properties>
  11. <dependencies>
  12. <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
  13. <dependency>
  14. <groupId>org.apache.httpcomponents</groupId>
  15. <artifactId>httpclient</artifactId>
  16. <version>4.5.13</version>
  17. </dependency>
  18. <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
  19. <dependency>
  20. <groupId>org.jsoup</groupId>
  21. <artifactId>jsoup</artifactId>
  22. <version>1.13.1</version>
  23. </dependency>
  24. <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
  25. <dependency>
  26. <groupId>commons-io</groupId>
  27. <artifactId>commons-io</artifactId>
  28. <version>2.8.0</version>
  29. </dependency>
  30. <!-- https://mvnrepository.com/artifact/junit/junit -->
  31. <dependency>
  32. <groupId>junit</groupId>
  33. <artifactId>junit</artifactId>
  34. <version>4.13.1</version>
  35. <scope>test</scope>
  36. </dependency>
  37. <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
  38. <dependency>
  39. <groupId>org.apache.commons</groupId>
  40. <artifactId>commons-lang3</artifactId>
  41. <version>3.11</version>
  42. </dependency>
  43. </dependencies>
  44. </project>

测试

  1. package com.ak.mytest;
  2. import org.jsoup.Jsoup;
  3. import org.jsoup.nodes.Document;
  4. import org.junit.Test;
  5. import java.io.IOException;
  6. import java.net.URL;
  7. public class MyTest {
  8. @Test
  9. public void Url(){
  10. // 解析url 地址,第一个是url,第二个是 连接超时时间
  11. try {
  12. Document document=Jsoup.parse(new URL("https://movie.douban.com/chart"),5000);
  13. String title = document.getElementsByTag("title").first().text();
  14. System.out.println(title);
  15. } catch (IOException e) {
  16. e.printStackTrace();
  17. }
  18. }
  19. }

结果

image-20210116213657069

3.2 解析字符串

  1. package com.ak.utils;
  2. import org.apache.http.client.config.RequestConfig;
  3. import org.apache.http.client.methods.CloseableHttpResponse;
  4. import org.apache.http.client.methods.HttpGet;
  5. import org.apache.http.impl.client.CloseableHttpClient;
  6. import org.apache.http.impl.client.HttpClients;
  7. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
  8. import org.apache.http.util.EntityUtils;
  9. import java.io.IOException;
  10. import java.util.ArrayList;
  11. import java.util.Random;
  12. public class HttpUtils {
  13. public static PoolingHttpClientConnectionManager cm;
  14. public static ArrayList<String> agents;
  15. static {
  16. // 创建连接池管理器
  17. cm = new PoolingHttpClientConnectionManager();
  18. // 设置连接数
  19. cm.setMaxTotal(100);
  20. // 设置每个主机(理解为网站,如:百度10个、网易10个)的最大连接数
  21. cm.setDefaultMaxPerRoute(10);
  22. //初始化 User-Agent 信息
  23. agents = new ArrayList<String>();
  24. // 添加 User-Agent 信息
  25. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36");
  26. agents.add("Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50");
  27. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0");
  28. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2");
  29. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
  30. agents.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
  31. agents.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16");
  32. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
  33. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER");
  34. agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
  35. agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
  36. agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36");
  37. System.out.println("<--------- HttpUtils initialization success --------->");
  38. }
  39. /** * 获取页面源代码 * * @param url 网页链接 * @return 页面源代码 */
  40. public static String doGetHtml(String url) {
  41. // 通过连接池获取 httpClient
  42. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
  43. HttpGet httpGet = new HttpGet(url);
  44. // 伪造 User-Agent(反反爬虫)
  45. // 生成一个范围在 0-x(不包含x)内的任意正整数
  46. int agentNum = new Random().nextInt(agents.size());
  47. httpGet.addHeader("User-Agent", agents.get(agentNum));
  48. // 设置请求信息
  49. httpGet.setConfig(getConfig());
  50. // 定义 response,方便 finally 中关闭
  51. CloseableHttpResponse response = null;
  52. try {
  53. response = httpClient.execute(httpGet);
  54. // 获取并判断,状态码是否正常(正常值:200)
  55. if (response.getStatusLine().getStatusCode() == 200) {
  56. // 判断响应体是否为空,不为空则获取内容
  57. if (response.getEntity() != null) {
  58. // 获取响应体,并指定 UTF-8 编码
  59. String content = EntityUtils.toString(response.getEntity(), "utf8");
  60. return content;
  61. }
  62. }
  63. } catch (IOException e) {
  64. e.printStackTrace();
  65. } finally {
  66. // 判断并关闭 response
  67. if (response != null) {
  68. try {
  69. response.close();
  70. } catch (IOException e) {
  71. e.printStackTrace();
  72. }
  73. }
  74. // 不关闭 httpClient,交给连接池管理
  75. }
  76. System.out.println("<--------- doGetHtml() ERROR --------->");
  77. return "";
  78. }
  79. public static RequestConfig getConfig() {
  80. RequestConfig config = RequestConfig.custom()
  81. // 创建连接的最长时间
  82. .setConnectTimeout(1000)
  83. // 获取连接最长时间
  84. .setConnectionRequestTimeout(1000)
  85. // 数据传输最长时间
  86. .setSocketTimeout(10 * 1000)
  87. .build();
  88. return config;
  89. }
  90. }

测试

  1. @Test
  2. public void testString(){
  3. // 获取字符串
  4. String content= HttpUtils.doGetHtml("https://www.baidu.com");
  5. // 转移成decument 文件
  6. Document parse = Jsoup.parse(content);
  7. String title = parse.getElementsByTag("title").first().text();
  8. System.out.println(title);
  9. }

结果

image-20210116214447584

3.3 解析文件

  1. @Test
  2. public void testFile() throws IOException {
  3. Document parse = Jsoup.parse(new File("C:\\Users\\bn\\Desktop\\豆瓣.html"), "utf-8");
  4. String title = parse.getElementsByTag("title").first().text();
  5. System.out.println(title);
  6. }

结果

image-20210116214921614

3.4 使用dom 方式解析

  1. @Test
  2. public void testDom(){
  3. // 解析url 地址,第一个是url,第二个是 连接超时时间
  4. try {
  5. Document document=Jsoup.parse(new URL("https://movie.douban.com/chart"),5000);
  6. // 第一个是 tag
  7. String title = document.getElementsByTag("title").first().text();
  8. // 第二个是class
  9. Elements nbg = document.getElementsByClass("pl");
  10. for (Element element : nbg){
  11. System.out.println(element.text());
  12. }
  13. // 第三个是
  14. Element content = document.getElementById("content");
  15. System.out.println(content.text().length());
  16. System.out.println(title);
  17. } catch (IOException e) {
  18. e.printStackTrace();
  19. }
  20. }

结果

image-20210116215703517

发表评论

表情:
评论列表 (有 0 条评论,303人围观)

还没有评论,来说两句吧...

相关阅读

    相关 jsoup教程_1 简介

    1.1 jsoup 概念 > jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API, > > 可通过D

    相关 jsoup

    jsoup 是一款 Java 的 HTML 解析器,可直接解析某个 URL 地址、HTML 文本内容。它提供了一套非常省力的 API,可通过 DOM,CSS 以及类似于 jQu

    相关 jsoup

    一、Jsoup概述 1.1、简介     jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,

    相关 爬虫Jsoup

    爬虫Jsoup 简介 导入jar 简单示例 简介 Jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提