HtmlSucker 从一个网页url中提取文章信息的小工具包，例如从网页中提取文章标题、作者、发布时间、封面图以及文章正文内容。基于 jsoup 库进行 HTML 解析.

桃扇骨 2023-10-10 14:51 16阅读 0赞

HtmlSucker 提供两种正文提取算法：

1.  最大文本块：分析整个 HTML 文档的所有节点，提取其中包含最多文字的的连续节点。
2.  文本密度算法：参考 [WebCollector][] 项目的代码

目前还处于非常简单的阶段，但是可用。

导入依赖

<dependency>
        <groupId>net.oschina.htmlsucker</groupId>
        <artifactId>HtmlSucker</artifactId>
        <version>0.0.2</version>
    </dependency>

工具类：

public class HtmlUtil {
    
        private static final Logger LOG = LoggerFactory.getLogger(HtmlUtil.class);
    
        private RemoteFileService fileService;
    
        public HtmlUtil(RemoteFileService fileService) {
            this.fileService = fileService;
        }
    
        public JSONObject htmlSucker(String url) {
            JSONObject jsonObject = new JSONObject();
            try {
                Article article = HtmlSucker.select(HtmlSucker.TEXT_DENSITY_EXTRACTOR).parse(url, 30000);
                String content = article.getContent();
                content = dealContentHtml(content, url);
                jsonObject.set("title", article.getTitle());
                jsonObject.set("publishDate", article.getDate());
                jsonObject.set("author", article.getAuthor());
                jsonObject.set("content", content);
            } catch (IOException e) {
                e.printStackTrace();
            }
            return jsonObject;
        }
    
        private String dealContentHtml(String content, String url) {
            Document document = Jsoup.parse(content);
            // jsoup标准化标签，生成闭合标签
            document.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
            document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
    
            URI uri = null;
            try {
                uri = new URI(url);
            } catch (URISyntaxException e) {
                e.printStackTrace();
            }
    
            // 处理data-src地址
            Elements dataSrcElems = document.getElementsByAttribute("data-src");
            for(Element dataSrcElem : dataSrcElems) {
                String dataSrc = dataSrcElem.attr("data-src");
                if(dataSrc != null) {
                    dataSrcElem.attr("src", dataSrc);
                }
            }
    
            // 处理src地址
            Elements srcElems = document.getElementsByAttribute("src");
            dealElementUrl(srcElems, "src", uri);
    
            // 处理href地址
            Elements links = document.getElementsByAttribute("href");
            dealElementUrl(links, "href", uri);
    
            return document.body().html();
        }
    
        private void dealElementUrl(Elements elements, String attr, URI uri) {
            for(Element element : elements) {
                if("src".equals(attr)) {
                    // 处理图片前端无法直接引用的问题
                    element.attr("referrerpolicy", "no-referrer");
                }
    
                String val = element.attr(attr);
                if(val.startsWith("//")) {
                    element.attr(attr, uri.getScheme() + ":" + val);
                }
                else if(val.startsWith("/")) {
                    element.attr(attr, uri.getScheme() + "://" + uri.getHost() + val);
                }
                else if(val.startsWith(":")) {
                    element.attr(attr, uri.getScheme() + val);
                }
                else if(val.startsWith("data://")) {
                    try {
                        BufferedImage bufferedImage = ImgUtil.toImage(val);
                        //创建一个ByteArrayOutputStream
                        ByteArrayOutputStream os = new ByteArrayOutputStream();
                        //把BufferedImage写入ByteArrayOutputStream
                        ImageIO.write(bufferedImage, "jpg", os);
                        //ByteArrayOutputStream转成InputStream
                        InputStream inputStream = new ByteArrayInputStream(os.toByteArray());
                        //InputStream转成MultipartFile
                        MultipartFile multipartFile = new MockMultipartFile("file", "file.jpg", "text/plain", inputStream);
                        // 上传图片
                        R<SysFile> uploadResult = fileService.upload(multipartFile);
                        if(uploadResult.getCode() == HttpStatus.SUCCESS) {
                            SysFile sysFile = uploadResult.getData();
                            String uploadUrl = sysFile.getUrl();
                            element.attr(attr, uploadUrl);
                        }
                    } catch (IOException e) {
                        LOG.error(e.getMessage());
                    }
                }
            }
        }
    
        public static void exportPdf(HttpServletResponse response,
                                     String title, String content,
                                     List<KsAppendix> appendices) throws IOException {
            com.spire.doc.Document document = new com.spire.doc.Document();
    
            if(!StringUtils.isEmpty(content)) {
                Section section = document.addSection();
                StringBuilder contentBuilder = new StringBuilder();
                contentBuilder.append("<h2>");
                contentBuilder.append(title);
                contentBuilder.append("</h2>");
                contentBuilder.append(content);
                section.addParagraph().appendHTML(contentBuilder.toString());
            }
    
            if(appendices != null && appendices.size() > 0) {
                Section section = document.addSection();
                StringBuilder appendicesBuilder = new StringBuilder();
                appendicesBuilder.append("<div>");
                appendicesBuilder.append("<p style='font-size: 22px;'>文件列表</p>");
                for (KsAppendix ksAppendix : appendices) {
                    String fileName = ksAppendix.getFileName();
                    String fileUrl = ksAppendix.getFileUrl();
                    appendicesBuilder.append("<p style='color:red; text-decoration:underline;'><a href='");
                    appendicesBuilder.append(fileUrl);
                    appendicesBuilder.append("'>");
                    appendicesBuilder.append(fileName);
                    appendicesBuilder.append("</a></p>");
                }
                appendicesBuilder.append("</div>");
                section.addParagraph().appendHTML(appendicesBuilder.toString());
            }
    
            document.saveToFile(response.getOutputStream(), FileFormat.PDF);
            document.dispose();
        }
    }

使用：

·public ResponseResult getArticleFromUrl(@RequestParam(name = "url") String url) {
            HtmlUtil htmlUtil = new HtmlUtil(fileService);
            JSONObject jsonObject = htmlUtil.htmlSucker(url);
            return ResponseResult.success(jsonObject);
        }

其中fileService可去掉，改为

HtmlUtil htmlUtil = new HtmlUtil();

[WebCollector]: https://gitee.com/webcollector/WebCollector