Jsoup学习总结
有两个推荐网址:
http://www.open-open.com/jsoup/
http://www.iteye.com/topic/1010581
这两个队jsoup简单使用做了很好的指导
我只提出比较实用的例子:
当我们读取某些网址被屏蔽返回505 时可以尝试用一下代码
// 读取URL
public static Document readUrlFistT(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url).timeout(60 * 1000).userAgent(
"Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)")
.followRedirects(true).ignoreHttpErrors(true).post();
} catch (IOException e) {
e.printStackTrace();
if ((e instanceof UnknownHostException)
|| (e instanceof SocketTimeoutException)) {
doc = readUrlFistT(url);
}
}
return doc;
}
也可以用下面的代码
public static Document readUrlFist(String url) {
Document doc = null;
Connection conn = Jsoup.connect(url);
conn
.header(
"User-Agent",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2 Googlebot/2.1");
try {
doc = conn.timeout(200 * 1000).get();
} catch (IOException e) {
e.printStackTrace();
if ((e instanceof UnknownHostException)
|| (e instanceof SocketTimeoutException)) {
doc = readUrlFist(url);
}
}
return doc;
}
个人推荐第二个,遇到解析不出来的两个换着用
还有一种情况想知道自己在的兄弟节点中的位置时候可以用siblingElements
这个比elementSiblingIndex()准确点,不过有的时候也不是很准确
下面贴一段我曾经做过的练习:如果你对某个网站做过详细的抓取后,jsoup对于你来说就是小菜了。
从不熟到应用灵活,个人喜好用select 语句,语句简单实用
package com.xinsearch.test;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.xinsearch.entity.Url;
import com.xinsearch.util.TaoUtil;
public class WedSeven extends TaoUtil {
public static String title = "";
public static Element bestFather = null;
public static List<Url> urls = new ArrayList<Url>();
public static void main(String[] args) {
String url = "http://www.qianzhihe.com.cn/allsort.html";
title = url.split("/")[2];
takeUrlFist(url);
}
// 得到最优节点
public static Element getFather(String url) {
Elements body = readBody(url);
Element bestElement = readChildByMaxNum(body.get(0));
return bestElement;
}
// 测试程序
public static String takeUrlFist(String url) {
bestFather = getFather(url);
// 得到该节点下a
Elements as = bestFather.select("a");
// 得到a 标签的父亲节点
List<Element> aParents = gerFaterPonit(as);
// 得到 a 父节点中tagName数量最多的那些节点
List<Element> aParentsTagNames = takeAparentByTagName(aParents,
bestFather);
// 得到这些节点中className 最多的节点
List<Element> aParentsClassNames = getElementByMaxClassName(aParentsTagNames);
// 在这些节点中找出想要的信息
takeMessage(aParentsClassNames);
return "已经成功提取一个URL";
}
// 得到相同className的 节点的父节点
public static List<Element> gerFaterPonit(Elements elements) {
List<Element> bestElements = new ArrayList<Element>();
for (Element element : elements) {
Element elementFater = element.parent();
bestElements.add(elementFater);
}
return bestElements;
}
//
public static List<Element> getElementByMaxClassName(
List<Element> bestElements) {
String index = takeIndexByClassName(bestElements);
List<Element> bestEList = getElementByClassName(bestElements, index);
return bestEList;
}
// 读取url得到一级节点,该节点含有url
public static Elements readBody(String url) {
Document doc = readUrlFist(url);
Elements body = doc.select("body");
return body;
}
// 得到该节点下的孩子最多的那个节点(这个节点就是含有Url的节点)
public static Element readChildByMaxNum(Element body) {
Elements divOne = body.children();
Element bestElement = divOne.get(0);
int best = divOne.get(0).children().size();
for (int i = 0; i < divOne.size(); i++) {
Elements divTwo = divOne.get(i).select("a");
int temp = divTwo.size();
if (temp > best) {
best = temp;
bestElement = divOne.get(i);
}
}
// System.out.println(bestElement.attr("id"));
return bestElement;
}
// 得到tagName最多的节点
public static List<Element> takeAparentByTagName(List<Element> as,
Element bestElement) {
List<Element> bestElements = new ArrayList<Element>();
Map<String, Integer> aparent = new HashMap<String, Integer>();
String index = "";
for (Element element : as) {
String tag = element.tagName();
if (aparent.containsKey(tag)) {
aparent.put(tag, aparent.get(tag) + 1);
} else {
aparent.put(tag, 1);
}
}
Set<String> keys = aparent.keySet();
Iterator<String> iterable = keys.iterator();
int max = 0;
String best = "";
while (iterable.hasNext()) {
String key = iterable.next();
if (max < aparent.get(key)) {
max = aparent.get(key);
best = key;
}
}
index = best;
bestElements = bestElement.select(index);
return bestElements;
}
// 得到className的名字和他的数量
public static Map<String, Integer> takeMapClass(List<Element> bestElements) {
Map<String, Integer> myClass = new HashMap<String, Integer>();
for (int i = 0; i < bestElements.size(); i++) {
Element element = bestElements.get(i);
String temp = element.className();
int sum = element.children().size();
if (sum == 0) {
sum = 1;
}
if (temp == null || temp.equals("")) {
temp = "iiiuuuzzz";
}
if (myClass.containsKey(temp)) {
myClass.put(temp, myClass.get(temp) + sum);
} else {
myClass.put(temp, sum);
}
}
// System.out.println("myClass.size() "+myClass.size());
return myClass;
}
// 得到className数量最多的节点的索引
public static String takeIndexByClassName(List<Element> bestElements) {
Map<String, Integer> myClass = takeMapClass(bestElements);
Set<String> keys = myClass.keySet();
Iterator<String> iterable = keys.iterator();
int max = 0;
String best = "";
while (iterable.hasNext()) {
String key = iterable.next();
// System.out.println(key+ myClass.get(key));
if (max < myClass.get(key)) {
max = myClass.get(key);
best = key;
}
}
String index = best;
// System.out.println("index :" +index);
return index;
}
// 和className数量次多的节点的索引
public static String takeBetterIndexByClassName(List<Element> bestElements) {
Map<String, Integer> myClass = takeMapClass(bestElements);
String index = takeIndexByClassName(bestElements);
String index2 = "";
Set<String> keys = myClass.keySet();
Iterator<String> iterable = keys.iterator();
int max = 0;
String best = "";
while (iterable.hasNext()) {
String key = iterable.next();
if (!key.equals(index)) {
if (max < myClass.get(key)) {
max = myClass.get(key);
best = key;
}
}
}
index2 = best;
// System.out.println("index2 :" +index2);
return index2;
}
// 根据索引得出所要的节点
public static List<Element> getElementByClassName(
List<Element> bestElements, String index) {
List<Element> elementList = new ArrayList<Element>();
for (Element element : bestElements) {
String temp = element.className();
if (temp == null || temp.equals("")) {
temp = "iiiuuuzzz";
}
if (index.equals(temp)) {
elementList.add(element);
}
}
return elementList;
}
// 得到孩子节点
public static List<Element> takeChildren(List<Element> bestElements) {
List<Element> children = new ArrayList<Element>();
for (Element element : bestElements) {
Elements childrens = element.children();
for (Element element2 : childrens) {
children.add(element2);
}
}
return children;
}
// 得到和自己内容不同的父亲节点
public static Element getParent(Element element) {
Element parent = element.parent();
if (element.siblingElements().size() > 0) {
while (parent.text().equals(element.text())) {
parent = parent.parent();
}
}
return parent;
}
// 得到最优父亲节点
public static void bestParent(List<Element> elements) {
for (int i = 0; i < elements.size(); i++) {
System.out.println(elements.get(i).tagName());
System.out.println(elements.get(i).className());
Element parent = getParent(elements.get(i));
System.out.println(parent.tagName());
System.out.println(parent.className());
//
//
Element parentTwo = getParent(parent);
System.out.println(parentTwo.tagName());
System.out.println(parentTwo.className());
//
Element aparentTwo = parentTwo.parent();
System.out.println(aparentTwo.tagName());
System.out.println(aparentTwo.className());
//
System.out.println(elements.get(i).siblingElements().size());
System.out.println(parent.siblingElements().size());
System.out.println(parentTwo.siblingElements().size());
System.out.println(aparentTwo.siblingElements().size());
// 为什么要选择parent呢?
// 该节点是a 标签的父节点,parent里面可能含有第二级目录
// 如可选择第一级目录呢?这是个难点主要是各大网站的格式不一样
// (jsoup抓到的信息和审查元素看到的结果不一样)
//
// 如果父节点兄弟有两个,可能一个是Title ,一个是want
// 如果是这样的话直接输出二级目录
// 还有一种可能:是这个区域中只有两个want(姑且放一边)
//
// 否则
// 可能的第一种情况是,他的上一个兄弟就是第二级目录,
// (2)他兄弟中的老大是第二级目录,(这种情况不好处理,因为不确定)
//
//
//
// 如何在二级目录上找一级目录呢?
//
//
if (parent.siblingElements().size() == 1) {
if (parentTwo.siblingElements().size() == 1) {
} else {
if (parentTwo.siblingElements().size() + 1 == aparentTwo
.children().size()) {
Element apElementTwo = getParent(aparentTwo);
System.out.println(apElementTwo.className());
if (aparentTwo.siblingElements().size() == 1) {
if (apElementTwo.siblingElements().size() == 1) {
System.out.println("fdsfdsfdsf");
System.out.print(apElementTwo.parent()
.children().first().text());
} else {
System.out.println("ffffffffffffff");
System.out.print(apElementTwo.children()
.first().text());
}
} else {
// System.out.println("dddddddddddddddddddddddd");
if (apElementTwo.siblingElements().size() >= 1) {
// System.out.println("fdsfdsfdsf");
System.out.print(apElementTwo.parent()
.children().first().text());
} else {
// System.out.println("ffffffffffffff");
// System.out.print(apElementTwo.children()
// .first().text());
// System.out.println("dsffffffffff");
}
}
} else {
}
}
System.out.print(" "
+ parent.parent().children().first().text());
} else {
// 如果第二级目录在该元素的第一个节点
// 如何找到第一级目录呢?
// 苏宁:
// 如果他的兄弟节点和他父节点的孩子数相同,说明他的父亲和他是亲父子,
// 一级目录就可能是他的第一个兄弟 、
// 判断一个父节点的兄弟,若有一个兄弟,可能是他的title
//
// 也可能是 两个want
//
if (parentTwo.children().size() == parent.siblingElements()
.size() + 1) {
System.out.println("dddddddddddddddddd");
System.out.println(parentTwo.children().first().text());
} else {
if (parentTwo.elementSiblingIndex() - 1 >= 0) {
System.out.print(aparentTwo.children().get(
parentTwo.elementSiblingIndex() - 1).children()
.first().text());
}
}
System.out.print(" "
+ parent.children().first().text());
}
// /
System.out.print(" "
+ elements.get(i).select("a").text());
System.out.print(" "
+ addHttp(addTitle(
elements.get(i).select("a").attr("href"), title)));
System.out.println();
}
}
// 提取需要的信息
public static void takeMessage(List<Element> bestElements) {
threadClass(bestElements);
}
//
public static void threadClass(List<Element> elements) {
for (int i = 0; i < elements.size(); i++) {
Element element = elements.get(i);
Elements elements2 = element.select("a");
for (Element element2 : elements2) {
System.out.print(element2.text());
System.out.println(" "
+ addHttp(addTitle(element2.attr("href"), title)));
}
}
}
}
转载于//www.cnblogs.com/tomcattd/archive/2013/01/02/2842137.html
还没有评论,来说两句吧...