html解析 jsoup使用介绍 jsoup解析html
- 2016-03-31 22:54:00
- admin
- 原创 2362
一、jsoup做什么
jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.
二、jsoup的maven配置
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
三、index.html的内容
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>jsoup use</title>
</head>
<body>
first demo:
show
<div>
<p>how to use jsoup?</p>
</div>
</body>
</html>
四、解析html代码示例
代码下载(浏览器打开可能乱码,请直接下载然后打开):HtmlParser.java
工程文件:html.rar
import java.io.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.jsoup.parser.*;
import org.jsoup.helper.*;
public class HtmlParser {
public static class TextVisitor implements NodeVisitor {
//单线程一般用StringBuilder,多线程一般用StringBuffer
private StringBuilder text = new StringBuilder();
private boolean showBlock;
public TextVisitor(boolean showBlock) {
this.showBlock = showBlock;
}
public String getText() {
return text.toString().trim();
}
public static boolean preserveWhitespace(Node node) {
if (node != null && node instanceof Element) {
Element element = (Element)node;
return element.tag().preserveWhitespace() ||
element.parent() != null && element.parent().tag().preserveWhitespace();
}
return false;
}
public static boolean lastCharIsSpace(StringBuilder sb) {
return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
}
public static boolean lastCharIsLineBreak(StringBuilder sb) {
return sb.length() != 0 && sb.charAt(sb.length() - 1) == '\n';
}
private static void appendNormalisedText(StringBuilder accum, TextNode textNode) {
String text = textNode.getWholeText();
text = text.trim();
if (preserveWhitespace(textNode.parent()))
accum.append(text);
else
StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsSpace(accum));
}
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode)node;
appendNormalisedText(text, textNode);
} else if (node instanceof Element) {
Element element = (Element)node;
Tag tag = element.tag();
String tagName = tag.getName();
boolean isBlock = tag.isBlock();
if (showBlock)
System.out.println(tagName + " block state is " + isBlock);
if (text.length() > 0
&& (isBlock || tagName.equals("br"))
&& !lastCharIsLineBreak(text))
text.append("\n");
}
}
public void tail(Node node, int depth) {
}
}
public static void parseSimpleHtml() throws Exception {
InputStream input =
HtmlParser.class.getResourceAsStream("/index.html");
Document doc = Jsoup.parse(input, "utf-8", "http://www.3scard.com");
TextVisitor visitor = new TextVisitor(true);
NodeTraversor traversor = new NodeTraversor(visitor);
traversor.traverse(doc);
System.out.println(visitor.getText());
}
public static void parseComplexHtml() throws Exception {
String url = "http://www.3scard.com/index.php?m=blog&f=index";
Document doc = Jsoup.connect(url).get();
System.out.println(doc.text());
TextVisitor visitor = new TextVisitor(false);
NodeTraversor traversor = new NodeTraversor(visitor);
traversor.traverse(doc);
System.out.println(visitor.getText());
}
public static void main(String[] args) throws Exception {
parseSimpleHtml();
parseComplexHtml();
}
}