[HTMLParser]配合[正则表达式]使用 过滤爬取网页
爬虫已经能够把所有页面爬下来了,但是保存的是包括完整html标签的html文件,所以使用HtmlParser来过滤不需要的html标签。然后想到了直接使用了正则表达式匹配来去除98上的[url]这种BB功能标签,别看就一个正则表达式,其实折腾了好久。正则表达式匹配[中括号,使用[,同理匹配?使用\?,别被网上一些言论误导。。不然都false。。还是自己查api比较靠谱。
HTMLParser 的核心模块是 org.htmlparser.Parser 类,这个类实际完成了对于HTML页面的分析工作。这个类有下面几个构造函数:
public Parser ();
public Parser (Lexer lexer, ParserFeedback fb);
public Parser (URLConnection connection, ParserFeedback fb) throws ParserException;
public Parser (String resource, ParserFeedback feedback) throws ParserException;
public Parser (String resource) throws ParserException;
public Parser (Lexer lexer);
public Parser (URLConnection connection) throws ParserException;
和一个静态类 public static Parser createParser (String html, String charset);
HtmlParser的filter功能也很强大,支持各种组合。。14个filter如下,下面注释掉的代码有使用几种
判断类** Filter **:
TagNameFilter
HasAttributeFilter
HasChildFilter
HasParentFilter
HasSiblingFilter
IsEqualFilter
逻辑运算** Filter **:
AndFilter
NotFilter
OrFilter
XorFilter
其他** Filter **:
NodeClassFilter
StringFilter
LinkStringFilter
LinkRegexFilter
RegexFilter
CssSelectorNodeFilter
以下就来贴下完整的处理代码:
[cc lang=”java”]
package com.htmlparser;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.File;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.Parser;
public class TestHtmlParser {
private static String ENCODE = “UTF-8”;
// 用于输出显示测试
private void message(String szMsg) {
try {
System.out.println(new String(szMsg.getBytes(ENCODE), System
.getProperty(“file.encoding”)));
} catch (Exception e) {
}
}
// 打开html文件,读取文件所有内容
public String openFile(String filename) {
try {
BufferedReader bis = new BufferedReader(new InputStreamReader(
new FileInputStream(new File(filename)), ENCODE));
String content = “”;
String temp;
while ((temp = bis.readLine()) != null) {
content += temp + “n”;
}
bis.close();
return content;
} catch (Exception e) {
return “”;
}
}
// 将解析好的字符串,写到.txt文件
public boolean writeFile(String filename, String content) {
boolean result = false;
try {
OutputStreamWriter osw = new OutputStreamWriter(
new FileOutputStream(new File(“E:\MyCrawl\cc98-txt"
+ filename + “.txt”)), ENCODE);
osw.write(content);
osw.close();
result = true;
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}
// 解析一个文件
public void parseFile(File file) {
System.out.println(“parsing:” + file.getName());
String content = openFile(file.getPath());
try {
Parser parser = Parser.createParser(content, ENCODE);
// TextExtractingVisitor visitor = new TextExtractingVisitor();
// parser.visitAllNodesWith(visitor);
// String textInPage = visitor.getExtractedText();
//
// message(textInPage);
// NodeFilter Tagfilter = new TagNameFilter(“span”);
// NodeFilter Urlfilter = new StringFilter(“http://”);
// NodeFilter noUrlfilter = new NotFilter(Urlfilter);
// NodeFilter filter = new AndFilter(Tagfilter, noUrlfilter);
NodeFilter tagfilter = new TagNameFilter(“span”);
NodeFilter filterID = new HasAttributeFilter(“id”);
NodeFilter filter = new AndFilter(tagfilter, filterID);
NodeList nodes = parser.extractAllNodesThatMatch(filter);
String txt = “”;
if (nodes != null) {
for (int i = 0; i < nodes.size(); i++) {
Node textnode = (Node) nodes.elementAt(i);
// message(“getText:” + textnode.getText());
txt += textnode.toPlainTextString() + “nr”;
}
}
txt = txt.replaceAll(“[[/?[a-zA-Z0-9\?](file://[/?[a-zA-Z0-9\?). /&~=:,&&[^]]]+]”, “”);
txt = txt.replaceAll(“ ”, “”).replaceAll(“[t]+”, “”);// .replaceAll(“[n]+”, “n”);
writeFile(file.getName(), txt);
// for (NodeIterator i = parser.elements(); i.hasMoreNodes();) {
// Node node = i.nextNode();
// message(“getText:” + node.getText());
// message(“getPlainText:” + node.toPlainTextString());
// message(“toHtml:” + node.toHtml());
// message(“toHtml(true):” + node.toHtml(true));
// message(“toHtml(false):” + node.toHtml(false));
// message(“toString:” + node.toString());
// message(“=================================================”);
// if(node.getText().length()>=3){
// if(“html”.equals(node.getText().substring(0, 4))){
// message(“getPlainText:” + node.toPlainTextString());
// }
// }
// }
} catch (Exception e) {
System.out.println(“Exception:” + e);
}
}
// 主函数,遍历cc98目录
public static void main(String[] args) {
TestHtmlParser thp = new TestHtmlParser();
LinkedList<String> folderList = new LinkedList<String>();
folderList.add(“E:\MyCrawl\cc98");
while (folderList.size() > 0) {
File file = new File(folderList.poll());
File[] files = file.listFiles();
List<File> fileList = new ArrayList<File>();
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
folderList.add(files[i].getPath());
} else {
fileList.add(files[i]);
}
}
for (File f : fileList) {
thp.parseFile(f);
}
}
}
}
[/cc]