爬虫已经能够把所有页面爬下来了,但是保存的是包括完整html标签的html文件,所以使用HtmlParser来过滤不需要的html标签。然后想到了直接使用了正则表达式匹配来去除98上的[url]这种BB功能标签,别看就一个正则表达式,其实折腾了好久。正则表达式匹配[中括号,使用[,同理匹配?使用\?,别被网上一些言论误导。。不然都false。。还是自己查api比较靠谱。

 HTMLParser 的核心模块是 org.htmlparser.Parser 类,这个类实际完成了对于HTML页面的分析工作。这个类有下面几个构造函数:
    public Parser ();
    public Parser (Lexer lexer, ParserFeedback fb);
    public Parser (URLConnection connection, ParserFeedback fb) throws ParserException;
    public Parser (String resource, ParserFeedback feedback) throws ParserException;
    public Parser (String resource) throws ParserException;
    public Parser (Lexer lexer);
    public Parser (URLConnection connection) throws ParserException;
和一个静态类 public static Parser createParser (String html, String charset);

HtmlParser的filter功能也很强大,支持各种组合。。14个filter如下,下面注释掉的代码有使用几种

判断类** Filter **:
TagNameFilter
HasAttributeFilter
HasChildFilter
HasParentFilter
HasSiblingFilter
IsEqualFilter
逻辑运算** Filter **:
AndFilter
NotFilter
OrFilter
XorFilter
其他** Filter **:
NodeClassFilter
StringFilter
LinkStringFilter
LinkRegexFilter
RegexFilter
CssSelectorNodeFilter

 

以下就来贴下完整的处理代码:

[cc lang=”java”]

package com.htmlparser;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.File;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.Parser;

public class TestHtmlParser {
 private static String ENCODE = “UTF-8”;

 // 用于输出显示测试
 private void message(String szMsg) {
  try {
   System.out.println(new String(szMsg.getBytes(ENCODE), System
     .getProperty(“file.encoding”)));
  } catch (Exception e) {
  }
 }

 // 打开html文件,读取文件所有内容
 public String openFile(String filename) {
  try {
   BufferedReader bis = new BufferedReader(new InputStreamReader(
     new FileInputStream(new File(filename)), ENCODE));
   String content = “”;
   String temp;

   while ((temp = bis.readLine()) != null) {
    content += temp + “n”;
   }
   bis.close();
   return content;
  } catch (Exception e) {
   return “”;
  }
 }

 // 将解析好的字符串,写到.txt文件
 public boolean writeFile(String filename, String content) {
  boolean result = false;
  try {
   OutputStreamWriter osw = new OutputStreamWriter(
     new FileOutputStream(new File(“E:\MyCrawl\cc98-txt"
       + filename + “.txt”)), ENCODE);
   osw.write(content);
   osw.close();
   result = true;

  } catch (FileNotFoundException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
  return result;
 }

 // 解析一个文件
 public void parseFile(File file) {
  System.out.println(“parsing:” + file.getName());
  String content = openFile(file.getPath());
  try {
   Parser parser = Parser.createParser(content, ENCODE);
   
   // TextExtractingVisitor visitor = new TextExtractingVisitor();
   // parser.visitAllNodesWith(visitor);
   // String textInPage = visitor.getExtractedText();
   //
   // message(textInPage);

   // NodeFilter Tagfilter = new TagNameFilter(“span”);
   // NodeFilter Urlfilter = new StringFilter(“http://”);
   // NodeFilter noUrlfilter = new NotFilter(Urlfilter);
   // NodeFilter filter = new AndFilter(Tagfilter, noUrlfilter);

   NodeFilter tagfilter = new TagNameFilter(“span”);
   NodeFilter filterID = new HasAttributeFilter(“id”);
   NodeFilter filter = new AndFilter(tagfilter, filterID);

   NodeList nodes = parser.extractAllNodesThatMatch(filter);

   String txt = “”;
   if (nodes != null) {
    for (int i = 0; i < nodes.size(); i++) {
     Node textnode = (Node) nodes.elementAt(i);

     // message(“getText:” + textnode.getText());
     txt += textnode.toPlainTextString() + “nr”;

    }
   }
   
   txt = txt.replaceAll(“[[/?[a-zA-Z0-9\?](file://[/?[a-zA-Z0-9\?). /&~=:,&&[^]]]+]”, “”);
   txt = txt.replaceAll(“ ”, “”).replaceAll(“[t]+”, “”);// .replaceAll(“[n]+”, “n”);
   writeFile(file.getName(), txt);

   // for (NodeIterator i = parser.elements(); i.hasMoreNodes();) {
   // Node node = i.nextNode();
   // message(“getText:” + node.getText());
   // message(“getPlainText:” + node.toPlainTextString());
   // message(“toHtml:” + node.toHtml());
   // message(“toHtml(true):” + node.toHtml(true));
   // message(“toHtml(false):” + node.toHtml(false));
   // message(“toString:” + node.toString());
   // message(“=================================================”);
   // if(node.getText().length()>=3){
   // if(“html”.equals(node.getText().substring(0, 4))){
   // message(“getPlainText:” + node.toPlainTextString());
   // }
   // }
   // }
  } catch (Exception e) {
   System.out.println(“Exception:” + e);
  }
 }

 // 主函数,遍历cc98目录
 public static void main(String[] args) {
  TestHtmlParser thp = new TestHtmlParser();
  LinkedList<String> folderList = new LinkedList<String>();
  folderList.add(“E:\MyCrawl\cc98");
  while (folderList.size() > 0) {
   File file = new File(folderList.poll());
   File[] files = file.listFiles();
   List<File> fileList = new ArrayList<File>();
   for (int i = 0; i < files.length; i++) {
    if (files[i].isDirectory()) {
     folderList.add(files[i].getPath());
    } else {
     fileList.add(files[i]);
    }
   }

   for (File f : fileList) {
    thp.parseFile(f);
   }
  }

 }

}

[/cc]