java写的搜索引擎网络爬虫
java写的搜索引擎网络爬虫
java写的搜索引擎网络爬虫
源代码在线查看: replaceurl.java
/*
* File name : ReplaceUrl.java
* Create Time : 2006-5-23 10:36:23
* Author : shiwei
* Descript : 替换字符串
* Version :
*/
package com.snoics.reptile.parse;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import com.snoics.base.util.StringClass;
import com.snoics.base.util.regex.Regex;
import com.snoics.reptile.regex.url.IUrlRegex;
import com.snoics.reptile.system.common.Common;
public class ReplaceUrl implements IReplaceUrl {
private String htmlString="";
private List urlRegexList=null;
private String url="";
private String replaceUrl="";
private Regex regex=new Regex();
public void setHtmlString(String htmlString) {
this.htmlString=htmlString;
}
public String getHtmlString() {
return htmlString;
}
public List getUrlRegexList() {
return urlRegexList;
}
public void setUrlRegexList(List urlRegexList) {
this.urlRegexList=urlRegexList;
}
public void setUrl(String url) {
this.url=url;
}
public String getUrl() {
return url;
}
public void setReplaceUrl(String replaceUrl) {
this.replaceUrl=replaceUrl;
}
public String getReplaceUrl() {
return replaceUrl;
}
public String replace() {
//System.out.println("url="+url);
url=StringClass.getSpecialReplaceString(url,"\\","\\\\");
url=StringClass.getSpecialReplaceString(url,".","\\.");
url=StringClass.getSpecialReplaceString(url,"?","\\?");
url=StringClass.getSpecialReplaceString(url,"*","\\*");
url=StringClass.getSpecialReplaceString(url,"^","\\^");
url=StringClass.getSpecialReplaceString(url,"+","\\+");
url=StringClass.getSpecialReplaceString(url,"$","\\$");
//System.out.println("url1="+url);
String finalReplaceRegex="[\\s'\"]?"+url+"[\\s'\"]?";
String newReplaceString="'"+replaceUrl+"' ";
if(urlRegexList==null){
return htmlString;
}
Iterator it=urlRegexList.iterator();
while(it.hasNext()){
IUrlRegex urlRegex=(IUrlRegex)it.next();
String replaceRegexUrl=urlRegex.getReplaceRegexUrl();
String oldReplaceRegexUrl=StringClass.getSpecialReplaceString(replaceRegexUrl,Common.REPLACEREGEXURL_FLAG,url);
List oldUrlList=null;
try{
oldUrlList=regex.group(htmlString,oldReplaceRegexUrl,Pattern.CASE_INSENSITIVE);
}catch(Exception e){
e.printStackTrace();
}
if((oldUrlList!=null)&&(oldUrlList.size()>0)) {
Iterator iterator=oldUrlList.iterator();
while(iterator.hasNext()) {
String oldUrl=(String)iterator.next();
String newUrlString="";
try{
newUrlString=regex.getReplaceAll(oldUrl,newReplaceString,finalReplaceRegex,Pattern.CASE_INSENSITIVE);
htmlString=StringClass.getSpecialReplaceString(htmlString,oldUrl,newUrlString);
}catch(Exception e){
e.printStackTrace();
}
}
}
}
return htmlString;
}
}