java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫 java写的搜索引擎网络爬虫

源代码在线查看: replaceurl.java

软件大小: 4772 K
上传用户: autojacky
关键词: java 搜索引擎 网络爬虫
下载地址: 免注册下载 普通下载 VIP

相关代码

				/*
				 * File name : ReplaceUrl.java
				 * Create Time : 2006-5-23 10:36:23
				 * Author : shiwei
				 * Descript : 替换字符串
				 * Version : 
				 */
				package com.snoics.reptile.parse;
				
				import java.util.Iterator;
				import java.util.List;
				import java.util.regex.Pattern;
				
				import com.snoics.base.util.StringClass;
				import com.snoics.base.util.regex.Regex;
				import com.snoics.reptile.regex.url.IUrlRegex;
				import com.snoics.reptile.system.common.Common;
				
				public class ReplaceUrl implements IReplaceUrl {
					private String htmlString="";
					private List urlRegexList=null;
					private String url="";
					private String replaceUrl="";
					private Regex regex=new Regex();
					
					public void setHtmlString(String htmlString) {
						this.htmlString=htmlString;
					}
				
					public String getHtmlString() {
						return htmlString;
					}
				
					public List getUrlRegexList() {
						return urlRegexList;
					}
				
					public void setUrlRegexList(List urlRegexList) {
						this.urlRegexList=urlRegexList;
					}
				
					public void setUrl(String url) {
						this.url=url;
					}
				
					public String getUrl() {
						return url;
					}
				
					public void setReplaceUrl(String replaceUrl) {
						this.replaceUrl=replaceUrl;
					}
				
					public String getReplaceUrl() {
						return replaceUrl;
					}
				
					public String replace() {
						//System.out.println("url="+url);
						url=StringClass.getSpecialReplaceString(url,"\\","\\\\");
						url=StringClass.getSpecialReplaceString(url,".","\\.");
						url=StringClass.getSpecialReplaceString(url,"?","\\?");
						url=StringClass.getSpecialReplaceString(url,"*","\\*");
						url=StringClass.getSpecialReplaceString(url,"^","\\^");
						url=StringClass.getSpecialReplaceString(url,"+","\\+");
						url=StringClass.getSpecialReplaceString(url,"$","\\$");
						//System.out.println("url1="+url);
						String finalReplaceRegex="[\\s'\"]?"+url+"[\\s'\"]?";
						String newReplaceString="'"+replaceUrl+"' ";
						if(urlRegexList==null){
							return htmlString;
						}
						Iterator it=urlRegexList.iterator();
						while(it.hasNext()){
							IUrlRegex urlRegex=(IUrlRegex)it.next();
							String replaceRegexUrl=urlRegex.getReplaceRegexUrl();
							String oldReplaceRegexUrl=StringClass.getSpecialReplaceString(replaceRegexUrl,Common.REPLACEREGEXURL_FLAG,url);
							List oldUrlList=null;
							try{
								oldUrlList=regex.group(htmlString,oldReplaceRegexUrl,Pattern.CASE_INSENSITIVE);
							}catch(Exception e){
								e.printStackTrace();
							}
							if((oldUrlList!=null)&&(oldUrlList.size()>0)) {
								Iterator iterator=oldUrlList.iterator();
								while(iterator.hasNext()) {
									String oldUrl=(String)iterator.next();
									String newUrlString="";
									try{
										newUrlString=regex.getReplaceAll(oldUrl,newReplaceString,finalReplaceRegex,Pattern.CASE_INSENSITIVE);
										htmlString=StringClass.getSpecialReplaceString(htmlString,oldUrl,newUrlString);
									}catch(Exception e){
										e.printStackTrace();
									}
								}
							}
							
						}
						return htmlString;
					}
				}
							

相关资源