最近做了很多关于爬虫到项目,写点感想,以后查询

1.请求http连接,并保存内容,catch不同到exception进行反爬处理

int countUrl=0;
	
	public String getOneHtml(String htmlurl,String encoding,String cookie) throws IOException, InterruptedException
	{
//最多重复请求5次,用来反爬的
		if(countUrl==5){
			countUrl=0;
			return "0";
		}
		//System.out.println(cookie);
		
	String temp;
	final StringBuffer sb = new StringBuffer();
	HttpURLConnection httpConn = null;
	try
	{
		URL url = new URL(htmlurl);
		
		httpConn = (HttpURLConnection) url.openConnection();
		//头设置,get方法
HttpURLConnection.setFollowRedirects(true);
		   httpConn.setRequestMethod("GET");
		   httpConn.setRequestProperty("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36");
		   httpConn.setRequestProperty("Connection","keep-alive");
		   httpConn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml");
		   httpConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
		   httpConn.setRequestProperty("cookie",cookie);
		   httpConn.setRequestProperty("Cache-control","no-cache, no-store"); 
		   httpConn.setRequestProperty("Host","www.linkedin.com");
		   httpConn.setConnectTimeout(20000);
		   httpConn.setReadTimeout(20000);
		 // logger.info(httpConn.getResponseMessage());
		   BufferedReader in = new BufferedReader(new InputStreamReader(httpConn.getInputStream(), encoding));打开连接,获取内容
	   if(httpConn.getResponseCode()!=200){
		   //System.out.println(httpConn.getHeaderField("location"));
		  // System.out.println(httpConn.getResponseCode()+htmlurl);
		   httpConn.disconnect();
		   Thread.sleep(30000);

		   cookie=login();
		   return getOneHtml(htmlurl,encoding,cookie);
	   }
		while ((temp = in.readLine()) != null)
		   //替换点一些无用到符号
		{
	       temp=temp.replaceAll("	","");
	       temp=temp.replaceAll("\\u002d","-");
	       temp=temp.replaceAll("\\u0026","&");
	       temp=temp.replaceAll("\\\\u002d","-");
	       temp=temp.replaceAll("\\\\u0026","&");
	       temp=temp.replaceAll("\n","");
	       temp=temp.replaceAll("\t","");
	       temp=temp.replaceAll("\r","");
		   sb.append(temp);
	   }
	   in.close();
	   httpConn.disconnect();
	 
	}
	catch (final MalformedURLException me)
	{
	   System.out.println("url不存在!");
	   me.getMessage();
	   throw me;
	}
	catch (final FileNotFoundException me)
	{
		System.out.println(htmlurl+"反爬启动");
	   return "0";
	}
	catch (final IOException e)
	{
		e.printStackTrace();
		System.out.println("反爬启动:"+htmlurl+"次数:"+countUrl++);
		httpConn.disconnect();
		Thread.sleep(20000);
		return this.getOneHtml(htmlurl, encoding,cookie);
	}
		
		//System.out.println(sb);
		countUrl=0;
		httpConn.disconnect();
		
		return sb.toString();
		
	}
2.模拟登录,获取cookie:
public String login() throws MalformedURLException, InterruptedException{
		//Thread.sleep(3000000);
		String htmlurl="https://www.linkedin.com/uas/login-submit";
		HttpURLConnection httpConn = null;
		String cookie="";
		try
		{
			URL url = new URL(htmlurl);
			
			httpConn = (HttpURLConnection) url.openConnection();
			
			   HttpURLConnection.setFollowRedirects(true);
			   httpConn.setRequestMethod("POST");
			   httpConn.setRequestProperty("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36");
			   httpConn.setRequestProperty("Connection","keep-alive");
			   httpConn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml");
			   httpConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
			   httpConn.setRequestProperty("Cache-control","no-cache, no-store");
			   httpConn.setRequestProperty("Host","www.linkedin.com");
			   //httpConn.setRequestProperty("Referer","https://www.linkedin.com/uas/login?session_redirect=http://www.linkedin.com/profile/view?id=222323610&authType=name&authToken=fcEe");
			   //post方法,重定向设置
			   httpConn.setDoOutput(true);
			   httpConn.setDoInput(true);
			   httpConn.setUseCaches(false);
			   httpConn.setInstanceFollowRedirects(true);
			   //写入,post方法必须用流写入的方式传输数据
			   StringBuffer str_buf = new StringBuffer(4096);
			   OutputStream os = httpConn.getOutputStream();
		       str_buf.append("session_key").append("=").append("email").append("&");
		       str_buf.append("session_password").append("=").append("gmail").append("&");
		       //str_buf.append("session_redirect").append("=").append(redictURL);
		       os.write(str_buf.toString().getBytes());
		       os.flush();
		       os.close();
			   httpConn.setConnectTimeout(20000);
			   httpConn.setReadTimeout(20000);
		        //获取重定向和cookie
		        //String redictURL= httpConn.getHeaderField( "Location" );  		        
		        //System.out.println("第一次请求重定向地址 location="+redictURL);   
		       
		        //获取cookie  
		        Map<String,List<String>> map=httpConn.getHeaderFields();  
		        //System.out.println(map.toString());  
		        Set<String> set=map.keySet();  
		        for (Iterator<String> iterator = set.iterator(); iterator.hasNext();) {  
		            String key = iterator.next();  
		            if(key!=null){
		            	if (key.equals("Set-Cookie")) {  
			                System.out.println("key=" + key+",开始获取cookie");  
			                List<String> list = map.get(key);    
			                for (String str : list) {  
			                	String temp=str.split("=")[0];
			                	//System.out.println(temp);
//cookie包含到信息非常多,调试发现登录只需这条信息 
			                	if(temp.equals("li_at")){
			                		cookie=str;
			                		return cookie;
			                	}
			                   
			                }  
			            }  
		            }
		            
		        }  
		   httpConn.disconnect();
		 
		}
		catch (final MalformedURLException me)
		{
		   System.out.println("url不存在!");
		   me.getMessage();
		   throw me;
		}
		catch (final FileNotFoundException me)
		{
			System.out.println(htmlurl+"反爬启动");
		   return "0";
		}
		catch (final IOException e)
		{
			e.printStackTrace();
			System.out.println("反爬启动:"+htmlurl+"次数:"+countUrl++);
			httpConn.disconnect();
			Thread.sleep(20000);
			return login();
		}
			
			//System.out.println(sb);
		return cookie;
		//return redictURL;
	}



以上是http处理部分,灵活应用post和get方法,可以获取HTML内容。

但是不同网站反爬策略不同。有的封IP,需要登录到有封帐号的,我这个是最简单到断开链接的,直接进程休眠。。。需要换IP,代理,cookie的情况,可以自己分析,基本也就是设置httpConn的一些值。

3.数据获取:

我一般采用正则匹配,这比较适用于爬取数据不多,网站只返回HTML内容,非常不规范的。。。比如linkedin,所有数据都在一个注释到json里,各种链接和奇怪的符号,用工具很难解析。。。

//教育信息"fosList":.*?schoolLogo
	    String edu="null";
	    ArrayList<EduInfor> listEdu=new ArrayList<EduInfor>();
	    String regex1 = "\"fosList\":.*?schoolLogo";
	    Pattern pa1 = Pattern.compile(regex1, Pattern.DOTALL);
	    Matcher ma1 = pa1.matcher(s);
	    while(ma1.find()){
	    	EduInfor ei=new EduInfor(ui.getCv_id());
	    	edu=ma1.group();
	    	//学校
	    	String school="null";
			String regex = "\"schoolName\":.*?,";
		    Pattern pa= Pattern.compile(regex, Pattern.DOTALL);
		    Matcher ma = pa.matcher(edu);
		    if(ma.find()){
		    	school=ma.group();
		    	school=school.replaceAll("\"schoolName\":", "");
		    	school=school.replaceAll("\"", "");
		    	school=school.replaceAll(",", "");
		    	if(!school.equals("")){
		    		ei.setCollege(school);
		    	}	
		    }
	    	//学位
		    String degree="null";
			regex = "\"fmt__degree_highlight\":.*?,";
		    pa= Pattern.compile(regex, Pattern.DOTALL);
		    ma = pa.matcher(edu);
		    if(ma.find()){
		    	degree=ma.group();
		    	degree=degree.replaceAll("\"fmt__degree_highlight\":", "");
		    	degree=degree.replaceAll("\"", "");
		    	degree=degree.replaceAll(",", "");
		    	degree=degree.replaceAll("\\u0027s", "");
		    	if(!degree.equals("")){
		    		ei.setDegree_name(degree);
		    	}	
		    }
	    	//专业
		    String major="null";
			regex = "\"fmt__fos_highlight\":.*?,";
		    pa= Pattern.compile(regex, Pattern.DOTALL);
		    ma = pa.matcher(edu);
		    if(ma.find()){
		    	major=ma.group();
		    	major=major.replaceAll("\"fmt__fos_highlight\":", "");
		    	major=major.replaceAll("\"", "");
		    	major=major.replaceAll(",", "");
		    	if(!major.equals("")){
		    		ei.setMajor(major);
		    	}	
		    }
		    //学历"grade":"1st"
		    String academic="null";
			regex = "\"grade\":.*?,";
		    pa= Pattern.compile(regex, Pattern.DOTALL);
		    ma = pa.matcher(edu);
		    if(ma.find()){
		    	academic=ma.group();
		    	academic=academic.replaceAll("\"grade\":", "");
		    	academic=academic.replaceAll("\"", "");
		    	academic=academic.replaceAll(",", "");
		    	if(!academic.equals("")){
		    		ei.setAcademic_name(academic);
		    	}	
		    }
	    	//时间"enddate_my":"2005","startdate_my":"2002"
		    String s_time="null";
			regex = "\"startdate_my\":.*?,";
		    pa= Pattern.compile(regex, Pattern.DOTALL);
		    ma = pa.matcher(edu);
		    if(ma.find()){
		    	s_time=ma.group();
		    	s_time=s_time.replaceAll("\"startdate_my\":", "");
		    	s_time=s_time.replaceAll("\"", "");
		    	s_time=s_time.replaceAll(",", "");
		    	s_time=s_time.replaceAll(" ", "");
		    	if(!s_time.equals("")){
		    		ei.setStart_time(s_time);
		    	}	
		    }
		    
		    String e_time="null";
			regex = "\"enddate_my\":.*?,";
		    pa= Pattern.compile(regex, Pattern.DOTALL);
		    ma = pa.matcher(edu);
		    if(ma.find()){
		    	e_time=ma.group();
		    	e_time=e_time.replaceAll("\"enddate_my\":", "");
		    	e_time=e_time.replaceAll("\"", "");
		    	e_time=e_time.replaceAll(",", "");
		    	e_time=e_time.replaceAll(" ", "");
		    	if(!e_time.equals("")){
		    		ei.setEnd_time(e_time);
		    	}	
		    }else{
		    	ei.setEnd_time("目前");
		    }
		    listEdu.add(ei);
		    
	    }



很多人都说正则匹配复杂难用,记不住。。。其实我也记不住:(但是只需要用 .*? 的匹配方式,把有用数据取出来,再replace不用的信息。。这样到代码重用率很高,写起来没那么痛苦。

4.数据输出

项目需要,全部转换成json格式,使用google的GSON包,一句话就把java的类转换成json格式的String,非常好用:)

Gson gson=new Gson();
UserInfor ui=new UserInfor();
ui.setCv_origin("linkedin");
ui.setCv_id(it.getKey());
ui=cl.getInfor(tmp_content, ui);
out.write((gson.toJson(ui)+"\n").getBytes());

5.多线程:

爬虫当然要多线程,很容易改就不写了。。。不过要写出优秀到多线程还是需要内功的,还在学。

总结:

爬虫的步骤很简单,请求http(包括处理各种异常),按需求获取数据,转换标准格式输出。。。最难的其实是第一步,包括了很多抓包分析的工作,这个很依靠经验的,多积累,不好说。。。