爬虫java的实现



文章目录

  • 爬虫java的实现
  • 前言
  • 一、selenium-java是什么?
  • 二、使用步骤
  • 爬虫目录结构
  • 引入库
  • 主方法代码
  • 封装数据实体类
  • 封装数据实体类
  • 工具类(Config)
  • 工具类(MyHttpUtil)
  • MySqlStrategy
  • 工具类(序列化与反序列化)
  • 工具类
  • 总结



前言

1 selenium-java+httpclient实现爬取页面,并且通过jdbc批量插入mysql
2 可解决开启请求监控,自动获取token,ajax数据加密返回,无法直接拿数据等问题
3 chromedriver的使用自行百度(如果步骤全对,还报错,请用管理员权限运行你开发工具)
4 注意:以下代码为demo,需自己根据实际业务修改


一、selenium-java是什么?

示例:selenium-java

二、使用步骤

爬虫目录结构

selenium官方Java中文文档 selenium java api_java

引入库

mavne依赖:

<dependency>
	        <groupId>org.seleniumhq.selenium</groupId>
	        <artifactId>selenium-java</artifactId>
	        <version>4.5.3</version>
	    </dependency>
	    <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.11.0</version>
        </dependency>
        <dependency>
		    <groupId>org.apache.httpcomponents</groupId>
		    <artifactId>httpclient</artifactId>
		    <version>4.5.13</version>
       </dependency>
       <dependency>
		    <groupId>org.projectlombok</groupId>
		    <artifactId>lombok</artifactId>
		    <version>1.18.22</version>
		    <scope>provided</scope>
		</dependency>
		 <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.11.1</version>
        </dependency>
 
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.11.1</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>2.11.1</version>
        </dependency>
	    <dependency>
	      <groupId>mysql</groupId>
	      <artifactId>mysql-connector-java</artifactId>
	      <version>8.0.23</version>
	    </dependency>

主方法代码

代码如下(示例):

package test;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.devtools.DevTools;
import org.openqa.selenium.devtools.v106.network.Network;
import org.openqa.selenium.devtools.v106.network.model.Headers;
import org.openqa.selenium.devtools.v106.network.model.ResourceType;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import entity.Fa;
import util.MyHttpUtil;
import util.MySqlStrategy;
import util.SerializableUtil;
import util.Utils;

/**
 * 
 * 
 * @author admin
 *
 */
public class CrawlerTest {
	private static String token = "xxxx";
	final static String driverAddr = "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";
	//登录
	final static String url1 = "https://xxxxx/system/login?";
	//获取详情接口
	final static String querySaasUrlTemplate = "https://xxxxxx?id=#{id}";
	final static String url2 = "https://xxxxxx?";
	final static String userName = "uername";
	final static String passWord = "password";
	final static File idCacheFile = new File("id.bat");
	//搜索名称持久化文件
	final static File searchNameFile = new File("searchName.bat");
	final static Set<String> idSet=getCacheSet(idCacheFile);
	final static Set<String> searchNameSet=getCacheSet(searchNameFile);

	public static void main(String[] args) {
		System.setProperty("webdriver.chrome.driver", driverAddr);
		// 设置浏览器options
		ChromeOptions options = new ChromeOptions();
		// 关闭界面上的---Chrome正在受到自动软件的控制
		options.setExperimentalOption("excludeSwitches", new String[] { "enable-automation" });
		ChromeDriver driver = new ChromeDriver(options);
		Map<String, Object> command = new HashMap<>();
		// window.navigator.webdirver
		command.put("source", "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
		driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", command);
		// driver.executeScript("https://raw.githubusercontent.com/wendux/Ajax-hook/master/dist/ajaxhook.min.js");
		// driver.get("htps://www.baidu.com");
		// 首先登录
		driver.get(url1);
		driver.manage().window().maximize();
		Utils.sleep(5000);
		// 设置用户名
		driver.findElement(By.xpath("//*[@id=\"phone_number\"]")).sendKeys(userName);
		Utils.sleep(1000);
		// 设置密码
		driver.findElement(By.xpath("//*[@id=\"password\"]")).sendKeys(passWord);
		Utils.sleep(1000);
		// 勾选同意
		driver.findElement(By.xpath("//*[@id=\"agreement\"]")).click();
		Utils.sleep(1000);
		// 登录
		driver.findElement(
				By.xpath("//*[@id=\"root\"]/div/div[2]/div[1]/div[2]/div/div/form/div[4]/div/div/div/button")).click();
		// 获取window窗口句柄
		String handel1 = driver.getWindowHandle();
		Utils.sleep(1000);
		System.out.println("登录成功");
		Utils.sleep(3000);
		driver.get(url1);
		Utils.sleep(3000);
		// 打开一个新窗口
		String js = "window.open(\"" + url2 + "\");";
		((JavascriptExecutor) driver).executeScript(js);
		Utils.sleep(2000);
		// 切换窗口
		Object[] obj = driver.getWindowHandles().toArray();
		// 监听数据(下标为1的窗口)
		createRequestListener(1, driver);
		driver.switchTo().window(obj[1].toString());
		Utils.sleep(1000);
		//
		String searchName="搜索名称";
		//已经爬取过,不在获取
		if(searchNameSet.contains(searchName)){
			System.out.println(searchName+":已经处理过");
			return;
		}
		driver.findElement(By.xpath("//*[@id=\"name\"]")).sendKeys(searchName);
		// 查询
		driver.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[1]/div/form/div[6]/button"))
				.click();
		Utils.sleep(2000);
		WebElement webElement = null;
		try {
			// 通过是否有下一页按钮,判断是否有数据(没有数据,这一行会抛出异常退出)
			webElement = driver.findElement(By
					.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/ul/li[5]/button"));
		} catch (Exception exception) {
			// 跳出循环
			System.out.println("没有数据");
		}
		// 一个字处理完所有数据插入数据库
		List<Fa> faList = new ArrayList<>();
		// 为了防止死循环,最多1000次
		for (int i = 0; i < 1000; i++) {
			// 第一次数据不点击
			if (i != 0) {
				// 分页处理----
				// 判断是否有可以点击
				Boolean isEnabled = webElement.isEnabled();
				if (isEnabled) {
					// 可以点击
					webElement.click();
					// 点击完休眠等待
					Utils.sleep(2000);
				} else {
					// 不可以点击说明下一页处理完毕
					break;
				}
				// 每次点击后休眠2秒,取数据
			}
			// 说明有数据,直接获取
			WebElement tableWebElement = driver.findElement(By.xpath(
					"//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody"));
			List<WebElement> trList = tableWebElement.findElements(By.tagName("tr"));
			System.out.println("");
			System.out.println("当前数据页数:" + (i + 1));
			for (WebElement element : trList) {
				Utils.sleep(500);
				// System.out.println(element.getText().replace(" ", ""));
				// 获取详情数据按钮
				// WebElement
				// detailElement=element.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody/tr[1]/td[9]/div/span[1]"));
				// detailElement.click();
				// 等待获取json数据完成
				//判断该条数据是否已经完成
				String detailId = element.getAttribute("data-row-key");
                if(idSet.contains(detailId)) {
                	//该条数据已经处理
					continue;
				}
				// 单位
				String unit = element.findElement(By.xpath("//td[5]")).getText().replace(" ", "");
				// 国家名称
				String countriesName = element.findElement(By.xpath("//td[7]")).getText().replace(" ", "");
				// 通过获取的id发送http请求
				String querySaasUrl = querySaasUrlTemplate.replace("#{id}", detailId);
				String result = MyHttpUtil.getRequest(token, querySaasUrl);
				// json解析数据
				ObjectMapper mapper = new ObjectMapper();// 定义一个转化对象
				try {
					JsonNode jsonNode = mapper.readTree(result);
					if ("200".equals(String.valueOf(jsonNode.get("code")))) {
						JsonNode dataNode = jsonNode.get("data");
						System.out.println(dataNode);
						Fa fa = mapper.readValue(dataNode.toString(), Fa.class);
						fa.setUnit(unit);
						fa.setCountriesName(countriesName);
						// 筛入ajax返回的所有数据
						fa.setRowData(dataNode.toString());
						faList.add(fa);
					} else {
						System.out.println("获取json数据失败");
						System.out.println(jsonNode.toPrettyString());
						System.exit(0);
					}
				} catch (Exception e) {
					System.out.print("数据解析异常:");
					e.printStackTrace();
					// 退出
					System.exit(0);
				}
			}
			// System.out.println(tableWebElement.getText());

		}
		// 插入数据到mysql
		if(!faList.isEmpty()) {
		   MySqlStrategy.insertValue(faList);
		}
		//将本次跑的参数缓存
		searchNameSet.add(searchName);
		for(Fa factory:faList) {
			idSet.add(factory.getRowId());	
		}
		//序列化
		SerializableUtil.serialization(searchNameFile,searchNameSet);
		SerializableUtil.serialization(idCacheFile, idSet);
		//
		try {
			Thread.currentThread().join();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
	/**
	 * 切换多个窗口需要多个监听
	 * 
	 * @param i      窗口下标(只区哪个窗口监控的数据,无实际意义)
	 * @param driver
	 */
	private static void createRequestListener(int i, ChromeDriver driver) {
		DevTools devTools = driver.getDevTools();
		devTools.createSession();
		devTools.send(
				Network.enable(java.util.Optional.empty(), java.util.Optional.empty(), java.util.Optional.empty()));
		devTools.addListener(Network.requestWillBeSent(), res -> {
			Utils.sleep(10);
			System.out.println("RequestHeaders:" + res.getRequest().getHeaders());
			System.out.println("RequestHeaders:" + res.getRequest().getUrl());
			Headers header = res.getRequest().getHeaders();
			synchronized (CrawlerTest.class) {
				if (header.containsKey("Authorization")) {
					token = (String) header.get("Authorization");
					// 获取token后销毁改监视器
					devTools.close();
					System.out.println("获取到了token:" + token);
				}
			}
		});
	}

	/**
	 * 根据url获取ajax数据
	 * 
	 * @param pattern
	 * @param callback
	 */
	public static void interceptResponseXHRByUrl(int i, DevTools devTools) {
		devTools.addListener(Network.responseReceived(), responseReceived -> {
			try {
				if (ResourceType.XHR != responseReceived.getType()
						|| !responseReceived.getResponse().getUrl().contains("/xxxxxx")) {
					return;
				}
				// 取类型为XHR
				String data = "监控数据" + i + ":" + responseReceived.getType() + ":"
						+ responseReceived.getResponse().getUrl();
				Utils.sleep(2);
				FileUtils.write(new File("log/re.txt"), data, "UTF-8", true);
				FileUtils.write(new File("log/re.txt"), "\r\n", "UTF-8", true);
				devTools.send(Network.getResponseBody(responseReceived.getRequestId()));
			} catch (Exception e) {
				e.printStackTrace();
			} finally {

			}
		});
	}
	
	/**
	 * 创建一个set集合
	 * @return
	 */
	private static Set<String> getCacheSet(File file) {
		//
		Set<String> set=new LinkedHashSet<>();
		//反序列化值
		Set<String>  cacheSet=SerializableUtil.deserialization(file, set);
		if(cacheSet!=null) {
			set=cacheSet;
		}
		return set;
	}
}

封装数据实体类

代码如下(示例):

package entity;

import java.util.List;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;

import lombok.Data;

@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class Fa{
   private String rowData;
   @JsonProperty("id")
   private String rowId;
   private String unit;
   private String countriesName;
   private List<FaDetail> detailData;
}

封装数据实体类

代码如下(示例):

package entity;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;

import lombok.Data;

@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class FaDetail {
	private Long faId;
	private String type;
}

工具类(Config)

代码如下(示例):

package util;

public class Config {
	 //驱动,8.0固定为该格式
    public static final String JDBC_DRIVER = "com.mysql.cj.jdbc.Driver";
    //数据库地址,修改该数据库名称
    public static final String DB_URL = "jdbc:mysql://192.168.111.102:3306/crawler?useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai";
    //用户名
    public static final String USER = "root";
    //密码
    public static final String PASSWORD = "Sailing123`";
}

工具类(MyHttpUtil)

代码如下(示例):

package util;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class MyHttpUtil {
	private static final String token="a7ee88f8-21d6-4b1d-bfa8-ff478a473304+1000001239406480";
	private static final String url="https://xxxxxx?id=xxxxxx";
	private static final CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
	public static void main(String[] args) {
		getRequest(token,url);
	}
	public static String getRequest(String token,String url){
	     HttpGet httpGet=new HttpGet(url.toString());
	     httpGet.setHeader("authorization", token);
	     try {
		    CloseableHttpResponse closeableHttpResponse = closeableHttpClient.execute(httpGet);
		    String responseString= EntityUtils.toString(closeableHttpResponse.getEntity());
		    return responseString;
		} catch (ParseException | IOException e) {
			e.printStackTrace();
			System.out.println("请求数据出错,请排查问题");
			System.exit(1);
		}finally {
			//将连接放回连接池中(下次重新使用)
			httpGet.releaseConnection();
		}
		return null;
	}
	

}

MySqlStrategy

代码如下(示例):

package util;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

import entity.Fa;
import entity.FaDetail;

public class MySqlStrategy {
	private final static String url = Config.DB_URL;
	private final static String user = Config.USER;
	private final static String password = Config.PASSWORD;
	private static Connection conn = getConnection();

	// ALTER TABLE factor AUTO_INCREMENT=1;
	public static void main(String[] args) {
		insertValue(null);
	}

	private static Connection getConnection() {
		try {
			conn = DriverManager.getConnection(url, user, password);
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return conn;
	}

	public static void insertValue(List<Fa> datalist) {
		String sql = "insert into fa values(?,?,?,?,?)";
		String gasSql = "insert into fa_detail values(?,?,?)";
		try {
			conn.setAutoCommit(false);
		} catch (SQLException e2) {
			e2.printStackTrace();
		}
		try(PreparedStatement statement = conn.prepareStatement(sql, PreparedStatement.RETURN_GENERATED_KEYS);
				PreparedStatement detailStatement = conn.prepareStatement(gasSql,
						PreparedStatement.RETURN_GENERATED_KEYS)) {
			for (int i = 0; i < datalist.size(); i++) {
				Fa fa = datalist.get(i);
				creatFaParam(fa, statement);
				statement.addBatch();
			}
			statement.executeBatch();
			ResultSet generatedKeys = statement.getGeneratedKeys();
			List<Long> idList = new ArrayList<>();
			while (generatedKeys.next()) {
				idList.add(generatedKeys.getLong(1));
			}
			//关闭该结果集
			close(null,null,generatedKeys);
			// 给子表插入主表id
			for (int i = 0; i < datalist.size(); i++) {
				Fa factory = datalist.get(i);
				List<FaDetail> detailList = factory.getDetailData();
				if (detailList != null) {
					for (FaDetail gas : detailList) {
						gas.setFaId(idList.get(i));
						// 准备批量数据
						creatFaDetailParam(detailStatement, gas);
						detailStatement.addBatch();
					}
				}
			}
			// 对子表进行批量插入
			detailStatement.executeBatch();
			conn.commit();
		} catch (Exception e1) {
		    //回滚
			try {
				conn.rollback();
			} catch (SQLException e) {
			}
			//说明有重复的key,直接返回
			if(e1.getMessage().contains("Duplicate entry")) {
				return;
			}else {
				//退出程序,排查问题
				e1.printStackTrace();
				System.exit(1);
			}
		}
	}
	
	private static void creatFaDetailParam(PreparedStatement statement, FaDetail detail) throws SQLException {
		statement.setString(1, null);
		statement.setLong(2, detail.getFaId());
		statement.setString(3, detail.getType());
	}

	private static void creatFaParam(Fa fa, PreparedStatement statement) throws SQLException {
		statement.setString(1, null);
		statement.setString(2, fa.getRowData());
		statement.setLong(3, Long.valueOf(fa.getRowId()));
		statement.setString(4, fa.getUnit());
		statement.setString(5, fa.getCountriesName());
	}

	public static void close(Connection connection, Statement statement, ResultSet resultSet) {
		try {
			if (connection != null)
				connection.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
		try {
			if (statement != null)
				statement.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
		try {
			if (resultSet != null)
				resultSet.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

工具类(序列化与反序列化)

代码如下(示例):

package util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.LinkedHashSet;
import java.util.Set;

public class SerializableUtil{
	public static void main(String[] args) {
		File file=new File("test.dat");
		Set<String> set=new LinkedHashSet<>();
		set.add("hello");
		SerializableUtil.serialization(file, set);
		Set<String> set1=SerializableUtil.deserialization(file,new LinkedHashSet<String>());
		System.out.println(set1);
	}

	public static <T> void serialization(File file, T t) {
		try {
			ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
			oos.writeObject(t);
			oos.flush();
			oos.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	public static <T> T deserialization(File file, T t) {
		if (!file.exists()) {
			return null;
		}
		try {
			ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
			t = (T) ois.readObject();
			ois.close();
			return t;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
}

工具类

代码如下(示例):

package util;

public class Utils {
	
	 public static void sleep(Integer time){
		try {
			Thread.sleep(time);
		} catch (InterruptedException e) {
			//
		}
	}

}

总结

selenium-java结合httpclient满足大部分网站爬虫代码就到这儿了