进入英雄页面 可以发现 每个英雄都ID 不同费用列表 是递增的 # 循环遍历英雄ID范围
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# 设置Chrome驱动服务
service = Service(ChromeDriverManager().install())
# 创建Chrome浏览器实例
driver = webdriver.Chrome(service=service)
# 创建空的DataFrame来存储数据
data = {
"英雄名称": [],
"英雄技能": [],
"技能描述": [],
"属性信息": [],
"羁绊信息": [],
"费用":[]
}
print(pd.__version__)
df = pd.DataFrame(data)
# 循环遍历英雄ID范围
# for hero_id in range(1190, 1202):
# for hero_id in range(2177, 2193):
# for hero_id in range(3194, 3209):
# for hero_id in range(4185, 4199):
for hero_id in range(5158, 5171):
try:
# 构建URL
url = f"https://jcc.qq.com/#/heroDetail/10,S11,10.4.5/{hero_id}"
driver.get(url)
# 获取页面内容
html_content = driver.page_source
# 使用Beautiful Soup解析HTML
soup = BeautifulSoup(html_content, "html.parser")
# 找到英雄详情的容器
heroDetailWrap = soup.find("div", class_="hero-detail-wrap")
if heroDetailWrap is None:
continue
# 提取英雄信息
hero_name = heroDetailWrap.find("p", class_="name").text
hero_skill = heroDetailWrap.find("p", class_="title skill-name").text.strip()
hero_skill_description = heroDetailWrap.find("p", class_="active").text.strip()
#TODO 费用
price_element = heroDetailWrap.find("li", class_="price1")
price = price_element.text.strip() if price_element else None
# 提取属性信息
attributes = heroDetailWrap.find("div", class_="attri").find_all("td")
attributes_info = [attr.text for attr in attributes]
# 提取羁绊信息
synergy_names = heroDetailWrap.find_all("p", class_="title skill-name")
# 构建一个字典以便将数据添加到DataFrame
hero_data = {
"英雄名称": hero_name,
"英雄技能": hero_skill,
"技能描述": hero_skill_description,
"属性信息": "\n".join(attributes_info),
"羁绊信息": "\n".join([name.text for name in synergy_names[1:]]), # 跳过第一个元素(英雄技能)
"费用": price
}
# 将字典转换为DataFrame并追加到现有DataFrame
df = df._append(pd.DataFrame(hero_data, index=[0]), ignore_index=True)
except Exception as e:
print(e)
print(f"Hero with ID {hero_id} does not exist.")
# 关闭浏览器
driver.quit()
# 保存DataFrame到Excel文件
df.to_excel("hero_details_五费.xlsx", index=False)
抓取不同费用英雄后 合并为json文件
import pandas as pd
import glob
# 获取所有xlsx文件
xlsx_files = glob.glob('hero_details_*.xlsx')
# 读取所有xlsx文件并合并为一个DataFrame
dfs = []
for file in xlsx_files:
df = pd.read_excel(file)
dfs.append(df)
# 合并所有DataFrame
merged_df = pd.concat(dfs)
# 将合并后的DataFrame保存为JSON文件
merged_json_filename = 'merged_hero_details.json'
merged_df.to_json(merged_json_filename, orient='records')
print("合并完成,并已保存为merged_hero_details.json文件。")
给openai作为资料学习 结果很差劲 还是不太行