Files
Quant_Code/3.新闻抓取与通知/jin10_new - 副本.py
2025-04-09 17:18:30 +08:00

76 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
# 初始化 WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式,不打开浏览器界面
driver = webdriver.Chrome(options=options)
try:
# 打开金十数据网站
driver.get("https://www.jin10.com/")
# 等待网页加载并找到目标元素
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
target_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, target_xpath))
)
# 使用 JavaScript 修改元素的 class
script = '''
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
element.className = "hot-filter_item is-active";
'''
driver.execute_script(script, target_xpath)
# 立即更新内容,假设更新内容的位置是已知的
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
update_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, update_xpath))
)
# 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式
# 请根据实际情况调整这里的操作
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
time.sleep(5) # 等待页面内容更新
# 获取页面内容
page_source = driver.page_source
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(page_source, 'lxml')
# 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
# 只爬取前5条信息
# 初始化匹配计数器
matched_count = 0
# 只爬取包含关键词的信息
for index, item in enumerate(items[:40], start=1):
text_content = item.get_text(strip=True)
# print(text_content)
if '默认火热沸爆' not in text_content:
continue
matched_count += 1
# print(items[1])
parts = text_content.split('默认火热沸爆', 1)#默认火热沸爆
if len(parts) > 1:
print(f"匹配信息 {matched_count}:")
if parts[1].strip():
modified_text = parts[1].strip()[0] + "" + parts[1].strip()[1:]
print(modified_text)
else:
print("")
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
finally:
# 关闭 WebDriver
driver.quit()