Files
2025-04-09 17:18:30 +08:00

184 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import schedule
import hashlib
# 邮箱配置
from_email = "240884432@qq.com"
from_password = "osjyjmbqrzxtbjbf"
to_email = "240884432@qq.com"
# 全局存储上次匹配内容
# 在Python中`global` 关键字用于声明一个变量是全局变量,但不能直接用于赋值语句。
# 应该先使用 `global` 声明变量,然后再进行赋值。
last_matched_hash = None
# 邮件发送函数
def send_email(content):
msg = MIMEMultipart('alternative')
msg['Subject'] = '金十数据更新通知'
msg['From'] = from_email
msg['To'] = to_email
msg.attach(MIMEText(content, 'html'))
try:
server = smtplib.SMTP_SSL('smtp.qq.com', 465)
server.login(from_email, from_password)
server.sendmail(from_email, to_email, msg.as_string())
server.quit()
print("邮件发送成功")
except Exception as e:
print(f"邮件发送失败: {e}")
# 初始化 WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式,不打开浏览器界面
driver = webdriver.Chrome(options=options)
try:
# 打开金十数据网站
driver.get("https://www.jin10.com/")
# 等待网页加载并找到目标元素
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
target_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, target_xpath))
)
# 使用 JavaScript 修改元素的 class
script = '''
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
element.className = "hot-filter_item is-active";
'''
driver.execute_script(script, target_xpath)
# 立即更新内容,假设更新内容的位置是已知的
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
update_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, update_xpath))
)
# 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式
# 请根据实际情况调整这里的操作
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
time.sleep(5) # 等待页面内容更新
# 获取页面内容
page_source = driver.page_source
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(page_source, 'lxml')
# 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
# 只爬取前5条信息
# 初始化匹配计数器
matched_count = 0
# 只爬取包含关键词的信息
for index, item in enumerate(items[:50], start=1):
text_content = item.get_text(strip=True)
if '默认火热沸爆' not in text_content:
continue
matched_count += 1
# print(items[1])
parts = text_content.split('默认火热沸爆', 1)
if len(parts) > 1:
print(f"匹配信息 {matched_count}:")
if parts[1].strip():
modified_text = parts[1].strip()[0] + "" + parts[1].strip()[1:]
print(modified_text)
else:
print("")
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
if modified_text:
# global last_matched_hash
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
if last_matched_hash and current_hash != last_matched_hash:
send_email(modified_text)
last_matched_hash = current_hash
finally:
# 关闭 WebDriver
driver.quit()
# 定时任务
def fetch_news():
# 初始化modified_text避免未定义错误
modified_text = ''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
try:
driver.get("https://www.jin10.com/")
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
target_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, target_xpath))
)
script = '''
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
element.className = "hot-filter_item is-active";
'''
driver.execute_script(script, target_xpath)
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
update_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, update_xpath))
)
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
matched_count = 0
modified_text = ''
for item in items[:50]:
text_content = item.get_text(strip=True)
if '默认火热沸爆' not in text_content:
continue
matched_count += 1
parts = text_content.split('默认火热沸爆', 1)
if len(parts) > 1 and parts[1].strip():
modified_text = parts[1].strip()[0] + "" + parts[1].strip()[1:]
if modified_text:
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
if last_matched_hash and current_hash != last_matched_hash:
send_email(modified_text)
last_matched_hash = current_hash
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
except Exception as e:
print(f"执行出错: {e}")
finally:
driver.quit()
# 每5分钟运行一次
schedule.every(1).minutes.do(fetch_news)
print("开始定时监控...")
while True:
schedule.run_pending()
time.sleep(1)