20250408修改
This commit is contained in:
184
3.新闻抓取与通知/2.py
Normal file
184
3.新闻抓取与通知/2.py
Normal file
@@ -0,0 +1,184 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import smtplib
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
import schedule
|
||||
import hashlib
|
||||
|
||||
# 邮箱配置
|
||||
from_email = "240884432@qq.com"
|
||||
from_password = "osjyjmbqrzxtbjbf"
|
||||
to_email = "240884432@qq.com"
|
||||
|
||||
# 全局存储上次匹配内容
|
||||
# 在Python中,`global` 关键字用于声明一个变量是全局变量,但不能直接用于赋值语句。
|
||||
# 应该先使用 `global` 声明变量,然后再进行赋值。
|
||||
last_matched_hash = None
|
||||
|
||||
# 邮件发送函数
|
||||
def send_email(content):
|
||||
msg = MIMEMultipart('alternative')
|
||||
msg['Subject'] = '金十数据更新通知'
|
||||
msg['From'] = from_email
|
||||
msg['To'] = to_email
|
||||
msg.attach(MIMEText(content, 'html'))
|
||||
|
||||
try:
|
||||
server = smtplib.SMTP_SSL('smtp.qq.com', 465)
|
||||
server.login(from_email, from_password)
|
||||
server.sendmail(from_email, to_email, msg.as_string())
|
||||
server.quit()
|
||||
print("邮件发送成功")
|
||||
except Exception as e:
|
||||
print(f"邮件发送失败: {e}")
|
||||
|
||||
# 初始化 WebDriver
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless') # 无头模式,不打开浏览器界面
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
# 打开金十数据网站
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
# 等待网页加载并找到目标元素
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
# 使用 JavaScript 修改元素的 class
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
# 立即更新内容,假设更新内容的位置是已知的
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
|
||||
# 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式
|
||||
# 请根据实际情况调整这里的操作
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5) # 等待页面内容更新
|
||||
|
||||
# 获取页面内容
|
||||
page_source = driver.page_source
|
||||
|
||||
# 使用 BeautifulSoup 解析 HTML
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
|
||||
# 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
# 只爬取前5条信息
|
||||
# 初始化匹配计数器
|
||||
matched_count = 0
|
||||
|
||||
# 只爬取包含关键词的信息
|
||||
for index, item in enumerate(items[:50], start=1):
|
||||
text_content = item.get_text(strip=True)
|
||||
|
||||
if '默认火热沸爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
# print(items[1])
|
||||
parts = text_content.split('默认火热沸爆', 1)
|
||||
if len(parts) > 1:
|
||||
print(f"匹配信息 {matched_count}:")
|
||||
if parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
print(modified_text)
|
||||
else:
|
||||
print("")
|
||||
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
|
||||
|
||||
if modified_text:
|
||||
# global last_matched_hash
|
||||
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
|
||||
if last_matched_hash and current_hash != last_matched_hash:
|
||||
send_email(modified_text)
|
||||
last_matched_hash = current_hash
|
||||
|
||||
finally:
|
||||
# 关闭 WebDriver
|
||||
driver.quit()
|
||||
|
||||
# 定时任务
|
||||
def fetch_news():
|
||||
|
||||
# 初始化modified_text避免未定义错误
|
||||
modified_text = ''
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5)
|
||||
|
||||
page_source = driver.page_source
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
matched_count = 0
|
||||
modified_text = ''
|
||||
|
||||
for item in items[:50]:
|
||||
text_content = item.get_text(strip=True)
|
||||
if '默认火热沸爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
parts = text_content.split('默认火热沸爆', 1)
|
||||
if len(parts) > 1 and parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
|
||||
if modified_text:
|
||||
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
|
||||
if last_matched_hash and current_hash != last_matched_hash:
|
||||
send_email(modified_text)
|
||||
last_matched_hash = current_hash
|
||||
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行出错: {e}")
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# 每5分钟运行一次
|
||||
schedule.every(1).minutes.do(fetch_news)
|
||||
|
||||
print("开始定时监控...")
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
Reference in New Issue
Block a user