from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import time import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText import schedule import hashlib # 邮箱配置 from_email = "240884432@qq.com" from_password = "osjyjmbqrzxtbjbf" to_email = "240884432@qq.com" # 全局存储上次匹配内容 # 在Python中,`global` 关键字用于声明一个变量是全局变量,但不能直接用于赋值语句。 # 应该先使用 `global` 声明变量,然后再进行赋值。 last_matched_hash = None # 邮件发送函数 def send_email(content): msg = MIMEMultipart('alternative') msg['Subject'] = '金十数据更新通知' msg['From'] = from_email msg['To'] = to_email msg.attach(MIMEText(content, 'html')) try: server = smtplib.SMTP_SSL('smtp.qq.com', 465) server.login(from_email, from_password) server.sendmail(from_email, to_email, msg.as_string()) server.quit() print("邮件发送成功") except Exception as e: print(f"邮件发送失败: {e}") # 初始化 WebDriver options = webdriver.ChromeOptions() options.add_argument('--headless') # 无头模式,不打开浏览器界面 driver = webdriver.Chrome(options=options) try: # 打开金十数据网站 driver.get("https://www.jin10.com/") # 等待网页加载并找到目标元素 target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]' target_element = WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.XPATH, target_xpath)) ) # 使用 JavaScript 修改元素的 class script = ''' var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; element.className = "hot-filter_item is-active"; ''' driver.execute_script(script, target_xpath) # 立即更新内容,假设更新内容的位置是已知的 update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]' update_element = WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.XPATH, update_xpath)) ) # 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式 # 请根据实际情况调整这里的操作 driver.execute_script('arguments[0].scrollIntoView(true);', update_element) time.sleep(5) # 等待页面内容更新 # 获取页面内容 page_source = driver.page_source # 使用 BeautifulSoup 解析 HTML soup = BeautifulSoup(page_source, 'lxml') # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签 items = soup.find_all('div', class_='jin-flash-item-container is-normal') # 只爬取前5条信息 # 初始化匹配计数器 matched_count = 0 # 只爬取包含关键词的信息 for index, item in enumerate(items[:50], start=1): text_content = item.get_text(strip=True) if '默认火热沸爆' not in text_content: continue matched_count += 1 # print(items[1]) parts = text_content.split('默认火热沸爆', 1) if len(parts) > 1: print(f"匹配信息 {matched_count}:") if parts[1].strip(): modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:] print(modified_text) else: print("") print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息") if modified_text: # global last_matched_hash current_hash = hashlib.md5(modified_text.encode()).hexdigest() if last_matched_hash and current_hash != last_matched_hash: send_email(modified_text) last_matched_hash = current_hash finally: # 关闭 WebDriver driver.quit() # 定时任务 def fetch_news(): # 初始化modified_text避免未定义错误 modified_text = '' options = webdriver.ChromeOptions() options.add_argument('--headless') driver = webdriver.Chrome(options=options) try: driver.get("https://www.jin10.com/") target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]' target_element = WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.XPATH, target_xpath)) ) script = ''' var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; element.className = "hot-filter_item is-active"; ''' driver.execute_script(script, target_xpath) update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]' update_element = WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.XPATH, update_xpath)) ) driver.execute_script('arguments[0].scrollIntoView(true);', update_element) time.sleep(5) page_source = driver.page_source soup = BeautifulSoup(page_source, 'lxml') items = soup.find_all('div', class_='jin-flash-item-container is-normal') matched_count = 0 modified_text = '' for item in items[:50]: text_content = item.get_text(strip=True) if '默认火热沸爆' not in text_content: continue matched_count += 1 parts = text_content.split('默认火热沸爆', 1) if len(parts) > 1 and parts[1].strip(): modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:] if modified_text: current_hash = hashlib.md5(modified_text.encode()).hexdigest() if last_matched_hash and current_hash != last_matched_hash: send_email(modified_text) last_matched_hash = current_hash print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息") except Exception as e: print(f"执行出错: {e}") finally: driver.quit() # 每5分钟运行一次 schedule.every(1).minutes.do(fetch_news) print("开始定时监控...") while True: schedule.run_pending() time.sleep(1)