20250408修改

2025-04-09 17:18:30 +08:00
parent f925dff46b
commit aaf2224484
146 changed files with 157794 additions and 5718 deletions
--- a/3.新闻抓取与通知/2.py
+++ b/3.新闻抓取与通知/2.py
@@ -0,0 +1,184 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import time
+import smtplib
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+import schedule
+import hashlib
+
+# 邮箱配置
+from_email = "240884432@qq.com"
+from_password = "osjyjmbqrzxtbjbf"
+to_email = "240884432@qq.com"
+
+# 全局存储上次匹配内容
+# 在Python中，`global` 关键字用于声明一个变量是全局变量，但不能直接用于赋值语句。
+# 应该先使用 `global` 声明变量，然后再进行赋值。
+last_matched_hash = None
+
+# 邮件发送函数
+def send_email(content):
+    msg = MIMEMultipart('alternative')
+    msg['Subject'] = '金十数据更新通知'
+    msg['From'] = from_email
+    msg['To'] = to_email
+    msg.attach(MIMEText(content, 'html'))
+
+    try:
+        server = smtplib.SMTP_SSL('smtp.qq.com', 465)
+        server.login(from_email, from_password)
+        server.sendmail(from_email, to_email, msg.as_string())
+        server.quit()
+        print("邮件发送成功")
+    except Exception as e:
+        print(f"邮件发送失败: {e}")
+
+# 初始化 WebDriver
+options = webdriver.ChromeOptions()
+options.add_argument('--headless')  # 无头模式，不打开浏览器界面
+driver = webdriver.Chrome(options=options)
+
+try:
+    # 打开金十数据网站
+    driver.get("https://www.jin10.com/")
+
+    # 等待网页加载并找到目标元素
+    target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
+    target_element = WebDriverWait(driver, 20).until(
+        EC.presence_of_element_located((By.XPATH, target_xpath))
+    )
+
+    # 使用 JavaScript 修改元素的 class
+    script = '''
+    var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+    element.className = "hot-filter_item is-active";
+    '''
+    driver.execute_script(script, target_xpath)
+
+    # 立即更新内容，假设更新内容的位置是已知的
+    update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
+    update_element = WebDriverWait(driver, 20).until(
+        EC.presence_of_element_located((By.XPATH, update_xpath))
+    )
+    
+    # 触发页面更新，这里假设更新内容是通过某种操作，比如点击按钮或其他方式
+    # 请根据实际情况调整这里的操作
+    driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
+    time.sleep(5)  # 等待页面内容更新
+
+    # 获取页面内容
+    page_source = driver.page_source
+
+    # 使用 BeautifulSoup 解析 HTML
+    soup = BeautifulSoup(page_source, 'lxml')
+
+    # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签
+    items = soup.find_all('div', class_='jin-flash-item-container is-normal')
+
+    # 只爬取前5条信息
+    # 初始化匹配计数器
+    matched_count = 0
+    
+    # 只爬取包含关键词的信息
+    for index, item in enumerate(items[:50], start=1):
+        text_content = item.get_text(strip=True)
+        
+        if '默认火热沸爆' not in text_content:
+            continue
+            
+        matched_count += 1
+        # print(items[1])
+        parts = text_content.split('默认火热沸爆', 1)
+        if len(parts) > 1:
+            print(f"匹配信息 {matched_count}:")
+            if parts[1].strip():
+                modified_text = parts[1].strip()[0] + "：" + parts[1].strip()[1:]
+                print(modified_text)
+            else:
+                print("")
+    
+    print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
+
+    if modified_text:
+        # global last_matched_hash
+        current_hash = hashlib.md5(modified_text.encode()).hexdigest()
+        if last_matched_hash and current_hash != last_matched_hash:
+            send_email(modified_text)
+        last_matched_hash = current_hash
+
+finally:
+    # 关闭 WebDriver
+    driver.quit()
+
+# 定时任务
+def fetch_news():
+    
+    # 初始化modified_text避免未定义错误
+    modified_text = ''
+    
+    options = webdriver.ChromeOptions()
+    options.add_argument('--headless')
+    driver = webdriver.Chrome(options=options)
+
+    try:
+        driver.get("https://www.jin10.com/")
+
+        target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
+        target_element = WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.XPATH, target_xpath))
+        )
+
+        script = '''
+        var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+        element.className = "hot-filter_item is-active";
+        '''
+        driver.execute_script(script, target_xpath)
+
+        update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
+        update_element = WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.XPATH, update_xpath))
+        )
+        driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
+        time.sleep(5)
+
+        page_source = driver.page_source
+        soup = BeautifulSoup(page_source, 'lxml')
+        items = soup.find_all('div', class_='jin-flash-item-container is-normal')
+
+        matched_count = 0
+        modified_text = ''
+        
+        for item in items[:50]:
+            text_content = item.get_text(strip=True)
+            if '默认火热沸爆' not in text_content:
+                continue
+                
+            matched_count += 1
+            parts = text_content.split('默认火热沸爆', 1)
+            if len(parts) > 1 and parts[1].strip():
+                modified_text = parts[1].strip()[0] + "：" + parts[1].strip()[1:]
+                
+        if modified_text:
+            current_hash = hashlib.md5(modified_text.encode()).hexdigest()
+            if last_matched_hash and current_hash != last_matched_hash:
+                send_email(modified_text)
+            last_matched_hash = current_hash
+
+        print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
+
+    except Exception as e:
+        print(f"执行出错: {e}")
+    finally:
+        driver.quit()
+
+# 每5分钟运行一次
+schedule.every(1).minutes.do(fetch_news)
+
+print("开始定时监控...")
+while True:
+    schedule.run_pending()
+    time.sleep(1)