184 lines
6.9 KiB
Python
184 lines
6.9 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
# import smtplib
|
|
# from email.mime.text import MIMEText
|
|
# from email.mime.multipart import MIMEMultipart
|
|
import hashlib
|
|
import schedule
|
|
|
|
import requests
|
|
|
|
import re
|
|
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
|
|
|
# from_email = "240884432@qq.com"
|
|
# from_password = "osjyjmbqrzxtbjbf"
|
|
# to_email = "240884432@qq.com"
|
|
|
|
# 邮件发送函数
|
|
# def send_email(content):
|
|
# msg = MIMEMultipart('alternative')
|
|
# msg['Subject'] = '金十数据更新通知'
|
|
# msg['From'] = from_email
|
|
# msg['To'] = to_email
|
|
# msg.attach(MIMEText(content, 'html'))
|
|
|
|
# try:
|
|
# server = smtplib.SMTP_SSL('smtp.qq.com', 465)
|
|
# server.login(from_email, from_password)
|
|
# server.sendmail(from_email, to_email, msg.as_string())
|
|
# server.quit()
|
|
# print("邮件发送成功")
|
|
# except Exception as e:
|
|
# print(f"邮件发送失败: {e}")
|
|
|
|
# 飞书消息发送函数
|
|
def send_feishu_message(text):
|
|
headers = {
|
|
"Content-Type": "application/json"
|
|
}
|
|
table_html = f'{text}'
|
|
data = {
|
|
"msg_type": "text",
|
|
"content": {
|
|
"text": table_html
|
|
}
|
|
}
|
|
response = requests.post("https://open.feishu.cn/open-apis/bot/v2/hook/094b85fb-4fc3-46f3-9673-ddb9702f7885", headers=headers, json=data)
|
|
if response.status_code != 200:
|
|
print(f"飞书消息发送失败,状态码: {response.status_code}, 响应内容: {response.text}")
|
|
|
|
# 全局存储上次匹配内容
|
|
last_matched_hash = None
|
|
|
|
# 主抓取函数
|
|
def fetch_news():
|
|
global last_matched_hash
|
|
|
|
# 配置选项,方法一:禁用SSL证书验证
|
|
options = webdriver.ChromeOptions()
|
|
options.add_argument('--headless')
|
|
options.add_argument('--ignore-certificate-errors')
|
|
options.add_argument('--allow-insecure-localhost')
|
|
options.add_argument('--ssl-protocol=TLSv1.2')
|
|
options.add_argument('--no-proxy-server')
|
|
options.add_argument('--enable-logging')
|
|
options.add_argument('--v=1') # 日志级别
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
# 配置选项,方法二:配置WebDriver以信任所有证书(不推荐)
|
|
# capabilities = DesiredCapabilities().CHROME
|
|
# capabilities['acceptInsecureCerts'] = True
|
|
# options = webdriver.ChromeOptions()
|
|
# driver = webdriver.Chrome(options=options, desired_capabilities=capabilities)
|
|
|
|
# service = Service("D:\chromedriver.exe")
|
|
# # 忽略证书错误
|
|
# options.add_argument('--ignore-certificate-errors')
|
|
# # 忽略 Bluetooth: bluetooth_adapter_winrt.cc:1075 Getting Default Adapter failed. 错误
|
|
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
|
# # 忽略 DevTools listening on ws://127.0.0.1... 提示
|
|
# options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
|
|
|
# 获取驱动
|
|
# driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
try:
|
|
driver.get("https://www.jin10.com/")
|
|
# driver.refresh()
|
|
|
|
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
|
target_element = WebDriverWait(driver, 40).until(
|
|
EC.presence_of_element_located((By.XPATH, target_xpath))
|
|
)
|
|
|
|
script = '''
|
|
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
|
element.className = "hot-filter_item is-active";
|
|
'''
|
|
driver.execute_script(script, target_xpath)
|
|
|
|
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
|
update_element = WebDriverWait(driver, 20).until(
|
|
EC.presence_of_element_located((By.XPATH, update_xpath))
|
|
)
|
|
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
|
# get_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
|
|
# print('网页抓取更新时间:', get_time)
|
|
time.sleep(5)
|
|
|
|
page_source = driver.page_source
|
|
soup = BeautifulSoup(page_source, 'lxml')
|
|
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
|
|
|
matched_count = 0
|
|
modified_text = ''
|
|
# text_data = {}
|
|
text_list = []
|
|
|
|
for item in items[:40]:
|
|
text_content = item.get_text(strip=True)
|
|
# print(text_content)
|
|
if '默认火热沸爆爆' not in text_content:
|
|
continue
|
|
# print('原始内容', text_content)
|
|
#
|
|
matched_count += 1
|
|
|
|
# 提取分享收藏详情复制之后的时间
|
|
index = text_content.find('分享收藏详情复制')
|
|
if index != -1:
|
|
time_match = re.search(r'\d{2}:\d{2}:\d{2}', text_content[index:])
|
|
if time_match:
|
|
specific_time = time_match.group()
|
|
# print('提取的时间:', specific_time)
|
|
parts = text_content.split('默认火热沸爆', 1)
|
|
if len(parts) > 1 and parts[1].strip():
|
|
modified_text = "消息时间:" +specific_time + "\n" + "消息等级:" + parts[1].strip()[0] + "\n" + parts[1].strip()[1:]
|
|
# text_data[matched_count] = modified_text
|
|
text_list.append(modified_text)
|
|
|
|
# print("text_list:",text_list)
|
|
# print("第一条text_list:",text_list[0])
|
|
# print("最后一条text_list:",text_list[-1])
|
|
if text_list:
|
|
print("最新一条消息:",text_list[0])
|
|
current_hash = hashlib.md5(text_list[0].encode()).hexdigest()
|
|
if last_matched_hash and current_hash != last_matched_hash:
|
|
# send_email(modified_text)
|
|
send_feishu_message(text_list[0])
|
|
|
|
# print("last_matched_hash", last_matched_hash)
|
|
last_matched_hash = current_hash
|
|
# print("current_hash", current_hash)
|
|
# else:
|
|
# print("未找到匹配的信息")
|
|
# last_matched_hash = None # 重置为 None,以便下一次匹配 nul
|
|
|
|
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆爆'的信息")
|
|
|
|
except Exception as e:
|
|
print(f"执行出错: {e}")
|
|
finally:
|
|
driver.quit()
|
|
# sp_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
|
|
# print('网页抓取间隔时间:', sp_time)
|
|
|
|
# 定时任务配置
|
|
# schedule.every(1).minutes.do(fetch_news)
|
|
schedule.every(1).seconds.do(fetch_news)
|
|
|
|
print("开始定时监控...")
|
|
while True:
|
|
schedule.run_pending()
|
|
# currrentime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
|
|
# print('当前暂停时间:', currrentime)
|
|
time.sleep(50)
|