Files
Quant_Code/3.新闻抓取与通知/jin10_feishu.py

184 lines
6.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
# import smtplib
# from email.mime.text import MIMEText
# from email.mime.multipart import MIMEMultipart
import hashlib
import schedule
import requests
import re
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# from_email = "240884432@qq.com"
# from_password = "osjyjmbqrzxtbjbf"
# to_email = "240884432@qq.com"
# 邮件发送函数
# def send_email(content):
# msg = MIMEMultipart('alternative')
# msg['Subject'] = '金十数据更新通知'
# msg['From'] = from_email
# msg['To'] = to_email
# msg.attach(MIMEText(content, 'html'))
# try:
# server = smtplib.SMTP_SSL('smtp.qq.com', 465)
# server.login(from_email, from_password)
# server.sendmail(from_email, to_email, msg.as_string())
# server.quit()
# print("邮件发送成功")
# except Exception as e:
# print(f"邮件发送失败: {e}")
# 飞书消息发送函数
def send_feishu_message(text):
headers = {
"Content-Type": "application/json"
}
table_html = f'{text}'
data = {
"msg_type": "text",
"content": {
"text": table_html
}
}
response = requests.post("https://open.feishu.cn/open-apis/bot/v2/hook/094b85fb-4fc3-46f3-9673-ddb9702f7885", headers=headers, json=data)
if response.status_code != 200:
print(f"飞书消息发送失败,状态码: {response.status_code}, 响应内容: {response.text}")
# 全局存储上次匹配内容
last_matched_hash = None
# 主抓取函数
def fetch_news():
global last_matched_hash
# 配置选项,方法一禁用SSL证书验证
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-insecure-localhost')
options.add_argument('--ssl-protocol=TLSv1.2')
options.add_argument('--no-proxy-server')
options.add_argument('--enable-logging')
options.add_argument('--v=1') # 日志级别
driver = webdriver.Chrome(options=options)
# 配置选项,方法二配置WebDriver以信任所有证书不推荐
# capabilities = DesiredCapabilities().CHROME
# capabilities['acceptInsecureCerts'] = True
# options = webdriver.ChromeOptions()
# driver = webdriver.Chrome(options=options, desired_capabilities=capabilities)
# service = Service("D:\chromedriver.exe")
# # 忽略证书错误
# options.add_argument('--ignore-certificate-errors')
# # 忽略 Bluetooth: bluetooth_adapter_winrt.cc:1075 Getting Default Adapter failed. 错误
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# # 忽略 DevTools listening on ws://127.0.0.1... 提示
# options.add_experimental_option('excludeSwitches', ['enable-logging'])
# 获取驱动
# driver = webdriver.Chrome(service=service, options=options)
try:
driver.get("https://www.jin10.com/")
# driver.refresh()
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
target_element = WebDriverWait(driver, 40).until(
EC.presence_of_element_located((By.XPATH, target_xpath))
)
script = '''
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
element.className = "hot-filter_item is-active";
'''
driver.execute_script(script, target_xpath)
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
update_element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, update_xpath))
)
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
# get_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
# print('网页抓取更新时间:', get_time)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
matched_count = 0
modified_text = ''
# text_data = {}
text_list = []
for item in items[:40]:
text_content = item.get_text(strip=True)
# print(text_content)
if '默认火热沸爆爆' not in text_content:
continue
# print('原始内容', text_content)
#
matched_count += 1
# 提取分享收藏详情复制之后的时间
index = text_content.find('分享收藏详情复制')
if index != -1:
time_match = re.search(r'\d{2}:\d{2}:\d{2}', text_content[index:])
if time_match:
specific_time = time_match.group()
# print('提取的时间:', specific_time)
parts = text_content.split('默认火热沸爆', 1)
if len(parts) > 1 and parts[1].strip():
modified_text = "消息时间:" +specific_time + "\n" + "消息等级:" + parts[1].strip()[0] + "\n" + parts[1].strip()[1:]
# text_data[matched_count] = modified_text
text_list.append(modified_text)
# print("text_list:",text_list)
# print("第一条text_list:",text_list[0])
# print("最后一条text_list:",text_list[-1])
if text_list:
print("最新一条消息:",text_list[0])
current_hash = hashlib.md5(text_list[0].encode()).hexdigest()
if last_matched_hash and current_hash != last_matched_hash:
# send_email(modified_text)
send_feishu_message(text_list[0])
# print("last_matched_hash", last_matched_hash)
last_matched_hash = current_hash
# print("current_hash", current_hash)
# else:
# print("未找到匹配的信息")
# last_matched_hash = None # 重置为 None以便下一次匹配 nul
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆爆'的信息")
except Exception as e:
print(f"执行出错: {e}")
finally:
driver.quit()
# sp_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
# print('网页抓取间隔时间:', sp_time)
# 定时任务配置
# schedule.every(1).minutes.do(fetch_news)
schedule.every(1).seconds.do(fetch_news)
print("开始定时监控...")
while True:
schedule.run_pending()
# currrentime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
# print('当前暂停时间:', currrentime)
time.sleep(50)