20250408修改
This commit is contained in:
184
3.新闻抓取与通知/2.py
Normal file
184
3.新闻抓取与通知/2.py
Normal file
@@ -0,0 +1,184 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import smtplib
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
import schedule
|
||||
import hashlib
|
||||
|
||||
# 邮箱配置
|
||||
from_email = "240884432@qq.com"
|
||||
from_password = "osjyjmbqrzxtbjbf"
|
||||
to_email = "240884432@qq.com"
|
||||
|
||||
# 全局存储上次匹配内容
|
||||
# 在Python中,`global` 关键字用于声明一个变量是全局变量,但不能直接用于赋值语句。
|
||||
# 应该先使用 `global` 声明变量,然后再进行赋值。
|
||||
last_matched_hash = None
|
||||
|
||||
# 邮件发送函数
|
||||
def send_email(content):
|
||||
msg = MIMEMultipart('alternative')
|
||||
msg['Subject'] = '金十数据更新通知'
|
||||
msg['From'] = from_email
|
||||
msg['To'] = to_email
|
||||
msg.attach(MIMEText(content, 'html'))
|
||||
|
||||
try:
|
||||
server = smtplib.SMTP_SSL('smtp.qq.com', 465)
|
||||
server.login(from_email, from_password)
|
||||
server.sendmail(from_email, to_email, msg.as_string())
|
||||
server.quit()
|
||||
print("邮件发送成功")
|
||||
except Exception as e:
|
||||
print(f"邮件发送失败: {e}")
|
||||
|
||||
# 初始化 WebDriver
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless') # 无头模式,不打开浏览器界面
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
# 打开金十数据网站
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
# 等待网页加载并找到目标元素
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
# 使用 JavaScript 修改元素的 class
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
# 立即更新内容,假设更新内容的位置是已知的
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
|
||||
# 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式
|
||||
# 请根据实际情况调整这里的操作
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5) # 等待页面内容更新
|
||||
|
||||
# 获取页面内容
|
||||
page_source = driver.page_source
|
||||
|
||||
# 使用 BeautifulSoup 解析 HTML
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
|
||||
# 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
# 只爬取前5条信息
|
||||
# 初始化匹配计数器
|
||||
matched_count = 0
|
||||
|
||||
# 只爬取包含关键词的信息
|
||||
for index, item in enumerate(items[:50], start=1):
|
||||
text_content = item.get_text(strip=True)
|
||||
|
||||
if '默认火热沸爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
# print(items[1])
|
||||
parts = text_content.split('默认火热沸爆', 1)
|
||||
if len(parts) > 1:
|
||||
print(f"匹配信息 {matched_count}:")
|
||||
if parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
print(modified_text)
|
||||
else:
|
||||
print("")
|
||||
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
|
||||
|
||||
if modified_text:
|
||||
# global last_matched_hash
|
||||
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
|
||||
if last_matched_hash and current_hash != last_matched_hash:
|
||||
send_email(modified_text)
|
||||
last_matched_hash = current_hash
|
||||
|
||||
finally:
|
||||
# 关闭 WebDriver
|
||||
driver.quit()
|
||||
|
||||
# 定时任务
|
||||
def fetch_news():
|
||||
|
||||
# 初始化modified_text避免未定义错误
|
||||
modified_text = ''
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5)
|
||||
|
||||
page_source = driver.page_source
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
matched_count = 0
|
||||
modified_text = ''
|
||||
|
||||
for item in items[:50]:
|
||||
text_content = item.get_text(strip=True)
|
||||
if '默认火热沸爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
parts = text_content.split('默认火热沸爆', 1)
|
||||
if len(parts) > 1 and parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
|
||||
if modified_text:
|
||||
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
|
||||
if last_matched_hash and current_hash != last_matched_hash:
|
||||
send_email(modified_text)
|
||||
last_matched_hash = current_hash
|
||||
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行出错: {e}")
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# 每5分钟运行一次
|
||||
schedule.every(1).minutes.do(fetch_news)
|
||||
|
||||
print("开始定时监控...")
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
1087
3.新闻抓取与通知/jin10.ipynb
Normal file
1087
3.新闻抓取与通知/jin10.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
145
3.新闻抓取与通知/jin10_event.py
Normal file
145
3.新闻抓取与通知/jin10_event.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Created by: Niuyoucai
|
||||
# 2022.04.09 Ver 1.0
|
||||
# https://rili.jin10.com爬虫
|
||||
|
||||
import requests, re, sys
|
||||
from datetime import *
|
||||
from bs4 import BeautifulSoup
|
||||
import smtplib
|
||||
import email.message
|
||||
|
||||
class Spider(object):
|
||||
'''
|
||||
用于jin10网站爬虫的专用类
|
||||
Created by: Niuyoucai
|
||||
2022.04.09 Ver 1.0
|
||||
'''
|
||||
def __init__(self):
|
||||
self.headers = {
|
||||
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 Safari/537.36'
|
||||
}
|
||||
self.root = 'https://rili.jin10.com/day/'
|
||||
self.session = requests.session()
|
||||
self.html = ''
|
||||
self.mailtext = ''
|
||||
|
||||
def getHtmlContent(self, dt):
|
||||
'''读取网页文本信息用于解析
|
||||
'''
|
||||
self.dt = dt
|
||||
url = self.root + dt
|
||||
try:
|
||||
response = self.session.get(url, headers=self.headers)
|
||||
response.encoding = 'utf-8'
|
||||
self.html = response.text
|
||||
self.getDatas()
|
||||
self.getEvents()
|
||||
self.getHolidays()
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.html = ''
|
||||
|
||||
def getDatas(self):
|
||||
'''获取重要经济数据
|
||||
'''
|
||||
stpos = self.html.find('经济数据')
|
||||
topos = self.html.find('事件')
|
||||
html = self.html[stpos:topos]
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
trs = soup.find_all('tr', attrs='jin-table-row')
|
||||
info = '[' + self.dt + ']' + '<br>' + '<b>重要经济数据</b>' + '<br>'
|
||||
for tr in trs:
|
||||
if 'is-important' not in str(tr):
|
||||
continue
|
||||
tds = tr.find_all('td')
|
||||
for col in tds:
|
||||
data = re.sub('<.*?>', '', str(col)).strip()
|
||||
info += data
|
||||
info += ' '
|
||||
info += '<br>'
|
||||
|
||||
self.mailtext += info
|
||||
|
||||
def getEvents(self):
|
||||
'''获取重要事件
|
||||
'''
|
||||
stpos = self.html.find('事件')
|
||||
topos = self.html.find('假期')
|
||||
html = self.html[stpos:topos]
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
trs = soup.find_all('tr', attrs='jin-table-row')
|
||||
info = '<b>重要事件</b>' + '<br>'
|
||||
for tr in trs:
|
||||
if 'is-important' not in str(tr):
|
||||
continue
|
||||
tds = tr.find_all('td')
|
||||
for col in tds:
|
||||
data = re.sub('<.*?>', '', str(col)).strip()
|
||||
info += data
|
||||
info += ' '
|
||||
info += '<br>'
|
||||
self.mailtext += info
|
||||
|
||||
def getHolidays(self):
|
||||
'''获取假期
|
||||
'''
|
||||
# html = self.getHtmlContent(self.root)
|
||||
stpos = self.html.find('假期')
|
||||
html = self.html[stpos:]
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
trs = soup.find_all('tr', attrs='jin-table-row current-countdown')
|
||||
info = '<b>假期</b>' + '<br>'
|
||||
for tr in trs:
|
||||
tds = tr.find_all('td')
|
||||
for col in tds:
|
||||
data = re.sub('<.*?>', '', str(col)).strip()
|
||||
info += data
|
||||
info += ' '
|
||||
info += '<br>'
|
||||
self.mailtext += info
|
||||
|
||||
def mailqq(self):
|
||||
#没有爬取到内容时不发邮件
|
||||
if self.mailtext == '':
|
||||
return
|
||||
smtpObj = smtplib.SMTP_SSL('smtp.qq.com',465)
|
||||
msg = email.message.EmailMessage()
|
||||
msg.set_type('text/html')
|
||||
msg["From"] = "240884432@qq.com"
|
||||
msg["To"] = "240884432@qq.com"
|
||||
msg["Subject"] = "每日交易提醒"
|
||||
text = "<html><body>" + self.mailtext + "</body></html>"
|
||||
|
||||
try:
|
||||
smtpObj.login('240884432@qq.com', 'ifjgwlnzdvrfbjgf')#qq邮箱的第三方授权码
|
||||
#msg.set_content(content)
|
||||
msg.add_alternative(text, subtype='html')
|
||||
smtpObj.send_message(msg)
|
||||
except smtplib.SMTPException as e:
|
||||
print(repr(e))
|
||||
finally:
|
||||
smtpObj.quit()
|
||||
|
||||
def main():
|
||||
#参数检查
|
||||
if len(sys.argv) > 2:
|
||||
print('用法:', sys.argv[0], '[数据日期YYYY-MM-dd]')
|
||||
return
|
||||
#缺省爬取今明两天的数据
|
||||
elif len(sys.argv) == 1:
|
||||
myspider = Spider()
|
||||
dt = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||
myspider.getHtmlContent(dt)
|
||||
dt = datetime.strftime(datetime.now() + timedelta(days=1), '%Y-%m-%d')
|
||||
myspider.getHtmlContent(dt)
|
||||
myspider.mailqq()
|
||||
#参数指定日期单独一天数据爬取
|
||||
elif len(sys.argv) == 2:
|
||||
myspider = Spider()
|
||||
dt = sys.argv[1]
|
||||
myspider.getHtmlContent(dt)
|
||||
myspider.mailqq()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
133
3.新闻抓取与通知/jin10_feishu.py
Normal file
133
3.新闻抓取与通知/jin10_feishu.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
# import smtplib
|
||||
# from email.mime.text import MIMEText
|
||||
# from email.mime.multipart import MIMEMultipart
|
||||
import hashlib
|
||||
import schedule
|
||||
|
||||
import requests
|
||||
|
||||
from_email = "240884432@qq.com"
|
||||
from_password = "osjyjmbqrzxtbjbf"
|
||||
to_email = "240884432@qq.com"
|
||||
|
||||
# 邮件发送函数
|
||||
# def send_email(content):
|
||||
# msg = MIMEMultipart('alternative')
|
||||
# msg['Subject'] = '金十数据更新通知'
|
||||
# msg['From'] = from_email
|
||||
# msg['To'] = to_email
|
||||
# msg.attach(MIMEText(content, 'html'))
|
||||
|
||||
# try:
|
||||
# server = smtplib.SMTP_SSL('smtp.qq.com', 465)
|
||||
# server.login(from_email, from_password)
|
||||
# server.sendmail(from_email, to_email, msg.as_string())
|
||||
# server.quit()
|
||||
# print("邮件发送成功")
|
||||
# except Exception as e:
|
||||
# print(f"邮件发送失败: {e}")
|
||||
|
||||
# 飞书消息发送函数
|
||||
def send_feishu_message(text):
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
table_html = f'{text}'
|
||||
data = {
|
||||
"msg_type": "text",
|
||||
"content": {
|
||||
"text": table_html
|
||||
}
|
||||
}
|
||||
response = requests.post("https://open.feishu.cn/open-apis/bot/v2/hook/094b85fb-4fc3-46f3-9673-ddb9702f7885", headers=headers, json=data)
|
||||
if response.status_code != 200:
|
||||
print(f"飞书消息发送失败,状态码: {response.status_code}, 响应内容: {response.text}")
|
||||
|
||||
# 全局存储上次匹配内容
|
||||
last_matched_hash = None
|
||||
|
||||
# 主抓取函数
|
||||
def fetch_news():
|
||||
global last_matched_hash
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5)
|
||||
|
||||
page_source = driver.page_source
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
matched_count = 0
|
||||
modified_text = ''
|
||||
text_data = {}
|
||||
text_list = []
|
||||
for item in items[:40]:
|
||||
text_content = item.get_text(strip=True)
|
||||
# print(text_content)
|
||||
if '默认火热沸爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
parts = text_content.split('默认火热沸爆', 1)
|
||||
if len(parts) > 1 and parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
text_data[matched_count] = modified_text
|
||||
text_list.append(modified_text)
|
||||
|
||||
print(text_data)
|
||||
print(text_data[matched_count].iloc[0])
|
||||
print(text_list)
|
||||
print(text_list[0])
|
||||
print(text_list[-1])
|
||||
if modified_text:
|
||||
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
|
||||
if last_matched_hash and current_hash != last_matched_hash:
|
||||
# send_email(modified_text)
|
||||
send_feishu_message(modified_text)
|
||||
print(modified_text)
|
||||
last_matched_hash = current_hash
|
||||
print("last_matched_hash", last_matched_hash)
|
||||
print("current_hash", current_hash)
|
||||
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行出错: {e}")
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# 定时任务配置
|
||||
schedule.every(1).minutes.do(fetch_news)
|
||||
|
||||
print("开始定时监控...")
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
76
3.新闻抓取与通知/jin10_new - 副本.py
Normal file
76
3.新闻抓取与通知/jin10_new - 副本.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
|
||||
# 初始化 WebDriver
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless') # 无头模式,不打开浏览器界面
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
# 打开金十数据网站
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
# 等待网页加载并找到目标元素
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
# 使用 JavaScript 修改元素的 class
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
# 立即更新内容,假设更新内容的位置是已知的
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
|
||||
# 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式
|
||||
# 请根据实际情况调整这里的操作
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5) # 等待页面内容更新
|
||||
|
||||
# 获取页面内容
|
||||
page_source = driver.page_source
|
||||
|
||||
# 使用 BeautifulSoup 解析 HTML
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
|
||||
# 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
# 只爬取前5条信息
|
||||
# 初始化匹配计数器
|
||||
matched_count = 0
|
||||
|
||||
# 只爬取包含关键词的信息
|
||||
for index, item in enumerate(items[:40], start=1):
|
||||
text_content = item.get_text(strip=True)
|
||||
# print(text_content)
|
||||
|
||||
if '默认火热沸爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
# print(items[1])
|
||||
parts = text_content.split('默认火热沸爆', 1)#默认火热沸爆
|
||||
if len(parts) > 1:
|
||||
print(f"匹配信息 {matched_count}:")
|
||||
if parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
print(modified_text)
|
||||
else:
|
||||
print("")
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息")
|
||||
|
||||
finally:
|
||||
# 关闭 WebDriver
|
||||
driver.quit()
|
||||
107
3.新闻抓取与通知/jin10_new.py
Normal file
107
3.新闻抓取与通知/jin10_new.py
Normal file
@@ -0,0 +1,107 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
import hashlib
|
||||
import schedule
|
||||
|
||||
from_email = "240884432@qq.com"
|
||||
from_password = "osjyjmbqrzxtbjbf"
|
||||
to_email = "240884432@qq.com"
|
||||
|
||||
# 邮件发送函数
|
||||
def send_email(content):
|
||||
msg = MIMEMultipart('alternative')
|
||||
msg['Subject'] = '金十数据更新通知'
|
||||
msg['From'] = from_email
|
||||
msg['To'] = to_email
|
||||
msg.attach(MIMEText(content, 'html'))
|
||||
|
||||
try:
|
||||
server = smtplib.SMTP_SSL('smtp.qq.com', 465)
|
||||
server.login(from_email, from_password)
|
||||
server.sendmail(from_email, to_email, msg.as_string())
|
||||
server.quit()
|
||||
print("邮件发送成功")
|
||||
except Exception as e:
|
||||
print(f"邮件发送失败: {e}")
|
||||
|
||||
# 全局存储上次匹配内容
|
||||
last_matched_hash = None
|
||||
|
||||
# 主抓取函数
|
||||
def fetch_news():
|
||||
global last_matched_hash
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
driver.get("https://www.jin10.com/")
|
||||
|
||||
target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'
|
||||
target_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, target_xpath))
|
||||
)
|
||||
|
||||
script = '''
|
||||
var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
element.className = "hot-filter_item is-active";
|
||||
'''
|
||||
driver.execute_script(script, target_xpath)
|
||||
|
||||
update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'
|
||||
update_element = WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.XPATH, update_xpath))
|
||||
)
|
||||
driver.execute_script('arguments[0].scrollIntoView(true);', update_element)
|
||||
time.sleep(5)
|
||||
|
||||
page_source = driver.page_source
|
||||
soup = BeautifulSoup(page_source, 'lxml')
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
matched_count = 0
|
||||
modified_text = ''
|
||||
|
||||
for item in items[:40]:
|
||||
text_content = item.get_text(strip=True)
|
||||
# print(text_content)
|
||||
if '默认火热沸爆爆' not in text_content:
|
||||
continue
|
||||
|
||||
matched_count += 1
|
||||
parts = text_content.split('默认火热沸爆', 1)
|
||||
if len(parts) > 1 and parts[1].strip():
|
||||
modified_text = parts[1].strip()[0] + ":" + parts[1].strip()[1:]
|
||||
|
||||
print(modified_text)
|
||||
if modified_text:
|
||||
current_hash = hashlib.md5(modified_text.encode()).hexdigest()
|
||||
if last_matched_hash and current_hash != last_matched_hash:
|
||||
send_email(modified_text)
|
||||
print(modified_text)
|
||||
last_matched_hash = current_hash
|
||||
print("last_matched_hash", last_matched_hash)
|
||||
print("current_hash", current_hash)
|
||||
|
||||
print(f"\n共找到 {matched_count} 条匹配'默认火热沸爆爆'的信息")
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行出错: {e}")
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# 定时任务配置
|
||||
schedule.every(1).minutes.do(fetch_news)
|
||||
|
||||
print("开始定时监控...")
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
134
3.新闻抓取与通知/jin10_send_mail.py
Normal file
134
3.新闻抓取与通知/jin10_send_mail.py
Normal file
@@ -0,0 +1,134 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import smtplib
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
import schedule
|
||||
import time
|
||||
import hashlib
|
||||
import csv # 导入csv模块
|
||||
|
||||
# 邮箱配置
|
||||
from_email = "240884432@qq.com"
|
||||
from_password = "osjyjmbqrzxtbjbf"
|
||||
to_email = "240884432@qq.com"
|
||||
|
||||
# 目标URL
|
||||
# 修改为新的目标URL
|
||||
url = 'https://www.jjin10.com/'
|
||||
|
||||
# 设置请求头,模拟浏览器访问
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
|
||||
}
|
||||
|
||||
# 消息跟踪记录
|
||||
last_records = {} # 格式:{id: (hash, timestamp)}
|
||||
|
||||
# 发送邮件的函数
|
||||
def send_email(subject, content, to_email):
|
||||
msg = MIMEMultipart('alternative')
|
||||
msg['Subject'] = subject
|
||||
msg['From'] = from_email
|
||||
msg['To'] = to_email
|
||||
msg.attach(MIMEText(content, 'html'))
|
||||
|
||||
try:
|
||||
server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 使用SSL,端口通常是465
|
||||
server.login(from_email, from_password)
|
||||
server.sendmail(from_email, to_email, msg.as_string())
|
||||
server.quit()
|
||||
print("邮件发送成功")
|
||||
except Exception as e:
|
||||
print(f"邮件发送失败: {e}")
|
||||
|
||||
# 爬取并发送邮件的函数
|
||||
def fetch_and_notify():
|
||||
global last_records
|
||||
|
||||
# 初始化浏览器
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
|
||||
# 点击爆款筛选按钮
|
||||
button = WebDriverWait(driver, 10).until(
|
||||
# 确保此选择器在新网页中仍适用
|
||||
EC.element_to_be_clickable((By.CSS_SELECTOR, '.flash-hot_text.is-bao'))
|
||||
)
|
||||
driver.execute_script("arguments[0].click();", button)
|
||||
time.sleep(3) # 等待内容加载
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||
items = soup.find_all('div', class_='jin-flash-item-container is-normal')
|
||||
|
||||
filtered_items = []
|
||||
new_item_ids = []
|
||||
|
||||
for item in items:
|
||||
item_id = item.get('id')
|
||||
if item.find('i', class_='flash-hot_text is-bao'):
|
||||
filtered_items.append(item)
|
||||
new_item_ids.append(item_id)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
if not filtered_items:
|
||||
return
|
||||
|
||||
# 生成内容哈希并筛选新内容
|
||||
new_items = []
|
||||
for item in filtered_items:
|
||||
item_id = item.get('id')
|
||||
content = item.find('div', class_='flash-text').get_text(strip=True)
|
||||
content_hash = hashlib.md5(content.encode()).hexdigest()
|
||||
|
||||
# 双重校验:ID不存在 或 ID存在但内容哈希不同
|
||||
if item_id not in last_records or last_records[item_id][0] != content_hash:
|
||||
new_items.append({
|
||||
'id': item_id,
|
||||
'time': item.find('div', class_='item-time').get_text(strip=True),
|
||||
'content': content,
|
||||
'hash': content_hash
|
||||
})
|
||||
|
||||
if new_items:
|
||||
email_content = ""
|
||||
for i, item in enumerate(new_items, 1):
|
||||
email_content += f"<p><strong>消息 {i}:</strong><br>时间: {item['time']}<br>内容: {item['content']}</p>"
|
||||
email_content += '<hr>'
|
||||
# 更新记录
|
||||
last_records[item['id']] = (item['hash'], time.time())
|
||||
|
||||
send_email("金十数据市场快讯", email_content, to_email)
|
||||
|
||||
# 清理过期记录(保留24小时)
|
||||
expire_time = time.time() - 86400
|
||||
last_records = {k:v for k,v in last_records.items() if v[1] > expire_time}
|
||||
|
||||
# 将新消息写入CSV文件
|
||||
with open('news.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
|
||||
fieldnames = ['id', 'time', 'content', 'hash']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for item in new_items:
|
||||
writer.writerow(item)
|
||||
else:
|
||||
# 当没有新消息时发送提示邮件
|
||||
send_email("金十数据市场快讯", "<p>没有新的市场快讯信息。</p>", to_email)
|
||||
|
||||
# 每5分钟运行一次
|
||||
schedule.every(5).minutes.do(fetch_and_notify)
|
||||
|
||||
print("开始监控市场快讯信息...")
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
43
3.新闻抓取与通知/send_mail_test.py
Normal file
43
3.新闻抓取与通知/send_mail_test.py
Normal file
@@ -0,0 +1,43 @@
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
# 加入邮件通知
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText # 导入 MIMEText 类发送纯文本邮件
|
||||
from email.mime.multipart import MIMEMultipart # 导入 MIMEMultipart 类发送带有附件的邮件
|
||||
from email.mime.application import MIMEApplication # 导入 MIMEApplication 类发送二进制附件
|
||||
|
||||
## 配置邮件信息
|
||||
receivers = ["*****@qq.com"] # 设置邮件接收人地址
|
||||
subject = "订单流策略交易信号" # 设置邮件主题
|
||||
#text = " " # 设置邮件正文
|
||||
# file_path = "test.txt" # 设置邮件附件文件路径
|
||||
|
||||
## 配置邮件服务器信息
|
||||
smtp_server = "smtp.qq.com" # 设置发送邮件的 SMTP 服务器地址
|
||||
smtp_port = 465 # 设置发送邮件的 SMTP 服务器端口号,一般为 25 端口 465
|
||||
sender = "***@qq.com" # 设置发送邮件的邮箱地址
|
||||
username = "***@@qq.com" # 设置发送邮件的邮箱用户名
|
||||
password = "osjyjmbqrzxtbjbf" #zrmpcgttataabhjh,设置发送邮件的邮箱密码或授权码
|
||||
|
||||
def send_mail(text):
|
||||
msg = MIMEMultipart()
|
||||
msg["From"] = sender
|
||||
msg["To"] = ";".join(receivers)
|
||||
msg["Subject"] = subject
|
||||
msg.attach(MIMEText(text, "plain", "utf-8"))
|
||||
smtp = smtplib.SMTP_SSL(smtp_server, smtp_port)
|
||||
# smtp = smtplib.SMTP_SSL(smtp_server)
|
||||
smtp.login(username, password)
|
||||
smtp.sendmail(sender, receivers, msg.as_string())
|
||||
smtp.quit()
|
||||
|
||||
# 获取当前时间
|
||||
now = datetime.now()
|
||||
|
||||
# 判断当前时间是否为15:30:00
|
||||
if now.strftime('%H:%M:%S') == '15:30:00':
|
||||
print("当前时间是15:30:00。")
|
||||
send_mail("当前时间是21:45:00")
|
||||
else:
|
||||
print("当前时间不是15:30:00。")
|
||||
send_mail("当前时间不是是21:45:00")
|
||||
Reference in New Issue
Block a user