Files
2025-04-09 17:18:30 +08:00

1088 lines
44 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 发送GET请求\n",
"response = requests.get(url, headers=headers)\n",
"response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
"# 解析HTML内容\n",
"soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
"# 使用CSS选择器查找特定元素\n",
"selector = '#JinFlashList > div.flash-top > div.flash-top_right > div.flash-top_tool.tw-change-box > div.tool-setting.hide-dot > span:nth-child(2) > div.setting-popup > div:nth-child(3) > div > div.hot-filter_bot > div:nth-child(3)'\n",
"element = soup.select_one(selector)\n",
"\n",
"# 检查元素是否存在并打印内容\n",
"if element:\n",
" print(element.prettify())\n",
"else:\n",
" print(\"未找到指定的元素\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 发送GET请求\n",
"response = requests.get(url, headers=headers)\n",
"response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
"# 解析HTML内容\n",
"soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
"# 查找所有符合条件的元素\n",
"items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
"# 提取满足条件的元素\n",
"filtered_items = []\n",
"for item in items:\n",
" hot_labels = item.select('.hot-filter_item_label')\n",
" for label in hot_labels:\n",
" if label.get_text(strip=True) == '爆':\n",
" filtered_items.append(item)\n",
" break\n",
"\n",
"# 打印提取的元素内容\n",
"for item in filtered_items:\n",
" time = item.find('div', class_='item-time').get_text(strip=True)\n",
" content = item.find('div', class_='flash-text').get_text(strip=True)\n",
" print(f\"时间: {time}\")\n",
" print(f\"内容: {content}\")\n",
" print('-' * 40)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 发送GET请求\n",
"response = requests.get(url, headers=headers)\n",
"response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
"# 解析HTML内容\n",
"soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
"# 查找所有符合条件的元素\n",
"items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
"# 提取满足条件的元素\n",
"filtered_items = []\n",
"for item in items:\n",
" hot_labels = item.select('.hot-filter_item_label')#hot-filter_item_label\n",
" for label in hot_labels:\n",
" if label.get_text(strip=True) == '火':\n",
" filtered_items.append(item)\n",
" if len(filtered_items) >= 20:\n",
" break\n",
" if len(filtered_items) >= 20:\n",
" break\n",
"\n",
"# 打印提取的20条元素内容\n",
"for i, item in enumerate(filtered_items, 1):\n",
" time = item.find('div', class_='item-time').get_text(strip=True)\n",
" content = item.find('div', class_='flash-text').get_text(strip=True)\n",
" print(f\"消息 {i}:\")\n",
" print(f\"时间: {time}\")\n",
" print(f\"内容: {content}\")\n",
" print('-' * 40)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 发送GET请求\n",
"response = requests.get(url, headers=headers)\n",
"response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
"# 解析HTML内容\n",
"soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
"# 查找所有符合条件的元素\n",
"items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
"# 提取满足条件的元素\n",
"filtered_items = []\n",
"for item in items:\n",
" if item.find('i', class_='jin-icon iconfont icon-huo is-huo'):\n",
" filtered_items.append(item)\n",
" if len(filtered_items) >= 20:\n",
" break\n",
"\n",
"# 打印提取的20条元素内容\n",
"for i, item in enumerate(filtered_items, 1):\n",
" time = item.find('div', class_='item-time').get_text(strip=True)\n",
" content = item.find('div', class_='flash-text').get_text(strip=True)\n",
" print(f\"消息 {i}:\")\n",
" print(f\"时间: {time}\")\n",
" print(f\"内容: {content}\")\n",
" print('-' * 40)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import smtplib\n",
"from email.mime.multipart import MIMEMultipart\n",
"from email.mime.text import MIMEText\n",
"import schedule\n",
"import time\n",
"\n",
"# 发件人邮箱和密码\n",
"from_email = \"240884432@qq.com\"\n",
"from_password = \"osjyjmbqrzxtbjbf\"\n",
"to_email = \"240884432@qq.com\"\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 上一次获取的信息\n",
"last_items = []\n",
"\n",
"# 发送邮件的函数\n",
"def send_email(subject, content, to_email):\n",
" msg = MIMEMultipart('alternative')\n",
" msg['Subject'] = subject\n",
" msg['From'] = from_email\n",
" msg['To'] = to_email\n",
" msg.attach(MIMEText(content, 'html'))\n",
"\n",
" try:\n",
" server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 这里使用SSL端口通常是465\n",
" server.login(from_email, from_password)\n",
" server.sendmail(from_email, to_email, msg.as_string())\n",
" server.quit()\n",
" print(\"邮件发送成功\")\n",
" except Exception as e:\n",
" print(f\"邮件发送失败: {e}\")\n",
"\n",
"# 爬取并发送邮件的函数\n",
"def fetch_and_notify():\n",
" global last_items\n",
" # 发送GET请求\n",
" response = requests.get(url, headers=headers)\n",
" response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
" # 解析HTML内容\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
" # 查找所有符合条件的元素\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" filtered_items = []\n",
" for item in items:\n",
" if item.find('i', class_='jin-icon iconfont icon-huo is-huo'):\n",
" filtered_items.append(item)\n",
" if len(filtered_items) >= 20:\n",
" break\n",
"\n",
" if not filtered_items:\n",
" return\n",
"\n",
" new_items = [item for item in filtered_items if item not in last_items]\n",
" print(filtered_items)\n",
" print(new_items)\n",
"\n",
" if new_items:\n",
" email_content = \"\"\n",
" for i, item in enumerate(new_items, 1):\n",
" time = item.find('div', class_='item-time').get_text(strip=True)\n",
" content = item.find('div', class_='flash-text').get_text(strip=True)\n",
" email_content += f\"<p><strong>消息 {i}:</strong><br>时间: {time}<br>内容: {content}</p>\"\n",
" email_content += '<hr>'\n",
"\n",
" send_email(\"金十数据市场快讯\", email_content, to_email)\n",
" last_items = filtered_items\n",
" else:\n",
" send_email(\"金十数据市场快讯\", \"<p>没有新的市场快讯信息。</p>\", to_email)\n",
"\n",
"# 每5分钟运行一次\n",
"schedule.every(1).minutes.do(fetch_and_notify)\n",
"\n",
"print(\"开始监控市场快讯信息...\")\n",
"while True:\n",
" schedule.run_pending()\n",
" time.sleep(1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import smtplib\n",
"from email.mime.multipart import MIMEMultipart\n",
"from email.mime.text import MIMEText\n",
"import schedule\n",
"import time\n",
"\n",
"# 邮箱配置\n",
"from_email = \"240884432@qq.com\"\n",
"from_password = \"osjyjmbqrzxtbjbf\"\n",
"to_email = \"240884432@qq.com\"\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 上一次获取的信息\n",
"last_items = []\n",
"\n",
"# 发送邮件的函数\n",
"def send_email(subject, content, to_email):\n",
" msg = MIMEMultipart('alternative')\n",
" msg['Subject'] = subject\n",
" msg['From'] = from_email\n",
" msg['To'] = to_email\n",
" msg.attach(MIMEText(content, 'html'))\n",
"\n",
" try:\n",
" server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 这里使用SSL端口通常是465\n",
" server.login(from_email, from_password)\n",
" server.sendmail(from_email, to_email, msg.as_string())\n",
" server.quit()\n",
" print(\"邮件发送成功\")\n",
" except Exception as e:\n",
" print(f\"邮件发送失败: {e}\")\n",
"\n",
"# 爬取并发送邮件的函数\n",
"def fetch_and_notify():\n",
" global last_items\n",
" response = requests.get(url, headers=headers)\n",
" response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" filtered_items = []\n",
" for item in items:\n",
" if item.find('i', class_='jin-icon iconfont icon-huo is-huo'):\n",
" filtered_items.append(item)\n",
" if len(filtered_items) >= 10:\n",
" break\n",
"\n",
" if not filtered_items:\n",
" return\n",
"\n",
" new_items = [item for item in filtered_items if item not in last_items]\n",
"\n",
" if new_items:\n",
" email_content = \"\"\n",
" for i, item in enumerate(new_items, 1):\n",
" time_text = item.find('div', class_='item-time').get_text(strip=True)\n",
" content_text = item.find('div', class_='flash-text').get_text(strip=True)\n",
" email_content += f\"<p><strong>消息 {i}:</strong><br>时间: {time_text}<br>内容: {content_text}</p>\"\n",
" email_content += '<hr>'\n",
"\n",
" send_email(\"金十数据市场快讯\", email_content, to_email)\n",
" last_items = filtered_items\n",
" else:\n",
" send_email(\"金十数据市场快讯\", \"<p>没有新的市场快讯信息。</p>\", to_email)\n",
"\n",
"# 每5分钟运行一次\n",
"schedule.every(1).minutes.do(fetch_and_notify)\n",
"\n",
"print(\"开始监控市场快讯信息...\")\n",
"while True:\n",
" schedule.run_pending()\n",
" time.sleep(1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import smtplib\n",
"from email.mime.multipart import MIMEMultipart\n",
"from email.mime.text import MIMEText\n",
"import schedule\n",
"import time\n",
"\n",
"# 邮箱配置\n",
"from_email = \"240884432@qq.com\"\n",
"from_password = \"osjyjmbqrzxtbjbf\"\n",
"to_email = \"240884432@qq.com\"\n",
"\n",
"# 目标URL\n",
"url = 'https://www.jin10.com/'\n",
"\n",
"# 设置请求头,模拟浏览器访问\n",
"headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n",
"}\n",
"\n",
"# 上一次获取的消息ID列表\n",
"last_item_ids = []\n",
"\n",
"# 发送邮件的函数\n",
"def send_email(subject, content, to_email):\n",
" msg = MIMEMultipart('alternative')\n",
" msg['Subject'] = subject\n",
" msg['From'] = from_email\n",
" msg['To'] = to_email\n",
" msg.attach(MIMEText(content, 'html'))\n",
"\n",
" try:\n",
" server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 使用SSL端口通常是465\n",
" server.login(from_email, from_password)\n",
" server.sendmail(from_email, to_email, msg.as_string())\n",
" server.quit()\n",
" print(\"邮件发送成功\")\n",
" except Exception as e:\n",
" print(f\"邮件发送失败: {e}\")\n",
"\n",
"# 爬取并发送邮件的函数\n",
"def fetch_and_notify():\n",
" global last_item_ids\n",
" response = requests.get(url, headers=headers)\n",
" response.raise_for_status() # 如果请求失败则抛出异常\n",
"\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" filtered_items = []\n",
" new_item_ids = []\n",
" \n",
" for item in items:\n",
" item_id = item.get('id')\n",
" if item.find('i', class_='flash-hot_text is-fei'):# jin-icon iconfont icon-huo is-huo\n",
" filtered_items.append(item)\n",
" new_item_ids.append(item_id)\n",
" if len(filtered_items) >= 5:\n",
" break\n",
"\n",
" if not filtered_items:\n",
" return\n",
"\n",
" new_items = [item for item in filtered_items if item.get('id') not in last_item_ids]\n",
"\n",
" if new_items:\n",
" email_content = \"\"\n",
" for i, item in enumerate(new_items, 1):\n",
" time_text = item.find('div', class_='item-time').get_text(strip=True)\n",
" content_text = item.find('div', class_='flash-text').get_text(strip=True)\n",
" email_content += f\"<p><strong>消息 {i}:</strong><br>时间: {time_text}<br>内容: {content_text}</p>\"\n",
" email_content += '<hr>'\n",
"\n",
" send_email(\"金十数据市场快讯\", email_content, to_email)\n",
" last_item_ids = new_item_ids\n",
" else:\n",
" send_email(\"金十数据市场快讯\", \"<p>没有新的市场快讯信息。</p>\", to_email)\n",
"\n",
"# 每5分钟运行一次\n",
"schedule.every(1).minutes.do(fetch_and_notify)\n",
"\n",
"print(\"开始监控市场快讯信息...\")\n",
"while True:\n",
" schedule.run_pending()\n",
" time.sleep(1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import time\n",
"\n",
"# 初始化 WebDriver\n",
"options = webdriver.ChromeOptions()\n",
"options.add_argument('--headless') # 无头模式,不打开浏览器界面\n",
"driver = webdriver.Chrome(options=options)\n",
"\n",
"try:\n",
" # 打开金十数据网站\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" # 等待网页加载并找到目标元素\n",
" target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n",
" target_element = WebDriverWait(driver, 10).until(\n",
" EC.presence_of_element_located((By.XPATH, target_xpath))\n",
" )\n",
" # print(target_element)\n",
"\n",
" # 使用 JavaScript 修改元素的 class class=\"hot-filter_item is-active\"\n",
" driver.execute_script('arguments[0].className = \"hot-filter_item is-active\";', target_element)\n",
"\n",
" # 等待内容更新(这里等待时间可以根据需要调整)\n",
" time.sleep(5)\n",
"\n",
" # 获取页面内容\n",
" page_source = driver.page_source\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(page_source, 'lxml')\n",
"\n",
" # 提取class为'jin-flash-item-container is-normal'的div标签\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" # 只爬取前5条信息\n",
" for index, item in enumerate(items[:5], start=1):\n",
" # print(f\"Information {index}:\")\n",
" # print(item.get_text(strip=True)) # 打印提取的文本内容\n",
" time = item.find('div', class_='item-time').get_text(strip=True)\n",
" content = item.find('div', class_='flash-text').get_text(strip=True)\n",
" print(f\"消息 {index}:\")\n",
" print(f\"时间: {time}\")\n",
" print(f\"内容: {content}\")\n",
" print('-' * 40)\n",
"\n",
"finally:\n",
" # 关闭 WebDriver\n",
" driver.quit()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"\n",
"# 初始化 WebDriver\n",
"options = webdriver.ChromeOptions()\n",
"options.add_argument('--headless') # 无头模式,不打开浏览器界面\n",
"driver = webdriver.Chrome(options=options)\n",
"\n",
"try:\n",
" # 打开金十数据网站\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" # 等待网页加载并找到目标元素\n",
" target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n",
" WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, target_xpath))\n",
" )\n",
"\n",
" # 使用 JavaScript 修改元素的 class\n",
" script = '''\n",
" var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n",
" element.className = \"hot-filter_item is-active\";\n",
" '''\n",
" driver.execute_script(script, target_xpath)\n",
"\n",
" # 等待内容更新\n",
" time.sleep(5)\n",
"\n",
" # 获取页面内容\n",
" page_source = driver.page_source\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(page_source, 'lxml')\n",
"\n",
" # 提取class为'jin-flash-item-container is-normal'的div标签\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" # 只爬取前5条信息\n",
" for index, item in enumerate(items[:5], start=1):\n",
" print(f\"Information {index}:\")\n",
" print(item.get_text(strip=True)) # 打印提取的文本内容\n",
"\n",
"finally:\n",
" # 关闭 WebDriver\n",
" driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"\n",
"# 初始化 WebDriver\n",
"options = webdriver.ChromeOptions()\n",
"options.add_argument('--headless') # 无头模式,不打开浏览器界面\n",
"driver = webdriver.Chrome(options=options)\n",
"\n",
"try:\n",
" # 打开金十数据网站\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" # 等待网页加载并找到目标元素\n",
" target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n",
" target_element = WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, target_xpath))\n",
" )\n",
"\n",
" # 使用 JavaScript 修改元素的 class\n",
" script = '''\n",
" var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n",
" element.className = \"hot-filter_item is-active\";\n",
" '''\n",
" driver.execute_script(script, target_xpath)\n",
"\n",
" # 立即更新内容,假设更新内容的位置是已知的\n",
" update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'\n",
" update_element = WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, update_xpath))\n",
" )\n",
" \n",
" # 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式\n",
" # 请根据实际情况调整这里的操作\n",
" driver.execute_script('arguments[0].scrollIntoView(true);', update_element)\n",
" time.sleep(5) # 等待页面内容更新\n",
"\n",
" # 获取页面内容\n",
" page_source = driver.page_source\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(page_source, 'lxml')\n",
"\n",
" # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" # 只爬取前5条信息\n",
" for index, item in enumerate(items[:40], start=1):\n",
" print(f\"Information {index}:\")\n",
" print(item.get_text(strip=True)) # 打印提取的文本内容\n",
"\n",
"finally:\n",
" # 关闭 WebDriver\n",
" driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"\n",
"# 初始化 WebDriver\n",
"options = webdriver.ChromeOptions()\n",
"options.add_argument('--headless') # 无头模式,不打开浏览器界面\n",
"driver = webdriver.Chrome(options=options)\n",
"\n",
"try:\n",
" # 打开金十数据网站\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" # 等待网页加载并找到目标按钮\n",
" button_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n",
" button_element = WebDriverWait(driver, 20).until(\n",
" EC.element_to_be_clickable((By.XPATH, button_xpath))\n",
" )\n",
"\n",
" # 点击按钮\n",
" button_element.click()\n",
"\n",
" # 等待页面内容更新\n",
" time.sleep(30) # 根据需要调整等待时间\n",
"\n",
" # 获取页面内容\n",
" page_source = driver.page_source\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(page_source, 'lxml')\n",
"\n",
" # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" # 只爬取前5条信息\n",
" for index, item in enumerate(items[:5], start=1):\n",
" print(f\"Information {index}:\")\n",
" print(item.get_text(strip=True)) # 打印提取的文本内容\n",
"\n",
"finally:\n",
" # 关闭 WebDriver\n",
" driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"\n",
"# 初始化 WebDriver\n",
"options = webdriver.ChromeOptions()\n",
"options.add_argument('--headless') # 无头模式,不打开浏览器界面\n",
"driver = webdriver.Chrome(options=options)\n",
"\n",
"try:\n",
" # 打开金十数据网站\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" # 等待网页加载并找到目标元素\n",
" target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n",
" target_element = WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, target_xpath))\n",
" )\n",
"\n",
" # 使用 JavaScript 修改元素的 class\n",
" script_modify_class = '''\n",
" var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n",
" element.className = \"hot-filter_item is-active\";\n",
" '''\n",
" driver.execute_script(script_modify_class, target_xpath)\n",
"\n",
" # 触发页面内容更新\n",
" update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'\n",
" update_element = WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, update_xpath))\n",
" )\n",
"\n",
" # 使用 JavaScript 来调用更新方法\n",
" script_update_content = '''\n",
" var updateElement = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n",
" updateElement.classList.add(\"el-loading-parent--relative\");\n",
" // 触发实际的内容更新逻辑这可能需要根据实际页面的JavaScript实现来进行调整\n",
" '''\n",
" driver.execute_script(script_update_content, update_xpath)\n",
"\n",
" # 等待页面内容更新\n",
" time.sleep(5) # 根据需要调整等待时间\n",
"\n",
" # 获取页面内容\n",
" page_source = driver.page_source\n",
"\n",
" # 使用 BeautifulSoup 解析 HTML\n",
" soup = BeautifulSoup(page_source, 'lxml')\n",
"\n",
" # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" # 只爬取前5条信息\n",
" for index, item in enumerate(items[:5], start=1):\n",
" print(f\"Information {index}:\")\n",
" print(item.get_text(strip=True)) # 打印提取的文本内容\n",
"\n",
"finally:\n",
" # 关闭 WebDriver\n",
" driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.chrome.service import Service\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"\n",
"# 初始化 WebDriver\n",
"options = Options()\n",
"options.add_argument('--headless') # 无头模式,不打开浏览器界面\n",
"options.add_argument('--disable-gpu')\n",
"# service = Service('path/to/chromedriver') # 替换为你的ChromeDriver路径\n",
"driver = webdriver.Chrome(options=options)# (service=service, options=options)\n",
"\n",
"try:\n",
" # 打开金十数据网站\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" # 等待页面加载\n",
" time.sleep(5)\n",
"\n",
" # 模拟点击滑块\n",
" switch_element_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]'\n",
" switch_element = driver.find_element(By.XPATH, switch_element_xpath)\n",
" switch_element.click()\n",
"\n",
" # 等待新内容加载\n",
" time.sleep(5) # 可以调整时间,确保新内容加载完成\n",
"\n",
" # 获取新的页面内容\n",
" new_page_source = driver.page_source\n",
"\n",
" # 使用 BeautifulSoup 解析新的 HTML\n",
" soup = BeautifulSoup(new_page_source, 'lxml')\n",
"\n",
" # 提取 class 为 'jin-flash-item-container is-important' 的 div 标签\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-important')\n",
"\n",
" # 只爬取前5条信息\n",
" for index, item in enumerate(items[:5], start=1):\n",
" print(f\"Information {index}:\")\n",
" print(item.get_text(strip=True)) # 打印提取的文本内容\n",
" print('-' * 50)\n",
"\n",
"finally:\n",
" # 关闭 WebDriver\n",
" driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始定时监控...\n",
"火现货黄金日内走低1.00%现报3102.30美元/盎司。\n",
"last_matched_hash 968c113779225800e7593a2e594ad0e0\n",
"current_hash 968c113779225800e7593a2e594ad0e0\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"火现货黄金日内走低1.00%现报3102.30美元/盎司。\n",
"last_matched_hash 968c113779225800e7593a2e594ad0e0\n",
"current_hash 968c113779225800e7593a2e594ad0e0\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"火现货黄金日内走低1.00%现报3102.30美元/盎司。\n",
"last_matched_hash 968c113779225800e7593a2e594ad0e0\n",
"current_hash 968c113779225800e7593a2e594ad0e0\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"火现货黄金日内走低1.00%现报3102.30美元/盎司。\n",
"last_matched_hash 968c113779225800e7593a2e594ad0e0\n",
"current_hash 968c113779225800e7593a2e594ad0e0\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"火现货黄金日内走低1.00%现报3102.30美元/盎司。\n",
"last_matched_hash 968c113779225800e7593a2e594ad0e0\n",
"current_hash 968c113779225800e7593a2e594ad0e0\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"执行出错: Message: \n",
"Stacktrace:\n",
"\tGetHandleVerifier [0x00007FF6B5614C25+3179557]\n",
"\t(No symbol) [0x00007FF6B52788A0]\n",
"\t(No symbol) [0x00007FF6B51091CA]\n",
"\t(No symbol) [0x00007FF6B515FA67]\n",
"\t(No symbol) [0x00007FF6B515FC9C]\n",
"\t(No symbol) [0x00007FF6B51B3627]\n",
"\t(No symbol) [0x00007FF6B5187C6F]\n",
"\t(No symbol) [0x00007FF6B51B02F3]\n",
"\t(No symbol) [0x00007FF6B5187A03]\n",
"\t(No symbol) [0x00007FF6B51506D0]\n",
"\t(No symbol) [0x00007FF6B5151983]\n",
"\tGetHandleVerifier [0x00007FF6B56767CD+3579853]\n",
"\tGetHandleVerifier [0x00007FF6B568D1D2+3672530]\n",
"\tGetHandleVerifier [0x00007FF6B5682153+3627347]\n",
"\tGetHandleVerifier [0x00007FF6B53E092A+868650]\n",
"\t(No symbol) [0x00007FF6B5282FFF]\n",
"\t(No symbol) [0x00007FF6B527F4A4]\n",
"\t(No symbol) [0x00007FF6B527F646]\n",
"\t(No symbol) [0x00007FF6B526EAA9]\n",
"\tBaseThreadInitThunk [0x00007FFF903C259D+29]\n",
"\tRtlUserThreadStart [0x00007FFF90F4AF38+40]\n",
"\n",
"热纽约期银暴跌6%金十数据4月3日讯纽约期银日内暴跌6.00%现报32.57美元/盎司。现货白银跌4.43%报32.40美元/盎司。\n",
"邮件发送成功\n",
"热纽约期银暴跌6%金十数据4月3日讯纽约期银日内暴跌6.00%现报32.57美元/盎司。现货白银跌4.43%报32.40美元/盎司。\n",
"last_matched_hash c2e035a6e60f47a267833722604e1a24\n",
"current_hash c2e035a6e60f47a267833722604e1a24\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热纽约期银暴跌6%金十数据4月3日讯纽约期银日内暴跌6.00%现报32.57美元/盎司。现货白银跌4.43%报32.40美元/盎司。\n",
"last_matched_hash c2e035a6e60f47a267833722604e1a24\n",
"current_hash c2e035a6e60f47a267833722604e1a24\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"热纽约期银暴跌6%金十数据4月3日讯纽约期银日内暴跌6.00%现报32.57美元/盎司。现货白银跌4.43%报32.40美元/盎司。\n",
"last_matched_hash c2e035a6e60f47a267833722604e1a24\n",
"current_hash c2e035a6e60f47a267833722604e1a24\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"热纽约期银暴跌6%金十数据4月3日讯纽约期银日内暴跌6.00%现报32.57美元/盎司。现货白银跌4.43%报32.40美元/盎司。\n",
"last_matched_hash c2e035a6e60f47a267833722604e1a24\n",
"current_hash c2e035a6e60f47a267833722604e1a24\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"热纽约期银暴跌6%金十数据4月3日讯纽约期银日内暴跌6.00%现报32.57美元/盎司。现货白银跌4.43%报32.40美元/盎司。\n",
"last_matched_hash c2e035a6e60f47a267833722604e1a24\n",
"current_hash c2e035a6e60f47a267833722604e1a24\n",
"\n",
"共找到 2 条匹配'默认火热沸爆'的信息\n",
"火黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"邮件发送成功\n",
"火黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash b8daf18e3735013ff4c9c47aeca33fa5\n",
"current_hash b8daf18e3735013ff4c9c47aeca33fa5\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"邮件发送成功\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n",
"热黄金快速下破3090美元金十数据4月3日讯现货黄金15分钟内快速下破两道关口最低至3089.57美元/盎司日内跌1.41%。\n",
"last_matched_hash e00714f0ae93195b10a76db627c95bbe\n",
"current_hash e00714f0ae93195b10a76db627c95bbe\n",
"\n",
"共找到 1 条匹配'默认火热沸爆'的信息\n"
]
}
],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import smtplib\n",
"from email.mime.text import MIMEText\n",
"from email.mime.multipart import MIMEMultipart\n",
"import hashlib\n",
"import schedule\n",
"\n",
"from_email = \"240884432@qq.com\"\n",
"from_password = \"osjyjmbqrzxtbjbf\"\n",
"to_email = \"240884432@qq.com\"\n",
"\n",
"# 邮件发送函数\n",
"def send_email(content):\n",
" msg = MIMEMultipart('alternative')\n",
" msg['Subject'] = '金十数据更新通知'\n",
" msg['From'] = from_email\n",
" msg['To'] = to_email\n",
" msg.attach(MIMEText(content, 'html'))\n",
"\n",
" try:\n",
" server = smtplib.SMTP_SSL('smtp.qq.com', 465)\n",
" server.login(from_email, from_password)\n",
" server.sendmail(from_email, to_email, msg.as_string())\n",
" server.quit()\n",
" print(\"邮件发送成功\")\n",
" except Exception as e:\n",
" print(f\"邮件发送失败: {e}\")\n",
"\n",
"# 全局存储上次匹配内容\n",
"last_matched_hash = None\n",
"\n",
"# 主抓取函数\n",
"def fetch_news():\n",
" global last_matched_hash\n",
" \n",
" options = webdriver.ChromeOptions()\n",
" options.add_argument('--headless')\n",
" driver = webdriver.Chrome(options=options)\n",
"\n",
" try:\n",
" driver.get(\"https://www.jin10.com/\")\n",
"\n",
" target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n",
" target_element = WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, target_xpath))\n",
" )\n",
"\n",
" script = '''\n",
" var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n",
" element.className = \"hot-filter_item is-active\";\n",
" '''\n",
" driver.execute_script(script, target_xpath)\n",
"\n",
" update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'\n",
" update_element = WebDriverWait(driver, 20).until(\n",
" EC.presence_of_element_located((By.XPATH, update_xpath))\n",
" )\n",
" driver.execute_script('arguments[0].scrollIntoView(true);', update_element)\n",
" time.sleep(5)\n",
"\n",
" page_source = driver.page_source\n",
" soup = BeautifulSoup(page_source, 'lxml')\n",
" items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n",
"\n",
" matched_count = 0\n",
" modified_text = ''\n",
" \n",
" for item in items[:40]:\n",
" text_content = item.get_text(strip=True)\n",
" # print(text_content)\n",
" if '默认火热沸爆' not in text_content:\n",
" continue\n",
" \n",
" matched_count += 1\n",
" parts = text_content.split('默认火热沸爆', 1)\n",
" if len(parts) > 1 and parts[1].strip():\n",
" modified_text = parts[1].strip()[0] + \"\" + parts[1].strip()[1:]\n",
"\n",
" print(modified_text) \n",
" if modified_text:\n",
" current_hash = hashlib.md5(modified_text.encode()).hexdigest()\n",
" if last_matched_hash and current_hash != last_matched_hash:\n",
" send_email(modified_text)\n",
" print(modified_text)\n",
" last_matched_hash = current_hash\n",
" print(\"last_matched_hash\", last_matched_hash)\n",
" print(\"current_hash\", current_hash)\n",
"\n",
" print(f\"\\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息\")\n",
"\n",
" except Exception as e:\n",
" print(f\"执行出错: {e}\")\n",
" finally:\n",
" driver.quit()\n",
"\n",
"# 定时任务配置\n",
"schedule.every(1).minutes.do(fetch_news)\n",
"\n",
"print(\"开始定时监控...\")\n",
"while True:\n",
" schedule.run_pending()\n",
" time.sleep(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}